def parse_data(data_file_full_path): """ This method parses the data into the final matrix [M x N] - called X matrix. and Nx1 vector of classifier results - Y vector. """ final_x_matrix = list() final_y_vector = list() try: data_file = open(data_file_full_path) for line in data_file: split_line = line.split(', ') if split_line.__contains__("?"): x_value, y_value = parse(split_line) # Adding median as a feature x_value.append(statistics.median(x_value)) # Adding mean as a feature x_value.append(statistics.mean(x_value)) # Adding variance as a feature x_value.append(statistics.variance(x_value)) final_x_matrix.append(x_value) final_y_vector.append(y_value) except Exception as err: print("Error: ", err) finally: return final_x_matrix, final_y_vector
def parse_data(data_file_full_path): """ This method parses the data into the final matrix [M x N] - called X matrix. and Nx1 vector of classifier results - Y vector. """ f = open(data_file_full_path) final_x_matrix = list() final_y_vector = list() lines = f.readlines( ) # Creates a list, each element in the list is a line in the data file data_size = len(lines) # Original data size (amount of objects provided) for line in lines: # creates a list presentation of 'lines', with the relevant values for each feature line_list_presentation = line.replace(',', '').replace('\n', '').split(' ') try: #test whether the object holds valid data and fix relevant features values if possible line_list_presentation = data_valid_fixer(line_list_presentation) except ValueError: #if one(or more) of the features holds non-valid value - it is ignored continue x, y_value = parse(line_list_presentation) #append parsed vector to x_matrix, and parsed label to y_vector final_x_matrix.append(x) final_y_vector.append(y_value) return data_size, len(final_y_vector), final_x_matrix, final_y_vector
def parse_test_data(test_file_full_path, means_and_frequents): f = open(test_file_full_path) missingMatrix = list() final_x_matrix = list() final_y_vector = list() # splitting the data for the rows file_input = f.read().split('\n') for row in file_input: missingMatrix.append(row.split(', ')) # removing the first and last entry to avoid list index out of range error missingMatrix.remove(missingMatrix[0]) missingMatrix.pop() # replace the missing values with the means and most frequents filledMatrix = addMissingValues(missingMatrix, means_and_frequents) for row in filledMatrix: newRow = parse(row) final_x_matrix.append(newRow[0]) final_y_vector.append(newRow[1]) return final_x_matrix, final_y_vector
def parse_data(data_file_full_path): """ This method parses the data into the final matrix [M x N] - called X matrix. and Nx1 vector of classifier results - Y vector. """ final_x_matrix = list() final_y_vector = list() f = open(data_file_full_path) for line in f: ans = parse(line.split(", ")) final_x_matrix.append(ans[0]) final_y_vector.append(ans[1]) f.close() return final_x_matrix, final_y_vector
def parse_data(data_file_full_path): """ This method parses the data into the final matrix [M x N] - called X matrix. and Nx1 vector of classifier results - Y vector. """ final_x_matrix = list() final_y_vector = list() missing_vectors_x = list() missing_vectors_y = list() avg = [0] * 14 count = 0 f = open(data_file_full_path, "r") for line in f: if line.startswith('|'): continue is_missed = line.find("?") > 0 splited_line = line.split(', ') x, y = parse(splited_line) # handling missed data: put feature average on missed feature count += 1 for col in range(0, len(x)): if int(x[col]) > -1: avg[col] += int(x[col]) if not is_missed: final_x_matrix.append(x) final_y_vector.append(y) else: missing_vectors_x.append(x) missing_vectors_y.append(y) # calculate average of each feature and replace missing values for i in range(0, len(avg)): avg[i] = avg[i] / count for i in range(0, len(missing_vectors_x)): for j in range(0, len(missing_vectors_x[i])): if missing_vectors_x[i][j] < 0: missing_vectors_x[i][j] = avg[j] final_x_matrix += missing_vectors_x final_y_vector += missing_vectors_y f.close() return final_x_matrix, final_y_vector
def parse_data(data_file_full_path): """ This method parses the data into the final matrix [M x N] - called X matrix. and Nx1 vector of classifier results - Y vector. Handling the missing data by skipping people with missing feature data (feature=? or len of feature vector!=15) """ unknown = "?" max_row_len = 15 final_x_matrix = [] final_y_vector = [] with open(data_file_full_path) as f: for line in f: # TODO - think about readlines() in order to access the file just once row = [x.strip() for x in line.split(',')] if unknown not in row and len(row) == max_row_len: x, y = parse(row) final_x_matrix.append(x) final_y_vector.append(y) return final_x_matrix, final_y_vector
def parse_data(data_file_full_path): """ This method parses the data into the final matrix [M x N] - called X matrix. and Nx1 vector of classifier results - Y vector. """ corrupted_data = False f = open(data_file_full_path) final_x_matrix = list() final_y_vector = list() for line in f.readlines(): if MISSING_DATA not in line: # we'll skip lines with partial data data = parse_line(line) if len( data ) == DATA_FEATURES: # make sure the data line has all the features x, y = parse(data) if x is not None and y is not None: final_x_matrix.append(x) final_y_vector.append(y) else: corrupted_data = True return final_x_matrix, final_y_vector, corrupted_data
def parse_data(data_file_full_path): """ This method parses the data into the final matrix [M x N] - called X matrix. and Nx1 vector of classifier results - Y vector. """ f = open(data_file_full_path) final_x_matrix = list() final_y_vector = list() missingMatrix = list() columns = [ "age", "workclass", "fnlwgt", "education", "education-num", "martial-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "salary" ] # splitting the data for each row, splitting for every entry using (", ") file_input = f.read().split('\n') for row in file_input: missingMatrix.append(row.split(', ')) # read the data as csv with the column names, to be fed into get means and most frequents data = pd.read_csv(data_file_full_path, names=columns) means_and_most_frequents = getMeansAndMostFrequent(data) # replacing the missing values with the data in means and most frequents vector filledMatrix = addMissingValues(missingMatrix, means_and_most_frequents) # popping the last empty row filledMatrix.pop() # parse each row to be replaced with numbers from the parse function for row in filledMatrix: newrow = parse(row) final_x_matrix.append(newrow[0]) final_y_vector.append(newrow[1]) # return X matrix, y vector, and means and most frequents vector - to be used in parse test data function return final_x_matrix, final_y_vector, means_and_most_frequents