def process_house_votes_data(file_path): #print('LOG: process_house_votes_data() START') #print(file_path) original_data = FileManager.get_csv_file_data_array(file_path) #print('LOG: ORIGINAL data') #print(original_data[0]) #print('LOG: Class moved to final column') original_col_with_class = 0 columns_moved_data = DataManipulator.move_column_to_end(original_data, original_col_with_class) #print(columns_moved_data[0]) #print('LOG: group input values into bins') data_in_bins = _bin_input_attributes(columns_moved_data) #print(data_in_bins[0]) #print('LOG: turn input features into binary values (0,1)') num_bins = 3 col_idx = 0 final_data = DataManipulator.expand_attributes_to_binary_values(data_in_bins, col_idx, num_bins) for col in data_in_bins[0]: col_idx += num_bins #This is because columns are inserted each itr #print('length of final_data') #print(len(final_data[0])) if col_idx == (len(final_data[0]) - 1): #Skip last col b/c it's the class value #print('LOG: Stop here - skip the final column which is classification') break final_data = DataManipulator.expand_attributes_to_binary_values(final_data, col_idx, num_bins) #print(final_data[0]) return final_data
def process_house_votes_data(file_path): #print('LOG: process_house_votes_data() START') #print(file_path) original_data = FileManager.get_csv_file_data_array(file_path) #print('LOG: ORIGINAL data') #print(original_data[0]) #print('LOG: group input values into bins') num_bins = 4 #Fix sepal length 4.3 - 7.9 column = 0 min_val = 4.3 max_val = 7.9 data_in_bins = \ _bin_input_attribute(original_data, column, min_val, max_val, num_bins) #Fix sepal width 2.0 - 4.4 column = 1 min_val = 2.0 max_val = 4.4 data_in_bins = \ _bin_input_attribute(original_data, column, min_val, max_val, num_bins) #Fix petal length 1.0 - 6.9 column = 2 min_val = 1.0 max_val = 6.9 data_in_bins = \ _bin_input_attribute(original_data, column, min_val, max_val, num_bins) #Fix petal width 0.1 - 2.5 column = 3 min_val = 0.1 max_val = 2.5 data_in_bins = \ _bin_input_attribute(original_data, column, min_val, max_val, num_bins) #print() #print(data_in_bins) #print() #print('LOG: turn input features into binary values (0,1)') col_idx = 0 final_data = DataManipulator.expand_attributes_to_binary_values( data_in_bins, col_idx, num_bins) for col in data_in_bins[0]: col_idx += num_bins #This is because columns are inserted each itr #print('length of final_data') #print(len(final_data[0])) if col_idx == (len(final_data[0]) - 1): #Skip last col b/c it's the class value #print('LOG: Stop here - skip the final column which is classification') break final_data = DataManipulator.expand_attributes_to_binary_values( final_data, col_idx, num_bins) #print(final_data[0]) return final_data
def process_house_votes_data(file_path): #print('LOG: process_house_votes_data() START') #print(file_path) original_data = FileManager.get_csv_file_data_array(file_path) #print('LOG: ORIGINAL data') #print(original_data[0]) #Eliminate first column of data, convert class from 2,4 -> 0,1 && make bins are 0-9, not 1-10 modified_data = list() number_of_rows = len(original_data) for vector_idx in range(0, number_of_rows, 1): tmplist = original_data[vector_idx][1:] number_of_cols = len(tmplist) for col_idx in range(0, number_of_cols, 1): if col_idx == (number_of_cols - 1): #classificiaton value {2,4} #Classification value, {2,4} -> {0,1} if tmplist[col_idx] == 2: tmplist[col_idx] = 0 elif tmplist[col_idx] == 4: tmplist[col_idx] = 1 else: print('ERROR: Expected {2,4} but instead got ', tmplist[col_idx]) #no modification made else: #normal bin value tmplist[col_idx] = tmplist[col_idx] - 1 #bins 1-10 -> 0-9 modified_data.append(tmplist) #print() #print(modified_data) #print() #print('LOG: turn input features into binary values (0,1)') num_bins = 10 col_idx = 0 final_data = DataManipulator.expand_attributes_to_binary_values( modified_data, col_idx, num_bins) for col in modified_data[0]: col_idx += num_bins #This is because columns are inserted each itr #print('length of final_data') #print(len(final_data[0])) if col_idx == (len(final_data[0]) - 1): #Skip last col b/c it's the class value #print('LOG: Stop here - skip the final column which is classification') break final_data = DataManipulator.expand_attributes_to_binary_values( final_data, col_idx, num_bins) #print(final_data[0]) return final_data
def main(): parser = argparse.ArgumentParser(description='One-hot-code data') parser.add_argument('input_path', type=str, help='full path to input file') parser.add_argument('output_path', type=str, help='full path to output file') parser.add_argument('number_max_bins', type=int, help='number of max bins for output data') args = parser.parse_args() input_path = args.input_path output_file = args.output_path num_bins = args.number_max_bins data_frame = FileManager.get_csv_file_data_pandas(input_path) print('data frame head:') print(data_frame.head(3)) hot_coded_data_frame = DataManipulator.one_hot_code(data_frame, num_bins) print('data one hot coded head:') print(hot_coded_data_frame.head(3)) hot_coded_data_frame.to_csv(output_file, header=None, index=None, sep=',', mode='a')
def main(): print("Crypto Currency Trade Suggestion Bot") print("*Starting Pulling Data*") os.remove('currency_data.txt') os.system('scrapy runspider currency_attribute_getter.py') print("Data has been Updated") print('Best Trade Options') DataManipulator()
def split_data(file_path, is_random, num_groups=5, separator=','): #GET DATA original_data = FileManager.get_csv_file_data_pandas(file_path, separator) #print('original data') #print(original_data) #STANDARD STUFF #Split the data into 5 groups groups = list() num_groups = int(num_groups) if (is_random == True): #Use basic random 5-way split groups = DataManipulator.split_data_randomly(original_data, num_groups) else: #Use more complex split groups = DataManipulator.split_data_randomly_accounting_for_class( original_data, num_groups) return groups
def main(): #Move class column to final col parser = argparse.ArgumentParser( description='Move the given column to be the last column') parser.add_argument('file_path_in', type=str, help='full path to input file') parser.add_argument('column', type=int, help='the column to remove') parser.add_argument('file_path_out', type=str, help='full path to output file') args = parser.parse_args() column = args.column data = FileManager.get_csv_file_data_numpy(args.file_path_in, ',') data = data.astype(np.float) data_as_np = DataManipulator.move_np_column_to_end(data, column) np.savetxt(args.file_path_out, data_as_np, delimiter=',') '''
def main(): #print('LOG: Main program to run tests') parser = argparse.ArgumentParser( description='Learn & Verify machine learning algorithms') parser.add_argument('all_class_filepath', type=str, help='full path to input file') parser.add_argument('class_0_filepath', type=str, help='full path to input file') parser.add_argument('class_1_filepath', type=str, help='full path to input file') parser.add_argument('class_2_filepath', type=str, help='full path to input file') parser.add_argument('fraction', type=float, default=0.66, nargs='?', help='fraction of data to learn') parser.add_argument('num_classes', type=int, default=3, nargs='?', help='number of total classes') args = parser.parse_args() #INPUTS print() print('INPUTS') allclass_filepath = args.all_class_filepath print('overall_model_file_path: ' + allclass_filepath) class0_filepath = args.class_0_filepath print('class0_filepath: ' + class0_filepath) class1_filepath = args.class_1_filepath print('class1_filepath: ' + class1_filepath) class2_filepath = args.class_2_filepath print('class2_filepath: ' + class2_filepath) fraction_of_data_for_learning = args.fraction print('fraction of data to train: ', fraction_of_data_for_learning) number_of_classes = args.num_classes print('number of classes: ', number_of_classes) #READ INPUT DATA allclass_data = FileManager.get_csv_file_data_array(allclass_filepath) print('number of input vectors: ', len(allclass_data)) class0_data = FileManager.get_csv_file_data_array(class0_filepath) class1_data = FileManager.get_csv_file_data_array(class1_filepath) class2_data = FileManager.get_csv_file_data_array(class2_filepath) #SPLIT INPUT DATA (learning & test sets) print() allclass_data_sets = DataManipulator.split_data_in_2_randomly( allclass_data, fraction_of_data_for_learning) class0_data_sets = DataManipulator.split_data_in_2_randomly( class0_data, fraction_of_data_for_learning) class1_data_sets = DataManipulator.split_data_in_2_randomly( class1_data, fraction_of_data_for_learning) class2_data_sets = DataManipulator.split_data_in_2_randomly( class2_data, fraction_of_data_for_learning) allclass_learning_data = allclass_data_sets[0] print('learning data size: ', len(allclass_learning_data)) allclass_test_data = allclass_data_sets[1] print('test data size: ', len(allclass_test_data)) print() class0_learning_data = class0_data_sets[0] class0_test_data = class0_data_sets[1] class1_learning_data = class1_data_sets[0] class1_test_data = class1_data_sets[1] class2_learning_data = class2_data_sets[0] class2_test_data = class2_data_sets[1] #LEARN THE MODELS #Winnow2 winnow2_0 = Winnow2() winnow2_1 = Winnow2() winnow2_2 = Winnow2() winnow2_0_learned_weights = winnow2_0.learn_winnow2_model( class0_learning_data) winnow2_1_learned_weights = winnow2_1.learn_winnow2_model( class1_learning_data) winnow2_2_learned_weights = winnow2_2.learn_winnow2_model( class2_learning_data) print('Winnow2 learned weights for class 0:') print(winnow2_0_learned_weights) print() print('Winnow2 learned weights for class 1:') print(winnow2_1_learned_weights) print() print('Winnow2 learned weights for class 2:') print(winnow2_2_learned_weights) print() #Naive Bayes naive_bayes = NaiveBayes(number_of_classes) naive_bayes_learned_percents = naive_bayes.learn_naive_bayes_model( allclass_learning_data) print( 'Naive Bayes learned percentages as input[ class[ (prob0, prob1) ] ]') print(naive_bayes_learned_percents) print() #TEST THE MODELS #Winnow2 print('Testing Winnow2 model') winnow2_multi_model_test_results = Winnow2.test_multiple_winnow2_models( allclass_test_data, [winnow2_0, winnow2_1, winnow2_2]) print('classification attempts(', winnow2_multi_model_test_results[0], '), \ #fails(', winnow2_multi_model_test_results[1], '), \ #success(', winnow2_multi_model_test_results[2], ')') print() #Naive Bayes print('Testing Naive Bayes model') naive_bayes_test_results = naive_bayes.test_naive_bayes_model( allclass_test_data) print('#classification attempts(', naive_bayes_test_results[0], '), \ #fails(', naive_bayes_test_results[1], '), \ #success(', naive_bayes_test_results[2], ')') print()
def main(): #print('LOG: Main program to run tests') parser = argparse.ArgumentParser( description='Learn & Verify machine learning algorithms') parser.add_argument('file_path', type=str, help='full path to input file') parser.add_argument('fraction', type=float, default=0.66, nargs='?', help='fraction of data to learn') parser.add_argument('num_classes', type=int, default=2, nargs='?', help='number of total classes') args = parser.parse_args() #INPUTS print() print('INPUTS') file_path = args.file_path print('filepath: ' + file_path) fraction_of_data_for_learning = args.fraction print('fraction of data to train: ', fraction_of_data_for_learning) number_of_classes = args.num_classes print('number of classes: ', number_of_classes) #READ INPUT DATA input_data = FileManager.get_csv_file_data_array(file_path) print('number of input vectors: ', len(input_data)) #SPLIT INPUT DATA (learning & test sets) print() data_sets = DataManipulator.split_data_in_2_randomly( input_data, fraction_of_data_for_learning) learning_data = data_sets[0] print('learning data size: ', len(learning_data)) test_data = data_sets[1] print('test data size: ', len(test_data)) print() #LEARN THE MODELS #Winnow2 winnow2 = Winnow2() # default values for alpha, threshold, & start weight winnow2_learned_weights = winnow2.learn_winnow2_model(learning_data) print('Winnow2 learned weights:') print(winnow2_learned_weights) print() #Naive Bayes naive_bayes = NaiveBayes(number_of_classes) naive_bayes_learned_percents = naive_bayes.learn_naive_bayes_model( learning_data) print( 'Naive Bayes learned percentages as input[ class[ (prob0, prob1) ] ]') print(naive_bayes_learned_percents) print() #TEST THE MODELS #Winnow2 print('Testing Winnow2 model') winnow2_test_results = winnow2.test_winnow2_model(test_data) print('classification attempts(', winnow2_test_results[0], '), \ #fails(', winnow2_test_results[1], '), \ #success(', winnow2_test_results[2], ')') print() #Naive Bayes print('Testing Naive Bayes model') naive_bayes_test_results = naive_bayes.test_naive_bayes_model(test_data) print('#classification attempts(', naive_bayes_test_results[0], '), \ #fails(', naive_bayes_test_results[1], '), \ #success(', naive_bayes_test_results[2], ')') print()