def main(): parser = argparse.ArgumentParser(description='One-hot-code data') parser.add_argument('input_path', type=str, help='full path to input file') parser.add_argument('output_path', type=str, help='full path to output file') parser.add_argument('number_max_bins', type=int, help='number of max bins for output data') args = parser.parse_args() input_path = args.input_path output_file = args.output_path num_bins = args.number_max_bins data_frame = FileManager.get_csv_file_data_pandas(input_path) print('data frame head:') print(data_frame.head(3)) hot_coded_data_frame = DataManipulator.one_hot_code(data_frame, num_bins) print('data one hot coded head:') print(hot_coded_data_frame.head(3)) hot_coded_data_frame.to_csv(output_file, header=None, index=None, sep=',', mode='a')
def main(): #print('LOG: Main program to run tests') parser = argparse.ArgumentParser( description= 'Run Stepwise Forward Selection && K-Means clustering on given data') parser.add_argument('file_path', type=str, help='full path to input file') parser.add_argument('num_clusters', type=int, default=2, nargs='?', help='number of total clusters') args = parser.parse_args() #INPUTS print() print('INPUTS') file_path = args.file_path print('input file path:', file_path) num_clusters = args.num_clusters print('number of clusters: ', num_clusters) print() #READ INPUT DATA input_data = FileManager.get_csv_file_data_pandas(file_path) print('head of input data') print(input_data.head()) print() #PREPROCESS DATA #According to 'names' files && visual inspection there are 0 missing values for these data sets #Remove missing value rows #input_data.pd.dropna(inplace=True) #print() #RUN FEATURE SELECTION && K-MEANS sfs = StepwiseForwardSelection(input_data) chosen_features, chosen_data_set = sfs.run_sfs(num_clusters) print() print('RESULT:') print('Chosen features (indexed from 0):') print(chosen_features) print('Chosen data:') print(chosen_data_set) print('Best silhouette coefficient:') print(sfs.base_performance) print('Clusters:') print(sfs.chosen_model.clusters) print('Centroids:') print(sfs.chosen_model.centroids) print()
def split_data(file_path, is_random, num_groups=5, separator=','): #GET DATA original_data = FileManager.get_csv_file_data_pandas(file_path, separator) #print('original data') #print(original_data) #STANDARD STUFF #Split the data into 5 groups groups = list() num_groups = int(num_groups) if (is_random == True): #Use basic random 5-way split groups = DataManipulator.split_data_randomly(original_data, num_groups) else: #Use more complex split groups = DataManipulator.split_data_randomly_accounting_for_class( original_data, num_groups) return groups