def main(): json_file_path = process_command_line_args() json_manager = JsonManager(json_file_path) csv_folder = json_manager.get_csv_path() normalized_folder = json_manager.get_normalized_path() feature_columns = json_manager.get_feature_columns() label_columns = json_manager.get_label_columns() lag_features = json_manager.get_lag_features() lag_window_length = json_manager.get_sliding_window_length() destination_path = constants.add_folder_to_directory(\ constants.NORMALIZED_CSV_FOLDER_NAME, normalized_folder) for file in os.listdir(csv_folder): complete_file_path = os.fsdecode(os.path.join(csv_folder, file)) if is_file_CSV(file): normalized_filename = make_modified_filename(\ file, CSV_NAME_EXTENSION) normalized_file_path = os.fsdecode(os.path.join(\ destination_path, normalized_filename)) current_csv_obj = open(complete_file_path) normalized_csv_obj = open(normalized_file_path, mode='w') csv_reader = csv.reader(current_csv_obj, \ delimiter = constants.CSV_DELIMITER) csv_writer = csv.writer(normalized_csv_obj, \ delimiter = constants.CSV_DELIMITER, \ quotechar = constants.CSV_QUOTECHAR, \ quoting=csv.QUOTE_MINIMAL) all_lag_queues = [[""] * lag_window_length for lag_feature in lag_features] header_row = list(feature_columns) header_row.append(constants.LABEL_COLUMN_NAME) csv_writer.writerow(header_row) label_indices = list(label_columns.values()) header_row_being_read = True for timeseries_row in csv_reader: if header_row_being_read: header_row_being_read = False continue label_values = [ timeseries_row[index] for index in label_indices ] label_value = next((label_value for label_value in label_values \ if label_value), None) if label_value: new_normalize_row = [] for column_name, column_index in feature_columns.items(): if column_name in lag_features: index = lag_features.index(column_name) lagged_feature = update_lag_feature_queue(\ all_lag_queues[index], timeseries_row[column_index]) new_normalize_row.append(lagged_feature) else: new_normalize_row.append(\ timeseries_row[feature_columns[column_name]]) new_normalize_row.append(label_value) csv_writer.writerow(new_normalize_row) else: for column_index, column_name in enumerate(lag_features): value = timeseries_row[feature_columns[column_name]] update_lag_feature_queue(all_lag_queues[column_index], value) current_csv_obj.close() normalized_csv_obj.close() combined_csv_file_path = os.path.join(destination_path, constants.COMBINED_CSV_FILENAME) if os.path.exists(combined_csv_file_path): os.remove(combined_csv_file_path) combined_csv = pd.concat([pd.read_csv(os.fsdecode(os.path.join(destination_path, f)))\ for f in os.listdir(destination_path)]) combined_csv.to_csv( os.fsdecode(combined_csv_file_path), \ index = False, encoding = 'utf-8-sig')
def run_normalize(json_file_path): global add_last_action_taken print(f"Normalizing started using {json_file_path}") json_manager = JsonManager(json_file_path) csv_folder = json_manager.get_csv_path() normalized_folder = json_manager.get_normalized_path() feature_list = json_manager.get_feature_columns() label_columns = json_manager.get_label_columns() lag_features = json_manager.get_lag_features() lag_window_length = json_manager.get_sliding_window_length() add_last_action_taken = json_manager.get_add_last_action_taken() constants.remove_folder_if_exists(\ constants.NORMALIZED_CSV_FOLDER_NAME, normalized_folder) destination_path = constants.add_folder_to_directory(\ constants.NORMALIZED_CSV_FOLDER_NAME, normalized_folder) for file in os.listdir(csv_folder): complete_file_path = os.fsdecode(os.path.join(csv_folder, file)) last_action_taken = None if is_file_CSV(file): print(f"Reading in csv: {complete_file_path}") normalized_filename = make_modified_filename(\ file, CSV_NAME_EXTENSION) normalized_file_path = os.fsdecode(os.path.join(\ destination_path, normalized_filename)) current_csv_obj = open(complete_file_path) normalized_csv_obj = open(normalized_file_path, mode='w') csv_reader = csv.reader(current_csv_obj, \ delimiter = constants.CSV_DELIMITER) csv_writer = csv.writer(normalized_csv_obj, \ delimiter = constants.CSV_DELIMITER, \ quotechar = constants.CSV_QUOTECHAR, \ quoting=csv.QUOTE_MINIMAL) all_lag_queues = [[""] * lag_window_length for lag_feature in lag_features] header_row = list(feature_list) if (add_last_action_taken): header_row.append(constants.LAST_ACTION_TAKEN_COLUMN_NAME) header_row.append(constants.LABEL_COLUMN_NAME) csv_writer.writerow(header_row) header_row_being_read = True for timeseries_row in csv_reader: if header_row_being_read: feature_columns = generate_feature_col_dictionary( timeseries_row, feature_list, False) label_indices = list( generate_feature_col_dictionary( timeseries_row, label_columns, True).values()) header_row_being_read = False continue label_values = [ timeseries_row[index] for index in label_indices ] label_value = next((label_value for label_value in label_values \ if label_value), None) if label_value: new_normalize_row = [] for column_name, column_index in feature_columns.items(): if column_name in lag_features: index = lag_features.index(column_name) lagged_feature = update_lag_feature_queue(\ all_lag_queues[index], timeseries_row[column_index]) new_normalize_row.append(lagged_feature) elif column_name == constants.LAST_ACTION_TAKEN_COLUMN_NAME: new_normalize_row.append(last_action_taken) else: new_normalize_row.append(\ timeseries_row[feature_columns[column_name]]) new_normalize_row.append(label_value) last_action_taken = label_value csv_writer.writerow(new_normalize_row) else: for column_index, column_name in enumerate(lag_features): value = timeseries_row[feature_columns[column_name]] update_lag_feature_queue(all_lag_queues[column_index], value) current_csv_obj.close() normalized_csv_obj.close() combined_csv_file_path = os.path.join(destination_path, constants.COMBINED_CSV_FILENAME) if os.path.exists(combined_csv_file_path): os.remove(combined_csv_file_path) combined_csv = pd.concat([pd.read_csv(os.fsdecode(os.path.join(destination_path, f)))\ for f in os.listdir(destination_path)]) combined_csv.to_csv( os.fsdecode(combined_csv_file_path), \ index = False, encoding = 'utf-8-sig') print(f"Normalizing finished, results in {normalized_file_path}")
def main(): json_file_path = process_command_line_args() json_manager = JsonManager(json_file_path) feature_columns = json_manager.get_feature_columns() categorical_features = json_manager.get_categorical_features() binary_features = json_manager.get_binary_features() hot_encoded_path = json_manager.get_hot_encoded_path() normalized_folder = os.fsdecode(os.path.join(\ json_manager.get_normalized_path(), \ constants.NORMALIZED_CSV_FOLDER_NAME)) combined_csv_file = os.fsdecode(os.path.join(\ normalized_folder, \ constants.COMBINED_CSV_FILENAME)) features_data = pd.read_csv(combined_csv_file, usecols=feature_columns) for binary_variable in binary_features: features_data[binary_variable] = features_data[binary_variable].fillna( value=-1) features_data[binary_variable] = features_data[binary_variable] * 1 true_false_columns = features_data[binary_features] true_false_columns_array = true_false_columns.to_numpy() # true_false_features(features_data, true_false_features) # hot encoded features hot_encoded_array, hot_encoded_header = hot_encode_features(\ features_data, categorical_features) # remove hot encoded features from features_data dataframe features_data = features_data.drop(columns=categorical_features + binary_features) features_data_array = features_data.to_numpy() # encode labels labels_data = pd.read_csv(combined_csv_file, \ usecols = [constants.LABEL_COLUMN_NAME]) label_encoder, labels_column_array = encode_label_column(labels_data) # add hot_encoded columns, than numerical columns, then encoded labels to one array final_csv = np.concatenate(\ (hot_encoded_array, binary_columns_array, \ features_data_array, labels_column_array), \ axis = constants.COLUMN_AXIS) hot_encoded_folder = constants.add_folder_to_directory(\ constants.HOT_ENCODED_CSV_FOLDER_NAME, hot_encoded_path) hot_encoded_file_path = os.fsdecode(os.path.join(\ hot_encoded_folder, constants.HOT_ENCODED_CSV_FILENAME)) if os.path.exists(hot_encoded_file_path): os.remove(hot_encoded_file_path) # make_formatter_string(hot_encoded_header, numerical_columns, label_column) hot_encode_fmt = "%i," * len( hot_encoded_header + binary_features) # format hot encoded columns to ints feature_data_fmt = "%1.3f," * len( features_data.columns) # format numerical columns to doubles total_fmt = hot_encode_fmt + feature_data_fmt + "%i" # for label final_header = ','.join( str(i) for i in (hot_encoded_header + binary_features + list(features_data.columns))) final_header += "," + constants.LABEL_COLUMN_NAME # for label np.savetxt(hot_encoded_file_path, final_csv, \ fmt = total_fmt, \ header = final_header, \ delimiter = constants.CSV_DELIMITER, \ comments='') f = open(OUTPUT_LOG_FILE, "w") f.write("{}\n".format(total_fmt)) f.write(str((label_encoder.classes_).tolist())) f.close()
def run_hotencode(json_file_path): global add_last_action_taken print(f"Hot encoding started using {json_file_path}") json_manager = JsonManager(json_file_path) feature_list = json_manager.get_feature_columns() categorical_features = json_manager.get_categorical_features() add_last_action_taken = json_manager.get_add_last_action_taken() if add_last_action_taken: categorical_features.append(constants.LAST_ACTION_TAKEN_COLUMN_NAME) binary_features = json_manager.get_binary_features() hot_encoded_path = json_manager.get_hot_encoded_path() constants.remove_folder_if_exists(\ constants.HOT_ENCODED_CSV_FOLDER_NAME, hot_encoded_path) hot_encoded_folder = constants.add_folder_to_directory(\ constants.HOT_ENCODED_CSV_FOLDER_NAME, hot_encoded_path) hot_encoded_file_path = os.fsdecode(os.path.join(\ hot_encoded_folder, constants.HOT_ENCODED_CSV_FILENAME)) normalized_folder = os.fsdecode(os.path.join(\ json_manager.get_normalized_path(), \ constants.NORMALIZED_CSV_FOLDER_NAME)) combined_csv_file = os.fsdecode(os.path.join(\ normalized_folder, \ constants.COMBINED_CSV_FILENAME)) feature_columns = generate_feature_col_dictionary( get_header_row(combined_csv_file), feature_list, False) features_data = pd.read_csv(combined_csv_file, usecols=feature_columns) features_data[binary_features] = features_data[binary_features].fillna(0) features_data[binary_features] = features_data[binary_features].astype( bool) binary_columns_array = features_data[binary_features].to_numpy() # hot encoded features hot_encoded_array, hot_encoded_header = hot_encode_features(\ features_data, categorical_features) # remove hot encoded features from features_data dataframe features_data = features_data.drop(columns=categorical_features + binary_features) features_data_array = features_data.to_numpy() # encode labels labels_data = pd.read_csv(combined_csv_file, \ usecols = [constants.LABEL_COLUMN_NAME]) label_encoder, labels_column_array = encode_label_column(labels_data) # add hot_encoded columns, than numerical columns, then encoded labels to one array final_csv = np.concatenate(\ (hot_encoded_array, binary_columns_array, \ features_data_array, labels_column_array), \ axis = constants.COLUMN_AXIS) # make_formatter_string(hot_encoded_header, numerical_columns, label_column) hot_encode_fmt = "%s," * len( hot_encoded_header + binary_features) # format hot encoded columns to binary features feature_data_fmt = "%1.3f," * len( features_data.columns) # format numerical columns to doubles total_fmt = hot_encode_fmt + feature_data_fmt + "%i" # for label final_header = ','.join( str(i) for i in (hot_encoded_header + binary_features + list(features_data.columns))) final_header += "," + constants.LABEL_COLUMN_NAME # for label np.savetxt(hot_encoded_file_path, final_csv, \ fmt = total_fmt, \ header = final_header, \ delimiter = constants.CSV_DELIMITER, \ comments='') f = open(OUTPUT_LOG_FILE, "w") f.write("{}\n".format(total_fmt)) f.write(str((label_encoder.classes_).tolist())) f.close() print(f"Hot Encoding finished, results in {hot_encoded_file_path}")