def main(): json_file_path = process_command_line_args() json_manager = JsonManager(json_file_path) csv_folder = json_manager.get_csv_path() normalized_folder = json_manager.get_normalized_path() feature_columns = json_manager.get_feature_columns() label_columns = json_manager.get_label_columns() lag_features = json_manager.get_lag_features() lag_window_length = json_manager.get_sliding_window_length() destination_path = constants.add_folder_to_directory(\ constants.NORMALIZED_CSV_FOLDER_NAME, normalized_folder) for file in os.listdir(csv_folder): complete_file_path = os.fsdecode(os.path.join(csv_folder, file)) if is_file_CSV(file): normalized_filename = make_modified_filename(\ file, CSV_NAME_EXTENSION) normalized_file_path = os.fsdecode(os.path.join(\ destination_path, normalized_filename)) current_csv_obj = open(complete_file_path) normalized_csv_obj = open(normalized_file_path, mode='w') csv_reader = csv.reader(current_csv_obj, \ delimiter = constants.CSV_DELIMITER) csv_writer = csv.writer(normalized_csv_obj, \ delimiter = constants.CSV_DELIMITER, \ quotechar = constants.CSV_QUOTECHAR, \ quoting=csv.QUOTE_MINIMAL) all_lag_queues = [[""] * lag_window_length for lag_feature in lag_features] header_row = list(feature_columns) header_row.append(constants.LABEL_COLUMN_NAME) csv_writer.writerow(header_row) label_indices = list(label_columns.values()) header_row_being_read = True for timeseries_row in csv_reader: if header_row_being_read: header_row_being_read = False continue label_values = [ timeseries_row[index] for index in label_indices ] label_value = next((label_value for label_value in label_values \ if label_value), None) if label_value: new_normalize_row = [] for column_name, column_index in feature_columns.items(): if column_name in lag_features: index = lag_features.index(column_name) lagged_feature = update_lag_feature_queue(\ all_lag_queues[index], timeseries_row[column_index]) new_normalize_row.append(lagged_feature) else: new_normalize_row.append(\ timeseries_row[feature_columns[column_name]]) new_normalize_row.append(label_value) csv_writer.writerow(new_normalize_row) else: for column_index, column_name in enumerate(lag_features): value = timeseries_row[feature_columns[column_name]] update_lag_feature_queue(all_lag_queues[column_index], value) current_csv_obj.close() normalized_csv_obj.close() combined_csv_file_path = os.path.join(destination_path, constants.COMBINED_CSV_FILENAME) if os.path.exists(combined_csv_file_path): os.remove(combined_csv_file_path) combined_csv = pd.concat([pd.read_csv(os.fsdecode(os.path.join(destination_path, f)))\ for f in os.listdir(destination_path)]) combined_csv.to_csv( os.fsdecode(combined_csv_file_path), \ index = False, encoding = 'utf-8-sig')
def run_normalize(json_file_path): global add_last_action_taken print(f"Normalizing started using {json_file_path}") json_manager = JsonManager(json_file_path) csv_folder = json_manager.get_csv_path() normalized_folder = json_manager.get_normalized_path() feature_list = json_manager.get_feature_columns() label_columns = json_manager.get_label_columns() lag_features = json_manager.get_lag_features() lag_window_length = json_manager.get_sliding_window_length() add_last_action_taken = json_manager.get_add_last_action_taken() constants.remove_folder_if_exists(\ constants.NORMALIZED_CSV_FOLDER_NAME, normalized_folder) destination_path = constants.add_folder_to_directory(\ constants.NORMALIZED_CSV_FOLDER_NAME, normalized_folder) for file in os.listdir(csv_folder): complete_file_path = os.fsdecode(os.path.join(csv_folder, file)) last_action_taken = None if is_file_CSV(file): print(f"Reading in csv: {complete_file_path}") normalized_filename = make_modified_filename(\ file, CSV_NAME_EXTENSION) normalized_file_path = os.fsdecode(os.path.join(\ destination_path, normalized_filename)) current_csv_obj = open(complete_file_path) normalized_csv_obj = open(normalized_file_path, mode='w') csv_reader = csv.reader(current_csv_obj, \ delimiter = constants.CSV_DELIMITER) csv_writer = csv.writer(normalized_csv_obj, \ delimiter = constants.CSV_DELIMITER, \ quotechar = constants.CSV_QUOTECHAR, \ quoting=csv.QUOTE_MINIMAL) all_lag_queues = [[""] * lag_window_length for lag_feature in lag_features] header_row = list(feature_list) if (add_last_action_taken): header_row.append(constants.LAST_ACTION_TAKEN_COLUMN_NAME) header_row.append(constants.LABEL_COLUMN_NAME) csv_writer.writerow(header_row) header_row_being_read = True for timeseries_row in csv_reader: if header_row_being_read: feature_columns = generate_feature_col_dictionary( timeseries_row, feature_list, False) label_indices = list( generate_feature_col_dictionary( timeseries_row, label_columns, True).values()) header_row_being_read = False continue label_values = [ timeseries_row[index] for index in label_indices ] label_value = next((label_value for label_value in label_values \ if label_value), None) if label_value: new_normalize_row = [] for column_name, column_index in feature_columns.items(): if column_name in lag_features: index = lag_features.index(column_name) lagged_feature = update_lag_feature_queue(\ all_lag_queues[index], timeseries_row[column_index]) new_normalize_row.append(lagged_feature) elif column_name == constants.LAST_ACTION_TAKEN_COLUMN_NAME: new_normalize_row.append(last_action_taken) else: new_normalize_row.append(\ timeseries_row[feature_columns[column_name]]) new_normalize_row.append(label_value) last_action_taken = label_value csv_writer.writerow(new_normalize_row) else: for column_index, column_name in enumerate(lag_features): value = timeseries_row[feature_columns[column_name]] update_lag_feature_queue(all_lag_queues[column_index], value) current_csv_obj.close() normalized_csv_obj.close() combined_csv_file_path = os.path.join(destination_path, constants.COMBINED_CSV_FILENAME) if os.path.exists(combined_csv_file_path): os.remove(combined_csv_file_path) combined_csv = pd.concat([pd.read_csv(os.fsdecode(os.path.join(destination_path, f)))\ for f in os.listdir(destination_path)]) combined_csv.to_csv( os.fsdecode(combined_csv_file_path), \ index = False, encoding = 'utf-8-sig') print(f"Normalizing finished, results in {normalized_file_path}")