def get_user_playlists(ploads_json, playlist_json): ploads = JsonManager.load_json_from_file( ploads_json)['using_access_token'] response = requests.get('https://api.spotify.com/v1/me/playlists', params=ploads) print(f"get_users_playlists {response.status_code:12}") JsonManager.dump_into_json_file(playlist_json, response.json())
def get_token_with_code(ploads_json, tokens_json): ploads = JsonManager.load_json_from_file(ploads_json)['using_code'] response = requests.post('https://accounts.spotify.com/api/token', data=ploads) print(f"get_token_with_code {response.status_code:12}") JsonManager.dump_into_json_file(tokens_json, response.json()) JsonManager.move_token_to_ploads(ploads_json, tokens_json)
def get_user_info(ploads_json, user_info_json): ploads = JsonManager.load_json_from_file( ploads_json)['using_access_token'] response = requests.get('https://api.spotify.com/v1/me', params=ploads) print(f"get_user_info {response.status_code:18}") JsonManager.dump_into_json_file(user_info_json, response.json()) JsonManager.move_user_id_to_ploads(ploads_json, user_info_json)
def __init__(self, json_file_path, log_file_path): """Summary Args: json_file_path (str): Full filepath to config.json log_file_path (str): Full filepath to output.log file """ print( f"BehaviorTree building started using {json_file_path} and {log_file_path}" ) self.json_manager = JsonManager(json_file_path) self.log_file = open(log_file_path, "r")
def get_user_playlist(ploads_json, playlist_json, playlist_name): if False == SpotifyRequests.check_if_a_playlist_exists( ploads_json, playlist_json, playlist_name): print("Error: Playlist doesn't exist") return None playlists = JsonManager.load_json_from_file(playlist_json) for playlist in playlists['items']: if playlist['name'] == playlist_name: JsonManager.dump_into_json_file('jsons/playlist.json', playlist) return playlist
def do_GET(self): if '/callback?c' in self.path: self.code = self.path[15:] code_json = {"code": self.code} JsonManager.dump_into_json_file('jsons/code.json', code_json) JsonManager.move_code_to_ploads('jsons/ploads.json', 'jsons/code.json') HttpServer.shutdown_server() elif '/callback?e' in self.path: print("Error: Access Denied") HttpServer.shutdown_server()
def print_user_playlists(ploads_json, playlist_json): SpotifyRequests.get_user_playlists(ploads_json, playlist_json) playlists = JsonManager.load_json_from_file(playlist_json) for i in range(len(playlists['items'])): print((i + 1), playlists['items'][i]['name'])
def get_token(ploads_json, tokens_json): token = JsonManager.load_json_from_file( ploads_json)['using_refresh_token']['refresh_token'] if token != "": SpotifyRequests.get_token_with_refresh_token( ploads_json, tokens_json) else: SpotifyRequests.get_token_without_code(ploads_json, tokens_json)
def get_code(ploads_json): ploads = JsonManager.load_json_from_file(ploads_json)['GET_code'] response = requests.get('https://accounts.spotify.com/authorize', params=ploads) print(f"get_code {response.status_code:23}") webbrowser.open(response.url) HttpServer.start_server()
def create_a_playlist(ploads_json, user_info_json, playlist_json, playlist_name): if SpotifyRequests.check_if_a_playlist_exists(ploads_json, playlist_json, playlist_name): print("Playlist already exists") return None SpotifyRequests.get_user_info(ploads_json, user_info_json) ploads = JsonManager.load_json_from_file(ploads_json) user_id = ploads['using_access_token']['user_id'] ploads['create_playlist_body']['name'] = playlist_name url = f'https://api.spotify.com/v1/users/{user_id}/playlists' data = JsonManager.dump_into_json_string( ploads['create_playlist_body']) headers = ploads['using_authorization_token'] response = requests.post(url, data=data, headers=headers) print(f"create_a_playlist {response.status_code:14}")
def check_if_a_playlist_exists(ploads_json, playlist_json, playlist_name): SpotifyRequests.get_user_playlists(ploads_json, playlist_json) playlists = JsonManager.load_json_from_file(playlist_json) for playlist in playlists['items']: name = playlist['name'] if playlist_name == name: return True return False
def get_token_with_refresh_token(ploads_json, tokens_json): ploads = JsonManager.load_json_from_file( ploads_json)['using_refresh_token'] response = requests.post('https://accounts.spotify.com/api/token', data=ploads) print(f"get_token_with_refresh_token {response.status_code}") if 'refresh_token' in response.json().keys(): JsonManager.dump_into_json_file(tokens_json, response.json()) else: response_dict = response.json() response_dict['refresh_token'] = ploads['refresh_token'] JsonManager.dump_into_json_file(tokens_json, response_dict) JsonManager.move_token_to_ploads(ploads_json, tokens_json)
def unfollow_user_playlist(ploads_json, playlist_json, playlist_name): if False == SpotifyRequests.check_if_a_playlist_exists( ploads_json, playlist_json, playlist_name): print("Error: Playlist doesn't exist") return None playlist = SpotifyRequests.get_user_playlist(ploads_json, playlist_json, playlist_name) ploads = JsonManager.load_json_from_file( ploads_json)['using_authorization_token'] playlist_id = playlist['id'] url = f'https://api.spotify.com/v1/playlists/{playlist_id}/followers' headers = ploads response = requests.delete(url, headers=headers) print(f"unfollow_user_playlist {response.status_code:9}")
def run_upsample(json_file_path, fmt_file_path): json_manager = JsonManager(json_file_path) if json_manager.get_upsample_status() == True: print(f"Upsampling started using {json_file_path} and {fmt_file_path}") upsampled_path = json_manager.get_upsampled_path() constants.remove_folder_if_exists(\ constants.UPSAMPLED_CSV_FOLDER_NAME, upsampled_path) hot_encoded_folder = os.fsdecode(os.path.join(\ json_manager.get_hot_encoded_path(), \ constants.HOT_ENCODED_CSV_FOLDER_NAME)) hot_encoded_file = os.fsdecode(os.path.join(\ hot_encoded_folder, \ constants.HOT_ENCODED_CSV_FILENAME)) hotEncoded_data = pd.read_csv(hot_encoded_file) features_data = pd.read_csv(hot_encoded_file, \ usecols = list(hotEncoded_data.columns)[:-1]) # everything except label labels_data = pd.read_csv(hot_encoded_file, \ usecols = [list(hotEncoded_data.columns)[-1]]) # label sm = SVMSMOTE(random_state=json_manager.get_random_state()) X_res, y_res = sm.fit_resample(features_data, labels_data) csv_ready = np.append(X_res, y_res, axis=constants.COLUMN_AXIS) upsampled_folder = constants.add_folder_to_directory(\ constants.UPSAMPLED_CSV_FOLDER_NAME, upsampled_path) upsampled_file_path = os.fsdecode(os.path.join(\ upsampled_folder, constants.UPSAMPLED_CSV_FILENAME)) if os.path.exists(upsampled_file_path): os.remove(upsampled_file_path) f = open(fmt_file_path, "r") fmt = f.readline() f.close() header = ','.join(str(i) for i in hotEncoded_data.columns) np.savetxt(upsampled_file_path, csv_ready, \ fmt = fmt, \ delimiter = constants.CSV_DELIMITER, \ header = header, \ comments='') print(f"Upsampling finished, results in {upsampled_file_path}")
def run_hotencode(json_file_path): global add_last_action_taken print(f"Hot encoding started using {json_file_path}") json_manager = JsonManager(json_file_path) feature_list = json_manager.get_feature_columns() categorical_features = json_manager.get_categorical_features() add_last_action_taken = json_manager.get_add_last_action_taken() if add_last_action_taken: categorical_features.append(constants.LAST_ACTION_TAKEN_COLUMN_NAME) binary_features = json_manager.get_binary_features() hot_encoded_path = json_manager.get_hot_encoded_path() constants.remove_folder_if_exists(\ constants.HOT_ENCODED_CSV_FOLDER_NAME, hot_encoded_path) hot_encoded_folder = constants.add_folder_to_directory(\ constants.HOT_ENCODED_CSV_FOLDER_NAME, hot_encoded_path) hot_encoded_file_path = os.fsdecode(os.path.join(\ hot_encoded_folder, constants.HOT_ENCODED_CSV_FILENAME)) normalized_folder = os.fsdecode(os.path.join(\ json_manager.get_normalized_path(), \ constants.NORMALIZED_CSV_FOLDER_NAME)) combined_csv_file = os.fsdecode(os.path.join(\ normalized_folder, \ constants.COMBINED_CSV_FILENAME)) feature_columns = generate_feature_col_dictionary( get_header_row(combined_csv_file), feature_list, False) features_data = pd.read_csv(combined_csv_file, usecols=feature_columns) features_data[binary_features] = features_data[binary_features].fillna(0) features_data[binary_features] = features_data[binary_features].astype( bool) binary_columns_array = features_data[binary_features].to_numpy() # hot encoded features hot_encoded_array, hot_encoded_header = hot_encode_features(\ features_data, categorical_features) # remove hot encoded features from features_data dataframe features_data = features_data.drop(columns=categorical_features + binary_features) features_data_array = features_data.to_numpy() # encode labels labels_data = pd.read_csv(combined_csv_file, \ usecols = [constants.LABEL_COLUMN_NAME]) label_encoder, labels_column_array = encode_label_column(labels_data) # add hot_encoded columns, than numerical columns, then encoded labels to one array final_csv = np.concatenate(\ (hot_encoded_array, binary_columns_array, \ features_data_array, labels_column_array), \ axis = constants.COLUMN_AXIS) # make_formatter_string(hot_encoded_header, numerical_columns, label_column) hot_encode_fmt = "%s," * len( hot_encoded_header + binary_features) # format hot encoded columns to binary features feature_data_fmt = "%1.3f," * len( features_data.columns) # format numerical columns to doubles total_fmt = hot_encode_fmt + feature_data_fmt + "%i" # for label final_header = ','.join( str(i) for i in (hot_encoded_header + binary_features + list(features_data.columns))) final_header += "," + constants.LABEL_COLUMN_NAME # for label np.savetxt(hot_encoded_file_path, final_csv, \ fmt = total_fmt, \ header = final_header, \ delimiter = constants.CSV_DELIMITER, \ comments='') f = open(OUTPUT_LOG_FILE, "w") f.write("{}\n".format(total_fmt)) f.write(str((label_encoder.classes_).tolist())) f.close() print(f"Hot Encoding finished, results in {hot_encoded_file_path}")
def run_normalize(json_file_path): global add_last_action_taken print(f"Normalizing started using {json_file_path}") json_manager = JsonManager(json_file_path) csv_folder = json_manager.get_csv_path() normalized_folder = json_manager.get_normalized_path() feature_list = json_manager.get_feature_columns() label_columns = json_manager.get_label_columns() lag_features = json_manager.get_lag_features() lag_window_length = json_manager.get_sliding_window_length() add_last_action_taken = json_manager.get_add_last_action_taken() constants.remove_folder_if_exists(\ constants.NORMALIZED_CSV_FOLDER_NAME, normalized_folder) destination_path = constants.add_folder_to_directory(\ constants.NORMALIZED_CSV_FOLDER_NAME, normalized_folder) for file in os.listdir(csv_folder): complete_file_path = os.fsdecode(os.path.join(csv_folder, file)) last_action_taken = None if is_file_CSV(file): print(f"Reading in csv: {complete_file_path}") normalized_filename = make_modified_filename(\ file, CSV_NAME_EXTENSION) normalized_file_path = os.fsdecode(os.path.join(\ destination_path, normalized_filename)) current_csv_obj = open(complete_file_path) normalized_csv_obj = open(normalized_file_path, mode='w') csv_reader = csv.reader(current_csv_obj, \ delimiter = constants.CSV_DELIMITER) csv_writer = csv.writer(normalized_csv_obj, \ delimiter = constants.CSV_DELIMITER, \ quotechar = constants.CSV_QUOTECHAR, \ quoting=csv.QUOTE_MINIMAL) all_lag_queues = [[""] * lag_window_length for lag_feature in lag_features] header_row = list(feature_list) if (add_last_action_taken): header_row.append(constants.LAST_ACTION_TAKEN_COLUMN_NAME) header_row.append(constants.LABEL_COLUMN_NAME) csv_writer.writerow(header_row) header_row_being_read = True for timeseries_row in csv_reader: if header_row_being_read: feature_columns = generate_feature_col_dictionary( timeseries_row, feature_list, False) label_indices = list( generate_feature_col_dictionary( timeseries_row, label_columns, True).values()) header_row_being_read = False continue label_values = [ timeseries_row[index] for index in label_indices ] label_value = next((label_value for label_value in label_values \ if label_value), None) if label_value: new_normalize_row = [] for column_name, column_index in feature_columns.items(): if column_name in lag_features: index = lag_features.index(column_name) lagged_feature = update_lag_feature_queue(\ all_lag_queues[index], timeseries_row[column_index]) new_normalize_row.append(lagged_feature) elif column_name == constants.LAST_ACTION_TAKEN_COLUMN_NAME: new_normalize_row.append(last_action_taken) else: new_normalize_row.append(\ timeseries_row[feature_columns[column_name]]) new_normalize_row.append(label_value) last_action_taken = label_value csv_writer.writerow(new_normalize_row) else: for column_index, column_name in enumerate(lag_features): value = timeseries_row[feature_columns[column_name]] update_lag_feature_queue(all_lag_queues[column_index], value) current_csv_obj.close() normalized_csv_obj.close() combined_csv_file_path = os.path.join(destination_path, constants.COMBINED_CSV_FILENAME) if os.path.exists(combined_csv_file_path): os.remove(combined_csv_file_path) combined_csv = pd.concat([pd.read_csv(os.fsdecode(os.path.join(destination_path, f)))\ for f in os.listdir(destination_path)]) combined_csv.to_csv( os.fsdecode(combined_csv_file_path), \ index = False, encoding = 'utf-8-sig') print(f"Normalizing finished, results in {normalized_file_path}")
def main(): json_file_path = process_command_line_args() json_manager = JsonManager(json_file_path) feature_columns = json_manager.get_feature_columns() categorical_features = json_manager.get_categorical_features() binary_features = json_manager.get_binary_features() hot_encoded_path = json_manager.get_hot_encoded_path() normalized_folder = os.fsdecode(os.path.join(\ json_manager.get_normalized_path(), \ constants.NORMALIZED_CSV_FOLDER_NAME)) combined_csv_file = os.fsdecode(os.path.join(\ normalized_folder, \ constants.COMBINED_CSV_FILENAME)) features_data = pd.read_csv(combined_csv_file, usecols=feature_columns) for binary_variable in binary_features: features_data[binary_variable] = features_data[binary_variable].fillna( value=-1) features_data[binary_variable] = features_data[binary_variable] * 1 true_false_columns = features_data[binary_features] true_false_columns_array = true_false_columns.to_numpy() # true_false_features(features_data, true_false_features) # hot encoded features hot_encoded_array, hot_encoded_header = hot_encode_features(\ features_data, categorical_features) # remove hot encoded features from features_data dataframe features_data = features_data.drop(columns=categorical_features + binary_features) features_data_array = features_data.to_numpy() # encode labels labels_data = pd.read_csv(combined_csv_file, \ usecols = [constants.LABEL_COLUMN_NAME]) label_encoder, labels_column_array = encode_label_column(labels_data) # add hot_encoded columns, than numerical columns, then encoded labels to one array final_csv = np.concatenate(\ (hot_encoded_array, binary_columns_array, \ features_data_array, labels_column_array), \ axis = constants.COLUMN_AXIS) hot_encoded_folder = constants.add_folder_to_directory(\ constants.HOT_ENCODED_CSV_FOLDER_NAME, hot_encoded_path) hot_encoded_file_path = os.fsdecode(os.path.join(\ hot_encoded_folder, constants.HOT_ENCODED_CSV_FILENAME)) if os.path.exists(hot_encoded_file_path): os.remove(hot_encoded_file_path) # make_formatter_string(hot_encoded_header, numerical_columns, label_column) hot_encode_fmt = "%i," * len( hot_encoded_header + binary_features) # format hot encoded columns to ints feature_data_fmt = "%1.3f," * len( features_data.columns) # format numerical columns to doubles total_fmt = hot_encode_fmt + feature_data_fmt + "%i" # for label final_header = ','.join( str(i) for i in (hot_encoded_header + binary_features + list(features_data.columns))) final_header += "," + constants.LABEL_COLUMN_NAME # for label np.savetxt(hot_encoded_file_path, final_csv, \ fmt = total_fmt, \ header = final_header, \ delimiter = constants.CSV_DELIMITER, \ comments='') f = open(OUTPUT_LOG_FILE, "w") f.write("{}\n".format(total_fmt)) f.write(str((label_encoder.classes_).tolist())) f.close()
class SpotifyRequests: JsonManager.move_ids_to_ploads('jsons/ploads.json', 'jsons/ids.json') @staticmethod def get_code(ploads_json): ploads = JsonManager.load_json_from_file(ploads_json)['GET_code'] response = requests.get('https://accounts.spotify.com/authorize', params=ploads) print(f"get_code {response.status_code:23}") webbrowser.open(response.url) HttpServer.start_server() @staticmethod def get_token_with_code(ploads_json, tokens_json): ploads = JsonManager.load_json_from_file(ploads_json)['using_code'] response = requests.post('https://accounts.spotify.com/api/token', data=ploads) print(f"get_token_with_code {response.status_code:12}") JsonManager.dump_into_json_file(tokens_json, response.json()) JsonManager.move_token_to_ploads(ploads_json, tokens_json) @staticmethod def get_token_without_code(ploads_json, tokens_json): SpotifyRequests.get_code(ploads_json) SpotifyRequests.get_token_with_code(ploads_json, tokens_json) @staticmethod def get_token_with_refresh_token(ploads_json, tokens_json): ploads = JsonManager.load_json_from_file( ploads_json)['using_refresh_token'] response = requests.post('https://accounts.spotify.com/api/token', data=ploads) print(f"get_token_with_refresh_token {response.status_code}") if 'refresh_token' in response.json().keys(): JsonManager.dump_into_json_file(tokens_json, response.json()) else: response_dict = response.json() response_dict['refresh_token'] = ploads['refresh_token'] JsonManager.dump_into_json_file(tokens_json, response_dict) JsonManager.move_token_to_ploads(ploads_json, tokens_json) @staticmethod def get_token(ploads_json, tokens_json): token = JsonManager.load_json_from_file( ploads_json)['using_refresh_token']['refresh_token'] if token != "": SpotifyRequests.get_token_with_refresh_token( ploads_json, tokens_json) else: SpotifyRequests.get_token_without_code(ploads_json, tokens_json) @staticmethod def get_user_info(ploads_json, user_info_json): ploads = JsonManager.load_json_from_file( ploads_json)['using_access_token'] response = requests.get('https://api.spotify.com/v1/me', params=ploads) print(f"get_user_info {response.status_code:18}") JsonManager.dump_into_json_file(user_info_json, response.json()) JsonManager.move_user_id_to_ploads(ploads_json, user_info_json) @staticmethod def get_user_playlists(ploads_json, playlist_json): ploads = JsonManager.load_json_from_file( ploads_json)['using_access_token'] response = requests.get('https://api.spotify.com/v1/me/playlists', params=ploads) print(f"get_users_playlists {response.status_code:12}") JsonManager.dump_into_json_file(playlist_json, response.json()) @staticmethod def check_if_a_playlist_exists(ploads_json, playlist_json, playlist_name): SpotifyRequests.get_user_playlists(ploads_json, playlist_json) playlists = JsonManager.load_json_from_file(playlist_json) for playlist in playlists['items']: name = playlist['name'] if playlist_name == name: return True return False @staticmethod def create_a_playlist(ploads_json, user_info_json, playlist_json, playlist_name): if SpotifyRequests.check_if_a_playlist_exists(ploads_json, playlist_json, playlist_name): print("Playlist already exists") return None SpotifyRequests.get_user_info(ploads_json, user_info_json) ploads = JsonManager.load_json_from_file(ploads_json) user_id = ploads['using_access_token']['user_id'] ploads['create_playlist_body']['name'] = playlist_name url = f'https://api.spotify.com/v1/users/{user_id}/playlists' data = JsonManager.dump_into_json_string( ploads['create_playlist_body']) headers = ploads['using_authorization_token'] response = requests.post(url, data=data, headers=headers) print(f"create_a_playlist {response.status_code:14}") @staticmethod def print_user_playlists(ploads_json, playlist_json): SpotifyRequests.get_user_playlists(ploads_json, playlist_json) playlists = JsonManager.load_json_from_file(playlist_json) for i in range(len(playlists['items'])): print((i + 1), playlists['items'][i]['name']) @staticmethod def get_user_playlist(ploads_json, playlist_json, playlist_name): if False == SpotifyRequests.check_if_a_playlist_exists( ploads_json, playlist_json, playlist_name): print("Error: Playlist doesn't exist") return None playlists = JsonManager.load_json_from_file(playlist_json) for playlist in playlists['items']: if playlist['name'] == playlist_name: JsonManager.dump_into_json_file('jsons/playlist.json', playlist) return playlist @staticmethod def unfollow_user_playlist(ploads_json, playlist_json, playlist_name): if False == SpotifyRequests.check_if_a_playlist_exists( ploads_json, playlist_json, playlist_name): print("Error: Playlist doesn't exist") return None playlist = SpotifyRequests.get_user_playlist(ploads_json, playlist_json, playlist_name) ploads = JsonManager.load_json_from_file( ploads_json)['using_authorization_token'] playlist_id = playlist['id'] url = f'https://api.spotify.com/v1/playlists/{playlist_id}/followers' headers = ploads response = requests.delete(url, headers=headers) print(f"unfollow_user_playlist {response.status_code:9}")
def main(): json_file_path, log_file_path = process_command_line_args() json_manager = JsonManager(json_file_path) log_file = open(log_file_path, "r") fmt = log_file.readline() label_encoding = eval(log_file.readline()) log_file.close() supervised_learning_data = None if json_manager.get_upsample_status() == True: upsampled_folder = os.fsdecode(os.path.join(\ json_manager.get_upsampled_path(), constants.UPSAMPLED_CSV_FOLDER_NAME)) supervised_learning_data = os.fsdecode(os.path.join(\ upsampled_folder, constants.UPSAMPLED_CSV_FILENAME)) else: hot_encoded_folder = os.fsdecode(os.path.join(\ json_manager.get_hot_encoded_path(), constants.HOT_ENCODED_CSV_FOLDER_NAME)) supervised_learning_data = os.fsdecode(os.path.join(\ hot_encoded_folder, constants.HOT_ENCODED_CSV_FILENAME)) supervised_learning_dataframe = pd.read_csv(supervised_learning_data) features_data = pd.read_csv(supervised_learning_data, \ usecols = list(supervised_learning_dataframe.columns)[:-1]) labels_data = pd.read_csv(supervised_learning_data, \ usecols = [list(supervised_learning_dataframe.columns)[-1]]) kFold = json_manager.get_kfold() max_depth = json_manager.get_decision_tree_depth() output_folder = constants.add_folder_to_directory(\ constants.OUTPUT_FOLDER_NAME, json_manager.get_output_path()) folder_name = "{}_kFold_{}_maxDepth".format(kFold, max_depth) output_full_path = constants.add_folder_to_directory(folder_name, output_folder) clfs = [] trains_accu = [] test_accu = [] # for j in range(4): kf = KFold(shuffle = True, n_splits = kFold) for train_index, test_index in kf.split(features_data): X_train, X_test = features_data.iloc[train_index], features_data.iloc[test_index] y_train, y_test = labels_data.iloc[train_index], labels_data.iloc[test_index] clf = tree.DecisionTreeClassifier(random_state = json_manager.get_random_state(), \ max_depth = max_depth) clf = clf.fit(X_train, y_train) trains_accu.append(clf.score(X_train, y_train)) test_accu.append(clf.score(X_test, y_test)) clfs.append(clf) report_file = "{}_kFold_{}_maxDepth.txt".format(kFold, max_depth) dot_pdf_header = "{}_kFold_{}_maxDepth".format(kFold, max_depth) report_file_path = os.path.join(output_full_path, report_file) # if os.path.exists(decisionTreeFile_path): # os.remove(decisionTreeFile_path) report_file_obj = open(report_file_path, "w") report_file_obj.write("Decision Tree with max_depth: {}, and kFold: {}\n".format(\ max_depth, kFold)) report_file_obj.write(" Average train error with {} fold: {}\n".format(\ kFold, sum(trains_accu)/len(trains_accu))) report_file_obj.write(" Average test error with {} fold: {}\n".format(\ kFold, sum(test_accu)/len(test_accu))) report_file_obj.write(" Decision Tree (DOT format) saved to: {}\n".format(dot_pdf_header)) report_file_obj.write(" Decision Tree (PDF format) saved to: {}.pdf\n".format(dot_pdf_header)) report_file_obj.write("Check {} for appropriate pruning.\n\n\n".format(PRUNING_GRAPH_FILENAME)) clf = tree.DecisionTreeClassifier(random_state = json_manager.get_random_state(), \ max_depth = max_depth) clf = clf.fit(features_data, labels_data) dot_pdf_full_path = os.fsdecode(os.path.join(output_full_path, dot_pdf_header)) plot_decision_tree(clf, dot_pdf_full_path, features_data.columns) prune_path = clf.cost_complexity_pruning_path(features_data, labels_data) ccp_alphas, impurities = prune_path.ccp_alphas, prune_path.impurities pruning_folder = constants.add_folder_to_directory(\ constants.PRUNE_FOLDER_NAME, output_full_path) clfs = [] train_scores = [] for i, ccp_alpha in enumerate(ccp_alphas): clf = tree.DecisionTreeClassifier(random_state = json_manager.get_random_state(), \ max_depth = max_depth, ccp_alpha=ccp_alpha) clf.fit(features_data, labels_data) score = clf.score(features_data, labels_data) clfs.append(clf) train_scores.append(score) newPrunePath = constants.add_folder_to_directory("Pruning_{}".format(i), pruning_folder) decision_tree_path = os.fsdecode(os.path.join(\ newPrunePath, "{}_kFold_{}_maxDepth_{}_prune".format(kFold, max_depth, i))) plot_decision_tree(clf, decision_tree_path, features_data.columns) decision_tree_obj = clf.tree_ behavior_tree_obj = btBuilder.bt_espresso_mod(\ decision_tree_obj, features_data.columns, label_encoding) behaviot_tree_full_path = os.fsdecode(os.path.join(\ newPrunePath, constants.BEHAVIOR_TREE_XML_FILENAME)) # btBuilder.save_tree(behavior_tree_obj, behaviot_tree_full_path) btBuilder.save_tree(behavior_tree_obj, newPrunePath) report_file_obj.write("prune: {} \n".format(i)) report_file_obj.write(" ccp_alpha: {}, train score: {}\n".format(ccp_alpha, train_scores[i])) report_file_obj.write(" Decision Tree saved to {}\n".format(decision_tree_path)) report_file_obj.write(" Behavior Tree saved to {}\n\n".format(behaviot_tree_full_path)) report_file_obj.write("") fig, ax = plt.subplots() ax.set_xlabel("alpha") ax.set_ylabel("accuracy") ax.set_title("Accuracy vs alpha for training sets") ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post") ax.legend() graph_path = os.fsdecode(os.path.join(output_full_path, PRUNING_GRAPH_FILENAME)) plt.savefig(graph_path) report_file_obj.close()
class Runner: """Main runner class for this file Attributes: json_manager (json_manager.JsonManager): JSON Manager for config.json log_file (_io.TextIOWrapper): Log file used for formatting """ def __init__(self, json_file_path, log_file_path): """Summary Args: json_file_path (str): Full filepath to config.json log_file_path (str): Full filepath to output.log file """ print( f"BehaviorTree building started using {json_file_path} and {log_file_path}" ) self.json_manager = JsonManager(json_file_path) self.log_file = open(log_file_path, "r") def get_file_fmt_and_label_encoding(self): """Summary Returns: tuple(str, list<str>): Tuple containg string output file format and list of label encodings """ fmt = self.log_file.readline() label_encoding = eval(self.log_file.readline()) self.log_file.close() return fmt, label_encoding def get_supervised_data_csv_filepath(self): """Returns filepath of data, uses one hot encoded if upsample = false in config.json Returns: string: filepath to data csv """ data_folder = os.fsdecode(os.path.join(\ self.json_manager.get_hot_encoded_path(), constants.HOT_ENCODED_CSV_FOLDER_NAME)) filename = constants.HOT_ENCODED_CSV_FILENAME if self.json_manager.get_upsample_status(): data_folder = os.fsdecode(os.path.join(\ self.json_manager.get_upsampled_path(), constants.UPSAMPLED_CSV_FOLDER_NAME)) filename = constants.UPSAMPLED_CSV_FILENAME return os.fsdecode(os.path.join(\ data_folder, filename)) def get_output_folder(self, kFold, max_depth): path = constants.combine_folder_and_working_dir( constants.PIPELINE_OUTPUT_FOLDER_NAME, self.json_manager.get_output_path()) return constants.combine_folder_and_working_dir( "{}_kFold_{}_maxDepth".format(kFold, max_depth), path) def create_output_folder(self, kFold, max_depth): output_folder = constants.add_folder_to_directory(\ constants.PIPELINE_OUTPUT_FOLDER_NAME, self.json_manager.get_output_path()) folder_name = "{}_kFold_{}_maxDepth".format(kFold, max_depth) return constants.add_folder_to_directory(folder_name, output_folder) def format_float_list_to_precision(self, list_in, precision): prec_str = "{0:0." + str(precision) + "f}" return [prec_str.format(i) for i in list_in] def k_fold_train_decision_tree_w_max_depth(self, num_k_folds, max_depth, output_full_path): kf = KFold(shuffle=True, n_splits=num_k_folds) # build full tree on all data full_tree = tree.DecisionTreeClassifier(random_state = self.json_manager.get_random_state(), \ max_depth = max_depth).fit(self.features_data, self.labels_data) # get set of alphas from cost_complexity_pruning prune_path = full_tree.cost_complexity_pruning_path( self.features_data, self.labels_data) ccp_alphas, impurities = prune_path.ccp_alphas, prune_path.impurities self.train_scores = [0] * len(ccp_alphas) self.test_scores = [0] * len(ccp_alphas) # split data into train/test for train_index, test_index in kf.split(self.features_data): X_train, X_test = self.features_data.iloc[ train_index], self.features_data.iloc[test_index] y_train, y_test = self.labels_data.iloc[ train_index], self.labels_data.iloc[test_index] # create tree on each alpha for i, alpha in enumerate(ccp_alphas): clf = tree.DecisionTreeClassifier(\ random_state = self.json_manager.get_random_state(), \ max_depth = max_depth, \ ccp_alpha=alpha) clf = clf.fit(X_train, y_train) self.train_scores[i] += clf.score(X_train, y_train) / num_k_folds self.test_scores[i] += clf.score(X_test, y_test) / num_k_folds def generate_full_binary_set(self): bin_set = self.json_manager.get_binary_features() # categrorical cat_set = self.json_manager.get_categorical_features() # LAT def run(self): """Reads in data, trains, and reports results """ kFold = self.json_manager.get_kfold() max_depth = self.json_manager.get_decision_tree_depth() constants.remove_folder_if_exists( self.get_output_folder(kFold, max_depth)) fmt, label_encoding = self.get_file_fmt_and_label_encoding() self.supervised_learning_dataframe = pd.read_csv( self.get_supervised_data_csv_filepath()) self.features_data = self.supervised_learning_dataframe.loc[:, self. supervised_learning_dataframe .columns != constants. LABEL_COLUMN_NAME] self.labels_data = self.supervised_learning_dataframe.loc[:, self. supervised_learning_dataframe .columns == constants. LABEL_COLUMN_NAME] output_full_path = self.create_output_folder(kFold, max_depth) self.k_fold_train_decision_tree_w_max_depth(kFold, max_depth, output_full_path) report_file = "{}_kFold_{}_maxDepth.txt".format(kFold, max_depth) dot_pdf_header = "{}_kFold_{}_maxDepth".format(kFold, max_depth) report_file_path = os.path.join(output_full_path, report_file) report_file_obj = open(report_file_path, "w") report_file_obj.write("Decision Tree with max_depth: {}, and kFold: {}\n".format(\ max_depth, kFold)) # report_file_obj.write(" Average train error with {} fold: {}\n".format(\ # kFold, sum(self.train_scores)/len(self.train_accu))) # report_file_obj.write(" Average test error with {} fold: {}\n".format(\ # kFold, sum(self.test_accu)/len(self.test_accu))) report_file_obj.write( " Decision Tree (DOT format) saved to: {}\n".format( dot_pdf_header)) report_file_obj.write( " Decision Tree (PDF format) saved to: {}.pdf\n".format( dot_pdf_header)) report_file_obj.write("Check {} for appropriate pruning.\n\n\n".format( PRUNING_GRAPH_FILENAME)) clf = tree.DecisionTreeClassifier(random_state = self.json_manager.get_random_state(), \ max_depth = max_depth) clf = clf.fit(self.features_data, self.labels_data) dot_pdf_full_path = os.fsdecode( os.path.join(output_full_path, dot_pdf_header)) plot_decision_tree(clf, dot_pdf_full_path, self.features_data.columns) prune_path = clf.cost_complexity_pruning_path(self.features_data, self.labels_data) ccp_alphas, impurities = prune_path.ccp_alphas, prune_path.impurities pruning_folder = constants.add_folder_to_directory(\ constants.PRUNE_FOLDER_NAME, output_full_path) clfs = [] train_scores = [] for i, ccp_alpha in enumerate(ccp_alphas): clf = tree.DecisionTreeClassifier(random_state = self.json_manager.get_random_state(), \ max_depth = max_depth, ccp_alpha=ccp_alpha) clf.fit(self.features_data, self.labels_data) score = clf.score(self.features_data, self.labels_data) clfs.append(clf) train_scores.append(score) newPrunePath = constants.add_folder_to_directory( "Pruning_{0}_{1:.6g}".format(i, ccp_alpha), pruning_folder) decision_tree_path = os.fsdecode(os.path.join(\ newPrunePath, "{0}_kFold_{1}_maxDepth_{2}_{3:.6g}_prune".format(kFold, max_depth,i, ccp_alpha))) plot_decision_tree(clf, decision_tree_path, self.features_data.columns) decision_tree_obj = clf.tree_ # theoretical split to dump decision trees out to files # TODO: should we include categorical features as binary? full_binary_set = self.generate_full_binary_set() behavior_tree_obj = btBuilder.bt_espresso_mod(\ decision_tree_obj, self.features_data.columns, label_encoding, self.json_manager.get_binary_features()) behaviot_tree_full_path = os.fsdecode(os.path.join(\ newPrunePath, constants.BEHAVIOR_TREE_XML_FILENAME)) # btBuilder.save_tree(behavior_tree_obj, behaviot_tree_full_path) btBuilder.save_tree(behavior_tree_obj, newPrunePath) report_file_obj.write("prune: {} \n".format(i)) report_file_obj.write(" ccp_alpha: {}, train score: {}\n".format( ccp_alpha, train_scores[i])) report_file_obj.write( " Decision Tree saved to {}\n".format(decision_tree_path)) report_file_obj.write(" Behavior Tree saved to {}\n\n".format( behaviot_tree_full_path)) report_file_obj.write("") fig, ax = plt.subplots() ax.set_xlabel("alpha") ax.set_ylabel("accuracy") ax.set_title( "Accuracy vs alpha for Final Tree Prunes (note: uses all data for final training)" ) ax.plot(ccp_alphas, self.train_scores, marker='o', label="train", drawstyle="steps-post") ax.plot(ccp_alphas, self.test_scores, marker='x', label="test", drawstyle="steps-post") ax.legend() graph_path = os.fsdecode( os.path.join(output_full_path, PRUNING_GRAPH_FILENAME)) plt.savefig(graph_path) results_txt_file = open( os.fsdecode(os.path.join(output_full_path, RESULTS_TEXT_FILENAME)), "w") alist = ccp_alphas.flatten().tolist() acc_diffs = [ a_i - b_i for a_i, b_i in zip(self.train_scores, self.test_scores) ] float_precision = 6 results_txt_file.write( f"alphas:\t\t{self.format_float_list_to_precision(alist, float_precision)}\n" ) results_txt_file.write( f"train acc:\t{self.format_float_list_to_precision(self.train_scores, float_precision)}\n" ) results_txt_file.write( f"test acc:\t{self.format_float_list_to_precision(self.test_scores, float_precision)}\n" ) results_txt_file.write( f"acc diff:\t{self.format_float_list_to_precision(acc_diffs, float_precision)}\n" ) results_txt_file.close() report_file_obj.close() print(f"BehaviorTree buidling finished, results in {output_full_path}")
def main(): json_file_path = process_command_line_args() json_manager = JsonManager(json_file_path) csv_folder = json_manager.get_csv_path() normalized_folder = json_manager.get_normalized_path() feature_columns = json_manager.get_feature_columns() label_columns = json_manager.get_label_columns() lag_features = json_manager.get_lag_features() lag_window_length = json_manager.get_sliding_window_length() destination_path = constants.add_folder_to_directory(\ constants.NORMALIZED_CSV_FOLDER_NAME, normalized_folder) for file in os.listdir(csv_folder): complete_file_path = os.fsdecode(os.path.join(csv_folder, file)) if is_file_CSV(file): normalized_filename = make_modified_filename(\ file, CSV_NAME_EXTENSION) normalized_file_path = os.fsdecode(os.path.join(\ destination_path, normalized_filename)) current_csv_obj = open(complete_file_path) normalized_csv_obj = open(normalized_file_path, mode='w') csv_reader = csv.reader(current_csv_obj, \ delimiter = constants.CSV_DELIMITER) csv_writer = csv.writer(normalized_csv_obj, \ delimiter = constants.CSV_DELIMITER, \ quotechar = constants.CSV_QUOTECHAR, \ quoting=csv.QUOTE_MINIMAL) all_lag_queues = [[""] * lag_window_length for lag_feature in lag_features] header_row = list(feature_columns) header_row.append(constants.LABEL_COLUMN_NAME) csv_writer.writerow(header_row) label_indices = list(label_columns.values()) header_row_being_read = True for timeseries_row in csv_reader: if header_row_being_read: header_row_being_read = False continue label_values = [ timeseries_row[index] for index in label_indices ] label_value = next((label_value for label_value in label_values \ if label_value), None) if label_value: new_normalize_row = [] for column_name, column_index in feature_columns.items(): if column_name in lag_features: index = lag_features.index(column_name) lagged_feature = update_lag_feature_queue(\ all_lag_queues[index], timeseries_row[column_index]) new_normalize_row.append(lagged_feature) else: new_normalize_row.append(\ timeseries_row[feature_columns[column_name]]) new_normalize_row.append(label_value) csv_writer.writerow(new_normalize_row) else: for column_index, column_name in enumerate(lag_features): value = timeseries_row[feature_columns[column_name]] update_lag_feature_queue(all_lag_queues[column_index], value) current_csv_obj.close() normalized_csv_obj.close() combined_csv_file_path = os.path.join(destination_path, constants.COMBINED_CSV_FILENAME) if os.path.exists(combined_csv_file_path): os.remove(combined_csv_file_path) combined_csv = pd.concat([pd.read_csv(os.fsdecode(os.path.join(destination_path, f)))\ for f in os.listdir(destination_path)]) combined_csv.to_csv( os.fsdecode(combined_csv_file_path), \ index = False, encoding = 'utf-8-sig')