def main(args): artists_filename = args.i_path chunk_filename = args.i_chunk global output_path output_path = args.output_path if output_path[-1] != '/': output_path += '/' global artists print('LOADING PKL...', end='') artists = load_data(filename=artists_filename) print('DONE') global chunk print('LOADING CHUNKS...') chunk = load_data(filename=chunk_filename) print('DONE') print('COMPUTE RANKING of selection ', chunk_filename) chunk_level_ranking = compute_ranking_master() output_filename= os.path.basename(chunk_filename) output_filename += '_OUT.pkl' output_path += output_filename save_data(chunk_level_ranking, filename=output_path)
def main(args): input_pkl= args.i_path input_ranking = args.i_ranking output_path = args.output_path if output_path[-1] != '/': output_path += '/' global heatmap_metric global ranking_metric global peak_thresh heatmap_metric = args.heatmap_metric ranking_metric = args.ranking_metric peak_thresh=args.peak_thresh print('LOADING PKL ARTISTS...', end='') artists = load_data(filename=input_pkl) print('DONE') if input_pkl != 'NO': # update artists file print('LOADING PKL RANKING...', end='') ranking = load_data(filename=input_ranking) print('DONE') for k, v in ranking.items(): arr = np.array(v) artists[k].my_similar_artists = arr[:, 0] artists[k].my_similar_artists_distances = arr[:, 1] ranking_pathname = output_path + 'ranking.txt' print_rankings_verbose(artists=artists, filename=ranking_pathname,output_path=output_path,heatmap_metric=heatmap_metric, ranking_metric=ranking_metric, peak_thresh=peak_thresh) compute_ranking_score(artists=artists, ranking_metric=ranking_metric, heatmap_metric=heatmap_metric)
def main(args): input_folder = args.i_path threshold = args.threshold mode = args.mode print('LOADING PKL...') artists = load_data(filename=input_folder) print('PREPROCESSING ', d[mode]) X, y = gen_dataset(artists=artists, mode=mode) X, y = remove_outlier(X=X, y=y, thresh=threshold) X = normalize(X=X) print('TSNE') X = tsne(X=X, lr=1000) artists = optimize_artists_dictionary(artists) artists = attach_tsne_to_art_dict(artists=artists, X=X, y=y) tsne_min = np.amin(X, axis=0) tsne_max = np.amax(X, axis=0) print('[TSNE-1 - TSNE-2]') print('min values') print(np.amin(X, axis=0)) print('max values') print(np.amax(X, axis=0)) print('mean values') print(np.mean(X, axis=0)) print('variance values') print(np.var(X, axis=0)) artists = clean_similar_artists(artists=artists)
def main(args): input_path = args.input_pkl output_path = args.output_path global metric metric = args.metric global artists artists = load_data(input_path) chunk_filename = args.input_chunk print('LOADING CHUNKS...') chunk = load_data(filename=chunk_filename) print('DONE') d = build_matrix_master(chunk=chunk) save_data(filename=output_path, dict=d)
def main(args): n_chunks = args.n_chunks chunk_folder = args.chunk_folder if chunk_folder[-1] != '/': chunk_folder += '/' #group all chunk level ranking in a single ranking file dictionary = dict() for i in range(n_chunks): chunk_filename = 'chunk_' + str(i) + '_OUT.pkl' chunk_pathname = chunk_folder + chunk_filename chunk_out = load_data(filename=chunk_pathname) for k, v in chunk_out.items(): dictionary[k] = v del chunk_out print('chunk ', str(i), 'Memory (GB) : ', getCurrentMemoryUsage() / (2**20)) final_pathname = chunk_folder + 'merged_OUT.pkl' print('before gc Memory (GB) : ', getCurrentMemoryUsage() / (2**20)) gc.collect() print('after gc Memory (GB) : ', getCurrentMemoryUsage() / (2**20)) df = pd.DataFrame.from_dict(dictionary) save_data(dict=df, filename=final_pathname) print('chunk ', str(i), 'Memory (GB) : ', getCurrentMemoryUsage() / (2**20))
def main(args): input_folder = args.i_path threshold = args.threshold mode = args.mode intervals = args.resolution print('LOADING PKL...') artists = load_data(filename=input_folder) print('PREPROCESSING ', d[mode]) X, y = gen_dataset(artists=artists, mode=mode) X, y = remove_outlier(X=X, y=y, thresh=threshold) X = normalize(X=X) for lr in [10, 100, 500, 1000]: print('TSNE with learning rate =', lr) X_emb = tsne(X, lr=lr) print('[TSNE-1 - TSNE-2]') print('min values') print(np.amin(X_emb, axis=0)) print('max values') print(np.amax(X_emb, axis=0)) print('mean values') print(np.mean(X_emb, axis=0)) print('variance values') print(np.var(X_emb, axis=0)) #artists = optimize_artists_dictionary(artists) #artists = attach_tsne_to_art_dict(artists=artists, X=X, y=y) tsne_min = np.amin(X, axis=0) tsne_max = np.amax(X, axis=0)
def main(args): input_filename = args.i_path output_folder = args.o_folder type = args.type if output_folder[-1] != '/': output_folder += '/' artists = load_data(filename=input_filename) if type == 'only_tsne': out_patname = output_folder + 'dataset_tsne.xlsx' columns = [ 'SONG_ID', 'SONG_NAME', 'ARTIST_ID', 'ARTIST_NAME', 'TSNE_1', 'TSNE_2', 'SIMILAR_ARTISTS' ] n_out = 0 data = [] for a in artists.values(): for s in a.song_list.values(): try: row = [ s.id, s.name, a.id, a.name, s.tsne[0], s.tsne[1], a.similar_artists ] data.append(row) except: n_out += 1 df = pd.DataFrame(data=data, columns=columns) writer = pd.ExcelWriter(out_patname, engine='xlsxwriter') # Convert the dataframe to an XlsxWriter Excel object. df.to_excel(writer, sheet_name='Sheet1') # Close the Pandas Excel writer and output the Excel file. writer.save()
def main(args): distances_filename = args.distances note = args.note distances = load_data(filename=distances_filename) max_length_ranking = build_max_length_ranking(distances=distances) output_path = os.path.dirname(distances_filename) basename = 'max_length_ranking_'+note+'.pkl' final_pathname = os.path.join(output_path,basename) save_data(filename=final_pathname, dict=max_length_ranking)
def main(args): input_folder = args.i_path global output_path output_path = args.o_path if output_path[-1] != '/': output_path += '/' global artists print('LOADING PKL...') artists = load_data(filename=input_folder) print('PREPROCESSING') X, y = gen_dataset(artists=artists, mode=3) for t in np.arange(1, 3, 0.2): A, b = remove_outlier(X=X, y=y, thresh=t)
def main(args): n_chunks = args.n_chunks chunk_folder = args.chunk_folder if chunk_folder[-1] != '/': chunk_folder += '/' #group all chunk level ranking in a single ranking file ranking = dict() for i in range(n_chunks): chunk_filename = 'chunk_' + str(i) + '.pkl_OUT.pkl' chunk_pathname = chunk_folder+chunk_filename chunk_out = load_data(filename=chunk_pathname) for k,v in chunk_out.items(): ranking[k]=v final_pathname= chunk_folder+'merged_OUT.pkl' save_data(ranking,filename=final_pathname)
def main(args): input_folder = args.i_path threshold = args.threshold output_pkl = args.output_pkl global output_path output_path = args.o_path if output_path[-1] != '/': output_path += '/' mode = args.mode global artists print('LOADING PKL...') artists = load_data(filename=input_folder) print('PREPROCESSING ', d[mode]) X, y = gen_dataset(artists=artists, mode=mode) X, y = remove_outlier(X=X, y=y, thresh=threshold) X = normalize(X=X) print('TSNE') X = tsne(X=X, lr=1000) artists = optimize_artists_dictionary(artists) artists = attach_tsne_to_art_dict(artists=artists, X=X, y=y) min = np.amin(X, axis=0) max = np.amax(X, axis=0) dimension = 20 print('[TSNE-1 - TSNE-2]') print('min values') print(np.amin(X, axis=0)) print('max values') print(np.amax(X, axis=0)) print('mean values') print(np.mean(X, axis=0)) print('variance values') print(np.var(X, axis=0)) artists = clean_similar_artists(artists=artists) print('GENERATE HEATMAPS') gen_heatmaps_master(dimension=dimension, min=min, max=max) print('SAVING DATA') save_data(artists, filename=output_pkl) print('PLOT HEATMAPS in ', output_path) plot_heatmaps_master(dimension=dimension, min=min, max=max)
def main(args): input_folder = args.i_path if args.o_path[-1] == '/': output_filename = args.o_path + args.o_name else: output_filename = args.o_path + '/' + args.o_name artists = load_data(filename=input_folder) print('PREPROCESSING') X , y = prepare_dataset(artists=artists, remove_outliers=True, mode=3, local_outlier=False,print_stats=args.stats, print_outlier_percentage_p_feature=True, outlier_trheshold=args.outlier_trheshold) ''' X = tsne(X) artists = optimize_artists_dictionary(artists) artists = attach_tsne_to_art_dict(artists=artists, X=X, y=y) tsne_min = np.amin(X, axis=0) tsne_max = np.amax(X, axis=0) if args.verbosity > 0: print('[TSNE-1 - TSNE-2]') print('min values') print(np.amin(X, axis=0)) print('max values') print(np.amax(X, axis=0)) print('mean values') print(np.mean(X, axis=0)) print('variance values') print(np.var(X, axis=0)) artists = clean_similar_artists(artists=artists) save_data(artists, filename=output_filename) ''' return
def main(args): input_pkl = args.i_path n_chunks = args.n_chunks output_folder = args.o_path if output_folder[-1] != '/': output_folder += '/' print('LOADING PKL ', input_pkl) artists = load_data(filename=input_pkl) clean_list = dict() #remove from lists those artists that don't have an heatmap for k, v in artists.items(): if v.tsne_heatmap is not None: clean_list[k] = k divide_dict_in_N_parts(artists=clean_list, n=n_chunks, save_to_pkl=True, output_path=output_folder) print(str(len(clean_list)), ' over ', str(len(artists)), ' artists, those with no heatmap were not added to lists')
def main(args): input_folder = args.i_path threshold = args.threshold artists = load_data(filename=input_folder) X, y = gen_dataset(artists=artists) if PRINT_DISTRIBUTION: #PRINT VALUES BEFORE OUTLIER REMOTION feat_names = get_features_dict() x = np.array(X) for i in range(x.shape[1]): ax = plt.hist(x[:, i], bins=200) filename = args.o_path + '/BEFORE/' + feat_names[i] + '.png' title = feat_names[i] + 'BEFORE outlier remotion' plt.title(title) plt.savefig(filename) plt.close('all') X, y = remove_outlier(X=X, y=y, thresh=threshold, verbose=False, save_histogram=True) #X = normalize(X=X) #X, y = remove_outliers_lof(data=X, y=y) if PRINT_DISTRIBUTION: # PRINT VALUES AFTER OUTLIER REMOTION x = np.array(X) for i in range(x.shape[1]): ax = plt.hist(x[:, i], bins=200) filename = args.o_path + '/AFTER/' + feat_names[i] + '.png' title = feat_names[i] + 'AFTER outlier remotion' plt.title(title) plt.savefig(filename) plt.close('all')
def main(args): input_path = args.input_pkl output_path = args.output_path if output_path[-1] != '/': output_path += '/' output_names = output_path + 'names.pkl' output_heatmaps = output_path + 'heatmaps.pkl' output_gt = output_path + 'ground_truth.pkl' artists = load_data(filename=input_path) names = dict() heatmaps = dict() ground_truth = dict() for id_, artist in artists.items(): names[id_] = artist.id heatmaps[id_] = artist.tsne_heatmap ground_truth[id_] = artist.similar_artists save_data(filename=output_heatmaps, dict=heatmaps) save_data(filename=output_names, dict=names) save_data(filename=output_gt, dict=ground_truth)
help='path to pkl name file') parser.add_argument('--ranking', '-r', required=False, type=str, default='./.pkl', help='path to ranking file') parser.add_argument('--output_folder', '-o', required=False, type=str, default='./OUTPUT', help='output folder') args = parser.parse_args() names = load_data(filename=args.names) heatmaps = load_data(filename=args.heatmaps) ground_truth = load_data(filename=args.ground_truth) distances = load_data(filename=args.distances) ranking = load_data(filename=args.ranking) output_folder = args.output_folder print_histograms(gt_distances=distance_vs_gt_position( ground_truth=ground_truth, distances=distances), folder=output_folder) ''' names = load_data(filename='/home/gigi/PycharmProjects/TESI_BIS/OUTPUT/names.pkl') heatmaps = load_data(filename='/home/gigi/PycharmProjects/TESI_BIS/OUTPUT/heatmaps.pkl') ground_truth = load_data(filename='/home/gigi/PycharmProjects/TESI_BIS/OUTPUT/ground_truth.pkl') distances = load_data(filename='/home/gigi/PycharmProjects/TESI_BIS/OUTPUT/distances_cc_peak_2.pkl') ranking = load_data(filename=)'''
import sys # insert at 1, 0 is the script path (or '' in REPL) from primary.heatmap import compute_heatmap_distance import pandas as pd sys.path.insert(1, '/home/crottondi/PIRISI_TESI/TESI_BIS/') import argparse from primary.data_io import save_data, load_data import primary.rbo as rbo import numpy as np if __name__ == '__main__': artists = load_data( filename='/home/gigi/PycharmProjects/TESI_BIS/PKL/artists_subset_hm.pkl' ) max = 0 max_id = '' for id_, a_ in artists.items(): if len(artists[id_].song_list) > max: max_id = id_ max = len(artists[id_].song_list) print(max_id, max)