def main(args):
    artists_filename = args.i_path
    chunk_filename = args.i_chunk
    global output_path
    output_path = args.output_path
    if output_path[-1] != '/':
        output_path += '/'

    global artists
    print('LOADING PKL...', end='')
    artists = load_data(filename=artists_filename)
    print('DONE')

    global chunk
    print('LOADING CHUNKS...')
    chunk = load_data(filename=chunk_filename)
    print('DONE')



    print('COMPUTE RANKING of selection ', chunk_filename)
    chunk_level_ranking = compute_ranking_master()
    output_filename= os.path.basename(chunk_filename)
    output_filename += '_OUT.pkl'
    output_path += output_filename

    save_data(chunk_level_ranking, filename=output_path)
예제 #2
0
def main(args):
    input_pkl= args.i_path
    input_ranking = args.i_ranking
    output_path = args.output_path

    if output_path[-1] != '/':
        output_path += '/'

    global heatmap_metric
    global ranking_metric
    global peak_thresh
    heatmap_metric = args.heatmap_metric
    ranking_metric = args.ranking_metric
    peak_thresh=args.peak_thresh

    print('LOADING PKL ARTISTS...', end='')
    artists = load_data(filename=input_pkl)
    print('DONE')

    if input_pkl != 'NO':
        # update artists file
        print('LOADING PKL RANKING...', end='')
        ranking = load_data(filename=input_ranking)
        print('DONE')

        for k, v in ranking.items():
            arr = np.array(v)
            artists[k].my_similar_artists = arr[:, 0]
            artists[k].my_similar_artists_distances = arr[:, 1]

    ranking_pathname = output_path + 'ranking.txt'
    print_rankings_verbose(artists=artists, filename=ranking_pathname,output_path=output_path,heatmap_metric=heatmap_metric,
                           ranking_metric=ranking_metric, peak_thresh=peak_thresh)
    compute_ranking_score(artists=artists, ranking_metric=ranking_metric, heatmap_metric=heatmap_metric)
예제 #3
0
def main(args):
    input_folder = args.i_path
    threshold = args.threshold
    mode = args.mode

    print('LOADING PKL...')
    artists = load_data(filename=input_folder)

    print('PREPROCESSING ', d[mode])

    X, y = gen_dataset(artists=artists, mode=mode)
    X, y = remove_outlier(X=X, y=y, thresh=threshold)
    X = normalize(X=X)
    print('TSNE')

    X = tsne(X=X, lr=1000)

    artists = optimize_artists_dictionary(artists)
    artists = attach_tsne_to_art_dict(artists=artists, X=X, y=y)

    tsne_min = np.amin(X, axis=0)
    tsne_max = np.amax(X, axis=0)

    print('[TSNE-1 - TSNE-2]')
    print('min values')
    print(np.amin(X, axis=0))
    print('max values')
    print(np.amax(X, axis=0))
    print('mean values')
    print(np.mean(X, axis=0))
    print('variance values')
    print(np.var(X, axis=0))

    artists = clean_similar_artists(artists=artists)
예제 #4
0
def main(args):
    input_path = args.input_pkl
    output_path = args.output_path
    global metric
    metric = args.metric
    global artists
    artists = load_data(input_path)

    chunk_filename = args.input_chunk
    print('LOADING CHUNKS...')
    chunk = load_data(filename=chunk_filename)
    print('DONE')

    d = build_matrix_master(chunk=chunk)

    save_data(filename=output_path, dict=d)
예제 #5
0
def main(args):
    n_chunks = args.n_chunks
    chunk_folder = args.chunk_folder
    if chunk_folder[-1] != '/':
        chunk_folder += '/'

    #group all chunk level ranking in a single ranking file
    dictionary = dict()
    for i in range(n_chunks):
        chunk_filename = 'chunk_' + str(i) + '_OUT.pkl'
        chunk_pathname = chunk_folder + chunk_filename
        chunk_out = load_data(filename=chunk_pathname)

        for k, v in chunk_out.items():
            dictionary[k] = v
        del chunk_out

        print('chunk ', str(i), 'Memory (GB) : ',
              getCurrentMemoryUsage() / (2**20))
    final_pathname = chunk_folder + 'merged_OUT.pkl'
    print('before gc Memory (GB) : ', getCurrentMemoryUsage() / (2**20))
    gc.collect()
    print('after gc Memory (GB) : ', getCurrentMemoryUsage() / (2**20))
    df = pd.DataFrame.from_dict(dictionary)
    save_data(dict=df, filename=final_pathname)
    print('chunk ', str(i), 'Memory (GB) : ',
          getCurrentMemoryUsage() / (2**20))
예제 #6
0
def main(args):

    input_folder = args.i_path
    threshold = args.threshold
    mode = args.mode
    intervals = args.resolution
    print('LOADING PKL...')
    artists = load_data(filename=input_folder)

    print('PREPROCESSING ', d[mode])

    X, y = gen_dataset(artists=artists, mode=mode)
    X, y = remove_outlier(X=X, y=y, thresh=threshold)
    X = normalize(X=X)

    for lr in [10, 100, 500, 1000]:
        print('TSNE with learning rate =', lr)
        X_emb = tsne(X, lr=lr)

        print('[TSNE-1 - TSNE-2]')
        print('min values')
        print(np.amin(X_emb, axis=0))
        print('max values')
        print(np.amax(X_emb, axis=0))
        print('mean values')
        print(np.mean(X_emb, axis=0))
        print('variance values')
        print(np.var(X_emb, axis=0))

    #artists = optimize_artists_dictionary(artists)
    #artists = attach_tsne_to_art_dict(artists=artists, X=X, y=y)

    tsne_min = np.amin(X, axis=0)
    tsne_max = np.amax(X, axis=0)
예제 #7
0
def main(args):
    input_filename = args.i_path
    output_folder = args.o_folder
    type = args.type
    if output_folder[-1] != '/':
        output_folder += '/'

    artists = load_data(filename=input_filename)

    if type == 'only_tsne':
        out_patname = output_folder + 'dataset_tsne.xlsx'
        columns = [
            'SONG_ID', 'SONG_NAME', 'ARTIST_ID', 'ARTIST_NAME', 'TSNE_1',
            'TSNE_2', 'SIMILAR_ARTISTS'
        ]
        n_out = 0
        data = []
        for a in artists.values():
            for s in a.song_list.values():
                try:
                    row = [
                        s.id, s.name, a.id, a.name, s.tsne[0], s.tsne[1],
                        a.similar_artists
                    ]
                    data.append(row)
                except:
                    n_out += 1
        df = pd.DataFrame(data=data, columns=columns)
        writer = pd.ExcelWriter(out_patname, engine='xlsxwriter')

        # Convert the dataframe to an XlsxWriter Excel object.
        df.to_excel(writer, sheet_name='Sheet1')

        # Close the Pandas Excel writer and output the Excel file.
        writer.save()
예제 #8
0
def main(args):
    distances_filename = args.distances
    note = args.note
    distances = load_data(filename=distances_filename)

    max_length_ranking = build_max_length_ranking(distances=distances)

    output_path = os.path.dirname(distances_filename)
    basename = 'max_length_ranking_'+note+'.pkl'
    final_pathname = os.path.join(output_path,basename)
    save_data(filename=final_pathname, dict=max_length_ranking)
예제 #9
0
def main(args):
    input_folder = args.i_path
    global output_path
    output_path = args.o_path
    if output_path[-1] != '/':
        output_path += '/'
    global artists
    print('LOADING PKL...')
    artists = load_data(filename=input_folder)

    print('PREPROCESSING')

    X, y = gen_dataset(artists=artists, mode=3)

    for t in np.arange(1, 3, 0.2):
        A, b = remove_outlier(X=X, y=y, thresh=t)
예제 #10
0
def main(args):
    n_chunks = args.n_chunks
    chunk_folder = args.chunk_folder
    if chunk_folder[-1] != '/':
        chunk_folder += '/'

    #group all chunk level ranking in a single ranking file
    ranking = dict()
    for i in range(n_chunks):
        chunk_filename = 'chunk_' + str(i) + '.pkl_OUT.pkl'
        chunk_pathname = chunk_folder+chunk_filename
        chunk_out = load_data(filename=chunk_pathname)

        for k,v in chunk_out.items():
            ranking[k]=v

    final_pathname= chunk_folder+'merged_OUT.pkl'

    save_data(ranking,filename=final_pathname)
예제 #11
0
def main(args):
    input_folder = args.i_path
    threshold = args.threshold
    output_pkl = args.output_pkl
    global output_path
    output_path = args.o_path
    if output_path[-1] != '/':
        output_path += '/'
    mode = args.mode
    global artists
    print('LOADING PKL...')
    artists = load_data(filename=input_folder)


    print('PREPROCESSING ', d[mode])
    X, y = gen_dataset(artists=artists, mode=mode)
    X, y = remove_outlier(X=X, y=y, thresh=threshold)
    X = normalize(X=X)
    print('TSNE')
    X = tsne(X=X, lr=1000)
    artists = optimize_artists_dictionary(artists)
    artists = attach_tsne_to_art_dict(artists=artists, X=X, y=y)
    min = np.amin(X, axis=0)
    max = np.amax(X, axis=0)
    dimension = 20
    print('[TSNE-1 - TSNE-2]')
    print('min values')
    print(np.amin(X, axis=0))
    print('max values')
    print(np.amax(X, axis=0))
    print('mean values')
    print(np.mean(X, axis=0))
    print('variance values')
    print(np.var(X, axis=0))
    artists = clean_similar_artists(artists=artists)
    print('GENERATE HEATMAPS')
    gen_heatmaps_master(dimension=dimension, min=min, max=max)
    print('SAVING DATA')
    save_data(artists, filename=output_pkl)


    print('PLOT HEATMAPS in ', output_path)
    plot_heatmaps_master(dimension=dimension, min=min, max=max)
예제 #12
0
def main(args):
    input_folder = args.i_path

    if args.o_path[-1] == '/':
        output_filename = args.o_path + args.o_name
    else:
        output_filename = args.o_path + '/' + args.o_name

    artists = load_data(filename=input_folder)
    print('PREPROCESSING')
    X , y = prepare_dataset(artists=artists, remove_outliers=True, mode=3, local_outlier=False,print_stats=args.stats, print_outlier_percentage_p_feature=True, outlier_trheshold=args.outlier_trheshold)

    '''
    X = tsne(X)

    artists = optimize_artists_dictionary(artists)
    artists = attach_tsne_to_art_dict(artists=artists, X=X, y=y)

    tsne_min = np.amin(X, axis=0)
    tsne_max = np.amax(X, axis=0)

    if args.verbosity > 0:
        print('[TSNE-1 - TSNE-2]')
        print('min values')
        print(np.amin(X, axis=0))
        print('max values')
        print(np.amax(X, axis=0))
        print('mean values')
        print(np.mean(X, axis=0))
        print('variance values')
        print(np.var(X, axis=0))

    artists = clean_similar_artists(artists=artists)

    save_data(artists, filename=output_filename)
    '''


    return
예제 #13
0
def main(args):
    input_pkl = args.i_path
    n_chunks = args.n_chunks
    output_folder = args.o_path
    if output_folder[-1] != '/':
        output_folder += '/'

    print('LOADING PKL ', input_pkl)

    artists = load_data(filename=input_pkl)
    clean_list = dict()
    #remove from lists those artists that don't have an heatmap
    for k, v in artists.items():
        if v.tsne_heatmap is not None:
            clean_list[k] = k

    divide_dict_in_N_parts(artists=clean_list,
                           n=n_chunks,
                           save_to_pkl=True,
                           output_path=output_folder)

    print(str(len(clean_list)), ' over ', str(len(artists)),
          ' artists, those with no heatmap were not added to lists')
예제 #14
0
def main(args):
    input_folder = args.i_path
    threshold = args.threshold
    artists = load_data(filename=input_folder)

    X, y = gen_dataset(artists=artists)

    if PRINT_DISTRIBUTION:
        #PRINT VALUES BEFORE OUTLIER REMOTION
        feat_names = get_features_dict()
        x = np.array(X)
        for i in range(x.shape[1]):
            ax = plt.hist(x[:, i], bins=200)
            filename = args.o_path + '/BEFORE/' + feat_names[i] + '.png'
            title = feat_names[i] + 'BEFORE outlier remotion'
            plt.title(title)
            plt.savefig(filename)
            plt.close('all')

    X, y = remove_outlier(X=X,
                          y=y,
                          thresh=threshold,
                          verbose=False,
                          save_histogram=True)
    #X = normalize(X=X)
    #X, y = remove_outliers_lof(data=X, y=y)

    if PRINT_DISTRIBUTION:
        # PRINT VALUES AFTER OUTLIER REMOTION
        x = np.array(X)
        for i in range(x.shape[1]):
            ax = plt.hist(x[:, i], bins=200)
            filename = args.o_path + '/AFTER/' + feat_names[i] + '.png'
            title = feat_names[i] + 'AFTER outlier remotion'
            plt.title(title)
            plt.savefig(filename)
            plt.close('all')
예제 #15
0
def main(args):
    input_path = args.input_pkl
    output_path = args.output_path
    if output_path[-1] != '/':
        output_path += '/'

    output_names = output_path + 'names.pkl'
    output_heatmaps = output_path + 'heatmaps.pkl'
    output_gt = output_path + 'ground_truth.pkl'

    artists = load_data(filename=input_path)

    names = dict()
    heatmaps = dict()
    ground_truth = dict()

    for id_, artist in artists.items():
        names[id_] = artist.id
        heatmaps[id_] = artist.tsne_heatmap
        ground_truth[id_] = artist.similar_artists

    save_data(filename=output_heatmaps, dict=heatmaps)
    save_data(filename=output_names, dict=names)
    save_data(filename=output_gt, dict=ground_truth)
예제 #16
0
                        help='path to pkl name file')
    parser.add_argument('--ranking',
                        '-r',
                        required=False,
                        type=str,
                        default='./.pkl',
                        help='path to ranking file')
    parser.add_argument('--output_folder',
                        '-o',
                        required=False,
                        type=str,
                        default='./OUTPUT',
                        help='output folder')
    args = parser.parse_args()

    names = load_data(filename=args.names)
    heatmaps = load_data(filename=args.heatmaps)
    ground_truth = load_data(filename=args.ground_truth)
    distances = load_data(filename=args.distances)
    ranking = load_data(filename=args.ranking)
    output_folder = args.output_folder
    print_histograms(gt_distances=distance_vs_gt_position(
        ground_truth=ground_truth, distances=distances),
                     folder=output_folder)
'''
    names = load_data(filename='/home/gigi/PycharmProjects/TESI_BIS/OUTPUT/names.pkl')
    heatmaps = load_data(filename='/home/gigi/PycharmProjects/TESI_BIS/OUTPUT/heatmaps.pkl')
    ground_truth = load_data(filename='/home/gigi/PycharmProjects/TESI_BIS/OUTPUT/ground_truth.pkl')
    distances = load_data(filename='/home/gigi/PycharmProjects/TESI_BIS/OUTPUT/distances_cc_peak_2.pkl')
    ranking = load_data(filename=)'''
예제 #17
0
파일: prova.py 프로젝트: gigpir/TESI_BIS
import sys
# insert at 1, 0 is the script path (or '' in REPL)
from primary.heatmap import compute_heatmap_distance
import pandas as pd

sys.path.insert(1, '/home/crottondi/PIRISI_TESI/TESI_BIS/')
import argparse
from primary.data_io import save_data, load_data
import primary.rbo as rbo
import numpy as np

if __name__ == '__main__':

    artists = load_data(
        filename='/home/gigi/PycharmProjects/TESI_BIS/PKL/artists_subset_hm.pkl'
    )
    max = 0
    max_id = ''
    for id_, a_ in artists.items():
        if len(artists[id_].song_list) > max:
            max_id = id_
            max = len(artists[id_].song_list)
    print(max_id, max)