Пример #1
0
def main(fpath, out_folder, use_idf=True):
    for min_filter in 0, 0.05, 0.1, 0.5:
        folder = os.path.join(out_folder, str(min_filter))
        os.mkdir(folder)
        
        out_fpath_tags = os.path.join(folder, 'tags')
        out_fpath_vocab = os.path.join(folder, 'vocab')
        
        doc_mat, vocabulary = vectorize_songs(fpath, use_idf=use_idf,
                                              bottom_filter=min_filter)
        with open(out_fpath_tags, 'w') as tags_file:
            rows, cols = doc_mat.nonzero()
            last_row = rows[0]
            for row, col in zip(rows, cols):
                if row != last_row:
                    assert row > last_row
                    print(file=tags_file)
                    last_row = row
                    
                print('%d:%.6f'%(col, doc_mat[row, col]), file=tags_file, end=' ')
        
        with open(out_fpath_vocab, 'w') as vocab_file:
            for term in vocabulary:
                term_id = vocabulary[term]
                print(term, term_id, file = vocab_file) 
def main(fpath):
    doc_mat = vectorize_songs(fpath)[0]
    rows = doc_mat.nonzero()[0]

    to_plot = Counter(rows).values()
    x, cdf_y = ecdf(to_plot)
    ccdf_y = 1 - cdf_y

    print(stats.scoreatpercentile(to_plot, 0.1))
    print(doc_mat.shape)
    ax = plt.gca()
    ax.set_yscale("log")
    ax.set_xscale("log")

    plt.plot(x, ccdf_y, "bo")
    plt.xlabel("Number Tags per Song (x)")
    plt.ylabel("Prob(Num. Tags per Song > x)")
    plt.title("CCDF of Tags per Song")
    plt.show()
Пример #3
0
def main(fpath):
    doc_mat = vectorize_songs(fpath)[0]
    cols = doc_mat.nonzero()[1]
    
    to_plot = Counter(cols).values()
    x, cdf_y = ecdf(to_plot)
    ccdf_y = 1 - cdf_y
    
    print(stats.scoreatpercentile(to_plot, 0.5))
    print(doc_mat.shape)
    ax = plt.gca()
    ax.set_yscale('log')
    ax.set_xscale('log')

    plt.plot(x, ccdf_y, 'bo')
    plt.xlabel('Number of songs with tag (x)')
    plt.ylabel('Prob(Num. Songs with Tag > x)')
    plt.title('CCDF of Tag Popularity')
    plt.show()