Exemplo n.º 1
0
    def test_d1_3_oov_steve(self):
        _, lyrics_dev   = preproc.read_data(LYRICS_DEV_CSV)
        _, lyrics_train = preproc.read_data(LYRICS_TRAIN_CSV)

        counts_dev   = preproc.aggregate_counts(lyrics_dev)
        counts_train = preproc.aggregate_counts(lyrics_train)

        oov_dev   = preproc.compute_oov(counts_dev,   counts_train)
        oov_train = preproc.compute_oov(counts_train, counts_dev)

        #oov_dev.remove(None)
        #oov_train.remove(None)

        list_oov_dev   = list(oov_dev)
        list_oov_train = list(oov_train)

        list_oov_dev.sort()
        list_oov_train.sort()

        set_oov_dev = set(list_oov_dev)
        set_oov_train = set(list_oov_train)

        oov_diff_dev_wo_train = set_oov_dev - set_oov_train
        oov_diff_train_wo_dev = set_oov_train - set_oov_dev
        # print(oov_diff)

        eq_(len(oov_dev), 2677)
        eq_(len(oov_train), 30459) # ) 30442
Exemplo n.º 2
0
def setup_module():
    global x_train, y_train, x_dev, y_dev, counts_dev, counts_train
    y_train, x_train = preproc.read_data('lyrics-train.csv')
    y_dev, x_dev = preproc.read_data('lyrics-dev.csv')

    counts_train = preproc.aggregate_counts(x_train)
    counts_dev = preproc.aggregate_counts(x_dev)
Exemplo n.º 3
0
    def setUp(self):
        global x_tr, y_tr, x_dv, y_dv, counts_tr, x_dv_pruned, x_tr_pruned, x_bl_pruned
        global labels
        global vocab

        y_tr,x_tr = preproc.read_data(LYRICS_TRAIN_CSV, preprocessor=preproc.bag_of_words)
        labels = set(y_tr)

        counts_tr = preproc.aggregate_counts(x_tr)

        y_dv,x_dv = preproc.read_data(LYRICS_DEV_CSV, preprocessor=preproc.bag_of_words)

        x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10)
        x_dv_pruned, _ = preproc.prune_vocabulary(counts_tr, x_dv, 10)
Exemplo n.º 4
0
def setup_module():
    global x_tr, y_tr, x_dv, y_dv, counts_tr, x_dv_pruned, x_tr_pruned, x_bl_pruned
    global labels
    global vocab

    y_tr, x_tr = preproc.read_data('lyrics-train.csv',
                                   preprocessor=preproc.bag_of_words)
    labels = set(y_tr)

    counts_tr = preproc.aggregate_counts(x_tr)

    y_dv, x_dv = preproc.read_data('lyrics-dev.csv',
                                   preprocessor=preproc.bag_of_words)

    x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10)
    x_dv_pruned, _ = preproc.prune_vocabulary(counts_tr, x_dv, 10)
Exemplo n.º 5
0
    def test_d1_2_agg_steve(self):
        _, bow_list = preproc.read_data(LYRICS_DEV_CSV)
        counts = preproc.aggregate_counts(bow_list)

        print(counts)

        eq_(counts['you'], 5542)
        eq_(counts['money'], 92)
        eq_(len(counts), 9006)
Exemplo n.º 6
0
def setup_module():
    global vocab, label_set, x_tr_pruned

    y_tr,x_tr = preproc.read_data(LYRICS_TRAIN_CSV, preprocessor=preproc.bag_of_words)
    labels = set(y_tr)

    counts_tr = preproc.aggregate_counts(x_tr)

    x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10)

    label_set = sorted(list(set(y_tr)))
Exemplo n.º 7
0
    def test_d1_1_bow_steve(self):
        train_labels, train_lyrics = preproc.read_data(LYRICS_TRAIN_CSV)

        # x (data) and y (label) vectors should be the same length
        eq_(len(train_lyrics), len(train_labels))

        # spot-check some counts:
        eq_(train_lyrics[4]['all'], 5)
        eq_(train_lyrics[41]['angels'], 1)
        eq_(train_lyrics[410]['angels'], 0)

        train_at_1144 = train_lyrics[1144]
        len_train_at_1144 = len(train_lyrics[1144])

        eq_(len(train_lyrics[1144]), 124)
Exemplo n.º 8
0
def setup_module():
    global vocab, label_set, x_tr_pruned, df_train, df_dev, df_test, count_vec

    y_tr, x_tr = preproc.read_data(LYRICS_TRAIN_CSV,
                                   preprocessor=preproc.bag_of_words)
    labels = set(y_tr)

    counts_tr = preproc.aggregate_counts(x_tr)

    x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10)

    label_set = sorted(list(set(y_tr)))

    df_train = pd.read_csv(LYRICS_TRAIN_CSV)
    df_dev = pd.read_csv(LYRICS_DEV_CSV)
    df_test = pd.read_csv(LYRICS_TEST_CSV)

    count_vec = CountVectorizer(vocabulary=vocab)
Exemplo n.º 9
0
def setup_module():
    global vocab, label_set, x_tr_pruned, df_train, df_dev, df_test, count_vec

    y_tr, x_tr = preproc.read_data('lyrics-train.csv',
                                   preprocessor=preproc.bag_of_words)
    labels = set(y_tr)

    counts_tr = preproc.aggregate_counts(x_tr)

    x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10)

    label_set = sorted(list(set(y_tr)))

    df_train = pd.read_csv('lyrics-train.csv')
    df_dev = pd.read_csv('lyrics-dev.csv')
    df_test = pd.read_csv('lyrics-test-hidden.csv')

    count_vec = CountVectorizer(vocabulary=vocab)
Exemplo n.º 10
0
    def test_d1_4_prune_steve_dev(self):
        _, dev_lyrics = preproc.read_data(LYRICS_DEV_CSV)
        dev_counts = preproc.aggregate_counts(dev_lyrics)
        dev_lyrics_pruned, vocab = preproc.prune_vocabulary(dev_counts, dev_lyrics, 3)

        vocab_sorted = collections.OrderedDict(sorted(vocab.items(), key=lambda t: t[0]))

        # On the 95th line in the dev file, we have the following words that were removed
        # Because the total count for these words in the entire document was < 3
        # heyyeah: 1, stands: 1, disco: 1, wasdancin: 1, diei: 1, diegonna: 1, electified: 1
        # yeahhey: 1, changin: 1, shaky: 1, funking: 1, diethey: 1

        # heyyeah: 1, stands: 1, disco: 2, wasdancin: 1, diei: 1, diegonna: 1, electified: 1
        # yeahhey: 2, changin: 2, shaky: 1, funking: 1, diethey: 1
        eq_(len(dev_lyrics[95].keys()) - len(dev_lyrics_pruned[95].keys()), 13)

        # How did we get 13?
        eq_(len(dev_lyrics[95].keys()) - len(dev_lyrics_pruned[95].keys()), 8)
Exemplo n.º 11
0
    def test_d1_4_prune_steve_train(self):
        _, train_lyrics = preproc.read_data(LYRICS_TRAIN_CSV)
        counts_train = preproc.aggregate_counts(train_lyrics)
        train_lyrics_pruned, vocab = preproc.prune_vocabulary(counts_train, train_lyrics, 3)

        eq_(len(vocab), 11820) # 11824 contains instrumental, NA and corrupted lyrics