def test_d1_3_oov_steve(self): _, lyrics_dev = preproc.read_data(LYRICS_DEV_CSV) _, lyrics_train = preproc.read_data(LYRICS_TRAIN_CSV) counts_dev = preproc.aggregate_counts(lyrics_dev) counts_train = preproc.aggregate_counts(lyrics_train) oov_dev = preproc.compute_oov(counts_dev, counts_train) oov_train = preproc.compute_oov(counts_train, counts_dev) #oov_dev.remove(None) #oov_train.remove(None) list_oov_dev = list(oov_dev) list_oov_train = list(oov_train) list_oov_dev.sort() list_oov_train.sort() set_oov_dev = set(list_oov_dev) set_oov_train = set(list_oov_train) oov_diff_dev_wo_train = set_oov_dev - set_oov_train oov_diff_train_wo_dev = set_oov_train - set_oov_dev # print(oov_diff) eq_(len(oov_dev), 2677) eq_(len(oov_train), 30459) # ) 30442
def setup_module(): global x_train, y_train, x_dev, y_dev, counts_dev, counts_train y_train, x_train = preproc.read_data('lyrics-train.csv') y_dev, x_dev = preproc.read_data('lyrics-dev.csv') counts_train = preproc.aggregate_counts(x_train) counts_dev = preproc.aggregate_counts(x_dev)
def setUp(self): global x_tr, y_tr, x_dv, y_dv, counts_tr, x_dv_pruned, x_tr_pruned, x_bl_pruned global labels global vocab y_tr,x_tr = preproc.read_data(LYRICS_TRAIN_CSV, preprocessor=preproc.bag_of_words) labels = set(y_tr) counts_tr = preproc.aggregate_counts(x_tr) y_dv,x_dv = preproc.read_data(LYRICS_DEV_CSV, preprocessor=preproc.bag_of_words) x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10) x_dv_pruned, _ = preproc.prune_vocabulary(counts_tr, x_dv, 10)
def setup_module(): global x_tr, y_tr, x_dv, y_dv, counts_tr, x_dv_pruned, x_tr_pruned, x_bl_pruned global labels global vocab y_tr, x_tr = preproc.read_data('lyrics-train.csv', preprocessor=preproc.bag_of_words) labels = set(y_tr) counts_tr = preproc.aggregate_counts(x_tr) y_dv, x_dv = preproc.read_data('lyrics-dev.csv', preprocessor=preproc.bag_of_words) x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10) x_dv_pruned, _ = preproc.prune_vocabulary(counts_tr, x_dv, 10)
def test_d1_2_agg_steve(self): _, bow_list = preproc.read_data(LYRICS_DEV_CSV) counts = preproc.aggregate_counts(bow_list) print(counts) eq_(counts['you'], 5542) eq_(counts['money'], 92) eq_(len(counts), 9006)
def setup_module(): global vocab, label_set, x_tr_pruned y_tr,x_tr = preproc.read_data(LYRICS_TRAIN_CSV, preprocessor=preproc.bag_of_words) labels = set(y_tr) counts_tr = preproc.aggregate_counts(x_tr) x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10) label_set = sorted(list(set(y_tr)))
def test_d1_1_bow_steve(self): train_labels, train_lyrics = preproc.read_data(LYRICS_TRAIN_CSV) # x (data) and y (label) vectors should be the same length eq_(len(train_lyrics), len(train_labels)) # spot-check some counts: eq_(train_lyrics[4]['all'], 5) eq_(train_lyrics[41]['angels'], 1) eq_(train_lyrics[410]['angels'], 0) train_at_1144 = train_lyrics[1144] len_train_at_1144 = len(train_lyrics[1144]) eq_(len(train_lyrics[1144]), 124)
def setup_module(): global vocab, label_set, x_tr_pruned, df_train, df_dev, df_test, count_vec y_tr, x_tr = preproc.read_data(LYRICS_TRAIN_CSV, preprocessor=preproc.bag_of_words) labels = set(y_tr) counts_tr = preproc.aggregate_counts(x_tr) x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10) label_set = sorted(list(set(y_tr))) df_train = pd.read_csv(LYRICS_TRAIN_CSV) df_dev = pd.read_csv(LYRICS_DEV_CSV) df_test = pd.read_csv(LYRICS_TEST_CSV) count_vec = CountVectorizer(vocabulary=vocab)
def setup_module(): global vocab, label_set, x_tr_pruned, df_train, df_dev, df_test, count_vec y_tr, x_tr = preproc.read_data('lyrics-train.csv', preprocessor=preproc.bag_of_words) labels = set(y_tr) counts_tr = preproc.aggregate_counts(x_tr) x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10) label_set = sorted(list(set(y_tr))) df_train = pd.read_csv('lyrics-train.csv') df_dev = pd.read_csv('lyrics-dev.csv') df_test = pd.read_csv('lyrics-test-hidden.csv') count_vec = CountVectorizer(vocabulary=vocab)
def test_d1_4_prune_steve_dev(self): _, dev_lyrics = preproc.read_data(LYRICS_DEV_CSV) dev_counts = preproc.aggregate_counts(dev_lyrics) dev_lyrics_pruned, vocab = preproc.prune_vocabulary(dev_counts, dev_lyrics, 3) vocab_sorted = collections.OrderedDict(sorted(vocab.items(), key=lambda t: t[0])) # On the 95th line in the dev file, we have the following words that were removed # Because the total count for these words in the entire document was < 3 # heyyeah: 1, stands: 1, disco: 1, wasdancin: 1, diei: 1, diegonna: 1, electified: 1 # yeahhey: 1, changin: 1, shaky: 1, funking: 1, diethey: 1 # heyyeah: 1, stands: 1, disco: 2, wasdancin: 1, diei: 1, diegonna: 1, electified: 1 # yeahhey: 2, changin: 2, shaky: 1, funking: 1, diethey: 1 eq_(len(dev_lyrics[95].keys()) - len(dev_lyrics_pruned[95].keys()), 13) # How did we get 13? eq_(len(dev_lyrics[95].keys()) - len(dev_lyrics_pruned[95].keys()), 8)
def test_d1_4_prune_steve_train(self): _, train_lyrics = preproc.read_data(LYRICS_TRAIN_CSV) counts_train = preproc.aggregate_counts(train_lyrics) train_lyrics_pruned, vocab = preproc.prune_vocabulary(counts_train, train_lyrics, 3) eq_(len(vocab), 11820) # 11824 contains instrumental, NA and corrupted lyrics