def setup_module(): global x_train, y_train, x_dev, y_dev, counts_dev, counts_train y_train, x_train = preproc.read_data('lyrics-train.csv') y_dev, x_dev = preproc.read_data('lyrics-dev.csv') counts_train = preproc.aggregate_counts(x_train) counts_dev = preproc.aggregate_counts(x_dev)
def test_d1_3_oov_steve(self): _, lyrics_dev = preproc.read_data(LYRICS_DEV_CSV) _, lyrics_train = preproc.read_data(LYRICS_TRAIN_CSV) counts_dev = preproc.aggregate_counts(lyrics_dev) counts_train = preproc.aggregate_counts(lyrics_train) oov_dev = preproc.compute_oov(counts_dev, counts_train) oov_train = preproc.compute_oov(counts_train, counts_dev) #oov_dev.remove(None) #oov_train.remove(None) list_oov_dev = list(oov_dev) list_oov_train = list(oov_train) list_oov_dev.sort() list_oov_train.sort() set_oov_dev = set(list_oov_dev) set_oov_train = set(list_oov_train) oov_diff_dev_wo_train = set_oov_dev - set_oov_train oov_diff_train_wo_dev = set_oov_train - set_oov_dev # print(oov_diff) eq_(len(oov_dev), 2677) eq_(len(oov_train), 30459) # ) 30442
def test_d1_3_oov_nonascii(self): # 1: you were laughing laughing ̢cause youre doin it to me laughing # 2; # you were laughing laughing ̢cause youre doin it to me laughing one two nonascii_txt1 = 'you were laughing laughing ̢cause youre doin it to me laughing' nonascii_txt2 = 'you were laughing laughing ̢cause youre doin it to me laughing one two' nonascii_txt3 = 'you were laughing laughing ̢cause can̢t youre doin it to me laughing one two three' nonascii_txt4 = 'you were laughing laughing ̢cause youre doin it to me laughing one two three four' nonascii_bag1 = preproc.bag_of_words(nonascii_txt1) nonascii_bag2 = preproc.bag_of_words(nonascii_txt2) nonascii_bag3 = preproc.bag_of_words(nonascii_txt3) nonascii_bag4 = preproc.bag_of_words(nonascii_txt4) nonascii_list1 = [nonascii_bag1, nonascii_bag2] nonascii_list2 = [nonascii_bag3, nonascii_bag4] counts1 = preproc.aggregate_counts(nonascii_list1) counts2 = preproc.aggregate_counts(nonascii_list2) oov1 = preproc.compute_oov(counts1, counts2) oov2 = preproc.compute_oov(counts2, counts1) print(oov1) print(oov2)
def test_d1_2_agg_steve(self): _, bow_list = preproc.read_data(LYRICS_DEV_CSV) counts = preproc.aggregate_counts(bow_list) print(counts) eq_(counts['you'], 5542) eq_(counts['money'], 92) eq_(len(counts), 9006)
def setup_module(): global vocab, label_set, x_tr_pruned y_tr,x_tr = preproc.read_data(LYRICS_TRAIN_CSV, preprocessor=preproc.bag_of_words) labels = set(y_tr) counts_tr = preproc.aggregate_counts(x_tr) x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10) label_set = sorted(list(set(y_tr)))
def setUp(self): global x_tr, y_tr, x_dv, y_dv, counts_tr, x_dv_pruned, x_tr_pruned, x_bl_pruned global labels global vocab y_tr,x_tr = preproc.read_data(LYRICS_TRAIN_CSV, preprocessor=preproc.bag_of_words) labels = set(y_tr) counts_tr = preproc.aggregate_counts(x_tr) y_dv,x_dv = preproc.read_data(LYRICS_DEV_CSV, preprocessor=preproc.bag_of_words) x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10) x_dv_pruned, _ = preproc.prune_vocabulary(counts_tr, x_dv, 10)
def setup_module(): global x_tr, y_tr, x_dv, y_dv, counts_tr, x_dv_pruned, x_tr_pruned, x_bl_pruned global labels global vocab y_tr, x_tr = preproc.read_data('lyrics-train.csv', preprocessor=preproc.bag_of_words) labels = set(y_tr) counts_tr = preproc.aggregate_counts(x_tr) y_dv, x_dv = preproc.read_data('lyrics-dev.csv', preprocessor=preproc.bag_of_words) x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10) x_dv_pruned, _ = preproc.prune_vocabulary(counts_tr, x_dv, 10)
def setup_module(): global vocab, label_set, x_tr_pruned, df_train, df_dev, df_test, count_vec y_tr, x_tr = preproc.read_data(LYRICS_TRAIN_CSV, preprocessor=preproc.bag_of_words) labels = set(y_tr) counts_tr = preproc.aggregate_counts(x_tr) x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10) label_set = sorted(list(set(y_tr))) df_train = pd.read_csv(LYRICS_TRAIN_CSV) df_dev = pd.read_csv(LYRICS_DEV_CSV) df_test = pd.read_csv(LYRICS_TEST_CSV) count_vec = CountVectorizer(vocabulary=vocab)
def setup_module(): global vocab, label_set, x_tr_pruned, df_train, df_dev, df_test, count_vec y_tr, x_tr = preproc.read_data('lyrics-train.csv', preprocessor=preproc.bag_of_words) labels = set(y_tr) counts_tr = preproc.aggregate_counts(x_tr) x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10) label_set = sorted(list(set(y_tr))) df_train = pd.read_csv('lyrics-train.csv') df_dev = pd.read_csv('lyrics-dev.csv') df_test = pd.read_csv('lyrics-test-hidden.csv') count_vec = CountVectorizer(vocabulary=vocab)
def test_d1_4_prune_steve_dev(self): _, dev_lyrics = preproc.read_data(LYRICS_DEV_CSV) dev_counts = preproc.aggregate_counts(dev_lyrics) dev_lyrics_pruned, vocab = preproc.prune_vocabulary(dev_counts, dev_lyrics, 3) vocab_sorted = collections.OrderedDict(sorted(vocab.items(), key=lambda t: t[0])) # On the 95th line in the dev file, we have the following words that were removed # Because the total count for these words in the entire document was < 3 # heyyeah: 1, stands: 1, disco: 1, wasdancin: 1, diei: 1, diegonna: 1, electified: 1 # yeahhey: 1, changin: 1, shaky: 1, funking: 1, diethey: 1 # heyyeah: 1, stands: 1, disco: 2, wasdancin: 1, diei: 1, diegonna: 1, electified: 1 # yeahhey: 2, changin: 2, shaky: 1, funking: 1, diethey: 1 eq_(len(dev_lyrics[95].keys()) - len(dev_lyrics_pruned[95].keys()), 13) # How did we get 13? eq_(len(dev_lyrics[95].keys()) - len(dev_lyrics_pruned[95].keys()), 8)
def estimate_nb(x, y, smoothing): """ Estimate a naive bayes model :param x: list of dictionaries of base feature counts :param y: list of labels :param smoothing: smoothing constant :returns: weights, as a default dict where the keys are (label, word) tuples and values are smoothed log-probs of P(word|label) :rtype: defaultdict """ labels = set(y) counts = defaultdict(float) doc_counts = defaultdict(float) # Create references to parameters that match formula and algorithm in 4.14 in J&M # Yes, it violates python naming conventions, but understanding Naive Bayes is more important D = x # all documents C = labels # Labels or class as it is known in 4.14 in J&M V = Counter() # Vocabulary counter V_size = 0 # |V| = size of vocabulary, initialize to 0 weights = defaultdict() for c in C: V = preproc.aggregate_counts(D) p_xy = estimate_pxy(D, C, c, smoothing, V) weights.update(clf_base.make_feature_vector(p_xy, c)) # for w, p in p_xy.items(): # key = (c, w) # weights[key] = p # Don't really understand how the OFFSET is related to Naive Bayes, #key = (c, OFFSET) #weights[key] = smoothing return weights
def test_d1_4_prune_steve_train(self): _, train_lyrics = preproc.read_data(LYRICS_TRAIN_CSV) counts_train = preproc.aggregate_counts(train_lyrics) train_lyrics_pruned, vocab = preproc.prune_vocabulary(counts_train, train_lyrics, 3) eq_(len(vocab), 11820) # 11824 contains instrumental, NA and corrupted lyrics