예제 #1
0
def setup_module():
    global x_train, y_train, x_dev, y_dev, counts_dev, counts_train
    y_train, x_train = preproc.read_data('lyrics-train.csv')
    y_dev, x_dev = preproc.read_data('lyrics-dev.csv')

    counts_train = preproc.aggregate_counts(x_train)
    counts_dev = preproc.aggregate_counts(x_dev)
예제 #2
0
    def test_d1_3_oov_steve(self):
        _, lyrics_dev   = preproc.read_data(LYRICS_DEV_CSV)
        _, lyrics_train = preproc.read_data(LYRICS_TRAIN_CSV)

        counts_dev   = preproc.aggregate_counts(lyrics_dev)
        counts_train = preproc.aggregate_counts(lyrics_train)

        oov_dev   = preproc.compute_oov(counts_dev,   counts_train)
        oov_train = preproc.compute_oov(counts_train, counts_dev)

        #oov_dev.remove(None)
        #oov_train.remove(None)

        list_oov_dev   = list(oov_dev)
        list_oov_train = list(oov_train)

        list_oov_dev.sort()
        list_oov_train.sort()

        set_oov_dev = set(list_oov_dev)
        set_oov_train = set(list_oov_train)

        oov_diff_dev_wo_train = set_oov_dev - set_oov_train
        oov_diff_train_wo_dev = set_oov_train - set_oov_dev
        # print(oov_diff)

        eq_(len(oov_dev), 2677)
        eq_(len(oov_train), 30459) # ) 30442
예제 #3
0
    def test_d1_3_oov_nonascii(self):
        # 1: you were laughing laughing ̢cause youre doin it to me laughing
        # 2; # you were laughing laughing ̢cause youre doin it to me laughing one two

        nonascii_txt1 = 'you were laughing laughing ̢cause youre doin it to me laughing'
        nonascii_txt2 = 'you were laughing laughing ̢cause youre doin it to me laughing one two'
        nonascii_txt3 = 'you were laughing laughing ̢cause can̢t youre doin it to me laughing one two three'
        nonascii_txt4 = 'you were laughing laughing ̢cause youre doin it to me laughing one two three four'

        nonascii_bag1 = preproc.bag_of_words(nonascii_txt1)
        nonascii_bag2 = preproc.bag_of_words(nonascii_txt2)
        nonascii_bag3 = preproc.bag_of_words(nonascii_txt3)
        nonascii_bag4 = preproc.bag_of_words(nonascii_txt4)

        nonascii_list1 = [nonascii_bag1, nonascii_bag2]
        nonascii_list2 = [nonascii_bag3, nonascii_bag4]

        counts1 = preproc.aggregate_counts(nonascii_list1)
        counts2 = preproc.aggregate_counts(nonascii_list2)

        oov1 = preproc.compute_oov(counts1, counts2)
        oov2 = preproc.compute_oov(counts2, counts1)

        print(oov1)
        print(oov2)
예제 #4
0
    def test_d1_2_agg_steve(self):
        _, bow_list = preproc.read_data(LYRICS_DEV_CSV)
        counts = preproc.aggregate_counts(bow_list)

        print(counts)

        eq_(counts['you'], 5542)
        eq_(counts['money'], 92)
        eq_(len(counts), 9006)
예제 #5
0
def setup_module():
    global vocab, label_set, x_tr_pruned

    y_tr,x_tr = preproc.read_data(LYRICS_TRAIN_CSV, preprocessor=preproc.bag_of_words)
    labels = set(y_tr)

    counts_tr = preproc.aggregate_counts(x_tr)

    x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10)

    label_set = sorted(list(set(y_tr)))
예제 #6
0
    def setUp(self):
        global x_tr, y_tr, x_dv, y_dv, counts_tr, x_dv_pruned, x_tr_pruned, x_bl_pruned
        global labels
        global vocab

        y_tr,x_tr = preproc.read_data(LYRICS_TRAIN_CSV, preprocessor=preproc.bag_of_words)
        labels = set(y_tr)

        counts_tr = preproc.aggregate_counts(x_tr)

        y_dv,x_dv = preproc.read_data(LYRICS_DEV_CSV, preprocessor=preproc.bag_of_words)

        x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10)
        x_dv_pruned, _ = preproc.prune_vocabulary(counts_tr, x_dv, 10)
예제 #7
0
def setup_module():
    global x_tr, y_tr, x_dv, y_dv, counts_tr, x_dv_pruned, x_tr_pruned, x_bl_pruned
    global labels
    global vocab

    y_tr, x_tr = preproc.read_data('lyrics-train.csv',
                                   preprocessor=preproc.bag_of_words)
    labels = set(y_tr)

    counts_tr = preproc.aggregate_counts(x_tr)

    y_dv, x_dv = preproc.read_data('lyrics-dev.csv',
                                   preprocessor=preproc.bag_of_words)

    x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10)
    x_dv_pruned, _ = preproc.prune_vocabulary(counts_tr, x_dv, 10)
예제 #8
0
def setup_module():
    global vocab, label_set, x_tr_pruned, df_train, df_dev, df_test, count_vec

    y_tr, x_tr = preproc.read_data(LYRICS_TRAIN_CSV,
                                   preprocessor=preproc.bag_of_words)
    labels = set(y_tr)

    counts_tr = preproc.aggregate_counts(x_tr)

    x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10)

    label_set = sorted(list(set(y_tr)))

    df_train = pd.read_csv(LYRICS_TRAIN_CSV)
    df_dev = pd.read_csv(LYRICS_DEV_CSV)
    df_test = pd.read_csv(LYRICS_TEST_CSV)

    count_vec = CountVectorizer(vocabulary=vocab)
예제 #9
0
def setup_module():
    global vocab, label_set, x_tr_pruned, df_train, df_dev, df_test, count_vec

    y_tr, x_tr = preproc.read_data('lyrics-train.csv',
                                   preprocessor=preproc.bag_of_words)
    labels = set(y_tr)

    counts_tr = preproc.aggregate_counts(x_tr)

    x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10)

    label_set = sorted(list(set(y_tr)))

    df_train = pd.read_csv('lyrics-train.csv')
    df_dev = pd.read_csv('lyrics-dev.csv')
    df_test = pd.read_csv('lyrics-test-hidden.csv')

    count_vec = CountVectorizer(vocabulary=vocab)
예제 #10
0
    def test_d1_4_prune_steve_dev(self):
        _, dev_lyrics = preproc.read_data(LYRICS_DEV_CSV)
        dev_counts = preproc.aggregate_counts(dev_lyrics)
        dev_lyrics_pruned, vocab = preproc.prune_vocabulary(dev_counts, dev_lyrics, 3)

        vocab_sorted = collections.OrderedDict(sorted(vocab.items(), key=lambda t: t[0]))

        # On the 95th line in the dev file, we have the following words that were removed
        # Because the total count for these words in the entire document was < 3
        # heyyeah: 1, stands: 1, disco: 1, wasdancin: 1, diei: 1, diegonna: 1, electified: 1
        # yeahhey: 1, changin: 1, shaky: 1, funking: 1, diethey: 1

        # heyyeah: 1, stands: 1, disco: 2, wasdancin: 1, diei: 1, diegonna: 1, electified: 1
        # yeahhey: 2, changin: 2, shaky: 1, funking: 1, diethey: 1
        eq_(len(dev_lyrics[95].keys()) - len(dev_lyrics_pruned[95].keys()), 13)

        # How did we get 13?
        eq_(len(dev_lyrics[95].keys()) - len(dev_lyrics_pruned[95].keys()), 8)
예제 #11
0
def estimate_nb(x, y, smoothing):
    """
    Estimate a naive bayes model

    :param x: list of dictionaries of base feature counts
    :param y: list of labels
    :param smoothing: smoothing constant
    :returns: weights, as a default dict where the keys are (label, word) tuples and values are smoothed log-probs of P(word|label)
    :rtype: defaultdict 

    """

    labels = set(y)
    counts = defaultdict(float)
    doc_counts = defaultdict(float)

    # Create references to parameters that match formula and algorithm in 4.14 in J&M
    # Yes, it violates python naming conventions, but understanding Naive Bayes is more important
    D = x  # all documents
    C = labels  # Labels or class as it is known in 4.14 in J&M
    V = Counter()  # Vocabulary counter
    V_size = 0  # |V| = size of vocabulary, initialize to 0

    weights = defaultdict()
    for c in C:
        V = preproc.aggregate_counts(D)

        p_xy = estimate_pxy(D, C, c, smoothing, V)
        weights.update(clf_base.make_feature_vector(p_xy, c))

        # for w, p in p_xy.items():
        #     key = (c, w)
        #     weights[key] = p

        # Don't really understand how the OFFSET is related to Naive Bayes,
        #key = (c, OFFSET)
        #weights[key] = smoothing

    return weights
예제 #12
0
    def test_d1_4_prune_steve_train(self):
        _, train_lyrics = preproc.read_data(LYRICS_TRAIN_CSV)
        counts_train = preproc.aggregate_counts(train_lyrics)
        train_lyrics_pruned, vocab = preproc.prune_vocabulary(counts_train, train_lyrics, 3)

        eq_(len(vocab), 11820) # 11824 contains instrumental, NA and corrupted lyrics