示例#1
0
def setup_module():
    global x_tr, y_tr, x_dv, y_dv, counts_tr, counts_dv, counts_bl, x_dv_pruned, x_tr_pruned
    global vocab
    y_tr, x_tr = preproc.read_data('data/rock-lyrics-train.csv',
                                   preprocessor=preproc.bag_of_words)
    y_dv, x_dv = preproc.read_data('data/rock-lyrics-dev.csv',
                                   preprocessor=preproc.bag_of_words)
def setup_module():

    global x_tr, y_tr, x_dv, y_dv, counts_tr, x_dv_pruned, x_tr_pruned, x_bl_pruned
    global labels
    global vocab

    y_tr, x_tr = preproc.read_data('data/rock-lyrics-train.csv',
                                   preprocessor=preproc.bag_of_words)
    labels = set(y_tr)

    counts_tr = preproc.aggregate_counts(x_tr)

    y_dv, x_dv = preproc.read_data('data/rock-lyrics-dev.csv',
                                   preprocessor=preproc.bag_of_words)

    x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10)
    x_dv_pruned, _ = preproc.prune_vocabulary(counts_tr, x_dv, 10)
示例#3
0
def setup_module():
    global x_tr, x_dev
    global vocab
    x_tr = preproc.read_data('data/corpus.csv',preprocessor=preproc.space_tokenizer)
    x_dev = preproc.read_data('data/corpus_dev.csv',preprocessor=preproc.space_tokenizer)
示例#4
0
! nosetests tests/test_environment.py

df_train = pd.read_csv('data/corpus.csv')
df_train.head()

! cat data/corpus.csv | wc -l

assert(len(df_train)==7)

# ----------------------------------
# 1.1

from snlp import preproc

reload(preproc);
x_train = preproc.read_data('data/corpus.csv',preprocessor=preproc.space_tokenizer)

! nosetests tests/test_preproc.py:test_space_tok


# ----------------------------------
# 1.2

reload(preproc);
! nosetests tests/test_preproc.py:test_create_vocab

print(preproc.create_vocab(x_train))

# ----------------------------------
# 2.1