def setup_module(): global x_tr, y_tr, x_dv, y_dv, counts_tr, counts_dv, counts_bl, x_dv_pruned, x_tr_pruned global vocab y_tr, x_tr = preproc.read_data('data/rock-lyrics-train.csv', preprocessor=preproc.bag_of_words) y_dv, x_dv = preproc.read_data('data/rock-lyrics-dev.csv', preprocessor=preproc.bag_of_words)
def setup_module(): global x_tr, y_tr, x_dv, y_dv, counts_tr, x_dv_pruned, x_tr_pruned, x_bl_pruned global labels global vocab y_tr, x_tr = preproc.read_data('data/rock-lyrics-train.csv', preprocessor=preproc.bag_of_words) labels = set(y_tr) counts_tr = preproc.aggregate_counts(x_tr) y_dv, x_dv = preproc.read_data('data/rock-lyrics-dev.csv', preprocessor=preproc.bag_of_words) x_tr_pruned, vocab = preproc.prune_vocabulary(counts_tr, x_tr, 10) x_dv_pruned, _ = preproc.prune_vocabulary(counts_tr, x_dv, 10)
def setup_module(): global x_tr, x_dev global vocab x_tr = preproc.read_data('data/corpus.csv',preprocessor=preproc.space_tokenizer) x_dev = preproc.read_data('data/corpus_dev.csv',preprocessor=preproc.space_tokenizer)
! nosetests tests/test_environment.py df_train = pd.read_csv('data/corpus.csv') df_train.head() ! cat data/corpus.csv | wc -l assert(len(df_train)==7) # ---------------------------------- # 1.1 from snlp import preproc reload(preproc); x_train = preproc.read_data('data/corpus.csv',preprocessor=preproc.space_tokenizer) ! nosetests tests/test_preproc.py:test_space_tok # ---------------------------------- # 1.2 reload(preproc); ! nosetests tests/test_preproc.py:test_create_vocab print(preproc.create_vocab(x_train)) # ---------------------------------- # 2.1