def test_train_model_untrained_multi(self): multi_text = pd.read_csv("../../data/sentences_multilingual.csv") multi_text_train, multi_text_test = train_test_split(multi_text, test_size=0.2) multi_text.groupby('lang').count() multi_text_train.groupby('lang').count() _c2i, _i2c = vocab.build_vocab(multi_text.sentence.values) _l2i, _i2l = vocab.build_label_vocab(multi_text.lang.values) _untrained_multi_class = lang_id.LangID(input_vocab_n=len(_c2i), embedding_dims=10, hidden_dims=20, lstm_layers=1, output_class_n=5) acc_untrained_multi, y_hat_untrained_multi = lang_id.eval_acc( _untrained_multi_class, multi_text_test, _c2i, _i2c, _l2i, _i2l) print(f"Untrained Multi Accuracy: {acc_untrained_multi}") from sklearn.metrics import classification_report, confusion_matrix y_multi = multi_text_test.lang.values print(classification_report(y_multi, y_hat_untrained_multi)) cm = confusion_matrix(y_multi, y_hat_untrained_multi) cm lang_id.pretty_conf_matrix(cm, ['deu', 'eng', 'fra', 'ita', 'spa']) assert_greater(acc_untrained_multi, 0.4) assert_less(acc_untrained_multi, 0.6)
def test_d1_1_char_vocab(): global tiny_corpus c2i, i2c = vocab.build_vocab(tiny_corpus) # are BOS_SYM and EOS_SYM in there? assert_in(vocab.BOS_SYM, c2i) assert_in(vocab.EOS_SYM, c2i) # does it map things correctly? t_idx = c2i["T"] eq_(i2c[t_idx], "T") # are they the same length? eq_(len(c2i), len(i2c)) # is it exhaustive? all_seen = defaultdict(int) for doc in tiny_corpus: for c in list(doc): all_seen[c] = 1 char_count = len(all_seen) # account for BOS_SYM and EOS_SYM char_count += 2 eq_(char_count, len(c2i))
def setUp(self): global bi_text, bi_text_train, bi_text_test, bt_c2i, bt_i2c, bt_l2i, bt_i2l bi_text = pd.read_csv("../../data/sentences_bilingual.csv") bt_c2i, bt_i2c = vocab.build_vocab(bi_text.sentence.values) bt_l2i, bt_i2l = vocab.build_label_vocab(bi_text.lang.values) bi_text_train, bi_text_test = train_test_split(bi_text, test_size=0.2)
def test_untrained_model(self): # Adding prefix _ to local variables # Noticed some variation in the scores every time I run it. # Don't want to pollute my variables with globals when running all tests _bi_text = pd.read_csv("../../data/sentences_bilingual.csv") _bi_text_train, _bi_text_test = train_test_split(_bi_text, test_size=0.2) _c2i, _i2c = vocab.build_vocab(_bi_text.sentence.values) _l2i, _i2l = vocab.build_label_vocab(_bi_text.lang.values) _untrained = lang_id.LangID(input_vocab_n=len(_c2i), embedding_dims=10, hidden_dims=20, lstm_layers=1, output_class_n=2) _acc_untrained, _y_hat_untrained = lang_id.eval_acc( _untrained, _bi_text_test, _c2i, _i2c, _l2i, _i2l) print(f"Untrained Accuracy: {_acc_untrained}") pred_label_list = np.asarray(_y_hat_untrained) pd.options.mode.chained_assignment = None _bi_text_test["predicted"] = pred_label_list _bi_text_test = _bi_text_test[["sentence", "lang", "predicted"]] _bi_text_test.to_csv("../../data/deliverable_2.4_untrained.csv", index=False) assert_greater(_acc_untrained, 0.4) assert_less(_acc_untrained, 0.6)
def test_d1_3_sentence_tensor(): global tiny_corpus c2i, i2c = vocab.build_vocab(tiny_corpus) s = "This is a sentence." s_tens = vocab.sentence_to_tensor(s, c2i) eq_(s_tens.shape[0], 1) eq_(s_tens.shape[1], len(s)) eq_(len(s_tens.shape), 2) # does BOS/EOS padding work? with_padding = vocab.sentence_to_tensor(s, c2i, True) eq_(with_padding.shape[1], len(s) + 2) eq_(with_padding[0, 0].item(), c2i[vocab.BOS_SYM])
def test_d1_2_sentence_vector(): global tiny_corpus c2i, i2c = vocab.build_vocab(tiny_corpus) test_sentence = "Here is a sentence." sent_vec = vocab.sentence_to_vector(test_sentence, c2i) eq_(len(sent_vec), len(test_sentence)) back_to_chars = [i2c[c] for c in sent_vec] eq_(test_sentence, ''.join(back_to_chars)) # does the BOS/EOS padding work? with_padding = vocab.sentence_to_vector(test_sentence, c2i, True) eq_(len(with_padding), len(test_sentence) + 2) eq_(with_padding[0], c2i[vocab.BOS_SYM]) eq_(with_padding[-1], c2i[vocab.EOS_SYM])
def test_train_model_multi_embed20_hidden40(self): multi_text = pd.read_csv("../../data/sentences_multilingual.csv") multi_text_train, multi_text_test = train_test_split(multi_text, test_size=0.2) multi_text.groupby('lang').count() multi_text_train.groupby('lang').count() _c2i, _i2c = vocab.build_vocab(multi_text.sentence.values) _l2i, _i2l = vocab.build_label_vocab(multi_text.lang.values) multi_class = lang_id.LangID(input_vocab_n=len(_c2i), embedding_dims=20, hidden_dims=40, lstm_layers=1, output_class_n=5) lang_id.train_model(model=multi_class, n_epochs=1, training_data=multi_text_train, c2i=_c2i, i2c=_i2c, l2i=_l2i, i2l=_i2l) print("done") acc_multi, y_hat_multi = lang_id.eval_acc(multi_class, multi_text_test, _c2i, _i2c, _l2i, _i2l) # Jupyter reported Accuracy: 0.6954 # Run 1: Accuracy: 0.6954 print(f"Accuracy: {acc_multi}") from sklearn.metrics import classification_report, confusion_matrix y_multi = multi_text_test.lang.values print(classification_report(y_multi, y_hat_multi)) cm = confusion_matrix(y_multi, y_hat_multi) cm #reload(lang_id); lang_id.pretty_conf_matrix(cm, ['deu', 'eng', 'fra', 'ita', 'spa']) assert_greater(acc_multi, 0.60)
def test_train_model(self): # Adding prefix _ to local variables # Noticed some variation in the scores every time I run it. # Don't want to pollute my variables with globals when running all tests _bi_text = pd.read_csv("../../data/sentences_bilingual.csv") _bi_text_train, _bi_text_test = train_test_split(_bi_text, test_size=0.2) _c2i, _i2c = vocab.build_vocab(bi_text.sentence.values) _l2i, _i2l = vocab.build_label_vocab(bi_text.lang.values) _li = lang_id.LangID(input_vocab_n=len(_c2i), embedding_dims=10, hidden_dims=20, lstm_layers=1, output_class_n=2) _trained_model = lang_id.train_model(model=_li, n_epochs=1, training_data=_bi_text_train, c2i=_c2i, i2c=_i2c, l2i=_l2i, i2l=_i2l) _trained_model(vocab.sentence_to_tensor("this is a sentence", _c2i)) _trained_model(vocab.sentence_to_tensor("quien estas", _c2i)) _acc, _y_hat = lang_id.eval_acc(_trained_model, _bi_text_test, _c2i, _i2c, _l2i, _i2l) print(f"Trained Accuracy: {_acc}") _untrained = lang_id.LangID(input_vocab_n=len(_c2i), embedding_dims=10, hidden_dims=20, lstm_layers=1, output_class_n=2) _acc_untrained, _y_hat_untrained = lang_id.eval_acc( _untrained, _bi_text_test, _c2i, _i2c, _l2i, _i2l) print(f"Untrained Accuracy: {_acc_untrained}") assert_greater(_acc, 0.89)
def test_train_model_embed2_hidden2(self): # Adding prefix _ to local variables # Noticed some variation in the scores every time I run it. # Don't want to pollute my variables with globals when running all tests _bi_text = pd.read_csv("../../data/sentences_bilingual.csv") _bi_text_train, _bi_text_test = train_test_split(_bi_text, test_size=0.2) _c2i, _i2c = vocab.build_vocab(bi_text.sentence.values) _l2i, _i2l = vocab.build_label_vocab(bi_text.lang.values) _li = lang_id.LangID(input_vocab_n=len(_c2i), embedding_dims=2, hidden_dims=2, lstm_layers=1, output_class_n=2) _trained_model = lang_id.train_model(model=_li, n_epochs=1, training_data=_bi_text_train, c2i=_c2i, i2c=_i2c, l2i=_l2i, i2l=_i2l) _trained_model(vocab.sentence_to_tensor("this is a sentence", _c2i)) _trained_model(vocab.sentence_to_tensor("quien estas", _c2i)) _acc, _y_hat = lang_id.eval_acc(_trained_model, _bi_text_test, _c2i, _i2c, _l2i, _i2l) print(f"Trained Accuracy: {_acc}") pred_label_list = np.asarray(_y_hat) pd.options.mode.chained_assignment = None pred_label_list = np.asarray(_y_hat) pd.options.mode.chained_assignment = None _bi_text_test["predicted"] = pred_label_list _bi_text_test = _bi_text_test[["sentence", "lang", "predicted"]] _bi_text_test.to_csv("../../data/deliverable_2.4.csv", index=False) assert_greater(_acc, 0.89)
def setup_module(): global corpus, c2i, i2c corpus = [l.strip() for l in open('data/towns_clean.txt')] c2i, i2c = vocab.build_vocab(corpus)
def setUp(self): global corpus, c2i, i2c corpus = [l.strip() for l in open('../../data/towns_clean.txt')] c2i, i2c = vocab.build_vocab(corpus)