예제 #1
0
    def test_train_model_untrained_multi(self):
        multi_text = pd.read_csv("../../data/sentences_multilingual.csv")
        multi_text_train, multi_text_test = train_test_split(multi_text,
                                                             test_size=0.2)

        multi_text.groupby('lang').count()
        multi_text_train.groupby('lang').count()

        _c2i, _i2c = vocab.build_vocab(multi_text.sentence.values)
        _l2i, _i2l = vocab.build_label_vocab(multi_text.lang.values)

        _untrained_multi_class = lang_id.LangID(input_vocab_n=len(_c2i),
                                                embedding_dims=10,
                                                hidden_dims=20,
                                                lstm_layers=1,
                                                output_class_n=5)

        acc_untrained_multi, y_hat_untrained_multi = lang_id.eval_acc(
            _untrained_multi_class, multi_text_test, _c2i, _i2c, _l2i, _i2l)
        print(f"Untrained Multi Accuracy: {acc_untrained_multi}")

        from sklearn.metrics import classification_report, confusion_matrix
        y_multi = multi_text_test.lang.values
        print(classification_report(y_multi, y_hat_untrained_multi))

        cm = confusion_matrix(y_multi, y_hat_untrained_multi)
        cm

        lang_id.pretty_conf_matrix(cm, ['deu', 'eng', 'fra', 'ita', 'spa'])

        assert_greater(acc_untrained_multi, 0.4)
        assert_less(acc_untrained_multi, 0.6)
예제 #2
0
def test_d1_1_char_vocab():
    global tiny_corpus
    c2i, i2c = vocab.build_vocab(tiny_corpus)

    # are BOS_SYM and EOS_SYM in there?
    assert_in(vocab.BOS_SYM, c2i)
    assert_in(vocab.EOS_SYM, c2i)

    # does it map things correctly?

    t_idx = c2i["T"]
    eq_(i2c[t_idx], "T")

    # are they the same length?
    eq_(len(c2i), len(i2c))

    # is it exhaustive?
    all_seen = defaultdict(int)
    for doc in tiny_corpus:
        for c in list(doc):
            all_seen[c] = 1
    char_count = len(all_seen)

    # account for BOS_SYM and EOS_SYM
    char_count += 2

    eq_(char_count, len(c2i))
예제 #3
0
    def setUp(self):
        global bi_text, bi_text_train, bi_text_test, bt_c2i, bt_i2c, bt_l2i, bt_i2l

        bi_text = pd.read_csv("../../data/sentences_bilingual.csv")
        bt_c2i, bt_i2c = vocab.build_vocab(bi_text.sentence.values)
        bt_l2i, bt_i2l = vocab.build_label_vocab(bi_text.lang.values)
        bi_text_train, bi_text_test = train_test_split(bi_text, test_size=0.2)
예제 #4
0
    def test_untrained_model(self):
        # Adding prefix _ to local variables
        # Noticed some variation in the scores every time I run it.
        # Don't want to pollute my variables with globals when running all tests
        _bi_text = pd.read_csv("../../data/sentences_bilingual.csv")
        _bi_text_train, _bi_text_test = train_test_split(_bi_text,
                                                         test_size=0.2)

        _c2i, _i2c = vocab.build_vocab(_bi_text.sentence.values)
        _l2i, _i2l = vocab.build_label_vocab(_bi_text.lang.values)

        _untrained = lang_id.LangID(input_vocab_n=len(_c2i),
                                    embedding_dims=10,
                                    hidden_dims=20,
                                    lstm_layers=1,
                                    output_class_n=2)

        _acc_untrained, _y_hat_untrained = lang_id.eval_acc(
            _untrained, _bi_text_test, _c2i, _i2c, _l2i, _i2l)
        print(f"Untrained Accuracy: {_acc_untrained}")

        pred_label_list = np.asarray(_y_hat_untrained)
        pd.options.mode.chained_assignment = None
        _bi_text_test["predicted"] = pred_label_list
        _bi_text_test = _bi_text_test[["sentence", "lang", "predicted"]]
        _bi_text_test.to_csv("../../data/deliverable_2.4_untrained.csv",
                             index=False)

        assert_greater(_acc_untrained, 0.4)
        assert_less(_acc_untrained, 0.6)
예제 #5
0
def test_d1_3_sentence_tensor():
    global tiny_corpus
    c2i, i2c = vocab.build_vocab(tiny_corpus)

    s = "This is a sentence."
    s_tens = vocab.sentence_to_tensor(s, c2i)
    eq_(s_tens.shape[0], 1)
    eq_(s_tens.shape[1], len(s))
    eq_(len(s_tens.shape), 2)

    # does BOS/EOS padding work?
    with_padding = vocab.sentence_to_tensor(s, c2i, True)
    eq_(with_padding.shape[1], len(s) + 2)
    eq_(with_padding[0, 0].item(), c2i[vocab.BOS_SYM])
예제 #6
0
def test_d1_2_sentence_vector():
    global tiny_corpus
    c2i, i2c = vocab.build_vocab(tiny_corpus)

    test_sentence = "Here is a sentence."

    sent_vec = vocab.sentence_to_vector(test_sentence, c2i)

    eq_(len(sent_vec), len(test_sentence))

    back_to_chars = [i2c[c] for c in sent_vec]
    eq_(test_sentence, ''.join(back_to_chars))

    # does the BOS/EOS padding work?
    with_padding = vocab.sentence_to_vector(test_sentence, c2i, True)
    eq_(len(with_padding), len(test_sentence) + 2)
    eq_(with_padding[0], c2i[vocab.BOS_SYM])
    eq_(with_padding[-1], c2i[vocab.EOS_SYM])
예제 #7
0
    def test_train_model_multi_embed20_hidden40(self):
        multi_text = pd.read_csv("../../data/sentences_multilingual.csv")
        multi_text_train, multi_text_test = train_test_split(multi_text,
                                                             test_size=0.2)

        multi_text.groupby('lang').count()
        multi_text_train.groupby('lang').count()

        _c2i, _i2c = vocab.build_vocab(multi_text.sentence.values)
        _l2i, _i2l = vocab.build_label_vocab(multi_text.lang.values)

        multi_class = lang_id.LangID(input_vocab_n=len(_c2i),
                                     embedding_dims=20,
                                     hidden_dims=40,
                                     lstm_layers=1,
                                     output_class_n=5)

        lang_id.train_model(model=multi_class,
                            n_epochs=1,
                            training_data=multi_text_train,
                            c2i=_c2i,
                            i2c=_i2c,
                            l2i=_l2i,
                            i2l=_i2l)
        print("done")

        acc_multi, y_hat_multi = lang_id.eval_acc(multi_class, multi_text_test,
                                                  _c2i, _i2c, _l2i, _i2l)

        # Jupyter reported Accuracy: 0.6954
        # Run 1: Accuracy: 0.6954
        print(f"Accuracy: {acc_multi}")

        from sklearn.metrics import classification_report, confusion_matrix
        y_multi = multi_text_test.lang.values
        print(classification_report(y_multi, y_hat_multi))

        cm = confusion_matrix(y_multi, y_hat_multi)
        cm

        #reload(lang_id);
        lang_id.pretty_conf_matrix(cm, ['deu', 'eng', 'fra', 'ita', 'spa'])

        assert_greater(acc_multi, 0.60)
예제 #8
0
    def test_train_model(self):

        # Adding prefix _ to local variables
        # Noticed some variation in the scores every time I run it.
        # Don't want to pollute my variables with globals when running all tests
        _bi_text = pd.read_csv("../../data/sentences_bilingual.csv")
        _bi_text_train, _bi_text_test = train_test_split(_bi_text,
                                                         test_size=0.2)
        _c2i, _i2c = vocab.build_vocab(bi_text.sentence.values)
        _l2i, _i2l = vocab.build_label_vocab(bi_text.lang.values)

        _li = lang_id.LangID(input_vocab_n=len(_c2i),
                             embedding_dims=10,
                             hidden_dims=20,
                             lstm_layers=1,
                             output_class_n=2)

        _trained_model = lang_id.train_model(model=_li,
                                             n_epochs=1,
                                             training_data=_bi_text_train,
                                             c2i=_c2i,
                                             i2c=_i2c,
                                             l2i=_l2i,
                                             i2l=_i2l)

        _trained_model(vocab.sentence_to_tensor("this is a sentence", _c2i))
        _trained_model(vocab.sentence_to_tensor("quien estas", _c2i))

        _acc, _y_hat = lang_id.eval_acc(_trained_model, _bi_text_test, _c2i,
                                        _i2c, _l2i, _i2l)
        print(f"Trained Accuracy: {_acc}")

        _untrained = lang_id.LangID(input_vocab_n=len(_c2i),
                                    embedding_dims=10,
                                    hidden_dims=20,
                                    lstm_layers=1,
                                    output_class_n=2)

        _acc_untrained, _y_hat_untrained = lang_id.eval_acc(
            _untrained, _bi_text_test, _c2i, _i2c, _l2i, _i2l)
        print(f"Untrained Accuracy: {_acc_untrained}")

        assert_greater(_acc, 0.89)
예제 #9
0
    def test_train_model_embed2_hidden2(self):

        # Adding prefix _ to local variables
        # Noticed some variation in the scores every time I run it.
        # Don't want to pollute my variables with globals when running all tests
        _bi_text = pd.read_csv("../../data/sentences_bilingual.csv")
        _bi_text_train, _bi_text_test = train_test_split(_bi_text,
                                                         test_size=0.2)
        _c2i, _i2c = vocab.build_vocab(bi_text.sentence.values)
        _l2i, _i2l = vocab.build_label_vocab(bi_text.lang.values)

        _li = lang_id.LangID(input_vocab_n=len(_c2i),
                             embedding_dims=2,
                             hidden_dims=2,
                             lstm_layers=1,
                             output_class_n=2)

        _trained_model = lang_id.train_model(model=_li,
                                             n_epochs=1,
                                             training_data=_bi_text_train,
                                             c2i=_c2i,
                                             i2c=_i2c,
                                             l2i=_l2i,
                                             i2l=_i2l)

        _trained_model(vocab.sentence_to_tensor("this is a sentence", _c2i))
        _trained_model(vocab.sentence_to_tensor("quien estas", _c2i))

        _acc, _y_hat = lang_id.eval_acc(_trained_model, _bi_text_test, _c2i,
                                        _i2c, _l2i, _i2l)
        print(f"Trained Accuracy: {_acc}")

        pred_label_list = np.asarray(_y_hat)
        pd.options.mode.chained_assignment = None

        pred_label_list = np.asarray(_y_hat)
        pd.options.mode.chained_assignment = None
        _bi_text_test["predicted"] = pred_label_list
        _bi_text_test = _bi_text_test[["sentence", "lang", "predicted"]]
        _bi_text_test.to_csv("../../data/deliverable_2.4.csv", index=False)

        assert_greater(_acc, 0.89)
예제 #10
0
def setup_module():
    global corpus, c2i, i2c
    corpus = [l.strip() for l in open('data/towns_clean.txt')]
    c2i, i2c = vocab.build_vocab(corpus)
예제 #11
0
 def setUp(self):
     global corpus, c2i, i2c
     corpus = [l.strip() for l in open('../../data/towns_clean.txt')]
     c2i, i2c = vocab.build_vocab(corpus)