Exemplo n.º 1
0
    def test_train_model_untrained_multi(self):
        multi_text = pd.read_csv("../../data/sentences_multilingual.csv")
        multi_text_train, multi_text_test = train_test_split(multi_text,
                                                             test_size=0.2)

        multi_text.groupby('lang').count()
        multi_text_train.groupby('lang').count()

        _c2i, _i2c = vocab.build_vocab(multi_text.sentence.values)
        _l2i, _i2l = vocab.build_label_vocab(multi_text.lang.values)

        _untrained_multi_class = lang_id.LangID(input_vocab_n=len(_c2i),
                                                embedding_dims=10,
                                                hidden_dims=20,
                                                lstm_layers=1,
                                                output_class_n=5)

        acc_untrained_multi, y_hat_untrained_multi = lang_id.eval_acc(
            _untrained_multi_class, multi_text_test, _c2i, _i2c, _l2i, _i2l)
        print(f"Untrained Multi Accuracy: {acc_untrained_multi}")

        from sklearn.metrics import classification_report, confusion_matrix
        y_multi = multi_text_test.lang.values
        print(classification_report(y_multi, y_hat_untrained_multi))

        cm = confusion_matrix(y_multi, y_hat_untrained_multi)
        cm

        lang_id.pretty_conf_matrix(cm, ['deu', 'eng', 'fra', 'ita', 'spa'])

        assert_greater(acc_untrained_multi, 0.4)
        assert_less(acc_untrained_multi, 0.6)
Exemplo n.º 2
0
    def test_train_model_multi_embed20_hidden40(self):
        multi_text = pd.read_csv("../../data/sentences_multilingual.csv")
        multi_text_train, multi_text_test = train_test_split(multi_text,
                                                             test_size=0.2)

        multi_text.groupby('lang').count()
        multi_text_train.groupby('lang').count()

        _c2i, _i2c = vocab.build_vocab(multi_text.sentence.values)
        _l2i, _i2l = vocab.build_label_vocab(multi_text.lang.values)

        multi_class = lang_id.LangID(input_vocab_n=len(_c2i),
                                     embedding_dims=20,
                                     hidden_dims=40,
                                     lstm_layers=1,
                                     output_class_n=5)

        lang_id.train_model(model=multi_class,
                            n_epochs=1,
                            training_data=multi_text_train,
                            c2i=_c2i,
                            i2c=_i2c,
                            l2i=_l2i,
                            i2l=_i2l)
        print("done")

        acc_multi, y_hat_multi = lang_id.eval_acc(multi_class, multi_text_test,
                                                  _c2i, _i2c, _l2i, _i2l)

        # Jupyter reported Accuracy: 0.6954
        # Run 1: Accuracy: 0.6954
        print(f"Accuracy: {acc_multi}")

        from sklearn.metrics import classification_report, confusion_matrix
        y_multi = multi_text_test.lang.values
        print(classification_report(y_multi, y_hat_multi))

        cm = confusion_matrix(y_multi, y_hat_multi)
        cm

        #reload(lang_id);
        lang_id.pretty_conf_matrix(cm, ['deu', 'eng', 'fra', 'ita', 'spa'])

        assert_greater(acc_multi, 0.60)