예제 #1
0
    def test(self):
        corpus: CategorizedCorpus = DataFetcher.load_corpus(
            NLPData.AIVIVN2019_SA_SAMPLE)
        # corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA)
        params = {
            "vectorizer": CountVectorizer(ngram_range=(1, 2),
                                          max_features=4000),
            "svc": SVC(kernel='linear', C=0.3)
        }
        classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.SVC,
                                    **params)
        model_trainer = ModelTrainer(classifier, corpus)
        tmp_model_folder = mkdtemp()

        def negative_f1_score(y_true, y_pred):
            score_class_0, score_class_1 = f1_score(y_true,
                                                    y_pred,
                                                    average=None)
            return score_class_1

        def macro_f1_score(y_true, y_pred):
            return f1_score(y_true, y_pred, average='macro')

        score = model_trainer.train(tmp_model_folder,
                                    scoring=negative_f1_score)
        print(score)

        classifier = TextClassifier.load(tmp_model_folder)
        sentence = Sentence('tuyệt vời')
        classifier.predict(sentence)
        shutil.rmtree(tmp_model_folder)
        print(sentence)
예제 #2
0
def my_run(estimator__C, features__lower_pipe__tfidf__ngram_range,
           features__with_tone_char__ngram_range,
           features__remove_tone__tfidf__ngram_range):
    params = locals().copy()
    start = time.time()
    print(params)
    corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.VLSP2016_SA)
    pipeline = Pipeline(
        steps=[(
            'features',
            FeatureUnion([(
                'lower_pipe',
                Pipeline([('lower', Lowercase()),
                          ('tfidf',
                           TfidfVectorizer(
                               ngram_range=(1, 4), norm='l2', min_df=2))])),
                          ('with_tone_char',
                           TfidfVectorizer(ngram_range=(1, 6),
                                           norm='l2',
                                           min_df=2,
                                           analyzer='char')),
                          ('remove_tone',
                           Pipeline([('remove_tone',
                                      RemoveTone()), ('lower', Lowercase()),
                                     ('tfidf',
                                      TfidfVectorizer(ngram_range=(1, 4),
                                                      norm='l2',
                                                      min_df=2))])
                           ), ('emoticons', CountEmoticons())])),
               ('estimator',
                SVC(kernel='linear', C=0.2175, class_weight=None, verbose=True)
                )])
    pipeline.set_params(**params)
    classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                                pipeline=pipeline)
    model_trainer = ModelTrainer(classifier, corpus)
    tmp_model_folder = mkdtemp()

    def negative_f1_score(y_true, y_pred):
        score_class_0, score_class_1, score_class_2 = f1_score(y_true,
                                                               y_pred,
                                                               average=None)
        return score_class_1

    def macro_f1_score(y_true, y_pred):
        return f1_score(y_true, y_pred, average='macro')

    score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score)
    ex.log_scalar('dev_score', score['dev_score'])
    ex.log_scalar('test_score', score['test_score'])
    print(time.time() - start)
    return score['dev_score']
예제 #3
0
def run(estimator, features):
    corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA)
    pipeline = Pipeline(
        steps=[(
            'features',
            FeatureUnion([(
                'lower_tfidf',
                Pipeline([('lower', Lowercase()),
                          ('tfidf',
                           TfidfVectorizer(
                               ngram_range=(1, 4), norm='l2', min_df=2))])),
                          ('with_tone_char',
                           TfidfVectorizer(ngram_range=(1, 6),
                                           norm='l2',
                                           min_df=2,
                                           analyzer='char')),
                          ('remove_tone',
                           Pipeline([('remove_tone',
                                      RemoveTone()), ('lower', Lowercase()),
                                     ('tfidf',
                                      TfidfVectorizer(ngram_range=(1, 4),
                                                      norm='l2',
                                                      min_df=2))]))])),
               ('estimator',
                SVC(kernel='linear', C=0.2175, class_weight=None, verbose=True)
                )])
    classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                                pipeline=pipeline)
    model_trainer = ModelTrainer(classifier, corpus)
    tmp_model_folder = mkdtemp()

    def negative_f1_score(y_true, y_pred):
        score_class_0, score_class_1 = f1_score(y_true, y_pred, average=None)
        return score_class_1

    def macro_f1_score(y_true, y_pred):
        return f1_score(y_true, y_pred, average='macro')

    score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score)
    ex.log_scalar('dev_score', score['dev_score'])
    ex.log_scalar('test_score', score['test_score'])
    return score['test_score']
def my_run(estimator__C, features__lower_pipe__tfidf__max_features,
           features__lower_pipe__tfidf__ngram_range,
           features__with_tone_char__ngram_range,
           features__remove_tone__tfidf__ngram_range):
    params = locals().copy()
    start = time.time()
    print(params)
    corpus: CategorizedCorpus = DataFetcher.load_corpus(
        NLPData.UTS2017_BANK_TC)
    pipeline = Pipeline(steps=[(
        'features',
        FeatureUnion([('lower_pipe',
                       Pipeline([('lower', Lowercase()),
                                 ('tfidf',
                                  TfidfVectorizer(norm='l2', min_df=2))])),
                      ('with_tone_char',
                       TfidfVectorizer(norm='l2', min_df=2, analyzer='char')),
                      ('remove_tone',
                       Pipeline([('remove_tone',
                                  RemoveTone()), ('lower', Lowercase()),
                                 ('tfidf', TfidfVectorizer(norm='l2', min_df=2)
                                  )])), ('emoticons', CountEmoticons())])
    ), ('estimator', SVC(kernel='linear', class_weight=None, verbose=True))])
    pipeline.set_params(**params)
    classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                                pipeline=pipeline)
    model_trainer = ModelTrainer(classifier, corpus)
    tmp_model_folder = "tmp/tmp_model"

    def micro_f1_score(y_true, y_pred):
        return f1_score(y_true, y_pred, average='micro')

    score = model_trainer.train(tmp_model_folder, scoring=micro_f1_score)
    tmp_files = listdir(tmp_model_folder)
    for file in tmp_files:
        if "gitignore" in file:
            continue
        os.remove(f"{tmp_model_folder}/{file}")
    ex.log_scalar('dev_score', score['dev_score'])
    ex.log_scalar('test_score', score['test_score'])
    print(f"Time: {round(time.time() - start, 2)} s")
    return score['dev_score']
예제 #5
0
    def test_fasttext(self):
        corpus: CategorizedCorpus = DataFetcher.load_corpus(
            NLPData.AIVIVN2019_SA_SAMPLE)
        params = {"lr": 0.01, "epoch": 20, "wordNgrams": 3, "dim": 20}
        classifier = TextClassifier(
            estimator=TEXT_CLASSIFIER_ESTIMATOR.FAST_TEXT, **params)
        model_trainer = ModelTrainer(classifier, corpus)
        tmp_model_folder = mkdtemp()

        def macro_f1_score(y_true, y_pred):
            return f1_score(y_true, y_pred, average='macro')

        score = model_trainer.train(tmp_model_folder, scoring=macro_f1_score)
        print(score)

        classifier = TextClassifier.load(tmp_model_folder)
        sentence = Sentence('tuyệt vời')
        classifier.predict(sentence)
        shutil.rmtree(tmp_model_folder)
        print(sentence)
예제 #6
0
def my_run(features__max_df, features__ngram_range):
    params = locals().copy()
    start = time.time()
    print(params)
    from languageflow.data_fetcher import DataFetcher
    corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.VNTC)
    pipeline = Pipeline(steps=[('features',
                                TfidfVectorizer()), ('estimator',
                                                     LinearSVC())])
    pipeline.set_params(**params)
    classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                                pipeline=pipeline)
    model_trainer = ModelTrainer(classifier, corpus)
    tmp_model_folder = mkdtemp()

    def micro_f1_score(y_true, y_pred):
        return f1_score(y_true, y_pred, average='micro')

    score = model_trainer.train(tmp_model_folder, scoring=micro_f1_score)
    ex.log_scalar('dev_score', score['dev_score'])
    ex.log_scalar('test_score', score['test_score'])
    print(time.time() - start)
    return score['dev_score']
예제 #7
0
    def test(self):
        corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.UTS2017_BANK_SA_SAMPLE)
        # corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA)
        pipeline = Pipeline(
            steps=[('features', CountVectorizer(ngram_range=(1, 2), max_features=4000)),
                   ('estimator', OneVsRestClassifier(SVC(kernel='linear', C=0.3)))]
        )
        classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline, multilabel=True)
        model_trainer = ModelTrainer(classifier, corpus)
        tmp_model_folder = mkdtemp()

        def macro_f1_score(y_true, y_pred):
            return f1_score(y_true, y_pred, average='macro')

        score = model_trainer.train(tmp_model_folder, scoring=macro_f1_score)
        print(score)

        classifier = TextClassifier.load(tmp_model_folder)

        sentence = Sentence('Dịch vụ tiện dụng quá')
        classifier.predict(sentence)
        print(sentence)

        shutil.rmtree(tmp_model_folder)
예제 #8
0
from languageflow.data_fetcher import DataFetcher, NLPData
from languageflow.models.text_classifier import TextClassifier, TEXT_CLASSIFIER_ESTIMATOR
from languageflow.trainers.model_trainer import ModelTrainer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA)
pipeline = Pipeline(
    steps=[('features', CountVectorizer(ngram_range=(
        1, 2), max_features=4000)), ('estimator',
                                     SVC(kernel='linear', C=0.3))])
classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                            pipeline=pipeline)
model_trainer = ModelTrainer(classifier, corpus)
tmp_model_folder = mkdtemp()


def negative_f1_score(y_true, y_pred):
    score_class_0, score_class_1 = f1_score(y_true, y_pred, average=None)
    return score_class_1


def macro_f1_score(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')


score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score)
print(score)
예제 #9
0
                                    norm='l2',
                                    min_df=2,
                                    analyzer='char')),
                   ('remove_tone',
                    Pipeline([('remove_tone',
                               RemoveTone()), ('lower', Lowercase()),
                              ('tfidf',
                               TfidfVectorizer(
                                   ngram_range=remove_tone__tfidf__ngram_range,
                                   norm='l2',
                                   min_df=2))])
                    ), ('emoticons',
                        CountEmoticons())])),
    ('estimator',
     SVC(kernel='linear', C=estimator_C, class_weight=None, verbose=True))
])

print("\n\n>>> Start training")
classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                            pipeline=pipeline)
model_trainer = ModelTrainer(classifier, corpus)


def macro_f1_score(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')


model_trainer.train(model_folder, scoring=macro_f1_score)
print("\n\n>>> Finish training")
print(f"Your model is saved in {model_folder}")
print(f"Running in {time() - start}")