def test(self): corpus: CategorizedCorpus = DataFetcher.load_corpus( NLPData.AIVIVN2019_SA_SAMPLE) # corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA) params = { "vectorizer": CountVectorizer(ngram_range=(1, 2), max_features=4000), "svc": SVC(kernel='linear', C=0.3) } classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.SVC, **params) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def negative_f1_score(y_true, y_pred): score_class_0, score_class_1 = f1_score(y_true, y_pred, average=None) return score_class_1 def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score) print(score) classifier = TextClassifier.load(tmp_model_folder) sentence = Sentence('tuyệt vời') classifier.predict(sentence) shutil.rmtree(tmp_model_folder) print(sentence)
def my_run(estimator__C, features__lower_pipe__tfidf__ngram_range, features__with_tone_char__ngram_range, features__remove_tone__tfidf__ngram_range): params = locals().copy() start = time.time() print(params) corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.VLSP2016_SA) pipeline = Pipeline( steps=[( 'features', FeatureUnion([( 'lower_pipe', Pipeline([('lower', Lowercase()), ('tfidf', TfidfVectorizer( ngram_range=(1, 4), norm='l2', min_df=2))])), ('with_tone_char', TfidfVectorizer(ngram_range=(1, 6), norm='l2', min_df=2, analyzer='char')), ('remove_tone', Pipeline([('remove_tone', RemoveTone()), ('lower', Lowercase()), ('tfidf', TfidfVectorizer(ngram_range=(1, 4), norm='l2', min_df=2))]) ), ('emoticons', CountEmoticons())])), ('estimator', SVC(kernel='linear', C=0.2175, class_weight=None, verbose=True) )]) pipeline.set_params(**params) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def negative_f1_score(y_true, y_pred): score_class_0, score_class_1, score_class_2 = f1_score(y_true, y_pred, average=None) return score_class_1 def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score) ex.log_scalar('dev_score', score['dev_score']) ex.log_scalar('test_score', score['test_score']) print(time.time() - start) return score['dev_score']
def run(estimator, features): corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA) pipeline = Pipeline( steps=[( 'features', FeatureUnion([( 'lower_tfidf', Pipeline([('lower', Lowercase()), ('tfidf', TfidfVectorizer( ngram_range=(1, 4), norm='l2', min_df=2))])), ('with_tone_char', TfidfVectorizer(ngram_range=(1, 6), norm='l2', min_df=2, analyzer='char')), ('remove_tone', Pipeline([('remove_tone', RemoveTone()), ('lower', Lowercase()), ('tfidf', TfidfVectorizer(ngram_range=(1, 4), norm='l2', min_df=2))]))])), ('estimator', SVC(kernel='linear', C=0.2175, class_weight=None, verbose=True) )]) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def negative_f1_score(y_true, y_pred): score_class_0, score_class_1 = f1_score(y_true, y_pred, average=None) return score_class_1 def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score) ex.log_scalar('dev_score', score['dev_score']) ex.log_scalar('test_score', score['test_score']) return score['test_score']
def my_run(estimator__C, features__lower_pipe__tfidf__max_features, features__lower_pipe__tfidf__ngram_range, features__with_tone_char__ngram_range, features__remove_tone__tfidf__ngram_range): params = locals().copy() start = time.time() print(params) corpus: CategorizedCorpus = DataFetcher.load_corpus( NLPData.UTS2017_BANK_TC) pipeline = Pipeline(steps=[( 'features', FeatureUnion([('lower_pipe', Pipeline([('lower', Lowercase()), ('tfidf', TfidfVectorizer(norm='l2', min_df=2))])), ('with_tone_char', TfidfVectorizer(norm='l2', min_df=2, analyzer='char')), ('remove_tone', Pipeline([('remove_tone', RemoveTone()), ('lower', Lowercase()), ('tfidf', TfidfVectorizer(norm='l2', min_df=2) )])), ('emoticons', CountEmoticons())]) ), ('estimator', SVC(kernel='linear', class_weight=None, verbose=True))]) pipeline.set_params(**params) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = "tmp/tmp_model" def micro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='micro') score = model_trainer.train(tmp_model_folder, scoring=micro_f1_score) tmp_files = listdir(tmp_model_folder) for file in tmp_files: if "gitignore" in file: continue os.remove(f"{tmp_model_folder}/{file}") ex.log_scalar('dev_score', score['dev_score']) ex.log_scalar('test_score', score['test_score']) print(f"Time: {round(time.time() - start, 2)} s") return score['dev_score']
def test_fasttext(self): corpus: CategorizedCorpus = DataFetcher.load_corpus( NLPData.AIVIVN2019_SA_SAMPLE) params = {"lr": 0.01, "epoch": 20, "wordNgrams": 3, "dim": 20} classifier = TextClassifier( estimator=TEXT_CLASSIFIER_ESTIMATOR.FAST_TEXT, **params) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=macro_f1_score) print(score) classifier = TextClassifier.load(tmp_model_folder) sentence = Sentence('tuyệt vời') classifier.predict(sentence) shutil.rmtree(tmp_model_folder) print(sentence)
def my_run(features__max_df, features__ngram_range): params = locals().copy() start = time.time() print(params) from languageflow.data_fetcher import DataFetcher corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.VNTC) pipeline = Pipeline(steps=[('features', TfidfVectorizer()), ('estimator', LinearSVC())]) pipeline.set_params(**params) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def micro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='micro') score = model_trainer.train(tmp_model_folder, scoring=micro_f1_score) ex.log_scalar('dev_score', score['dev_score']) ex.log_scalar('test_score', score['test_score']) print(time.time() - start) return score['dev_score']
def test(self): corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.UTS2017_BANK_SA_SAMPLE) # corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA) pipeline = Pipeline( steps=[('features', CountVectorizer(ngram_range=(1, 2), max_features=4000)), ('estimator', OneVsRestClassifier(SVC(kernel='linear', C=0.3)))] ) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline, multilabel=True) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=macro_f1_score) print(score) classifier = TextClassifier.load(tmp_model_folder) sentence = Sentence('Dịch vụ tiện dụng quá') classifier.predict(sentence) print(sentence) shutil.rmtree(tmp_model_folder)
from languageflow.data_fetcher import DataFetcher, NLPData from languageflow.models.text_classifier import TextClassifier, TEXT_CLASSIFIER_ESTIMATOR from languageflow.trainers.model_trainer import ModelTrainer from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import f1_score from sklearn.pipeline import Pipeline from sklearn.svm import SVC corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA) pipeline = Pipeline( steps=[('features', CountVectorizer(ngram_range=( 1, 2), max_features=4000)), ('estimator', SVC(kernel='linear', C=0.3))]) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def negative_f1_score(y_true, y_pred): score_class_0, score_class_1 = f1_score(y_true, y_pred, average=None) return score_class_1 def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score) print(score)
norm='l2', min_df=2, analyzer='char')), ('remove_tone', Pipeline([('remove_tone', RemoveTone()), ('lower', Lowercase()), ('tfidf', TfidfVectorizer( ngram_range=remove_tone__tfidf__ngram_range, norm='l2', min_df=2))]) ), ('emoticons', CountEmoticons())])), ('estimator', SVC(kernel='linear', C=estimator_C, class_weight=None, verbose=True)) ]) print("\n\n>>> Start training") classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline) model_trainer = ModelTrainer(classifier, corpus) def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') model_trainer.train(model_folder, scoring=macro_f1_score) print("\n\n>>> Finish training") print(f"Your model is saved in {model_folder}") print(f"Running in {time() - start}")