def test(self): corpus: CategorizedCorpus = DataFetcher.load_corpus( NLPData.AIVIVN2019_SA_SAMPLE) # corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA) params = { "vectorizer": CountVectorizer(ngram_range=(1, 2), max_features=4000), "svc": SVC(kernel='linear', C=0.3) } classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.SVC, **params) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def negative_f1_score(y_true, y_pred): score_class_0, score_class_1 = f1_score(y_true, y_pred, average=None) return score_class_1 def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score) print(score) classifier = TextClassifier.load(tmp_model_folder) sentence = Sentence('tuyệt vời') classifier.predict(sentence) shutil.rmtree(tmp_model_folder) print(sentence)
def predict(text): print(f"\nText: {text}") sentence = Sentence(text) classifier.predict(sentence) labels = sentence.labels print(f"Labels: {labels}")
def read_text_classification_file(path_to_file) -> List[Sentence]: sentences = [] with open(path_to_file) as f: lines = f.read().splitlines() for line in lines: label_pattern = r"__label__(?P<label>[\w#]+)" labels = re.findall(label_pattern, line) labels = [Label(label) for label in labels] text = re.sub(label_pattern, "", line) s = Sentence(text, labels) sentences.append(s) return sentences
def predict(self, sentence: Sentence): if self.estimator == TEXT_CLASSIFIER_ESTIMATOR.FAST_TEXT: values, scores = self.ft.predict(sentence.text) labels = [] for value, score in zip(values, scores): value = value.replace("__label__", "") label = Label(value, score) labels.append(label) sentence.add_labels(labels) if self.estimator == TEXT_CLASSIFIER_ESTIMATOR.SVC: text = sentence.text X = self.x_transformer.transform([text]) y = self.svc.predict(X) y = self.y_transformer.inverse_transform(y) sentence.add_labels(y) if self.estimator == TEXT_CLASSIFIER_ESTIMATOR.PIPELINE: text = sentence.text y = self.pipeline.predict([text]) if self.multilabel: y = self.y_encoder.inverse_transform(y) y = list(y[0]) else: y = list(y) sentence.add_labels(y)
def sentiment(text): global classifier if not classifier: if os.path.exists(model_path): classifier = TextClassifier.load(model_path) else: logger.error( f"Could not load model at {model_path}.\n" f"Download model with \"underthesea download {UTSModel.sa_bank.value}\"." ) sys.exit(1) sentence = Sentence(text) classifier.predict(sentence) labels = sentence.labels return [label.value for label in labels]
def classify(X): global classifier if not classifier: if os.path.exists(model_path): classifier = TextClassifier.load(model_path) else: logger.error( f"Could not load model at {model_path}.\n" f"Download model with \"underthesea download {UTSModel.tc_general.value}\".") sys.exit(1) sentence = Sentence(X) classifier.predict(sentence) labels = sentence.labels return labels
def sentiment(text): global classifier if not classifier: if os.path.exists(model_path): classifier = TextClassifier.load(model_path) else: logger.error( f"Could not load model at {model_path}.\n" f"Download model with \"underthesea download {UTSModel.sa_general.value}\"." ) sys.exit(1) sentence = Sentence(text) classifier.predict(sentence) label = sentence.labels[0] if label == "1": label = "negative" if label == "0": label = "positive" return label
def test_fasttext(self): corpus: CategorizedCorpus = DataFetcher.load_corpus( NLPData.AIVIVN2019_SA_SAMPLE) params = {"lr": 0.01, "epoch": 20, "wordNgrams": 3, "dim": 20} classifier = TextClassifier( estimator=TEXT_CLASSIFIER_ESTIMATOR.FAST_TEXT, **params) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=macro_f1_score) print(score) classifier = TextClassifier.load(tmp_model_folder) sentence = Sentence('tuyệt vời') classifier.predict(sentence) shutil.rmtree(tmp_model_folder) print(sentence)
def test(self): corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.UTS2017_BANK_SA_SAMPLE) # corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA) pipeline = Pipeline( steps=[('features', CountVectorizer(ngram_range=(1, 2), max_features=4000)), ('estimator', OneVsRestClassifier(SVC(kernel='linear', C=0.3)))] ) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline, multilabel=True) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=macro_f1_score) print(score) classifier = TextClassifier.load(tmp_model_folder) sentence = Sentence('Dịch vụ tiện dụng quá') classifier.predict(sentence) print(sentence) shutil.rmtree(tmp_model_folder)