def test(): from tokenization.crf_tokenizer import CrfTokenizer from word_embedding.word2vec_gensim import Word2Vec word2vec_model = Word2Vec.load('../models/pretrained_word2vec.bin') # Please give the correct paths tokenizer = CrfTokenizer( config_root_path= '/Users/admin/Desktop/Projects/python/NLP/hactcore/hactcore/nlp/tokenization/', model_path='../models/pretrained_tokenizer.crfsuite') sym_dict = load_synonym_dict('../data/sentiment/synonym.txt') # keras_text_classifier = KerasTextClassifier(tokenizer=tokenizer, word2vec=word2vec_model.wv, keras_text_classifier = BiDirectionalLSTMClassifier( tokenizer=tokenizer, word2vec=word2vec_model.wv, model_path='../models/sentiment_model.h5', max_length=10, n_epochs=10, sym_dict=sym_dict) X, y = keras_text_classifier.load_data( [ '../data/sentiment/samples/positive.txt', '../data/sentiment/samples/negative.txt' ], load_method=keras_text_classifier.load_data_from_file) keras_text_classifier.train(X, y) label_dict = {0: 'tích cực', 1: 'tiêu cực'} test_sentences = ['Dở thế', 'Hay thế', 'phim chán thật', 'nhảm quá'] labels = keras_text_classifier.classify(test_sentences, label_dict=label_dict) print(labels) # Output: ['tiêu cực', 'tích cực', 'tiêu cực', 'tiêu cực']
def test_clean_files_in_dir(): input_dir = '../data/word_embedding/real/html' output_dir = '../data/word_embedding/real/training' from tokenization.crf_tokenizer import CrfTokenizer crf_config_root_path = "../tokenization/" crf_model_path = "../models/pretrained_tokenizer.crfsuite" tokenizer = CrfTokenizer(config_root_path=crf_config_root_path, model_path=crf_model_path) clean_files_from_dir(input_dir, output_dir, should_tokenize=True, tokenizer=tokenizer)
def test_clean_files_in_dir(): input_dir = 'C:/Users/anlan/OneDrive/Desktop/core_nlp-master/data/word_embedding/real/html' output_dir = 'C:/Users/anlan/OneDrive/Desktop/core_nlp-master/data/word_embedding/real/training' from tokenization.crf_tokenizer import CrfTokenizer crf_config_root_path = "C:/Users/anlan/OneDrive/Desktop/core_nlp-master/word_embedding/tokenization/" crf_model_path = "C:/Users/anlan/OneDrive/Desktop/core_nlp-master/models/pretrained_tokenizer.crfsuite" tokenizer = CrfTokenizer(config_root_path=crf_config_root_path, model_path=crf_model_path) clean_files_from_dir(input_dir, output_dir, should_tokenize=True, tokenizer=tokenizer)
from word_embedding.word2vec_gensim import Word2Vec from text_classification.short_text_classifiers import BiDirectionalLSTMClassifier from tokenization.crf_tokenizer import CrfTokenizer import flask import pandas as pd word2vec_model = Word2Vec.load('models/pretrained_word2vec.bin') tokenizer = CrfTokenizer(config_root_path='tokenization/', model_path='models/pretrained_tokenizer.crfsuite') model = BiDirectionalLSTMClassifier(tokenizer=tokenizer, word2vec=word2vec_model.wv, model_path='models/app.h5', n_class=3) label_dict = {0: 'mo_vnexpress', 1: 'mo_dantri', 2: 'mo_truyenfull'} app = flask.Flask(__name__) @app.route("/predict", methods=["GET", "POST"]) def predict(): data = {"success": False} params = flask.request.json if params is None: params = flask.request.args # if parameters are found, return a prediction if params is not None: x = pd.DataFrame.from_dict( params, orient='index').to_numpy(dtype=str).tolist()
from tokenization.crf_tokenizer import CrfTokenizer from tokenization.base_tokenizer import BaseTokenizer crf_tokenizer_obj = CrfTokenizer() # crf_tokenizer_obj.train('data/tokenized/samples/training') # Note: If you trained your model, please set correct model path and do not train again! crf_tokenizer_obj = CrfTokenizer( model_path='models/pretrained_tokenizer.crfsuite') import xlrd import time import re partern = re.compile( "[0-9a-zA-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂẾưăạảấầẩẫậắằẳẵặẹẻẽềềểếỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹ\s]+" ) wordFreqs = [] totalLength = 0 start = time.time() words = {0: {}, 1: {}, 2: {}, 3: {}} f = open("tokenization/stopwords.txt", "r") stop_words = f.read().split("\n") f.close() f = open("tokenization/cus_stopwords.txt", "r") stop_words = stop_words + f.read().split("\n") f.close() # # social file_name = 'social.xls'