예제 #1
0
def test():
    from tokenization.crf_tokenizer import CrfTokenizer
    from word_embedding.word2vec_gensim import Word2Vec
    word2vec_model = Word2Vec.load('../models/pretrained_word2vec.bin')
    # Please give the correct paths
    tokenizer = CrfTokenizer(
        config_root_path=
        '/Users/admin/Desktop/Projects/python/NLP/hactcore/hactcore/nlp/tokenization/',
        model_path='../models/pretrained_tokenizer.crfsuite')
    sym_dict = load_synonym_dict('../data/sentiment/synonym.txt')
    # keras_text_classifier = KerasTextClassifier(tokenizer=tokenizer, word2vec=word2vec_model.wv,
    keras_text_classifier = BiDirectionalLSTMClassifier(
        tokenizer=tokenizer,
        word2vec=word2vec_model.wv,
        model_path='../models/sentiment_model.h5',
        max_length=10,
        n_epochs=10,
        sym_dict=sym_dict)
    X, y = keras_text_classifier.load_data(
        [
            '../data/sentiment/samples/positive.txt',
            '../data/sentiment/samples/negative.txt'
        ],
        load_method=keras_text_classifier.load_data_from_file)

    keras_text_classifier.train(X, y)
    label_dict = {0: 'tích cực', 1: 'tiêu cực'}
    test_sentences = ['Dở thế', 'Hay thế', 'phim chán thật', 'nhảm quá']
    labels = keras_text_classifier.classify(test_sentences,
                                            label_dict=label_dict)
    print(labels)  # Output: ['tiêu cực', 'tích cực', 'tiêu cực', 'tiêu cực']
예제 #2
0
def test_clean_files_in_dir():
    input_dir = '../data/word_embedding/real/html'
    output_dir = '../data/word_embedding/real/training'
    from tokenization.crf_tokenizer import CrfTokenizer
    crf_config_root_path = "../tokenization/"
    crf_model_path = "../models/pretrained_tokenizer.crfsuite"
    tokenizer = CrfTokenizer(config_root_path=crf_config_root_path, model_path=crf_model_path)
    clean_files_from_dir(input_dir, output_dir, should_tokenize=True, tokenizer=tokenizer)
def test_clean_files_in_dir():
    input_dir = 'C:/Users/anlan/OneDrive/Desktop/core_nlp-master/data/word_embedding/real/html'
    output_dir = 'C:/Users/anlan/OneDrive/Desktop/core_nlp-master/data/word_embedding/real/training'
    from tokenization.crf_tokenizer import CrfTokenizer
    crf_config_root_path = "C:/Users/anlan/OneDrive/Desktop/core_nlp-master/word_embedding/tokenization/"
    crf_model_path = "C:/Users/anlan/OneDrive/Desktop/core_nlp-master/models/pretrained_tokenizer.crfsuite"
    tokenizer = CrfTokenizer(config_root_path=crf_config_root_path,
                             model_path=crf_model_path)
    clean_files_from_dir(input_dir,
                         output_dir,
                         should_tokenize=True,
                         tokenizer=tokenizer)
예제 #4
0
from word_embedding.word2vec_gensim import Word2Vec
from text_classification.short_text_classifiers import BiDirectionalLSTMClassifier
from tokenization.crf_tokenizer import CrfTokenizer
import flask
import pandas as pd

word2vec_model = Word2Vec.load('models/pretrained_word2vec.bin')

tokenizer = CrfTokenizer(config_root_path='tokenization/',
                         model_path='models/pretrained_tokenizer.crfsuite')
model = BiDirectionalLSTMClassifier(tokenizer=tokenizer,
                                    word2vec=word2vec_model.wv,
                                    model_path='models/app.h5',
                                    n_class=3)
label_dict = {0: 'mo_vnexpress', 1: 'mo_dantri', 2: 'mo_truyenfull'}

app = flask.Flask(__name__)


@app.route("/predict", methods=["GET", "POST"])
def predict():
    data = {"success": False}

    params = flask.request.json
    if params is None:
        params = flask.request.args

    # if parameters are found, return a prediction
    if params is not None:
        x = pd.DataFrame.from_dict(
            params, orient='index').to_numpy(dtype=str).tolist()
예제 #5
0
from tokenization.crf_tokenizer import CrfTokenizer
from tokenization.base_tokenizer import BaseTokenizer

crf_tokenizer_obj = CrfTokenizer()
# crf_tokenizer_obj.train('data/tokenized/samples/training')
# Note: If you trained your model, please set correct model path and do not train again!
crf_tokenizer_obj = CrfTokenizer(
    model_path='models/pretrained_tokenizer.crfsuite')

import xlrd
import time
import re

partern = re.compile(
    "[0-9a-zA-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂẾưăạảấầẩẫậắằẳẵặẹẻẽềềểếỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹ\s]+"
)

wordFreqs = []
totalLength = 0
start = time.time()
words = {0: {}, 1: {}, 2: {}, 3: {}}

f = open("tokenization/stopwords.txt", "r")
stop_words = f.read().split("\n")
f.close()
f = open("tokenization/cus_stopwords.txt", "r")
stop_words = stop_words + f.read().split("\n")
f.close()

# # social
file_name = 'social.xls'