Пример #1
0
import pandas as pd
from gensim.models import KeyedVectors
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, concatenate, BatchNormalization, \
    Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from nlp.utils.basic_log import Log
from nlp.utils.clean_text import clean_to_list
from nlp.utils.plot_model_history import plot
from nlp.utils.set_stopwords import set_en_stopwords

log = Log(logging.INFO)


class SiameseSimilarity:
    def __init__(self,
                 model_path,
                 config_path,
                 data_path=None,
                 embedding_file=None,
                 n_hidden=128,
                 batch_size=64,
                 epochs=10,
                 embedding_dim=300,
                 train=False):
        """
        初始化
Пример #2
0
# coding:utf-8
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn import svm
import pickle
import pandas as pd
from nlp.utils.clean_text import clean_en_text
from nlp.utils.basic_log import Log

log = Log('info')


class SVMClassifier(object):
    """
    这个类,是用svm对文本进行分类.
    1. 用TF-IDF计算权重值
    2. 用卡方检验获取特征
    3. 用SVM进行分类训练
    """
    def __init__(self, model_file, train_path=None, train=False):
        """
        初始化参数
        :param train_path: 训练路径:数据以“x##y”的格式分隔,x为分好词的数据,y为数据标签
        :param model_file: 模型保存路径
        """
        self.model_file = model_file
        # 先读取训练好的models,如果读取不到,则重新训练
        if not train:
            self.tf_idf_model, self.chi_model, self.clf_model = self.read_model(
            )