import pandas as pd from gensim.models import KeyedVectors from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard from tensorflow.keras.models import load_model from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, concatenate, BatchNormalization, \ Dropout from tensorflow.keras.models import Model from tensorflow.keras.preprocessing.sequence import pad_sequences from sklearn.model_selection import train_test_split from nlp.utils.basic_log import Log from nlp.utils.clean_text import clean_to_list from nlp.utils.plot_model_history import plot from nlp.utils.set_stopwords import set_en_stopwords log = Log(logging.INFO) class SiameseSimilarity: def __init__(self, model_path, config_path, data_path=None, embedding_file=None, n_hidden=128, batch_size=64, epochs=10, embedding_dim=300, train=False): """ 初始化
# coding:utf-8 from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.model_selection import train_test_split from sklearn import svm import pickle import pandas as pd from nlp.utils.clean_text import clean_en_text from nlp.utils.basic_log import Log log = Log('info') class SVMClassifier(object): """ 这个类,是用svm对文本进行分类. 1. 用TF-IDF计算权重值 2. 用卡方检验获取特征 3. 用SVM进行分类训练 """ def __init__(self, model_file, train_path=None, train=False): """ 初始化参数 :param train_path: 训练路径:数据以“x##y”的格式分隔,x为分好词的数据,y为数据标签 :param model_file: 模型保存路径 """ self.model_file = model_file # 先读取训练好的models,如果读取不到,则重新训练 if not train: self.tf_idf_model, self.chi_model, self.clf_model = self.read_model( )