def data_process(self, data, model_data_file): ''' @description: 判断咨询中是否包含业务关键词, 如果包含label为1, 否则为0 并处理成fasttext 需要的数据格式: "__label__" + label + "\t" + content + "\n" @param {type} model_data_file: 模型训练数据保存路径 model_test_file: 模型验证数据保存路径 @return: ''' logging.info('processing data: %s.' % model_data_file) data['custom'] = data['custom'].fillna('') examples = [] for sentence in data['custom'].values: if sentence != '': if any(kw in sentence for kw in self.keywords): label = 1 else: label = 0 text = '\t'.join(['__label__%s' % label, clean(sentence)]) examples.append(text) with open(model_data_file, 'w') as f: for text in examples: f.write(text + '\n') logging.info('Processing data, finished!')
def read_data(file_path): ''' @description: 读取数据,清洗 @param {type} file_path: 文件所在路径 @return: Training samples. ''' data = read_file(file_path, is_train=True) data = pd.DataFrame(data, columns=['session_id', 'role', 'content']) data['clean'] = data['content'].apply(lambda x: clean(x)) # print("data shape: {}".format(data.shape)) # print("data: {}".format(data.head(5))) return data
def predict(self, text): ''' @description: 预测 @param {type} text: 文本 @return: label, score ''' logging.info('Predicting.') clean_text = clean(filter_content(text)) logging.info('text: %s' % text) logging.info('clean text: %s' % clean_text) start_time = time.time() label, score = self.fast.predict(clean_text) logging.info('used time: {:.4f}s'.format(time.time() - start_time)) return label, score
def search(self, text, k=5): ''' @description: 通过hnsw 检索 @param {type} text: 检索句子 k: 检索返回的数量 @return: DataFrame containing the customer input, assistance response and the distance to the query. ''' test_vec = wam(clean(text), self.w2v_model) test_vec = test_vec.reshape(1, -1) D, I = self.index.search(test_vec, k) logging.info("D: {}".format(D)) logging.info("I: {}".format(I)) return pd.concat( (self.data.iloc[I[0]]['custom'].reset_index(), self.data.iloc[I[0]]['assistance'].reset_index(drop=True), pd.DataFrame(D.reshape(-1, 1), columns=['q_distance'])), axis=1)
def load_data(self, data_path): ''' @description: 读取数据,并生成句向量 @param {type} data_path:问答pair数据所在路径 @return: 包含句向量的dataframe ''' if os.path.exists(data_path.replace('.csv', '_for_hnsw.pkl')): logging.info("Reading data from %s" % data_path.replace('.csv', '_for_hnsw.pkl')) data = pd.read_pickle(data_path.replace('.csv', '_for_hnsw.pkl')) logging.info("data: %s" % data.head(5)) else: logging.info("Reading data from %s" % data_path) data = pd.read_csv(data_path, header=0) data['custom_vec'] = data['custom'].progress_apply( lambda s: wam(clean(s), self.w2v_model)) # data['assistance_vec'] = data['assistance'].apply( # lambda s: wam(s, self.w2v_model)) data = data.dropna() logging.info("data: %s" % data.head(5)) data.to_pickle(data_path.replace('.csv', '_for_hnsw.pkl')) return data
# ## Final Preprocessed Data # %% train_x[0] # %% X_train[0] # %% [markdown] # ### Clean the data %load_ext autoreload %autoreload 2 from utils.preprocessing import clean clean_train_x = clean(train_x) clean_valid_x = clean(valid_x) clean_test_x = clean(test_x) # %% [markdown] # ### Stopwords # %% %load_ext autoreload %autoreload 2 from utils.preprocessing import filter_stop_words filtered_train_x = filter_stop_words(clean_train_x) filtered_valid_x = filter_stop_words(clean_valid_x) filtered_test_x = filter_stop_words(clean_test_x) tokenizer = Tokenizer(num_words=max_words)