def _load_data(self, test_size=0.2): log.info('数据预处理...') # word:index和index:word word_index = dict() index_word = ['<unk>'] questions_cols = ['question1', 'question2'] log.info('加载数据集...') train_data = os.path.join(self.data_path, 'train.csv') test_data = os.path.join(self.data_path, 'test.csv') train_df = pd.read_csv(train_data) test_df = pd.read_csv(test_data) # 找到最大的句子长度 sentences = [ df[col].str.split(' ') for df in [train_df, test_df] for col in questions_cols ] max_length = max( [len(s) for ss in sentences for s in ss if isinstance(s, list)]) # 预处理(统计并将字符串转换为索引) for dataset in [train_df, test_df]: for index, row in dataset.iterrows(): for question_col in questions_cols: question_indexes = [] for word in clean_to_list(row[question_col]): if word in self.stops: continue if word not in word_index: word_index[word] = len(index_word) question_indexes.append(len(index_word)) index_word.append(word) else: question_indexes.append(word_index[word]) dataset.set_value(index, question_col, question_indexes) x = train_df[questions_cols] y = train_df['is_duplicate'] x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=test_size) x_train = {'left': x_train.question1, 'right': x_train.question2} x_val = {'left': x_val.question1, 'right': x_val.question2} y_train = y_train.values y_val = y_val.values for dataset, side in itertools.product([x_train, x_val], ['left', 'right']): dataset[side] = pad_sequences(dataset[side], maxlen=max_length) # 校验问题对各自数目是否正确 assert x_train['left'].shape == x_train['right'].shape assert len(x_train['left']) == len(y_train) return x_train, y_train, x_val, y_val, word_index, max_length
def predict(self, text1, text2): if isinstance(text1, list) or isinstance(text2, list): x1 = [[ self.word_index.get(word, 0) for word in clean_to_list(text) ] for text in text1] x2 = [[ self.word_index.get(word, 0) for word in clean_to_list(text) ] for text in text2] x1 = pad_sequences(x1, maxlen=self.max_length) x2 = pad_sequences(x2, maxlen=self.max_length) else: x1 = [ self.word_index.get(word, 0) for word in clean_to_list(text1) ] x2 = [ self.word_index.get(word, 0) for word in clean_to_list(text2) ] x1 = pad_sequences([x1], maxlen=self.max_length) x2 = pad_sequences([x2], maxlen=self.max_length) # 转为词向量 return self.model.predict([x1, x2])