예제 #1
0
    def _load_data(self, test_size=0.2):
        log.info('数据预处理...')
        # word:index和index:word
        word_index = dict()
        index_word = ['<unk>']
        questions_cols = ['question1', 'question2']

        log.info('加载数据集...')
        train_data = os.path.join(self.data_path, 'train.csv')
        test_data = os.path.join(self.data_path, 'test.csv')

        train_df = pd.read_csv(train_data)
        test_df = pd.read_csv(test_data)

        # 找到最大的句子长度
        sentences = [
            df[col].str.split(' ') for df in [train_df, test_df]
            for col in questions_cols
        ]
        max_length = max(
            [len(s) for ss in sentences for s in ss if isinstance(s, list)])
        # 预处理(统计并将字符串转换为索引)
        for dataset in [train_df, test_df]:
            for index, row in dataset.iterrows():
                for question_col in questions_cols:
                    question_indexes = []
                    for word in clean_to_list(row[question_col]):
                        if word in self.stops:
                            continue
                        if word not in word_index:
                            word_index[word] = len(index_word)
                            question_indexes.append(len(index_word))
                            index_word.append(word)
                        else:
                            question_indexes.append(word_index[word])
                    dataset.set_value(index, question_col, question_indexes)

        x = train_df[questions_cols]
        y = train_df['is_duplicate']
        x_train, x_val, y_train, y_val = train_test_split(x,
                                                          y,
                                                          test_size=test_size)

        x_train = {'left': x_train.question1, 'right': x_train.question2}
        x_val = {'left': x_val.question1, 'right': x_val.question2}

        y_train = y_train.values
        y_val = y_val.values

        for dataset, side in itertools.product([x_train, x_val],
                                               ['left', 'right']):
            dataset[side] = pad_sequences(dataset[side], maxlen=max_length)

        # 校验问题对各自数目是否正确
        assert x_train['left'].shape == x_train['right'].shape
        assert len(x_train['left']) == len(y_train)
        return x_train, y_train, x_val, y_val, word_index, max_length
예제 #2
0
 def predict(self, text1, text2):
     if isinstance(text1, list) or isinstance(text2, list):
         x1 = [[
             self.word_index.get(word, 0) for word in clean_to_list(text)
         ] for text in text1]
         x2 = [[
             self.word_index.get(word, 0) for word in clean_to_list(text)
         ] for text in text2]
         x1 = pad_sequences(x1, maxlen=self.max_length)
         x2 = pad_sequences(x2, maxlen=self.max_length)
     else:
         x1 = [
             self.word_index.get(word, 0) for word in clean_to_list(text1)
         ]
         x2 = [
             self.word_index.get(word, 0) for word in clean_to_list(text2)
         ]
         x1 = pad_sequences([x1], maxlen=self.max_length)
         x2 = pad_sequences([x2], maxlen=self.max_length)
     # 转为词向量
     return self.model.predict([x1, x2])