Пример #1
0
 def read_instance(self, seg_lists, labels, mode='train'):
     texts, ids = [], []
     for seg_list, label in zip(seg_lists, labels):
         char_list, char_id_list, word_id_list, = [], [], []
         label_id = self.label_alphabet.get_index(label, mode)
         for word in seg_list:
             word_id = self.word_alphabet.get_index(normalize_word(word))
             word_id_list.append(word_id)
             chars, char_ids = [], []
             if self.specific_word(word):
                 chars.append(word)
                 char_ids.append(
                     self.char_alphabet.get_index(normalize_word(word)))
             else:
                 for char in word:
                     chars.append(char)
                     char_ids.append(
                         self.char_alphabet.get_index(normalize_word(char)))
             char_list.append(chars)
             char_id_list.append(char_ids)
         # for char in char_list:
         # 	char_id = self.char_alphabet.get_index(normalize_word(char))
         # 	char_id_list.append(char_id)
         texts.append([seg_list, char_list, label])
         ids.append([word_id_list, char_id_list, label_id])
     return texts, ids
Пример #2
0
    def inference(self, text):
        """

		:param text:
		:return:
		"""
        texts, ids = [], []
        seg_list = self.data.segment([text])[0]
        seg_list = self.synonyms_replace(seg_list)  # 同义词替换
        # print('text: %s, seg_list: %s' % (text, seg_list))
        if len(seg_list) == 0:
            return 1, None, None
        char_list, char_id_list, word_id_list, = [], [], []
        for word in seg_list:
            word_id = self.data.word_alphabet.get_index(normalize_word(word))
            word_id_list.append(word_id)
            chars, char_ids = [], []
            if self.data.specific_word(word):
                chars.append(word)
                char_ids.append(
                    self.data.char_alphabet.get_index(normalize_word(word)))
            else:
                for char in word:
                    chars.append(char)
                    char_ids.append(
                        self.data.char_alphabet.get_index(
                            normalize_word(char)))
            char_list.append(chars)
            char_id_list.append(char_ids)
        texts.append([seg_list, char_list])
        ids.append([word_id_list, char_id_list])
        batch_word, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, mask = \
         predict_batchfy_classification_with_label(ids, self.model.configs['gpu'], if_train=False)
        pred_represent = self.model(batch_word, batch_wordlen, batch_char,
                                    batch_charlen, batch_charrecover, mask)
        pred_represent = pred_represent.data.numpy()
        # ori_pred_represent = pred_represent
        # faiss.normalize_L2(pred_represent)
        # numpy改写faiss.normalize_L2
        pred_represent = pred_represent / np.linalg.norm(pred_represent, ord=2)
        pred_represent = pred_represent.tolist()[0]

        faiss_start = datetime.datetime.now()
        D, I = self.search(self.stub, pred_represent)
        logger.info('Faiss search costs: %s' %
                    (datetime.datetime.now() - faiss_start).total_seconds())

        if D > 0 and I > 0:
            max_id = I[0][0]
            max_score = D
            max_similar_text = self.train_texts[max_id]
            pred_text = ''.join(max_similar_text[0])
            pred_label = max_similar_text[-1]
            if pred_label == 'None':
                pred_label = None
            return max_score, pred_text, pred_label
        else:
            # 如果faiss调用失败,返回默认得分和标签
            return 0, None, None
Пример #3
0
 def inference(self, text, text_list, label_list):
     texts, ids = self.data.read_scene_text_list(text_list, label_list)
     self.data.scene_texts, self.data.scene_ids = texts, ids
     self.scene_texts, scene_represents, scene_label_ids = get_represents(
         self.data, self.model, 'scene', self.model.configs)
     # 处理当前传入的用户input_text
     texts, ids = [], []
     seg_list = self.data.segment([text])[0]
     if len(seg_list) == 0:
         return None, None, None
     # print('seg_list: %s' % seg_list)
     char_list, char_id_list, word_id_list, = [], [], []
     for word in seg_list:
         word_id = self.data.word_alphabet.get_index(normalize_word(word))
         word_id_list.append(word_id)
         chars, char_ids = [], []
         if self.data.specific_word(word):
             chars.append(word)
             char_ids.append(
                 self.data.char_alphabet.get_index(normalize_word(word)))
         else:
             for char in word:
                 chars.append(char)
                 char_ids.append(
                     self.data.char_alphabet.get_index(
                         normalize_word(char)))
         char_list.append(chars)
         char_id_list.append(char_ids)
     texts.append([seg_list, char_list])
     ids.append([word_id_list, char_id_list])
     batch_word, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, mask = \
      predict_batchfy_classification_with_label(ids, self.model.configs['gpu'], if_train=False)
     pred_represent = self.model(batch_word, batch_wordlen, batch_char,
                                 batch_charlen, batch_charrecover, mask)
     max_score, max_similar_text = self.cal_similarity(
         pred_represent, scene_represents)
     pred_text = ''.join(max_similar_text[0])
     pred_label = max_similar_text[-1]
     if pred_label == 'None':
         pred_label = None
     # 置信度、最接近的text,最接近的label
     return max_score, pred_text, pred_label
Пример #4
0
 def read_scene_text_list(self, text_list):
     chars, ids = [], []
     for sentence in text_list:
         sen_text = [char for char in sentence]
         sen_id = [
             self.char_alphabet.get_index(normalize_word(char))
             for char in sentence
         ]
         chars.append(sen_text)
         ids.append(sen_id)
     return chars, ids
Пример #5
0
 def build_alphabet(self):
     char_lists = []
     for seg_list, label in zip(self.seg_lists, self.labels):
         char_list = []
         for word in seg_list:
             # word = normalize_word(word)
             self.word_alphabet.add(normalize_word(word))
             if self.specific_word(word):
                 self.char_alphabet.add(word)
                 char_list.append(word)
             else:
                 for char in word:
                     char_list.append(char)
                     char = normalize_word(char)
                     self.char_alphabet.add(char)
         char_lists.append(char_list)
         self.label_alphabet.add(label)
     self.char_alphabet_size = self.char_alphabet.size()
     self.word_alphabet_size = self.word_alphabet.size()
     self.label_alphabet_size = self.label_alphabet.size()
     self.fix_alphabet()
     return char_lists
Пример #6
0
 def inference_for_scene_with_glove(self, text, text_list, label_list):
     # 预处理scene_texts
     scene_chars, scene_ids = self.data.read_scene_text_list(text_list)
     # 计算weight
     # s = datetime.datetime.now()
     sen_weights = self.cal_char_weight(scene_chars, scene_ids)
     # print('cal_char_weight costs: %s' % (datetime.datetime.now() - s).total_seconds())
     # 计算对应weight下的句子表征
     scene_represents = self.cal_scene_represents(scene_ids, sen_weights)
     # 处理当前input_text:
     chars, ids = [], []
     for char in text:
         chars.append(char)
         ids.append(self.data.char_alphabet.get_index(normalize_word(char)))
     if len(chars) == 0:
         return 1, None, None
     input_weights = self.cal_char_weight([chars], [ids])
     pred_represent = self.cal_scene_represents([ids], input_weights)
     max_score, pred_text, pred_label = self.cal_similarity(
         pred_represent, scene_represents, text_list, label_list)
     if pred_label == 'None':
         pred_label = None
     # 置信度、最接近的text,最接近的label
     return max_score, pred_text, pred_label