Exemplo n.º 1
0
        def _create_feed_dict_from_features(self, features, mode = 'train'):
            s1 = features['s1'].tolist()
            s1_input_mask = []
            s1_segment_ids = []

            for batch_idx, s1_ in enumerate(s1):  # 遍历batch
                # remove pad
                i = len(s1_)-1
                while s1_[i] == word2id['<pad>']:  # [PAD]
                    i -= 1
                s1_ = s1_[:i+1]

                s1[batch_idx] = [word2id['[CLS]']] + s1_[:bert_max_length-2] + [word2id['[SEP]']]  # [CLS] + [] + [SEP]
                input_mask_ = [1] * len(s1[batch_idx])
                segment_ids_ = [0] * len(s1[batch_idx])
                s1_input_mask.append(input_mask_)
                s1_segment_ids.append(segment_ids_)

            feed_dict = {
                self.s1: utils.pad_sequences(s1, padding='post'),
                self.s1_input_mask: utils.pad_sequences(s1_input_mask, padding='post'),
                self.s1_segment_ids: utils.pad_sequences(s1_segment_ids, padding='post'),
                self.target: features['target'],
            }
            if mode == 'train': feed_dict['num'] = len(features['s1'])
            feed_dict[self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0.
            return feed_dict
Exemplo n.º 2
0
        def _create_feed_dict_from_raw(self, batch_s1, batch_y, token2id_dct, mode='infer'):
            word2id = token2id_dct['word2id']
            label2id = token2id_dct['label2id']

            feed_s1 = [self.sent2ids(s1, word2id) for s1 in batch_s1]
            feed_input_mask = []
            feed_segment_ids = []

            for i, s1_ in enumerate(feed_s1):
                feed_s1[i] = [word2id['[CLS]']] + s1_[:bert_max_length - 2] + [word2id['[SEP]']]  # [CLS] + [] + [SEP]
                feed_input_mask.append([1] * len(feed_s1[i]))
                feed_segment_ids.append([0] * len(feed_s1[i]))

            feed_dict = {
                self.s1: utils.pad_sequences(feed_s1, padding='post'),
                self.s1_input_mask: utils.pad_sequences(feed_input_mask, padding='post'),
                self.s1_segment_ids: utils.pad_sequences(feed_segment_ids, padding='post'),
            }
            feed_dict[self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0.

            if mode == 'infer':
                return feed_dict

            if mode in ['train', 'dev']:
                feed_target = [self.label2id(y, label2id) for y in batch_y]
                feed_dict[self.target] = feed_target
                return feed_dict
            raise ValueError(f'mode type {mode} not support')
Exemplo n.º 3
0
    def create_feed_dict_from_raw(self,
                                  batch_s1,
                                  batch_ner_label,
                                  token2id_dct,
                                  mode='infer'):
        char2id = token2id_dct['char2id']
        bmeo2id = token2id_dct['bmeo2id']

        feed_s1 = [self.sent2ids(s1, char2id) for s1 in batch_s1]

        feed_dict = {
            self.s1: utils.pad_sequences(feed_s1, padding='post'),
        }
        feed_dict[
            self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0.

        if mode == 'infer':
            return feed_dict

        if mode in ['train', 'dev']:
            assert batch_ner_label, 'batch_ner_label should not be None when mode is train or dev'
            feed_ner_label = [
                self.bmeo2ids(label, bmeo2id) for label in batch_ner_label
            ]
            feed_dict[self.ner_label] = utils.pad_sequences(feed_ner_label,
                                                            padding='post')
            return feed_dict

        raise ValueError(f'mode type {mode} not support')
Exemplo n.º 4
0
 def create_feed_dict_from_data(self, data, ids, mode='train'):
     # data:数据已经转为id, data不同字段保存该段字段全量数据
     batch_s1 = [data['s1'][i] for i in ids]
     batch_s2 = [data['s2'][i] for i in ids]
     feed_dict = {
         self.s1: utils.pad_sequences(batch_s1, padding='post'),
         self.s2: utils.pad_sequences(batch_s2, padding='post'),
     }
     if mode == 'train': feed_dict['num'] = len(batch_s1)
     feed_dict[self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0.
     return feed_dict
Exemplo n.º 5
0
 def create_feed_dict_from_data(self, data, ids, mode='train'):
     # data:数据已经转为id, data不同字段保存该段字段全量数据
     batch_s1 = [data['s1'][i] for i in ids]
     batch_s2 = [data['s2'][i] for i in ids]
     if len(set([len(e) for e in batch_s1])) != 1:  # 长度不等
         batch_s1 = utils.pad_sequences(batch_s1, padding='post')
     if len(set([len(e) for e in batch_s2])) != 1:  # 长度不等
         batch_s2 = utils.pad_sequences(batch_s2, padding='post')
     feed_dict = {
         self.s1: batch_s1,
         self.s2: batch_s2,
         self.target: [data['target'][i] for i in ids],
     }
     if mode == 'train': feed_dict['num'] = len(batch_s1)
     feed_dict[self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0.
     return feed_dict
Exemplo n.º 6
0
        def _create_feed_dict_from_raw(self,
                                       batch_s1,
                                       batch_ner_label,
                                       token2id_dct,
                                       mode='infer'):
            char2id = token2id_dct['char2id']
            bmeo2id = token2id_dct['bmeo2id']

            feed_s1 = [self.sent2ids(s1, char2id) for s1 in batch_s1]
            feed_input_mask = []
            feed_segment_ids = []
            for i, s1_ in enumerate(feed_s1):
                feed_s1[i] = [char2id['[CLS]']] + s1_[:bert_max_length - 2] + [
                    char2id['[SEP]']
                ]  # [CLS] + [] + [SEP]
                feed_input_mask.append([1] * len(feed_s1[i]))
                feed_segment_ids.append([0] * len(feed_s1[i]))

            feed_dict = {
                self.s1:
                utils.pad_sequences(feed_s1, padding='post'),
                self.s1_input_mask:
                utils.pad_sequences(feed_input_mask, padding='post'),
                self.s1_segment_ids:
                utils.pad_sequences(feed_segment_ids, padding='post'),
            }
            feed_dict[
                self.
                dropout_rate] = conf.dropout_rate if mode == 'train' else 0.

            if mode == 'infer':
                return feed_dict

            if mode in ['train', 'dev']:
                feed_ner_label = [
                    self.bmeo2ids(label, bmeo2id) for label in batch_ner_label
                ]
                feed_ner_label = [[bmeo2id['O']] + ele[:bert_max_length - 2] +
                                  [bmeo2id['O']] for ele in feed_ner_label]
                feed_ner_label = utils.pad_sequences(feed_ner_label,
                                                     padding='post')

                feed_dict[self.ner_label] = feed_ner_label
                return feed_dict

            raise ValueError(f'mode type {mode} not support')
Exemplo n.º 7
0
        def _create_feed_dict_from_features(self, features, mode='train'):
            s1 = features['s1'].tolist()
            ner_label = features['ner_label'].tolist()
            new_s1 = []
            new_ner_label = []
            s1_input_mask = []
            s1_segment_ids = []

            for s1_, ner_label_ in zip(s1, ner_label):  # 遍历batch
                # remove pad
                i = len(s1_) - 1
                while s1_[i] == char2id['<pad>']:  # [PAD]
                    i -= 1
                s1_ = s1_[:i + 1]
                ner_label_ = ner_label_[:i + 1]

                s1_ = [char2id['[CLS]']] + s1_[:bert_max_length - 2] + [
                    char2id['[SEP]']
                ]  # [CLS] + [] + [SEP]
                ner_label_ = [bmeo2id['O']
                              ] + ner_label_[:bert_max_length - 2] + [
                                  bmeo2id['O']
                              ]  # [O] + [] + [O]
                input_mask_ = [1] * len(s1_)
                segment_ids_ = [0] * len(s1_)

                new_s1.append(s1_)
                new_ner_label.append(ner_label_)
                s1_input_mask.append(input_mask_)
                s1_segment_ids.append(segment_ids_)

            feed_dict = {
                self.s1:
                utils.pad_sequences(new_s1, padding='post'),
                self.s1_input_mask:
                utils.pad_sequences(s1_input_mask, padding='post'),
                self.s1_segment_ids:
                utils.pad_sequences(s1_segment_ids, padding='post'),
                self.ner_label:
                utils.pad_sequences(new_ner_label, padding='post'),
            }
            if mode == 'train': feed_dict['num'] = len(features['s1'])
            feed_dict[
                self.
                dropout_rate] = conf.dropout_rate if mode == 'train' else 0.
            return feed_dict
Exemplo n.º 8
0
    def create_feed_dict_from_raw(self, batch_multi_s1, batch_s2, token2id_dct, mode='infer'):
        word2id = token2id_dct['word2id']

        feed_multi_s1 = [self.multi_sent2ids(multi_s1.split('$$$'), word2id) for multi_s1 in batch_multi_s1]

        feed_dict = {
            self.multi_s1: utils.pad_sequences(feed_multi_s1, padding='post'),
        }
        feed_dict[self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0.
        if mode == 'infer':
            return feed_dict

        if mode in ['train', 'dev']:
            assert batch_s2, 'batch_s2 should not be None when mode is train or dev'
            feed_s2 = [self.sent2ids(s2, word2id) for s2 in batch_s2]
            feed_dict[self.s2] = utils.pad_sequences(feed_s2, padding='post')
            return feed_dict

        raise ValueError(f'mode type {mode} not support')
Exemplo n.º 9
0
 def multi_sent2ids(cls, multi_sent, word2id, max_word_len=None, max_turn=None):
     # multi_sent: sent的list
     # sent: 已分好词 ' '隔开
     # 不能在形成batch时才动态补齐长度和轮数,因为tfrecord存储会平展为一维,并没有足够信息恢复为不定的[turn,len]
     # 至少需要固定一个(turn或len),且需要补齐为矩阵turn * len,后面才可通过reshape恢复
     # 这里固定turn
     multi_token_ids = []
     for sent in multi_sent:
         multi_token_ids.append(cls.sent2ids(sent, word2id, max_word_len=max_word_len))
     if max_turn is None:  # turn需固定来补齐句子长度
         max_turn = conf.max_turn
     multi_token_ids = multi_token_ids[-max_turn:]  # 若大于截取后n轮
     multi_token_ids = multi_token_ids + [[]] * (max_turn - len(multi_token_ids))  # 若不足则后面补齐
     multi_token_ids = utils.pad_sequences(multi_token_ids, padding='post', value=0)  # 补齐每个sent长度
     return multi_token_ids  # [turn(5), len]
Exemplo n.º 10
0
    def create_feed_dict_from_raw(self, batch_s1, batch_y, token2id_dct, mode='infer'):
        word2id = token2id_dct['word2id']
        label2id = token2id_dct['label2id']

        feed_s1 = [self.sent2ids(s1, word2id) for s1 in batch_s1]

        feed_dict = {
            self.s1: utils.pad_sequences(feed_s1, padding='post'),
        }
        feed_dict[self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0.

        if mode == 'infer':
            return feed_dict

        if mode in ['train', 'dev']:
            assert batch_y, 'batch_y should not be None when mode is train or dev'
            feed_target = [self.label2id(y, label2id) for y in batch_y]
            feed_dict[self.target] = feed_target
            return feed_dict
        raise ValueError(f'mode type {mode} not support')