def _create_feed_dict_from_features(self, features, mode = 'train'): s1 = features['s1'].tolist() s1_input_mask = [] s1_segment_ids = [] for batch_idx, s1_ in enumerate(s1): # 遍历batch # remove pad i = len(s1_)-1 while s1_[i] == word2id['<pad>']: # [PAD] i -= 1 s1_ = s1_[:i+1] s1[batch_idx] = [word2id['[CLS]']] + s1_[:bert_max_length-2] + [word2id['[SEP]']] # [CLS] + [] + [SEP] input_mask_ = [1] * len(s1[batch_idx]) segment_ids_ = [0] * len(s1[batch_idx]) s1_input_mask.append(input_mask_) s1_segment_ids.append(segment_ids_) feed_dict = { self.s1: utils.pad_sequences(s1, padding='post'), self.s1_input_mask: utils.pad_sequences(s1_input_mask, padding='post'), self.s1_segment_ids: utils.pad_sequences(s1_segment_ids, padding='post'), self.target: features['target'], } if mode == 'train': feed_dict['num'] = len(features['s1']) feed_dict[self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0. return feed_dict
def _create_feed_dict_from_raw(self, batch_s1, batch_y, token2id_dct, mode='infer'): word2id = token2id_dct['word2id'] label2id = token2id_dct['label2id'] feed_s1 = [self.sent2ids(s1, word2id) for s1 in batch_s1] feed_input_mask = [] feed_segment_ids = [] for i, s1_ in enumerate(feed_s1): feed_s1[i] = [word2id['[CLS]']] + s1_[:bert_max_length - 2] + [word2id['[SEP]']] # [CLS] + [] + [SEP] feed_input_mask.append([1] * len(feed_s1[i])) feed_segment_ids.append([0] * len(feed_s1[i])) feed_dict = { self.s1: utils.pad_sequences(feed_s1, padding='post'), self.s1_input_mask: utils.pad_sequences(feed_input_mask, padding='post'), self.s1_segment_ids: utils.pad_sequences(feed_segment_ids, padding='post'), } feed_dict[self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0. if mode == 'infer': return feed_dict if mode in ['train', 'dev']: feed_target = [self.label2id(y, label2id) for y in batch_y] feed_dict[self.target] = feed_target return feed_dict raise ValueError(f'mode type {mode} not support')
def create_feed_dict_from_raw(self, batch_s1, batch_ner_label, token2id_dct, mode='infer'): char2id = token2id_dct['char2id'] bmeo2id = token2id_dct['bmeo2id'] feed_s1 = [self.sent2ids(s1, char2id) for s1 in batch_s1] feed_dict = { self.s1: utils.pad_sequences(feed_s1, padding='post'), } feed_dict[ self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0. if mode == 'infer': return feed_dict if mode in ['train', 'dev']: assert batch_ner_label, 'batch_ner_label should not be None when mode is train or dev' feed_ner_label = [ self.bmeo2ids(label, bmeo2id) for label in batch_ner_label ] feed_dict[self.ner_label] = utils.pad_sequences(feed_ner_label, padding='post') return feed_dict raise ValueError(f'mode type {mode} not support')
def create_feed_dict_from_data(self, data, ids, mode='train'): # data:数据已经转为id, data不同字段保存该段字段全量数据 batch_s1 = [data['s1'][i] for i in ids] batch_s2 = [data['s2'][i] for i in ids] feed_dict = { self.s1: utils.pad_sequences(batch_s1, padding='post'), self.s2: utils.pad_sequences(batch_s2, padding='post'), } if mode == 'train': feed_dict['num'] = len(batch_s1) feed_dict[self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0. return feed_dict
def create_feed_dict_from_data(self, data, ids, mode='train'): # data:数据已经转为id, data不同字段保存该段字段全量数据 batch_s1 = [data['s1'][i] for i in ids] batch_s2 = [data['s2'][i] for i in ids] if len(set([len(e) for e in batch_s1])) != 1: # 长度不等 batch_s1 = utils.pad_sequences(batch_s1, padding='post') if len(set([len(e) for e in batch_s2])) != 1: # 长度不等 batch_s2 = utils.pad_sequences(batch_s2, padding='post') feed_dict = { self.s1: batch_s1, self.s2: batch_s2, self.target: [data['target'][i] for i in ids], } if mode == 'train': feed_dict['num'] = len(batch_s1) feed_dict[self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0. return feed_dict
def _create_feed_dict_from_raw(self, batch_s1, batch_ner_label, token2id_dct, mode='infer'): char2id = token2id_dct['char2id'] bmeo2id = token2id_dct['bmeo2id'] feed_s1 = [self.sent2ids(s1, char2id) for s1 in batch_s1] feed_input_mask = [] feed_segment_ids = [] for i, s1_ in enumerate(feed_s1): feed_s1[i] = [char2id['[CLS]']] + s1_[:bert_max_length - 2] + [ char2id['[SEP]'] ] # [CLS] + [] + [SEP] feed_input_mask.append([1] * len(feed_s1[i])) feed_segment_ids.append([0] * len(feed_s1[i])) feed_dict = { self.s1: utils.pad_sequences(feed_s1, padding='post'), self.s1_input_mask: utils.pad_sequences(feed_input_mask, padding='post'), self.s1_segment_ids: utils.pad_sequences(feed_segment_ids, padding='post'), } feed_dict[ self. dropout_rate] = conf.dropout_rate if mode == 'train' else 0. if mode == 'infer': return feed_dict if mode in ['train', 'dev']: feed_ner_label = [ self.bmeo2ids(label, bmeo2id) for label in batch_ner_label ] feed_ner_label = [[bmeo2id['O']] + ele[:bert_max_length - 2] + [bmeo2id['O']] for ele in feed_ner_label] feed_ner_label = utils.pad_sequences(feed_ner_label, padding='post') feed_dict[self.ner_label] = feed_ner_label return feed_dict raise ValueError(f'mode type {mode} not support')
def _create_feed_dict_from_features(self, features, mode='train'): s1 = features['s1'].tolist() ner_label = features['ner_label'].tolist() new_s1 = [] new_ner_label = [] s1_input_mask = [] s1_segment_ids = [] for s1_, ner_label_ in zip(s1, ner_label): # 遍历batch # remove pad i = len(s1_) - 1 while s1_[i] == char2id['<pad>']: # [PAD] i -= 1 s1_ = s1_[:i + 1] ner_label_ = ner_label_[:i + 1] s1_ = [char2id['[CLS]']] + s1_[:bert_max_length - 2] + [ char2id['[SEP]'] ] # [CLS] + [] + [SEP] ner_label_ = [bmeo2id['O'] ] + ner_label_[:bert_max_length - 2] + [ bmeo2id['O'] ] # [O] + [] + [O] input_mask_ = [1] * len(s1_) segment_ids_ = [0] * len(s1_) new_s1.append(s1_) new_ner_label.append(ner_label_) s1_input_mask.append(input_mask_) s1_segment_ids.append(segment_ids_) feed_dict = { self.s1: utils.pad_sequences(new_s1, padding='post'), self.s1_input_mask: utils.pad_sequences(s1_input_mask, padding='post'), self.s1_segment_ids: utils.pad_sequences(s1_segment_ids, padding='post'), self.ner_label: utils.pad_sequences(new_ner_label, padding='post'), } if mode == 'train': feed_dict['num'] = len(features['s1']) feed_dict[ self. dropout_rate] = conf.dropout_rate if mode == 'train' else 0. return feed_dict
def create_feed_dict_from_raw(self, batch_multi_s1, batch_s2, token2id_dct, mode='infer'): word2id = token2id_dct['word2id'] feed_multi_s1 = [self.multi_sent2ids(multi_s1.split('$$$'), word2id) for multi_s1 in batch_multi_s1] feed_dict = { self.multi_s1: utils.pad_sequences(feed_multi_s1, padding='post'), } feed_dict[self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0. if mode == 'infer': return feed_dict if mode in ['train', 'dev']: assert batch_s2, 'batch_s2 should not be None when mode is train or dev' feed_s2 = [self.sent2ids(s2, word2id) for s2 in batch_s2] feed_dict[self.s2] = utils.pad_sequences(feed_s2, padding='post') return feed_dict raise ValueError(f'mode type {mode} not support')
def multi_sent2ids(cls, multi_sent, word2id, max_word_len=None, max_turn=None): # multi_sent: sent的list # sent: 已分好词 ' '隔开 # 不能在形成batch时才动态补齐长度和轮数,因为tfrecord存储会平展为一维,并没有足够信息恢复为不定的[turn,len] # 至少需要固定一个(turn或len),且需要补齐为矩阵turn * len,后面才可通过reshape恢复 # 这里固定turn multi_token_ids = [] for sent in multi_sent: multi_token_ids.append(cls.sent2ids(sent, word2id, max_word_len=max_word_len)) if max_turn is None: # turn需固定来补齐句子长度 max_turn = conf.max_turn multi_token_ids = multi_token_ids[-max_turn:] # 若大于截取后n轮 multi_token_ids = multi_token_ids + [[]] * (max_turn - len(multi_token_ids)) # 若不足则后面补齐 multi_token_ids = utils.pad_sequences(multi_token_ids, padding='post', value=0) # 补齐每个sent长度 return multi_token_ids # [turn(5), len]
def create_feed_dict_from_raw(self, batch_s1, batch_y, token2id_dct, mode='infer'): word2id = token2id_dct['word2id'] label2id = token2id_dct['label2id'] feed_s1 = [self.sent2ids(s1, word2id) for s1 in batch_s1] feed_dict = { self.s1: utils.pad_sequences(feed_s1, padding='post'), } feed_dict[self.dropout_rate] = conf.dropout_rate if mode == 'train' else 0. if mode == 'infer': return feed_dict if mode in ['train', 'dev']: assert batch_y, 'batch_y should not be None when mode is train or dev' feed_target = [self.label2id(y, label2id) for y in batch_y] feed_dict[self.target] = feed_target return feed_dict raise ValueError(f'mode type {mode} not support')