def get_batch(self, bucket_dbs, bucket_id, data): (encoder_size, decoder_size) = self.buckets[bucket_id] (encoder_inputs, decoder_inputs) = ([], []) for (encoder_input, decoder_input) in data: encoder_input = data_utils.sentence_indice(encoder_input) decoder_input = data_utils.sentence_indice(decoder_input) encoder_pad = ([data_utils.PAD_ID] * (encoder_size - len(encoder_input))) encoder_inputs.append(list(reversed( (encoder_input + encoder_pad)))) decoder_pad_size = ((decoder_size - len(decoder_input)) - 2) decoder_inputs.append( ((([data_utils.GO_ID] + decoder_input) + [data_utils.EOS_ID]) + ([data_utils.PAD_ID] * decoder_pad_size))) (batch_encoder_inputs, batch_decoder_inputs, batch_weights) = ([], [], []) for i in range(encoder_size): batch_encoder_inputs.append( np.array( [encoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32)) for i in range(decoder_size): batch_decoder_inputs.append( np.array( [decoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32)) batch_weight = np.ones(self.batch_size, dtype=np.float32) for j in range(self.batch_size): if (i < (decoder_size - 1)): target = decoder_inputs[j][(i + 1)] if ((i == (decoder_size - 1)) or (target == data_utils.PAD_ID)): batch_weight[j] = 0.0 batch_weights.append(batch_weight) return (batch_encoder_inputs, batch_decoder_inputs, batch_weights)
def get_batch(self, bucket_dbs, bucket_id, data): '''将data转换为模型训练可接受的格式 ''' encoder_size, decoder_size = self.buckets[bucket_id] # bucket_db = bucket_dbs[bucket_id] encoder_inputs, decoder_inputs = [], [] for encoder_input, decoder_input in data: # encoder_input, decoder_input = random.choice(data[bucket_id]) # encoder_input, decoder_input = bucket_db.random() encoder_input = data_utils.sentence_indice(encoder_input) decoder_input = data_utils.sentence_indice(decoder_input) # Encoder # 句子填充为固定长度 encoder_pad = [data_utils.PAD_ID] * ( encoder_size - len(encoder_input) ) # 填充后,将输入反转,填充的内容置前。 # 此处我的理解:信息在前向传播过程中,越靠前的内容损失越多,借此可以降低填充内容的权重 # 编码反转,解码不需反转。要提高准确率可以采用attention机制 encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) # Decoder # 解码填充长度需减2, 因为解码自带开始符和结束符,本项目定义为GO_ID和EOS_ID decoder_pad_size = decoder_size - len(decoder_input) - 2 decoder_inputs.append( [data_utils.GO_ID] + decoder_input + [data_utils.EOS_ID] + [data_utils.PAD_ID] * decoder_pad_size ) batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # batch encoder for i in range(encoder_size): batch_encoder_inputs.append(np.array( [encoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32 )) # batch decoder for i in range(decoder_size): batch_decoder_inputs.append(np.array( [decoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32 )) # batch_weights 的维度与 batch_decoder_inputs 一致 batch_weight = np.ones(self.batch_size, dtype=np.float32) for j in range(self.batch_size): if i < decoder_size - 1: target = decoder_inputs[j][i + 1] if i == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[j] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def get_batch(self, bucket_dbs, bucket_id, data): '''将batch中的字符调用data_utils里面的函数转换为数值''' encoder_size, decoder_size = self.buckets[bucket_id] # bucket_db = bucket_dbs[bucket_id] encoder_inputs, decoder_inputs = [], [] # 读取的文字存放在data里面 data = <class 'list'>: [('你输入的内容\n', '')] for encoder_input, decoder_input in data: # encoder_input, decoder_input = random.choice(data[bucket_id]) # encoder_input, decoder_input = bucket_db.random() # 利用sentence——indice 把输入句子转化为id encoder_input = data_utils.sentence_indice(encoder_input) # encoder_input 是ask,汉字,输出的该句子中每个词在字典中的位置,并且组成列表<class 'list'>: [1632, 3008, 69, 1334, 1642, 2280, 2524] decoder_input = data_utils.sentence_indice(decoder_input) #<class 'list'>: [1334, 1200, 882, 69, 1334, 197, 852, 644, 1094, 2280, 3146, 3703, 164] # Encoder # data_utils.PAD_ID = 2, 句子总长度减去传入句子长度,然后用pad填充 encoder_pad = [data_utils.PAD_ID] * ( encoder_size - len(encoder_input) ) # reversed翻转,利用我们想输入风云三尺剑,实际上我们输入的剑尺三云风,这样提高我们模型的预测能力 encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) # Decoder 20 - 句子原始长度 - 0 - 2,留2是因为要给留go,eos,其他地方全部做pad decoder_pad_size = decoder_size - len(decoder_input) - 2 decoder_inputs.append( # go必须写在最前面,代表模型开始运行 [data_utils.GO_ID] + decoder_input + # decoderinput是20,所以decoder要长两个维度,因为有go和eos [data_utils.EOS_ID] + # 结束标记符 [data_utils.PAD_ID] * decoder_pad_size # pad size = 20 - 0 - 2 = 18 ) # 最终组装成的,一个batch32个句子,编码取用pad填充,解码区用GO ~~~eos-PAD填充,里面全部整型数字填充,在前面的sen2index已经由汉字根据在字典里面的位置转为数字了 batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # batch encoder for i in range(encoder_size): # 1-10for循环 batch_encoder_inputs.append(np.array( [encoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32 )) # batch decoder for i in range(decoder_size): # size和input是有区别的 batch_decoder_inputs.append(np.array( [decoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32 )) # 在decoder阶段,因为有pad的存在,所以被pad的部分给定权重为0,没有pad的部分为1 batch_weight = np.ones(self.batch_size, dtype=np.float32) #batch_weight要么0,要么1 for j in range(self.batch_size): if i < decoder_size - 1: # 20 -1 = 19,就是说i在0-18 target = decoder_inputs[j][i + 1] # 编码阶段向右移动一个单位 if i == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[j] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def get_batch(self, bucket_dbs, bucket_id, data): encoder_size, decoder_size = self.buckets[bucket_id] # bucket_db = bucket_dbs[bucket_id] encoder_inputs, decoder_inputs = [], [] for encoder_input, decoder_input in data: # encoder_input, decoder_input = random.choice(data[bucket_id]) # encoder_input, decoder_input = bucket_db.random() #把输入句子转化为id encoder_input = data_utils.sentence_indice(encoder_input) decoder_input = data_utils.sentence_indice(decoder_input) # Encoder encoder_pad = [data_utils.PAD_ID] * ( encoder_size - len(encoder_input) ) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) # Decoder decoder_pad_size = decoder_size - len(decoder_input) - 2 decoder_inputs.append( [data_utils.GO_ID] + decoder_input + [data_utils.EOS_ID] + [data_utils.PAD_ID] * decoder_pad_size ) batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # batch encoder for i in range(encoder_size): batch_encoder_inputs.append(np.array( [encoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32 )) # batch decoder for i in range(decoder_size): batch_decoder_inputs.append(np.array( [decoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32 )) batch_weight = np.ones(self.batch_size, dtype=np.float32) for j in range(self.batch_size): if i < decoder_size - 1: target = decoder_inputs[j][i + 1] if i == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[j] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def get_batch(self, bucket_dbs, bucket_id, data): encoder_size, decoder_size = self.buckets[bucket_id] # bucket_db = bucket_dbs[bucket_id] encoder_inputs, decoder_inputs = [], [] for encoder_input, decoder_input in data: # encoder_input, decoder_input = random.choice(data[bucket_id]) # encoder_input, decoder_input = bucket_db.random() encoder_input = data_utils.sentence_indice(encoder_input) decoder_input = data_utils.sentence_indice(decoder_input) # Encoder encoder_pad = [data_utils.PAD_ID] * ( encoder_size - len(encoder_input) ) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) # Decoder decoder_pad_size = decoder_size - len(decoder_input) - 2 decoder_inputs.append( [data_utils.GO_ID] + decoder_input + [data_utils.EOS_ID] + [data_utils.PAD_ID] * decoder_pad_size ) batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # batch encoder for i in range(encoder_size): batch_encoder_inputs.append(np.array( [encoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32 )) # batch decoder for i in range(decoder_size): batch_decoder_inputs.append(np.array( [decoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32 )) batch_weight = np.ones(self.batch_size, dtype=np.float32) for j in range(self.batch_size): if i < decoder_size - 1: target = decoder_inputs[j][i + 1] if i == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[j] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def get_batch(self, bucket_id, data): # 获取bucket_id这个桶对应的ask和answer的字符长度大小限制值 encoder_size, decoder_size = self.buckets[bucket_id] encoder_inputs, decoder_inputs = [], [] for encoder_input, decoder_input in data: # ids化 encoder_input = data_utils.sentence_indice(encoder_input) decoder_input = data_utils.sentence_indice(decoder_input) # Encoder Padding encoder_pad = [data_utils.PAD_ID ] * (encoder_size - len(encoder_input)) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) # Decoder Padding decoder_pad_size = decoder_size - len(decoder_input) - 2 decoder_inputs.append([data_utils.GO_ID] + decoder_input + [data_utils.EOS_ID] + [data_utils.PAD_ID] * decoder_pad_size) batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # batch encoder for i in range(encoder_size): batch_encoder_inputs.append( np.array( [encoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32)) # batch decoder for i in range(decoder_size): batch_decoder_inputs.append( np.array( [decoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32)) batch_weight = np.ones(self.batch_size, dtype=np.float32) for j in range(self.batch_size): if i < decoder_size - 1: target = decoder_inputs[j][i + 1] if i == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[j] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights