def encode(self, sequence, emb=None): if emb is None: emb = self.emb #--for debug sequence, sequence_length = melt.pad( sequence, start_id=(vocabulary.vocab.start_id() if FLAGS.encode_start_mark else None), end_id=(self.end_id if FLAGS.encode_end_mark else None)) #for attention due to float32 numerice accuracy problem, may has some diff, so not slice it #if self.is_predict: # num_steps = tf.cast(tf.reduce_max(sequence_length), dtype=tf.int32) # sequence = tf.slice(sequence, [0,0], [-1, num_steps]) inputs = tf.nn.embedding_lookup(emb, sequence) if self.is_training and FLAGS.keep_prob < 1: inputs = tf.nn.dropout(inputs, FLAGS.keep_prob) output_method = FLAGS.rnn_output_method encode_feature, state = melt.rnn.encode(self.cell, inputs, sequence_length, cell_bw=self.bwcell, encode_method=FLAGS.rnn_method, output_method=output_method) return encode_feature, state
def _decode(example, parse, dynamic_batch_length): features = parse(example, features={ 'image_name': tf.FixedLenFeature([], tf.string), 'url': tf.FixedLenFeature([], tf.string), 'text_str': tf.FixedLenFeature([], tf.string), 'ct0_str': tf.FixedLenFeature([], tf.string), 'title_str': tf.FixedLenFeature([], tf.string), 'real_title_str': tf.FixedLenFeature([], tf.string), 'text': tf.VarLenFeature(tf.int64), 'ct0': tf.VarLenFeature(tf.int64), 'title': tf.VarLenFeature(tf.int64), 'real_title': tf.VarLenFeature(tf.int64), }) image_name = features['image_name'] text = features['text'] input_type = 'real_title' input_text = features[input_type] maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS text = melt.sparse_tensor_to_dense(text, maxlen) text, _ = melt.pad(text, start_id=get_decoder_start_id(), end_id=get_decoder_end_id()) input_maxlen = 0 if dynamic_batch_length else INPUT_TEXT_MAX_WORDS input_text = melt.sparse_tensor_to_dense(input_text, maxlen) input_text, _ = melt.pad( input_text, start_id=(vocabulary.vocab.start_id() if FLAGS.encode_start_mark else None), end_id=(encoder_end_id if FLAGS.encode_end_mark else None)) text_str = features['text_str'] input_text_str = features['{}_str'.format(input_type)] return image_name, text, text_str, input_text, input_text_str
def adjust(features, subset): if 'hist_len' not in features: try: features['hist_len'] = mt.length(features['history']) except Exception: features['hist_len'] = tf.ones_like(features['did']) if FLAGS.max_history: for key in features: if 'history' in key: max_history = FLAGS.max_history if 'enti' in key: max_history *= 2 if not FLAGS.fixed_pad: features[key] = features[key][:, :max_history] else: features[key] = mt.pad(features[key], max_history) # 注意按照nid去获取新闻测信息 did只是用作id特征 可能被mask features['ori_did'] = features['did'] features['ori_history'] = features['history'] if 'impressions' in features: features['ori_impressions'] = features['impressions'] features['did'] = mask_dids(features['did'], features['did_in_train'], subset, FLAGS.test_all_mask) features['uid'] = mask_uids(features['uid'], subset == 'train') if 'history' in features: features['history'] = unk_aug(features['history'], subset == 'train') mask_negative_weights(features, subset == 'train') vs = gezi.get('vocab_sizes') if FLAGS.min_count_unk and FLAGS.min_count: features['uid'] = get_id(features['uid'], vs['uid'][1]) features['did'] = get_id(features['did'], vs['did'][1]) if FLAGS.mask_history: features['history'] = get_id(features['history'], vs['did'][1]) if 'impressions' in features: features['impressions'] = get_id(features['impressions'], vs['did'][1]) if vs['uid'][1] < vs['uid'][0]: features['uid'] = get_id(features['uid'], vs['uid'][1]) return features
def build_predict_text_graph(self, input_text, decode_method=0, beam_size=5, convert_unk=True): with tf.variable_scope("encode"): encoder_output, state = self.encoder.encode(input_text) if not FLAGS.use_attention: encoder_output = None with tf.variable_scope("decode"): #---try to use static shape if possible batch_size = melt.get_batch_size(input_text) decoder_input = self.decoder.get_start_embedding_input(batch_size) max_words = FLAGS.decode_max_words if FLAGS.decode_max_words else TEXT_MAX_WORDS if decode_method == SeqDecodeMethod.greedy: input_text = self.encoder.sequence return self.decoder.generate_sequence_greedy( decoder_input, max_words=max_words, initial_state=state, attention_states=encoder_output, convert_unk=convert_unk, input_text=input_text) else: if decode_method == SeqDecodeMethod.beam: decode_func = self.decoder.generate_sequence_beam elif decode_method == SeqDecodeMethod.beam_search: decode_func = self.decoder.generate_sequence_beam_search else: raise ValueError('not supported decode_method: %d' % decode_method) input_text, input_text_length = melt.pad( input_text, end_id=self.encoder.end_id) #input_text = self.encoder.sequence #input_text_length = self.encoder.sequence_length return decode_func(decoder_input, max_words=max_words, initial_state=state, attention_states=encoder_output, beam_size=beam_size, convert_unk=convert_unk, length_normalization_factor=FLAGS. length_normalization_factor, input_text=input_text, input_text_length=input_text_length)
def build_predict_text_graph(self, image_feature, input_text, decode_method='greedy', beam_size=5, convert_unk=True): with tf.variable_scope("encode"): image_emb = self.build_image_embeddings(image_feature) encoder_output, state = self.encoder.encode(input_text, input=image_emb) #encoder_output, state = self.encoder.encode(input_text, input=None) if not FLAGS.use_attention: encoder_output = None with tf.variable_scope("decode"): batch_size = tf.shape(input_text)[0] decoder_input = self.decoder.get_start_embedding_input(batch_size) max_words = FLAGS.decode_max_words if FLAGS.decode_max_words else TEXT_MAX_WORDS if decode_method == SeqDecodeMethod.greedy: return self.decoder.generate_sequence_greedy( decoder_input, max_words=max_words, initial_state=state, attention_states=encoder_output, convert_unk=convert_unk) else: if decode_method == SeqDecodeMethod.ingraph_beam: decode_func = self.decoder.generate_sequence_ingraph_beam elif decode_method == SeqDecodeMethod.outgraph_beam: decode_func = self.decoder.generate_sequence_outgraph_beam else: raise ValueError('not supported decode_method: %s' % decode_method) input_text, input_text_length = melt.pad( input_text, end_id=self.encoder.end_id) return decode_func(decoder_input, max_words=max_words, initial_state=state, attention_states=encoder_output, beam_size=beam_size, convert_unk=convert_unk, length_normalization_factor=FLAGS. length_normalization_factor, input_text=input_text, input_text_length=input_text_length)
def pad(self, sequence): return melt.pad( sequence, start_id=(vocabulary.vocab.start_id() if FLAGS.encode_start_mark else None), end_id=(self.end_id if FLAGS.encode_end_mark else None))
def sequence_loss(self, sequence, initial_state=None, attention_states=None, input=None, input_text=None, exact_prob=False, exact_loss=False, emb=None): """ for general seq2seq input is None, sequence will pad <GO>, inital_state is last state from encoder for img2text/showandtell input is image_embedding, inital_state is None/zero set TODO since exact_porb and exact_loss same value, may remove exact_prob NOTICE! assume sequence to be padded by zero and must have one instance full length(no zero!) """ if emb is None: emb = self.emb is_training = self.is_training batch_size = tf.shape(sequence)[0] sequence, sequence_length = melt.pad(sequence, start_id=self.get_start_id(), end_id=self.get_end_id()) #TODO different init state as show in ptb_word_lm state = self.cell.zero_state( batch_size, tf.float32) if initial_state is None else initial_state #[batch_size, num_steps - 1, emb_dim], remove last col inputs = tf.nn.embedding_lookup(emb, sequence[:, :-1]) if is_training and FLAGS.keep_prob < 1: inputs = tf.nn.dropout(inputs, FLAGS.keep_prob) #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat if input is not None: #used like showandtell where image_emb is as input, additional to sequence inputs = tf.concat([tf.expand_dims(input, 1), inputs], 1) else: #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad sequence_length -= 1 sequence = sequence[:, 1:] if self.is_predict: #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot num_steps = tf.to_int32(tf.reduce_max(sequence_length)) sequence = sequence[:, :num_steps] inputs = inputs[:, :num_steps, :] tf.add_to_collection('sequence', sequence) tf.add_to_collection('sequence_length', sequence_length) if attention_states is None: outputs, state = tf.nn.dynamic_rnn(self.cell, inputs, initial_state=state, sequence_length=sequence_length, scope=self.scope) self.final_state = state else: attention_keys, attention_values, attention_score_fn, attention_construct_fn = \ self.prepare_attention(attention_states) decoder_fn_train = melt.seq2seq.attention_decoder_fn_train( encoder_state=state, attention_keys=attention_keys, attention_values=attention_values, attention_score_fn=attention_score_fn, attention_construct_fn=attention_construct_fn) decoder_outputs_train, decoder_state_train, _ = \ melt.seq2seq.dynamic_rnn_decoder( cell=self.cell, decoder_fn=decoder_fn_train, inputs=inputs, sequence_length=tf.to_int32(sequence_length), scope=self.scope) outputs = decoder_outputs_train self.final_state = decoder_state_train tf.add_to_collection('outputs', outputs) #[batch_size, num_steps] targets = sequence if FLAGS.copy_only: #TODO now not work! attention_scores = tf.get_collection('attention_scores')[-1] indices = melt.batch_values_to_indices(input_text) #logits = ; else: #TODO: hack here add FLAGS.predict_no_sample just for Seq2seqPredictor exact_predict softmax_loss_function = self.softmax_loss_function if self.is_predict and (exact_prob or exact_loss): softmax_loss_function = None if softmax_loss_function is None: #[batch_size, num_steps, num_units] * [num_units, vocab_size] # -> logits [batch_size, num_steps, vocab_size] (if use exact_predict_loss) #or [batch_size * num_steps, vocab_size] by default flatten=True keep_dims = exact_prob or exact_loss logits = melt.batch_matmul_embedding( outputs, self.w, keep_dims=keep_dims) + self.v if not keep_dims: targets = tf.reshape(targets, [-1]) else: logits = outputs mask = tf.cast(tf.sign(targets), dtype=tf.float32) if self.is_predict and exact_prob: #generate real prob for sequence #for 10w vocab textsum seq2seq 20 -> 4 about loss = melt.seq2seq.exact_predict_loss(logits, targets, mask, num_steps, batch_size) elif self.is_predict and exact_loss: #force no sample softmax loss, the diff with exact_prob is here we just use cross entropy error as result not real prob of seq #NOTICE using time a bit less 55 to 57(prob), same result with exact prob and exact score #but 256 vocab sample will use only about 10ms #TODO check more with softmax loss and sampled somtmax loss, check length normalize loss = melt.seq2seq.sequence_loss_by_example(logits, targets, weights=mask) else: #loss [batch_size,] loss = melt.seq2seq.sequence_loss_by_example( logits, targets, weights=mask, softmax_loss_function=softmax_loss_function) #mainly for compat with [bach_size, num_losses] loss = tf.reshape(loss, [-1, 1]) if self.is_predict: loss = self.normalize_length(loss, sequence_length, exact_prob) #loss = tf.squeeze(loss) TODO: later will uncomment this with all models rerun return loss
def sequence_loss(self, sequence, initial_state=None, attention_states=None, input=None, input_text=None, exact_prob=False, exact_loss=False, emb=None): """ for general seq2seq input is None, sequence will pad <GO>, inital_state is last state from encoder for img2text/showandtell input is image_embedding, inital_state is None/zero set TODO since exact_porb and exact_loss same value, may remove exact_prob NOTICE! assume sequence to be padded by zero and must have one instance full length(no zero!) """ if emb is None: emb = self.emb is_training = self.is_training batch_size = tf.shape(sequence)[0] sequence, sequence_length = melt.pad(sequence, start_id=self.get_start_id(), end_id=self.get_end_id()) #[batch_size, num_steps - 1, emb_dim], remove last col inputs = tf.nn.embedding_lookup(emb, sequence[:, :-1]) if is_training and FLAGS.keep_prob < 1: inputs = tf.nn.dropout(inputs, FLAGS.keep_prob) #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat if input is not None: #used like showandtell where image_emb is as input, additional to sequence inputs = tf.concat([tf.expand_dims(input, 1), inputs], 1) else: #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad sequence_length -= 1 sequence = sequence[:, 1:] if self.is_predict: #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot num_steps = tf.to_int32(tf.reduce_max(sequence_length)) sequence = sequence[:, :num_steps] inputs = inputs[:, :num_steps, :] tf.add_to_collection('sequence', sequence) tf.add_to_collection('sequence_length', sequence_length) if attention_states is None: cell = self.cell else: cell = self.prepare_attention(attention_states, initial_state=initial_state) #initial_state = None initial_state = cell.zero_state(batch_size, tf.float32) state = cell.zero_state( batch_size, tf.float32) if initial_state is None else initial_state #if attention_states is None: #-----TODO using attention_wrapper works now with dynamic_rnn but still slower then old attention method... outputs, state = tf.nn.dynamic_rnn(cell, inputs, initial_state=state, sequence_length=sequence_length, dtype=tf.float32, scope=self.scope) #else: # #---below is also ok but slower, above 16+ ,below only 13,14 batch/s, may be due to sample id # #TODO: can we make below code as fast as tf.nn.dyanmic_rnn if not need smaple id remove it ? # #FIXME... AttentionWrapper is only 1/2 speed comapred to old function based attention, why? # #helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) # helper = melt.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) # #my_decoder = tf.contrib.seq2seq.BasicDecoder( # my_decoder = melt.seq2seq.BasicTrainingDecoder( # cell=cell, # helper=helper, # initial_state=state) # outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(my_decoder, scope=self.scope) # #outputs = outputs.rnn_output self.final_state = state tf.add_to_collection('outputs', outputs) #[batch_size, num_steps] targets = sequence if FLAGS.copy_only: #TODO now not work! attention_scores = tf.get_collection('attention_scores')[-1] indices = melt.batch_values_to_indices(input_text) #logits = ; else: #TODO: hack here add FLAGS.predict_no_sample just for Seq2seqPredictor exact_predict softmax_loss_function = self.softmax_loss_function if self.is_predict and (exact_prob or exact_loss): softmax_loss_function = None if softmax_loss_function is None: #[batch_size, num_steps, num_units] * [num_units, vocab_size] # -> logits [batch_size, num_steps, vocab_size] (if use exact_predict_loss) #or [batch_size * num_steps, vocab_size] by default flatten=True keep_dims = exact_prob or exact_loss logits = melt.batch_matmul_embedding( outputs, self.w, keep_dims=keep_dims) + self.v if not keep_dims: targets = tf.reshape(targets, [-1]) else: logits = outputs mask = tf.cast(tf.sign(targets), dtype=tf.float32) if self.is_predict and exact_prob: #generate real prob for sequence #for 10w vocab textsum seq2seq 20 -> 4 about loss = melt.seq2seq.exact_predict_loss(logits, targets, mask, num_steps, batch_size) elif self.is_predict and exact_loss: #force no sample softmax loss, the diff with exact_prob is here we just use cross entropy error as result not real prob of seq #NOTICE using time a bit less 55 to 57(prob), same result with exact prob and exact score #but 256 vocab sample will use only about 10ms #TODO check more with softmax loss and sampled somtmax loss, check length normalize loss = melt.seq2seq.sequence_loss_by_example(logits, targets, weights=mask) else: #loss [batch_size,] loss = melt.seq2seq.sequence_loss_by_example( logits, targets, weights=mask, softmax_loss_function=softmax_loss_function) #mainly for compat with [bach_size, num_losses] loss = tf.reshape(loss, [-1, 1]) if self.is_predict: loss = self.normalize_length(loss, sequence_length, exact_prob) #loss = tf.squeeze(loss) TODO: later will uncomment this with all models rerun return loss
def sequence_loss(self, sequence, initial_state=None, attention_states=None, input=None, input_text=None, exact_prob=False, exact_loss=False, emb=None): """ for general seq2seq input is None, sequence will pad <GO>, inital_state is last state from encoder for img2text/showandtell input is image_embedding, inital_state is None/zero set TODO since exact_porb and exact_loss same value, may remove exact_prob NOTICE! assume sequence to be padded by zero and must have one instance full length(no zero!) """ if emb is None: emb = self.emb is_training = self.is_training batch_size = melt.get_batch_size(sequence) sequence, sequence_length = melt.pad(sequence, start_id=self.get_start_id(), end_id=self.get_end_id()) #[batch_size, num_steps - 1, emb_dim], remove last col inputs = tf.nn.embedding_lookup(emb, sequence[:, :-1]) if is_training and FLAGS.keep_prob < 1: inputs = tf.nn.dropout(inputs, FLAGS.keep_prob) #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat if input is not None: #used like showandtell where image_emb is as input, additional to sequence inputs = tf.concat([tf.expand_dims(input, 1), inputs], 1) else: #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad sequence_length -= 1 sequence = sequence[:, 1:] if self.is_predict: #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot num_steps = tf.to_int32(tf.reduce_max(sequence_length)) sequence = sequence[:, :num_steps] inputs = inputs[:, :num_steps, :] tf.add_to_collection('sequence', sequence) tf.add_to_collection('sequence_length', sequence_length) #[batch_size, num_steps] targets = sequence if attention_states is None: cell = self.cell else: cell = self.prepare_attention( attention_states, initial_state=initial_state, score_as_alignment=self.score_as_alignment) initial_state = None state = cell.zero_state( batch_size, tf.float32) if initial_state is None else initial_state if FLAGS.gen_only: #gen only mode #for attention wrapper can not use dynamic_rnn if aligments_history=True TODO see pointer_network in application seems ok.. why outputs, state = tf.nn.dynamic_rnn(cell, inputs, initial_state=state, sequence_length=sequence_length, dtype=tf.float32, scope=self.scope) #--------below is ok but slower then dynamic_rnn 3.4batch -> 3.1 batch/s #helper = melt.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) ##helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) #my_decoder = melt.seq2seq.BasicTrainingDecoder( ##my_decoder = tf.contrib.seq2seq.BasicDecoder( ##my_decoder = melt.seq2seq.BasicDecoder( # cell=cell, # helper=helper, # initial_state=state) ##outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(my_decoder, scope=self.scope) #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope) ##outputs = outputs.rnn_output else: #---copy only or gen copy helper = melt.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) indices = melt.batch_values_to_indices(tf.to_int32(input_text)) if FLAGS.copy_only: output_fn = lambda cell_output, cell_state: self.copy_output_fn( indices, batch_size, cell_output, cell_state) else: #gen_copy right now, not use switch sampled_values = None if self.softmax_loss_function is not None: sampled_values = tf.nn.log_uniform_candidate_sampler( true_classes=tf.reshape(targets, [-1, 1]), num_true=1, num_sampled=self.num_sampled, unique=True, range_max=self.vocab_size) #TODO since perf of sampled version here is ok not modify now, but actually in addtional to sampled_values #sampled_w, sampled_b can also be pre embedding lookup, may imporve not much output_fn = lambda time, cell_output, cell_state: self.gen_copy_output_train_fn( time, indices, targets, sampled_values, batch_size, cell_output, cell_state) my_decoder = melt.seq2seq.BasicTrainingDecoder( cell=cell, helper=helper, initial_state=state, vocab_size=self.vocab_size, output_fn=output_fn) outputs, state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, scope=self.scope) #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope) tf.add_to_collection('outputs', outputs) #TODO: hack here add FLAGS.predict_no_sample just for Seq2seqPredictor exact_predict softmax_loss_function = self.softmax_loss_function if self.is_predict and (exact_prob or exact_loss): softmax_loss_function = None if not FLAGS.gen_only: logits = outputs softmax_loss_function = None elif softmax_loss_function is not None: logits = outputs else: #[batch_size, num_steps, num_units] * [num_units, vocab_size] # -> logits [batch_size, num_steps, vocab_size] (if use exact_predict_loss) #or [batch_size * num_steps, vocab_size] by default flatten=True keep_dims = exact_prob or exact_loss logits = melt.batch_matmul_embedding( outputs, self.w, keep_dims=keep_dims) + self.v if not keep_dims: targets = tf.reshape(targets, [-1]) tf.add_to_collection('logits', logits) #if input_text is not None: # logits = outputs mask = tf.cast(tf.sign(targets), dtype=tf.float32) if FLAGS.gen_copy_switch: #TODO why need more gpu mem ? ... do not save logits ? just calc loss in output_fn ? #batch size 256 #File "/home/gezi/mine/hasky/util/melt/seq2seq/loss.py", line 154, in body #step_logits = logits[:, i, :] #ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[256,21,33470] num_steps = tf.shape(targets)[1] loss = melt.seq2seq.exact_predict_loss(logits, targets, mask, num_steps, need_softmax=False, need_average=True, batch_size=batch_size) # loss = melt.seq2seq.sequence_loss_by_example( # logits, # targets, # weights=mask) elif self.is_predict and exact_prob: #generate real prob for sequence #for 10w vocab textsum seq2seq 20 -> 4 about loss = melt.seq2seq.exact_predict_loss(logits, targets, mask, num_steps, batch_size=batch_size) elif self.is_predict and exact_loss: #force no sample softmax loss, the diff with exact_prob is here we just use cross entropy error as result not real prob of seq #NOTICE using time a bit less 55 to 57(prob), same result with exact prob and exact score #but 256 vocab sample will use only about 10ms loss = melt.seq2seq.sequence_loss_by_example(logits, targets, weights=mask) else: #loss [batch_size,] loss = melt.seq2seq.sequence_loss_by_example( logits, targets, weights=mask, softmax_loss_function=softmax_loss_function) #mainly for compat with [bach_size, num_losses] loss = tf.reshape(loss, [-1, 1]) if self.is_predict: loss = self.normalize_length(loss, sequence_length, exact_prob) #loss = tf.squeeze(loss) TODO: later will uncomment this with all models rerun return loss
def sequence_loss(self, sequence, initial_state=None, attention_states=None, input=None, input_text=None, exact_prob=False, exact_loss=False, emb=None): """ for general seq2seq input is None, sequence will pad <GO>, inital_state is last state from encoder for showandtell input is image_embedding, inital_state is None/zero set, if use im2txt mode set image_as_init_state=True will do as above, need to PAD <GO> ! TODO since exact_porb and exact_loss same value, may remove exact_prob NOTICE! assume sequence to be padded by zero and must have one instance full length(no zero!) """ if emb is None: emb = self.emb is_training = self.is_training batch_size = melt.get_batch_size(sequence) sequence, sequence_length = melt.pad(sequence, start_id=self.get_start_id(), end_id=self.get_end_id()) #[batch_size, num_steps - 1, emb_dim], remove last col inputs = tf.nn.embedding_lookup(emb, sequence[:, :-1]) if is_training and FLAGS.keep_prob < 1: inputs = tf.nn.dropout(inputs, FLAGS.keep_prob) #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat if input is not None: #used like showandtell where image_emb is as input, additional to sequence inputs = tf.concat([tf.expand_dims(input, 1), inputs], 1) else: #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad sequence_length -= 1 sequence = sequence[:, 1:] if self.is_predict: #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot num_steps = tf.to_int32(tf.reduce_max(sequence_length)) sequence = sequence[:, :num_steps] inputs = inputs[:, :num_steps, :] tf.add_to_collection('sequence', sequence) tf.add_to_collection('sequence_length', sequence_length) #[batch_size, num_steps] targets = sequence if attention_states is None: cell = self.cell else: cell = self.prepare_attention( attention_states, initial_state=initial_state, score_as_alignment=self.score_as_alignment) initial_state = None state = cell.zero_state( batch_size, tf.float32) if initial_state is None else initial_state #TODO: hack here add FLAGS.predict_no_sample just for Seq2seqPredictor exact_predict softmax_loss_function = self.softmax_loss_function if self.is_predict and (exact_prob or exact_loss): softmax_loss_function = None scheduled_sampling_probability = FLAGS.scheduled_sampling_probability if self.is_training else 0. if FLAGS.gen_only: #gen only mode #for attention wrapper can not use dynamic_rnn if aligments_history=True TODO see pointer_network in application seems ok.. why if scheduled_sampling_probability > 0.: helper = melt.seq2seq.ScheduledEmbeddingTrainingHelper( inputs, tf.to_int32(sequence_length), emb, tf.constant(FLAGS.scheduled_sampling_probability)) #helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) my_decoder = melt.seq2seq.BasicDecoder( #my_decoder = tf.contrib.seq2seq.BasicDecoder( #my_decoder = melt.seq2seq.BasicDecoder( cell=cell, helper=helper, initial_state=state) outputs, state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, scope=self.scope) #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope) outputs = outputs.rnn_output else: outputs, state = tf.nn.dynamic_rnn( cell, inputs, initial_state=state, sequence_length=sequence_length, dtype=tf.float32, scope=self.scope) #--------below is ok but slower then dynamic_rnn 3.4batch -> 3.1 batch/s #helper = melt.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) ##helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) #my_decoder = melt.seq2seq.BasicTrainingDecoder( ##my_decoder = tf.contrib.seq2seq.BasicDecoder( ##my_decoder = melt.seq2seq.BasicDecoder( # cell=cell, # helper=helper, # initial_state=state) ##outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(my_decoder, scope=self.scope) #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope) ##outputs = outputs.rnn_output else: #---copy only or gen copy if scheduled_sampling_probability > 0.: #not tested yet TODO helper = melt.seq2seq.ScheduledEmbeddingTrainingHelper( inputs, tf.to_int32(sequence_length), emb, tf.constant(FLAGS.scheduled_sampling_probability)) Decoder_ = melt.seq2seq.BasicDecoder else: #as before helper = melt.seq2seq.TrainingHelper( inputs, tf.to_int32(sequence_length)) Decoder_ = melt.seq2seq.BasicTrainingDecoder indices = melt.batch_values_to_indices(tf.to_int32(input_text)) if FLAGS.copy_only: output_fn = lambda cell_output, cell_state: self.copy_output_fn( indices, batch_size, cell_output, cell_state) else: #gen_copy right now, not use switch ? gen_copy and switch? sampled_values = None #TODO CHECK this is it ok? why train and predict not equal and score/exact score same? FIXME #need first debug why score and exact score is same ? score should be the same as train! TODO #sh ./inference/infrence-score.sh to reproduce #now just set num_sampled = 0 for safe, may be here train also not correct FIXME if softmax_loss_function is not None: sampled_values = tf.nn.log_uniform_candidate_sampler( true_classes=tf.reshape(targets, [-1, 1]), num_true=1, num_sampled=self.num_sampled, unique=True, range_max=self.vocab_size) #TODO since perf of sampled version here is ok not modify now, but actually in addtional to sampled_values #sampled_w, sampled_b can also be pre embedding lookup, may imporve not much output_fn = lambda time, cell_output, cell_state: self.gen_copy_output_train_fn( time, indices, targets, sampled_values, batch_size, cell_output, cell_state) my_decoder = Decoder_(cell=cell, helper=helper, initial_state=state, vocab_size=self.vocab_size, output_fn=output_fn) outputs, state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, scope=self.scope) #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope) if hasattr(outputs, 'rnn_output'): outputs = outputs.rnn_output tf.add_to_collection('outputs', outputs) if not FLAGS.gen_only: logits = outputs softmax_loss_function = None elif softmax_loss_function is not None: logits = outputs else: #--softmax_loss_function is None means num_sample = 0 or exact_loss or exact_prob #[batch_size, num_steps, num_units] * [num_units, vocab_size] # -> logits [batch_size, num_steps, vocab_size] (if use exact_predict_loss) #or [batch_size * num_steps, vocab_size] by default flatten=True #this will be fine for train [batch_size * num_steps] but not good for eval since we want #get score of each instance also not good for predict #--------only training mode not keep dims, but this will be dangerous, since class call rnn_decoder #need to manully set rnn_decoder.is_training=False! TODO other wise will show incorrect scores in eval mode #but not affect the final model! keep_dims = exact_prob or exact_loss or (not self.is_training) logits = melt.batch_matmul_embedding( outputs, self.w, keep_dims=keep_dims) + self.v if not keep_dims: targets = tf.reshape(targets, [-1]) tf.add_to_collection('logits', logits) mask = tf.cast(tf.sign(targets), dtype=tf.float32) if FLAGS.gen_copy_switch and FLAGS.switch_after_softmax: #TODO why need more gpu mem ? ... do not save logits ? just calc loss in output_fn ? #batch size 256 #File "/home/gezi/mine/hasky/util/melt/seq2seq/loss.py", line 154, in body #step_logits = logits[:, i, :] #ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[256,21,33470] num_steps = tf.shape(targets)[1] loss = melt.seq2seq.exact_predict_loss( logits, targets, mask, num_steps, need_softmax=False, average_across_timesteps=not self.is_predict, batch_size=batch_size) elif self.is_predict and exact_prob: #generate real prob for sequence #for 10w vocab textsum seq2seq 20 -> 4 about loss = melt.seq2seq.exact_predict_loss( logits, targets, mask, num_steps, batch_size=batch_size, average_across_timesteps=False) elif self.is_predict and exact_loss: #force no sample softmax loss, the diff with exact_prob is here we just use cross entropy error as result not real prob of seq #NOTICE using time a bit less 55 to 57(prob), same result with exact prob and exact score #but 256 vocab sample will use only about 10ms loss = melt.seq2seq.sequence_loss_by_example( logits, targets, weights=mask, average_across_timesteps=False) else: #loss [batch_size,] loss = melt.seq2seq.sequence_loss_by_example( logits, targets, weights=mask, average_across_timesteps=not self. is_predict, #train must average, other wise long sentence big loss.. softmax_loss_function=softmax_loss_function) #mainly for compat with [bach_size, num_losses] here may be [batch_size * num_steps,] if is_training and not exact loss/prob loss = tf.reshape(loss, [-1, 1]) self.ori_loss = loss if self.is_predict: #note use avg_loss not to change loss pointer, avg_loss is same as average time step=True is length_normalize_fator=1.0 avg_loss = self.normalize_length(loss, sequence_length) return avg_loss #if not is_predict loss is averaged per time step else not but avg loss will average it return loss
def sequence_loss(self, input, sequence, initial_state=None, emb=None): if emb is None: emb = self.emb is_training = self.is_training batch_size = tf.shape(sequence)[0] sequence, sequence_length = melt.pad(sequence, start_id=self.get_start_id(), end_id=self.get_end_id()) #TODO different init state as show in ptb_word_lm state = self.cell.zero_state(batch_size, tf.float32) if initial_state is None else initial_state #[batch_size, num_steps - 1, emb_dim], remove last col inputs = tf.nn.embedding_lookup(emb, melt.dynamic_exclude_last_col(sequence)) if is_training and FLAGS.keep_prob < 1: inputs = tf.nn.dropout(inputs, FLAGS.keep_prob) #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat if input is not None: #used like showandtell where image_emb is as input, additional to sequence inputs = tf.concat(1, [tf.expand_dims(input, 1), inputs]) else: #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad sequence_length -= 1 sequence = sequence[:, 1:] if self.is_predict: #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot num_steps = tf.cast(tf.reduce_max(sequence_length), dtype=tf.int32) sequence = tf.slice(sequence, [0,0], [-1, num_steps]) outputs, state = tf.nn.dynamic_rnn(self.cell, inputs, initial_state=state, sequence_length=sequence_length) self.final_state = state if self.softmax_loss_function is None: #[batch_size, num_steps, num_units] * [num_units, vocab_size] -> logits [batch_size, num_steps, vocab_size] logits = melt.batch_matmul_embedding(outputs, self.w) + self.v else: logits = outputs #[batch_size, num_steps] targets = sequence mask = tf.cast(tf.sign(sequence), dtype=tf.float32) if self.is_predict and FLAGS.predict_no_sample: loss = melt.seq2seq.exact_predict_loss(logits, batch_size, num_steps) else: #loss [batch_size,] loss = melt.seq2seq.sequence_loss_by_example( logits, targets, weights=mask, softmax_loss_function=self.softmax_loss_function) #mainly for compat with [bach_size, num_losses] loss = tf.reshape(loss, [-1, 1]) if self.is_predict: loss = self.normalize_length(loss, sequence_length) return loss