def read_data_test(source_path, _beam_buckets): order = [] data_set = [[] for _ in _beam_buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: source = source_file.readline() counter = 0 while source: if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()][::-1] success = False for bucket_id, source_size in enumerate(_beam_buckets): if len(source_ids) <= source_size: order.append( (bucket_id, len(data_set[bucket_id]), counter)) data_set[bucket_id].append(source_ids) success = True break if not success: mylog("failed source length {}".format(len(source_ids))) source = source_file.readline() counter += 1 return data_set, order, counter
def init_parameters_from_scratch(self, sess): mylog("Created model with fresh parameters.") sess.run(self.var_init_op) sess.run(self.broadcast_ops) # verify each model have the same parameters self.check_output_bias(sess)
def beam_with_buckets(self, sources, sources_raw, inputs, source_buckets, encoder_cell, decoder_cell, dtype, devices = None, attention = False): self.topk_values = [] self.eos_values = [] self.topk_indexes = [] with variable_scope.variable_scope(variable_scope.get_variable_scope(),reuse= None): # seq2seq if not self.with_attention: _ht, _, = self.beam_basic_seq2seq(encoder_cell, decoder_cell, sources, inputs, dtype, devices) else: _ht, _, = self.beam_attention_seq2seq(encoder_cell, decoder_cell, sources, sources_raw, inputs, dtype, devices) # flat _ht _ht = tf.reshape(_ht, [-1, self.size]) # [batch_size, size] if self.normalize_ht_radius != 0.0: mylog("Normalize_ht: radius: {}".format(self.normalize_ht_radius)) _ht = tf.nn.l2_normalize(_ht, 1) * self.normalize_ht_radius # logits _softmax = tf.nn.softmax(tf.add(tf.matmul(_ht, self.output_embedding, transpose_b = True), self.output_bias)) if self.with_fsa: _softmax = self.mask_target(_softmax, self.fsa_target_mask, self.mask_values) # topk value, index = tf.nn.top_k(_softmax, self.topk_n, sorted = True) eos_v = tf.slice(_softmax, [0,self.EOS_ID],[-1,1]) self.topk_value = value self.topk_index = index self.eos_value = eos_v
def load_parameters(self, sess, path): mylog("Reading model parameters from %s" % path) self.saver.restore(sess, path) sess.run(self.broadcast_ops) # verify each model have the same parameters self.check_output_bias(sess)
def model_with_buckets(self, sources, sources_raw, inputs, targets, weights, buckets, encoder_cell, decoder_cell, dtype, softmax_loss_function, per_example_loss=False, name=None, devices = None, attention = False, rare_weights = None): seq2seq_f = None if attention: seq2seq_f = self.attention_seq2seq else: seq2seq_f = self.basic_seq2seq with variable_scope.variable_scope(variable_scope.get_variable_scope()): _hts, decoder_state = seq2seq_f(encoder_cell, decoder_cell, sources, sources_raw, inputs, dtype, devices) # flat _hts targets weights _hts = tf.reshape(_hts, [-1, self.size]) #[batch_size * time_steps , size] # normalize the ht; if self.normalize_ht_radius != 0.0: mylog("Normalize_ht: radius: {}".format(self.normalize_ht_radius)) _hts = tf.nn.l2_normalize(_hts, 1) * self.normalize_ht_radius targets = tf.reshape(targets, [-1]) weights = tf.reshape(weights, [-1]) # logits / loss / topk_values + topk_indexes with tf.device(devices[-1]): if self.with_sampled_softmax: logits = _hts else: logits = tf.add(tf.matmul(_hts, self.output_embedding, transpose_b = True), self.output_bias) crossent = softmax_loss_function(logits, targets) cost = math_ops.reduce_sum(crossent * weights) cost = cost / math_ops.cast(self.global_batch_size, cost.dtype) if self.rare_weight: rare_weights = tf.reshape(rare_weights, [-1]) rare_cost = math_ops.reduce_sum(crossent * rare_weights) rare_cost = rare_cost / math_ops.cast(self.global_batch_size, cost.dtype) if self.mrt: crossent_batch_length = tf.reshape(crossent * weights, [self.batch_size,-1]) # alpha_log_p = a * log(p(sentence)): shape:[batch_size] alpha_log_p = - math_ops.reduce_sum(crossent_batch_length, axis = 1) * self.mrt_alpha alpha_log_p = tf.reshape(alpha_log_p,[self.num_sentences_per_batch_in_mrt,-1]) q = tf.nn.softmax(alpha_log_p) negative_bleu_scores = - tf.reshape(self.bleu_scores, [self.num_sentences_per_batch_in_mrt,-1]) mrt_loss = math_ops.reduce_sum(q * negative_bleu_scores) / math_ops.cast(self.num_sentences_per_batch_in_mrt, q.dtype) #mrt_loss = tf.Print(mrt_loss,[crossent_batch_length, weights, alpha_log_p, q, negative_bleu_scores, mrt_loss], summarize = 1000) #mrt_loss = tf.Print(mrt_loss, [q,negative_bleu_scores], summarize = 100) self.mrt_loss = mrt_loss self.logits = logits self.losses_by_words = crossent # 1 dimension: [#batch_size * #words] if self.rare_weight: self.losses = rare_cost self.normal_losses = cost else: self.losses = cost self.hts = _hts
def print_current_beam(self, j, bc, finished=False): if self.with_fsa: s = "Beam:{} Father:{} word:{} state:{} score:{}".format( j, bc.beam_index, bc.word_index, bc.fsa_state, bc.score) else: s = "Beam:{} Father:{} word:{} score:{}".format( j, bc.beam_index, bc.word_index, bc.score) if finished: s = "*" + s mylog(s)
def report_statics(self): mylog_section("FSA") mylog_subsection("FSA Info") mylog("Number of States: {}".format(len(self.states))) mylog("Number of Links: {}".format(self.num_links)) mylog("Start state: {}".format(self.start_state.name)) mylog("End state: {}".format(self.end_state.name))
def read_data_test_parallel(source_path, target_path, _buckets): order = [] data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source = source_file.readline() target = target_file.readline() counter = 0 while source: if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() if source.strip() == '': source_ids = [] else: source_ids = np.fromstring(source, dtype=int, sep=' ').tolist()[::-1] if target.strip() == '': target_ids = [] else: target_ids = np.fromstring(target, dtype=int, sep=' ').tolist() target_ids.append(data_utils.EOS_ID) success = False for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) <= source_size and len( target_ids) <= target_size: order.append( (bucket_id, len(data_set[bucket_id]), counter)) data_set[bucket_id].append([source_ids, target_ids]) success = True break if not success: mylog("failed source length {}".format(len(source_ids))) source, target = source_file.readline(), target_file.readline() counter += 1 return data_set, order, counter
def decode(self): for i in xrange(self.max_target_length): # rnn_step if self.check_attention: top_value, top_index, eos_value, attention_score = self.rnn_step( i) else: top_value, top_index, eos_value = self.rnn_step(i) attention_score = None # top_beam_cells = [BeamCell] top_beam_cells = self.get_top_beam_cells(i, top_value, top_index, eos_value) # grow sentence self.grow_sentence(i, top_beam_cells, attention_score=attention_score) if self.valid_beam_size_last_step <= 0: break # add the length penalty for i in xrange(len(self.results)): self.results[i].get_normalized_score(self.length_alpha, self.coverage_beta) # return the top one sentence and scores self.results = sorted(self.results, key=lambda x: -x.normalized_score) print(self.results[0]) if len(self.results) > 0: best_sentence = self.results[0].finished_sentence best_score = self.results[0].normalized_score attention_history = self.results[0].attention_history else: best_sentence = [] best_score = 0.0 attention_history = None mylog("No decoding results.") return best_sentence, best_score, attention_history
def step(self, session, sources_per_model, inputs_per_model, targets_per_model, target_weights_per_model, bucket_id, forward_only=False): # just ignore the bucket_id if forward_only: # if forward only (usually the evaluation of the dev set), use model0's step function. The sources_per_model should be the same shape as requested by models[0].step return self.models[0].step(session, sources_per_model, inputs_per_model, targets_per_model, target_weights_per_model, bucket_id, forward_only=forward_only) # sources: [] * n_models input_feed = {} for m, sources in enumerate(sources_per_model): input_feed[self.models[m].sources.name] = sources for m in xrange(len(sources_per_model)): inputs = inputs_per_model[m] targets = targets_per_model[m] target_weights = target_weights_per_model[m] input_feed[self.models[m].inputs.name] = inputs input_feed[self.models[m].targets.name] = targets input_feed[self.models[m].target_weights.name] = target_weights dump_logits_when_error = True # output_feed output_feed = [] output_feed.append(self.losses) if not forward_only: output_feed += [self.updates, self.gradient_norms] if dump_logits_when_error: output_feed += [self.logits] outputs = session.run(output_feed, input_feed, options=self.run_options, run_metadata=self.run_metadata) if dump_logits_when_error and (not forward_only) and ( np.isnan(outputs[0]) or np.isinf(outputs[0]) or np.isnan(outputs[2][0]) or np.isinf(outputs[2][0])): mylog("L/norm is Nan/Inf! {} {}".format(outputs[0], outputs[2][0])) for i in xrange(len(targets_per_model)): target = targets_per_model[i] source = sources_per_model[i] logits = outputs[3][i] np.savetxt("targets{}.npz".format(i), target) np.savetxt("source_inputs{}.npz".format(i), source) np.savetxt("logits{}.npz".format(i), logits) return # will cause a exception if forward_only: return outputs[0] else: return outputs[0], outputs[2][ 0] # only return losses and norm of first model
def __init__(self, buckets, size, from_vocab_size, target_vocab_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, optimizer = "adam", forward_only=False, dropoutRate = 1.0, devices = "", run_options = None, run_metadata = None, topk_n = 30, dtype=tf.float32, with_attention = False, beam_search = False, beam_buckets = None, n_samples = 500, with_sampled_softmax = False, attention_style = "additive", attention_scale = True, standalone = True, swap_memory = True, n_distributed_models = 1, with_fsa = False, dump_lstm = False, check_attention = False, tie_input_output_embedding = False, variational_dropout = False, mrt = False, num_sentences_per_batch_in_mrt = 1, mrt_alpha = 0.005, normalize_ht_radius = 0.0, layer_normalization = False, rare_weight = False, null_attention = False ): """Create the model. """ mylog("Init SeqModel with dynamic_rnn") self.buckets = buckets self.PAD_ID = 0 self.GO_ID = 1 self.EOS_ID = 2 self.UNK_ID = 3 self.batch_size = batch_size self.devices = devices self.run_options = run_options self.run_metadata = run_metadata self.topk_n = min(topk_n,target_vocab_size) self.dtype = dtype self.from_vocab_size = from_vocab_size self.target_vocab_size = target_vocab_size self.num_layers = num_layers self.size = size self.with_attention = with_attention self.beam_search = beam_search self.with_sampled_softmax = with_sampled_softmax self.n_samples = n_samples self.attention_style = attention_style self.attention_scale = attention_scale self.max_gradient_norm = max_gradient_norm self.swap_memory = swap_memory self.with_fsa = with_fsa self.dump_lstm = dump_lstm self.check_attention = check_attention self.forward_only = forward_only self.tie_input_output_embedding = tie_input_output_embedding self.variational_dropout = variational_dropout self.mrt = mrt # minimum risk training self.num_sentences_per_batch_in_mrt = num_sentences_per_batch_in_mrt self.mrt_alpha = mrt_alpha self.normalize_ht_radius = normalize_ht_radius self.layer_normalization = layer_normalization self.rare_weight = rare_weight self.null_attention = null_attention self.global_batch_size = batch_size if not standalone: self.global_batch_size = batch_size * n_distributed_models self.first_batch = True # some parameters with tf.device(devices[0]): self.dropoutRate = tf.get_variable('dropoutRate',shape = (), initializer = tf.constant_initializer(float(dropoutRate),dtype = dtype), trainable=False, dtype=dtype) self.dropoutAssign_op = self.dropoutRate.assign(dropoutRate) self.dropout10_op = self.dropoutRate.assign(1.0) self.learning_rate = tf.get_variable("learning_rate", shape = (), initializer = tf.constant_initializer(float(learning_rate), dtype = dtype), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.get_variable("global_step", initializer = 0, trainable=False, dtype = tf.int32) # Input Layer with tf.device(devices[0]): # for encoder self.source_input_embedding = tf.get_variable("source_input_embedding",[from_vocab_size, size], dtype = dtype) source_input_plhd = tf.placeholder(tf.int32, shape = [self.batch_size, None], name = "source") source_input_embed = tf.nn.embedding_lookup(self.source_input_embedding, source_input_plhd) self.sources = source_input_plhd self.sources_embed = source_input_embed # for decoder self.inputs = [] self.inputs_embed = [] self.input_embedding = tf.get_variable("input_embedding",[target_vocab_size, size], dtype = dtype) input_plhd = tf.placeholder(tf.int32, shape = [self.batch_size, None], name = "input") input_embed = tf.nn.embedding_lookup(self.input_embedding, input_plhd) self.inputs = input_plhd self.inputs_embed = input_embed if self.mrt: # only for sampling self.dummy_inputs = tf.cast(tf.reshape(input_plhd, [self.batch_size,-1,1]),self.dtype) # for mrt training self.bleu_scores = tf.placeholder(self.dtype, shape = [self.batch_size], name = "bleu_scores") def lstm_cell(device,input_keep_prob = 1.0, output_keep_prob = 1.0, state_keep_prob=1.0, variational_recurrent=False, input_size = None, seed = None): if not self.layer_normalization: cell = tf.contrib.rnn.LSTMCell(size, state_is_tuple=True) else: cell = tf.contrib.rnn.LayerNormBasicLSTMCell(size) cell = tf.contrib.rnn.DropoutWrapper(cell,input_keep_prob = input_keep_prob, output_keep_prob = output_keep_prob, state_keep_prob = state_keep_prob, variational_recurrent=False, dtype=self.dtype, input_size = input_size, seed = None) cell = DeviceCellWrapper(cell, device) return cell # LSTM encoder_cells = [] decoder_cells = [] for i in xrange(num_layers): input_keep_prob = self.dropoutRate output_keep_prob = 1.0 state_keep_prob = 1.0 input_size = size seed = None if self.variational_dropout: seed = random.randint(1,10000) state_keep_prob = self.dropoutRate if i == num_layers - 1: output_keep_prob = self.dropoutRate device = devices[i+1] encoder_cells.append(lstm_cell(device,input_keep_prob, 1.0, state_keep_prob = state_keep_prob, variational_recurrent = self.variational_dropout, input_size = input_size, seed = seed)) # encoder's top layer doesn't need dropout decoder_cells.append(lstm_cell(device,input_keep_prob, output_keep_prob, state_keep_prob = state_keep_prob, variational_recurrent = self.variational_dropout, input_size = input_size, seed = seed)) # encoder's top layer doesn't need dropout self.encoder_cell = tf.contrib.rnn.MultiRNNCell(encoder_cells, state_is_tuple=True) self.decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells, state_is_tuple=True) # Output Layer with tf.device(devices[-1]): if self.tie_input_output_embedding: self.output_embedding = self.input_embedding else: self.output_embedding = tf.get_variable("output_embedding",[target_vocab_size, size], dtype = dtype) self.output_bias = tf.get_variable("output_bias",[target_vocab_size], dtype = dtype) # target: 1 2 3 4 # inputs: go 1 2 3 # weights:1 1 1 1 self.targets = tf.placeholder(tf.int32, shape=[self.batch_size, None ], name = "target") self.target_weights = tf.placeholder(dtype, shape = [self.batch_size, None ], name="target_weight") if self.rare_weight: self.rare_weights = tf.placeholder(dtype, shape = [self.batch_size, None ], name="rare_weight") # Attention if self.with_attention: self.attention = Attention(self) # softmax + cross_entropy_loss if not self.with_sampled_softmax: self.softmax_loss_function = lambda x,y: tf.nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels= y) else: def sampled_loss(labels, logits): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. return tf.cast( tf.nn.sampled_softmax_loss( weights=self.output_embedding, biases=self.output_bias, labels=labels, inputs=logits, num_sampled=self.n_samples, num_classes=target_vocab_size), dtype) self.softmax_loss_function = lambda y,x: sampled_loss(x,y) if not beam_search: # Model with buckets self.model_with_buckets(self.sources_embed, self.sources, self.inputs_embed, self.targets, self.target_weights, self.buckets, self.encoder_cell, self.decoder_cell, dtype, self.softmax_loss_function, devices = devices, attention = with_attention, rare_weights = self.rare_weights if self.rare_weight else None) # for minimum risk training, draw the sample decoder if self.mrt: self.sample_network(self.sources_embed, self.sources, self.dummy_inputs, self.encoder_cell, self.decoder_cell, dtype, devices = devices, attention = with_attention) # train if not forward_only: params = tf.contrib.framework.get_trainable_variables(scope=variable_scope.get_variable_scope()) self.params = params # unclipped gradients if not self.mrt: self.gradients = tf.gradients(self.losses, params, colocate_gradients_with_ops=True) else: self.gradients = tf.gradients(self.mrt_loss, params, colocate_gradients_with_ops=True) # optimizor if optimizer == "adagrad": opt = tf.train.AdagradOptimizer(self.learning_rate) elif optimizer == 'adam': opt = tf.train.AdamOptimizer(self.learning_rate) else: opt = tf.train.GradientDescentOptimizer(learning_rate = self.learning_rate) self.opt = opt # updates if standalone: clipped_gradients, norm = tf.clip_by_global_norm(self.gradients, max_gradient_norm) self.gradient_norms = norm self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) else: # for beam search; self.init_beam_decoder(beam_buckets) if standalone: all_vars = tf.global_variables() self.train_vars = [] self.beam_search_vars = [] for var in all_vars: if not var.name.startswith("v0/beam_search"): self.train_vars.append(var) else: self.beam_search_vars.append(var) self.saver = tf.train.Saver(self.train_vars) self.best_saver = tf.train.Saver(self.train_vars)
def grow_sentence(self, index, top_beam_cells, attention_score=None): if self.print_beam: mylog("--------- Step {} --------".format(index)) # the variables for next step target_inputs = [] beam_parent = [] scores = [] sentences = [] if self.check_attention: attention_scores = [] attention_history = [] if self.with_fsa: fsa_states = [] # process the top beam_size cells for j, bc in enumerate(top_beam_cells): if bc.word_index == data_utils.EOS_ID: # finish one sentences if len(self.sentences[ bc.beam_index]) + 1 < self.min_target_length: continue finished_sentence = self.sentences[bc.beam_index] + [ bc.word_index ] finished_score = bc.score coverage_score = 0.0 finished_attention_history = None if self.check_attention: coverage_score = self.attention_scores[ bc.beam_index] + attention_score[bc.beam_index] #print(finished_sentence, finished_score) #print(coverage_score) coverage_score = np.sum( np.log(np.minimum(coverage_score, 1.0))) # for attention_history finished_attention_history = self.attention_history[ bc.beam_index] + [attention_score[bc.beam_index]] f = FinishedEntry(finished_sentence, finished_score, coverage_score=coverage_score, attention_history=finished_attention_history) self.results.append(f) if self.print_beam: self.print_current_beam(j, bc, finished=True) continue if self.print_beam: self.print_current_beam(j, bc) beam_parent.append(bc.beam_index) target_inputs.append(bc.word_index) scores.append(bc.score) sentences.append(self.sentences[bc.beam_index] + [bc.word_index]) if self.check_attention: attention_scores.append(self.attention_scores[bc.beam_index] + attention_score[bc.beam_index]) attention_history.append( self.attention_history[bc.beam_index] + [attention_score[bc.beam_index]]) if self.with_fsa: fsa_states.append(bc.fsa_state) if len(scores) >= self.beam_size: break # can not fill beam_size, just repeat the last one self.valid_beam_size_last_step = len(scores) while len(scores) > 0 and len( scores ) < self.beam_size and index < self.max_target_length - 1: beam_parent.append(beam_parent[-1]) target_inputs.append(target_inputs[-1]) scores.append(scores[-1]) sentences.append(sentences[-1]) if self.with_fsa: fsa_states.append(fsa_states[-1]) if self.check_attention: attention_scores.append(self.attention_scores[-1] + attention_score[-1]) attention_history.append(self.attention_history[-1] + [attention_score[-1]]) # update for next step self.beam_parent = beam_parent self.target_inputs = target_inputs self.scores = scores self.sentences = sentences if self.with_fsa: self.fsa_states = fsa_states self.prepare_fsa_target_mask() if self.check_attention: self.attention_scores = attention_scores self.attention_history = attention_history
def __init__(self, buckets, size, from_vocab_size, target_vocab_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, optimizer="adam", forward_only=False, dropoutRate=1.0, run_options=None, run_metadata=None, devices_per_model=None, topk_n=30, dtype=tf.float32, with_attention=False, beam_search=False, beam_buckets=None, n_samples=500, with_sampled_softmax=False, attention_style="additive", attention_scale=True, num_models=4, tie_input_output_embedding=False): ''' LocalReplica: Model1[GPU0,GPU1] Model2[GPU3,GPU4],... each model has their own variables, after one step, gradients will sum across multiple GPUs, and updates locally on their own GPU. devices_per_model: [["/gpu:0",..],...] devices_per_model[m][l] m: model, l:layer ''' self.models = [] self.devices_per_model = devices_per_model self.variable_mgr = VariableMgrLocalReplicated() self.num_models = num_models self.buckets = buckets self.run_options = run_options self.run_metadata = run_metadata # Generate models for d, devices_each_model in enumerate(self.devices_per_model): with tf.device(devices_each_model[0]): with self.variable_mgr.create_outer_variable_scope( d), tf.name_scope("tower_{}".format(d)) as name_scope: mylog("creating model #{} at devices: {}".format( d, devices_each_model)) seqModel = SeqModel( buckets, size, from_vocab_size, target_vocab_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, optimizer=optimizer, forward_only=forward_only, dropoutRate=dropoutRate, devices=devices_each_model, run_options=run_options, run_metadata=run_metadata, topk_n=topk_n, dtype=dtype, with_attention=with_attention, beam_search=beam_search, beam_buckets=beam_buckets, n_samples=n_samples, with_sampled_softmax=with_sampled_softmax, attention_style=attention_style, attention_scale=attention_scale, standalone=False, # ! do not init the optimizer now n_distributed_models=self.num_models, tie_input_output_embedding=tie_input_output_embedding) self.models.append(seqModel) # collect the learning_rate_decay_op self.learning_rate_dacay_ops = [] self.dropout10_ops = [] self.dropoutAssign_ops = [] for model in self.models: self.learning_rate_dacay_ops.append(model.learning_rate_decay_op) self.dropout10_ops.append(model.dropout10_op) self.dropoutAssign_ops.append(model.dropoutAssign_op) # Aggregate the gradients section = "Aggregate Gradients " mylog_section(section) agg_grads = [] for b in xrange(len(buckets)): mylog_subsection("Bucket {}".format(b)) # for each buckets gradients = [] # [[grad * n_variable] * n_model] params = [] # [[param * n_variable] * n_model] for model in self.models: gradients.append(model.gradients[b]) params.append(model.params) agg_grad_per_gpu = { } # record how many aggregations of grads happens on eah gpu agg_grads_per_bucket = [] for param_id in xrange(len(params[0])): grads_per_model = [] params_per_model = [] for model_id in xrange(len(params)): params_per_model.append(params[model_id][param_id]) grads_per_model.append(gradients[model_id][param_id]) # choose one device to do aggregation device_for_agg = None min_n_agg = 1000000 for param in params_per_model: dev = param.device if not dev in agg_grad_per_gpu: agg_grad_per_gpu[dev] = [] n_agg = len(agg_grad_per_gpu[dev]) if min_n_agg > n_agg: min_n_agg = n_agg device_for_agg = dev agg_grad_per_gpu[device_for_agg].append(params[0][param_id]) with tf.device(device_for_agg): if type(grads_per_model[0]) == tf.IndexedSlices: values = tf.concat([x.values for x in grads_per_model], 0) indices = tf.concat( [x.indices for x in grads_per_model], 0) agg_grad = tf.IndexedSlices(values, indices) else: agg_grad = tf.add_n(grads_per_model) agg_grads_per_bucket.append(agg_grad) # show aggregation device placement for device in agg_grad_per_gpu: mylog("Aggregated On {}:".format(device)) for param in agg_grad_per_gpu[device]: mylog("\t" + param.name) agg_grads.append(agg_grads_per_bucket) # send the aggregated grads to each model on different gpus for d, devices_each_model in enumerate(self.devices_per_model): self.models[d].init_agg_updates(agg_grads) # combine losses, updates and gradients norm self.losses = [] # per bucket self.updates = [] self.gradient_norms = [] for b in xrange(len(buckets)): losses = [] updates = [] gradient_norms = [] for i, model in enumerate(self.models): losses.append(model.losses[b]) updates.append(model.updates[b]) gradient_norms.append(model.gradient_norms[b]) loss = tf.add_n(losses) self.losses.append(loss) self.updates.append(updates) self.gradient_norms.append(gradient_norms) # get init ops group self.var_init_op = tf.global_variables_initializer() self.broadcast_ops = self.variable_mgr.get_post_init_ops() # for saver all_vars = tf.global_variables() self.train_vars = [] for var in all_vars: if var.name.startswith("v0"): self.train_vars.append(var) self.saver = tf.train.Saver(self.train_vars) self.best_saver = tf.train.Saver(self.train_vars)
def train(): # Read Data mylog_section("READ DATA") from_train = None to_train = None from_dev = None to_dev = None from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_data( FLAGS.data_cache_dir, FLAGS.train_path_from, FLAGS.train_path_to, FLAGS.dev_path_from, FLAGS.dev_path_to, FLAGS.from_vocab_size, FLAGS.to_vocab_size, preprocess_data = FLAGS.preprocess_data ) train_data_bucket = read_data(from_train,to_train,_buckets) dev_data_bucket = read_data(from_dev,to_dev, _buckets) _,_,real_vocab_size_from,real_vocab_size_to = data_utils.get_vocab_info(FLAGS.data_cache_dir) FLAGS._buckets = _buckets FLAGS.real_vocab_size_from = real_vocab_size_from FLAGS.real_vocab_size_to = real_vocab_size_to train_n_targets = np.sum([np.sum([len(items[1]) for items in x]) for x in train_data_bucket]) train_n_tokens = np.sum([np.sum([len(items[1])+len(items[0]) for items in x]) for x in train_data_bucket]) train_bucket_sizes = [len(train_data_bucket[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] dev_bucket_sizes = [len(dev_data_bucket[b]) for b in xrange(len(_buckets))] dev_total_size = int(sum(dev_bucket_sizes)) mylog_section("REPORT") # steps batch_size = FLAGS.batch_size n_epoch = FLAGS.n_epoch steps_per_epoch = int(train_total_size / batch_size / FLAGS.num_models) steps_per_dev = int(dev_total_size / batch_size) if FLAGS.checkpoint_steps == 0: steps_per_checkpoint = int(steps_per_epoch / FLAGS.checkpoint_frequency) else: steps_per_checkpoint = FLAGS.checkpoint_steps total_steps = steps_per_epoch * n_epoch # reports mylog("from_vocab_size: {}".format(FLAGS.real_vocab_size_from)) mylog("to_vocab_size: {}".format(FLAGS.real_vocab_size_to)) mylog("_buckets: {}".format(FLAGS._buckets)) mylog("Train:") mylog("total: {}".format(train_total_size)) mylog("bucket sizes: {}".format(train_bucket_sizes)) mylog("Dev:") mylog("total: {}".format(dev_total_size)) mylog("bucket sizes: {}".format(dev_bucket_sizes)) mylog("Steps_per_epoch: {}".format(steps_per_epoch)) mylog("Total_steps:{}".format(total_steps)) mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint)) mylog_section("IN TENSORFLOW") config = tf.ConfigProto(allow_soft_placement=True, log_device_placement = False) config.gpu_options.allow_growth = FLAGS.allow_growth with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog_section("MODEL/SUMMARY/WRITER") mylog("Creating Model.. (this can take a few minutes)") model = create_model(sess, run_options, run_metadata) if FLAGS.with_summary: mylog("Creating ModelSummary") modelSummary = ModelSummary() mylog("Creating tf.summary.FileWriter") summaryWriter = tf.summary.FileWriter(os.path.join(FLAGS.summary_dir , "train.summary"), sess.graph) mylog_section("All Variables") show_all_variables() # Data Iterators mylog_section("Data Iterators") dite = DataIterator(model, train_data_bucket, len(train_buckets_scale), batch_size, train_buckets_scale) iteType = 0 if iteType == 0: mylog("Itetype: withRandom") ite = dite.next_random() elif iteType == 1: mylog("Itetype: withSequence") ite = dite.next_sequence() # statistics during training step_time, loss = 0.0, 0.0 get_batch_time = 0.0 current_step = 0 previous_losses = [] low_ppx = float("inf") low_ppx_step = 0 steps_per_report = 30 n_targets_report = 0 n_sources_report = 0 report_time = 0 n_valid_sents = 0 n_valid_words = 0 patience = FLAGS.patience mylog_section("TRAIN") while current_step < total_steps: # start start_time = time.time() # data and train source_inputs, target_inputs, target_outputs, target_weights, bucket_id = ite.next() get_batch_time += (time.time() - start_time) / steps_per_checkpoint L, norm = model.step(sess, source_inputs, target_inputs, target_outputs, target_weights, bucket_id) # loss and time step_time += (time.time() - start_time) / steps_per_checkpoint loss += L current_step += 1 n_valid_sents += np.sum(np.sign(target_weights[0])) # double sum because different model's target_weights has different shape n_valid_words += np.sum(np.sum(target_weights)) # for report report_time += (time.time() - start_time) n_targets_report += np.sum(np.sum(target_weights)) n_sources_report += np.sum(np.sum(np.sign(source_inputs))) if current_step % steps_per_report == 1: sect_name = "STEP {}".format(current_step) msg = "StepTime: {:.4f} sec Speed: {:.4f} words/s Total_words: {} get_batch_time_ratio: {:.4f}".format(report_time/steps_per_report, (n_sources_report+n_targets_report)*1.0 / report_time, train_n_tokens, get_batch_time / step_time) mylog_line(sect_name,msg) report_time = 0 n_targets_report = 0 n_sources_report = 0 # Create the Timeline object, and write it to a json if FLAGS.profile: tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(ctf) exit() if current_step % steps_per_checkpoint == 1: i_checkpoint = int(current_step / steps_per_checkpoint) # train_ppx loss = loss * FLAGS.batch_size * FLAGS.num_models loss = loss / n_valid_words train_ppx = math.exp(float(loss)) if loss < 300 else float("inf") learning_rate = model.get_learning_rate(sess) # dev_ppx dev_loss, dev_ppx = evaluate(sess, model, dev_data_bucket) # report sect_name = "CHECKPOINT {} STEP {}".format(i_checkpoint, current_step) msg = "Learning_rate: {:.4f} Dev_ppx: {:.4f} Train_ppx: {:.4f} Norm: {:.4f}".format(learning_rate, dev_ppx, train_ppx, norm) mylog_line(sect_name, msg) if FLAGS.with_summary: # save summary _summaries = modelSummary.step_record(sess, train_ppx, dev_ppx) for _summary in _summaries: summaryWriter.add_summary(_summary, i_checkpoint) # save model per checkpoint if FLAGS.saveCheckpoint: checkpoint_path = os.path.join(FLAGS.saved_model_dir, "model") s = time.time() model.saver.save(sess, checkpoint_path, global_step=i_checkpoint, write_meta_graph = False) msg = "Model saved using {:.4f} sec at {}".format(time.time()-s, checkpoint_path) mylog_line(sect_name, msg) # save best model if dev_ppx < low_ppx: patience = FLAGS.patience low_ppx = dev_ppx low_ppx_step = current_step checkpoint_path = os.path.join(FLAGS.saved_model_dir, "best") s = time.time() model.best_saver.save(sess, checkpoint_path, global_step=0, write_meta_graph = False) msg = "Model saved using {:.4f} sec at {}".format(time.time()-s, checkpoint_path) mylog_line(sect_name, msg) else: patience -= 1 # decay the learning rate if FLAGS.decay_learning_rate: sess.run(model.learning_rate_dacay_ops) msg = "New learning_rate: {:.4f} Dev_ppx: {:.4f} Lowest_dev_ppx: {:.4f}".format(model.get_learning_rate(sess), dev_ppx, low_ppx) mylog_line(sect_name, msg) if patience <= 0: mylog("Training finished. Running out of patience.") break # Save checkpoint and zero timer and loss. step_time, loss, n_valid_sents, n_valid_words = 0.0, 0.0, 0, 0 get_batch_time = 0
def check_output_bias(self, sess): # to varify, we lookat the output_bias for m, model in enumerate(self.models): mylog("Model{} output_bias: {}".format( m, sess.run(model.output_bias)[0]))
def log_flags(_FLAGS): members = _FLAGS.__dict__['__flags'].keys() mylog_section("FLAGS") for attr in members: mylog("{}={}".format(attr, getattr(_FLAGS, attr)))
def show_all_tensors(): all_tensors = [ tensor for tensor in tf.get_default_graph().as_graph_def().node ] for tensor in all_tensors: mylog(tensor.name)
def show_all_variables(): all_vars = tf.global_variables() for var in all_vars: mylog(var.name + " @ " + var.device)