def report_statics(self): mylog_section("FSA") mylog_subsection("FSA Info") mylog("Number of States: {}".format(len(self.states))) mylog("Number of Links: {}".format(self.num_links)) mylog("Start state: {}".format(self.start_state.name)) mylog("End state: {}".format(self.end_state.name))
def __init__(self, buckets, size, from_vocab_size, target_vocab_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, optimizer="adam", forward_only=False, dropoutRate=1.0, run_options=None, run_metadata=None, devices_per_model=None, topk_n=30, dtype=tf.float32, with_attention=False, beam_search=False, beam_buckets=None, n_samples=500, with_sampled_softmax=False, attention_style="additive", attention_scale=True, num_models=4, tie_input_output_embedding=False): ''' LocalReplica: Model1[GPU0,GPU1] Model2[GPU3,GPU4],... each model has their own variables, after one step, gradients will sum across multiple GPUs, and updates locally on their own GPU. devices_per_model: [["/gpu:0",..],...] devices_per_model[m][l] m: model, l:layer ''' self.models = [] self.devices_per_model = devices_per_model self.variable_mgr = VariableMgrLocalReplicated() self.num_models = num_models self.buckets = buckets self.run_options = run_options self.run_metadata = run_metadata # Generate models for d, devices_each_model in enumerate(self.devices_per_model): with tf.device(devices_each_model[0]): with self.variable_mgr.create_outer_variable_scope( d), tf.name_scope("tower_{}".format(d)) as name_scope: mylog("creating model #{} at devices: {}".format( d, devices_each_model)) seqModel = SeqModel( buckets, size, from_vocab_size, target_vocab_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, optimizer=optimizer, forward_only=forward_only, dropoutRate=dropoutRate, devices=devices_each_model, run_options=run_options, run_metadata=run_metadata, topk_n=topk_n, dtype=dtype, with_attention=with_attention, beam_search=beam_search, beam_buckets=beam_buckets, n_samples=n_samples, with_sampled_softmax=with_sampled_softmax, attention_style=attention_style, attention_scale=attention_scale, standalone=False, # ! do not init the optimizer now n_distributed_models=self.num_models, tie_input_output_embedding=tie_input_output_embedding) self.models.append(seqModel) # collect the learning_rate_decay_op self.learning_rate_dacay_ops = [] self.dropout10_ops = [] self.dropoutAssign_ops = [] for model in self.models: self.learning_rate_dacay_ops.append(model.learning_rate_decay_op) self.dropout10_ops.append(model.dropout10_op) self.dropoutAssign_ops.append(model.dropoutAssign_op) # Aggregate the gradients section = "Aggregate Gradients " mylog_section(section) agg_grads = [] for b in xrange(len(buckets)): mylog_subsection("Bucket {}".format(b)) # for each buckets gradients = [] # [[grad * n_variable] * n_model] params = [] # [[param * n_variable] * n_model] for model in self.models: gradients.append(model.gradients[b]) params.append(model.params) agg_grad_per_gpu = { } # record how many aggregations of grads happens on eah gpu agg_grads_per_bucket = [] for param_id in xrange(len(params[0])): grads_per_model = [] params_per_model = [] for model_id in xrange(len(params)): params_per_model.append(params[model_id][param_id]) grads_per_model.append(gradients[model_id][param_id]) # choose one device to do aggregation device_for_agg = None min_n_agg = 1000000 for param in params_per_model: dev = param.device if not dev in agg_grad_per_gpu: agg_grad_per_gpu[dev] = [] n_agg = len(agg_grad_per_gpu[dev]) if min_n_agg > n_agg: min_n_agg = n_agg device_for_agg = dev agg_grad_per_gpu[device_for_agg].append(params[0][param_id]) with tf.device(device_for_agg): if type(grads_per_model[0]) == tf.IndexedSlices: values = tf.concat([x.values for x in grads_per_model], 0) indices = tf.concat( [x.indices for x in grads_per_model], 0) agg_grad = tf.IndexedSlices(values, indices) else: agg_grad = tf.add_n(grads_per_model) agg_grads_per_bucket.append(agg_grad) # show aggregation device placement for device in agg_grad_per_gpu: mylog("Aggregated On {}:".format(device)) for param in agg_grad_per_gpu[device]: mylog("\t" + param.name) agg_grads.append(agg_grads_per_bucket) # send the aggregated grads to each model on different gpus for d, devices_each_model in enumerate(self.devices_per_model): self.models[d].init_agg_updates(agg_grads) # combine losses, updates and gradients norm self.losses = [] # per bucket self.updates = [] self.gradient_norms = [] for b in xrange(len(buckets)): losses = [] updates = [] gradient_norms = [] for i, model in enumerate(self.models): losses.append(model.losses[b]) updates.append(model.updates[b]) gradient_norms.append(model.gradient_norms[b]) loss = tf.add_n(losses) self.losses.append(loss) self.updates.append(updates) self.gradient_norms.append(gradient_norms) # get init ops group self.var_init_op = tf.global_variables_initializer() self.broadcast_ops = self.variable_mgr.get_post_init_ops() # for saver all_vars = tf.global_variables() self.train_vars = [] for var in all_vars: if var.name.startswith("v0"): self.train_vars.append(var) self.saver = tf.train.Saver(self.train_vars) self.best_saver = tf.train.Saver(self.train_vars)
def log_flags(_FLAGS): members = _FLAGS.__dict__['__flags'].keys() mylog_section("FLAGS") for attr in members: mylog("{}={}".format(attr, getattr(_FLAGS, attr)))
def train(): # Read Data mylog_section("READ DATA") from_train = None to_train = None from_dev = None to_dev = None from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_data( FLAGS.data_cache_dir, FLAGS.train_path_from, FLAGS.train_path_to, FLAGS.dev_path_from, FLAGS.dev_path_to, FLAGS.from_vocab_size, FLAGS.to_vocab_size, preprocess_data = FLAGS.preprocess_data ) train_data_bucket = read_data(from_train,to_train,_buckets) dev_data_bucket = read_data(from_dev,to_dev, _buckets) _,_,real_vocab_size_from,real_vocab_size_to = data_utils.get_vocab_info(FLAGS.data_cache_dir) FLAGS._buckets = _buckets FLAGS.real_vocab_size_from = real_vocab_size_from FLAGS.real_vocab_size_to = real_vocab_size_to train_n_targets = np.sum([np.sum([len(items[1]) for items in x]) for x in train_data_bucket]) train_n_tokens = np.sum([np.sum([len(items[1])+len(items[0]) for items in x]) for x in train_data_bucket]) train_bucket_sizes = [len(train_data_bucket[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] dev_bucket_sizes = [len(dev_data_bucket[b]) for b in xrange(len(_buckets))] dev_total_size = int(sum(dev_bucket_sizes)) mylog_section("REPORT") # steps batch_size = FLAGS.batch_size n_epoch = FLAGS.n_epoch steps_per_epoch = int(train_total_size / batch_size / FLAGS.num_models) steps_per_dev = int(dev_total_size / batch_size) if FLAGS.checkpoint_steps == 0: steps_per_checkpoint = int(steps_per_epoch / FLAGS.checkpoint_frequency) else: steps_per_checkpoint = FLAGS.checkpoint_steps total_steps = steps_per_epoch * n_epoch # reports mylog("from_vocab_size: {}".format(FLAGS.real_vocab_size_from)) mylog("to_vocab_size: {}".format(FLAGS.real_vocab_size_to)) mylog("_buckets: {}".format(FLAGS._buckets)) mylog("Train:") mylog("total: {}".format(train_total_size)) mylog("bucket sizes: {}".format(train_bucket_sizes)) mylog("Dev:") mylog("total: {}".format(dev_total_size)) mylog("bucket sizes: {}".format(dev_bucket_sizes)) mylog("Steps_per_epoch: {}".format(steps_per_epoch)) mylog("Total_steps:{}".format(total_steps)) mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint)) mylog_section("IN TENSORFLOW") config = tf.ConfigProto(allow_soft_placement=True, log_device_placement = False) config.gpu_options.allow_growth = FLAGS.allow_growth with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog_section("MODEL/SUMMARY/WRITER") mylog("Creating Model.. (this can take a few minutes)") model = create_model(sess, run_options, run_metadata) if FLAGS.with_summary: mylog("Creating ModelSummary") modelSummary = ModelSummary() mylog("Creating tf.summary.FileWriter") summaryWriter = tf.summary.FileWriter(os.path.join(FLAGS.summary_dir , "train.summary"), sess.graph) mylog_section("All Variables") show_all_variables() # Data Iterators mylog_section("Data Iterators") dite = DataIterator(model, train_data_bucket, len(train_buckets_scale), batch_size, train_buckets_scale) iteType = 0 if iteType == 0: mylog("Itetype: withRandom") ite = dite.next_random() elif iteType == 1: mylog("Itetype: withSequence") ite = dite.next_sequence() # statistics during training step_time, loss = 0.0, 0.0 get_batch_time = 0.0 current_step = 0 previous_losses = [] low_ppx = float("inf") low_ppx_step = 0 steps_per_report = 30 n_targets_report = 0 n_sources_report = 0 report_time = 0 n_valid_sents = 0 n_valid_words = 0 patience = FLAGS.patience mylog_section("TRAIN") while current_step < total_steps: # start start_time = time.time() # data and train source_inputs, target_inputs, target_outputs, target_weights, bucket_id = ite.next() get_batch_time += (time.time() - start_time) / steps_per_checkpoint L, norm = model.step(sess, source_inputs, target_inputs, target_outputs, target_weights, bucket_id) # loss and time step_time += (time.time() - start_time) / steps_per_checkpoint loss += L current_step += 1 n_valid_sents += np.sum(np.sign(target_weights[0])) # double sum because different model's target_weights has different shape n_valid_words += np.sum(np.sum(target_weights)) # for report report_time += (time.time() - start_time) n_targets_report += np.sum(np.sum(target_weights)) n_sources_report += np.sum(np.sum(np.sign(source_inputs))) if current_step % steps_per_report == 1: sect_name = "STEP {}".format(current_step) msg = "StepTime: {:.4f} sec Speed: {:.4f} words/s Total_words: {} get_batch_time_ratio: {:.4f}".format(report_time/steps_per_report, (n_sources_report+n_targets_report)*1.0 / report_time, train_n_tokens, get_batch_time / step_time) mylog_line(sect_name,msg) report_time = 0 n_targets_report = 0 n_sources_report = 0 # Create the Timeline object, and write it to a json if FLAGS.profile: tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(ctf) exit() if current_step % steps_per_checkpoint == 1: i_checkpoint = int(current_step / steps_per_checkpoint) # train_ppx loss = loss * FLAGS.batch_size * FLAGS.num_models loss = loss / n_valid_words train_ppx = math.exp(float(loss)) if loss < 300 else float("inf") learning_rate = model.get_learning_rate(sess) # dev_ppx dev_loss, dev_ppx = evaluate(sess, model, dev_data_bucket) # report sect_name = "CHECKPOINT {} STEP {}".format(i_checkpoint, current_step) msg = "Learning_rate: {:.4f} Dev_ppx: {:.4f} Train_ppx: {:.4f} Norm: {:.4f}".format(learning_rate, dev_ppx, train_ppx, norm) mylog_line(sect_name, msg) if FLAGS.with_summary: # save summary _summaries = modelSummary.step_record(sess, train_ppx, dev_ppx) for _summary in _summaries: summaryWriter.add_summary(_summary, i_checkpoint) # save model per checkpoint if FLAGS.saveCheckpoint: checkpoint_path = os.path.join(FLAGS.saved_model_dir, "model") s = time.time() model.saver.save(sess, checkpoint_path, global_step=i_checkpoint, write_meta_graph = False) msg = "Model saved using {:.4f} sec at {}".format(time.time()-s, checkpoint_path) mylog_line(sect_name, msg) # save best model if dev_ppx < low_ppx: patience = FLAGS.patience low_ppx = dev_ppx low_ppx_step = current_step checkpoint_path = os.path.join(FLAGS.saved_model_dir, "best") s = time.time() model.best_saver.save(sess, checkpoint_path, global_step=0, write_meta_graph = False) msg = "Model saved using {:.4f} sec at {}".format(time.time()-s, checkpoint_path) mylog_line(sect_name, msg) else: patience -= 1 # decay the learning rate if FLAGS.decay_learning_rate: sess.run(model.learning_rate_dacay_ops) msg = "New learning_rate: {:.4f} Dev_ppx: {:.4f} Lowest_dev_ppx: {:.4f}".format(model.get_learning_rate(sess), dev_ppx, low_ppx) mylog_line(sect_name, msg) if patience <= 0: mylog("Training finished. Running out of patience.") break # Save checkpoint and zero timer and loss. step_time, loss, n_valid_sents, n_valid_words = 0.0, 0.0, 0, 0 get_batch_time = 0