def finalize_evaluation(self, results_per_batch, training_step=None): total_word_lev = 0.0 total_word_count = 0.0 for word_lev, word_count in results_per_batch: total_word_lev += word_lev total_word_count += word_count total_wer = 1.0 * total_word_lev / total_word_count deco_print("Validation WER: {:.4f}".format(total_wer), offset=4) return { "Eval WER": total_wer, }
def finalize_evaluation(self, results_per_batch, training_step=None): preds, targets = [], [] for preds_cur, targets_cur in results_per_batch: if self.params.get('eval_using_bleu', True): preds.extend(preds_cur) targets.extend(targets_cur) if self.params.get('eval_using_bleu', True): eval_bleu = calculate_bleu(preds, targets) deco_print("Eval BLUE score: {}".format(eval_bleu), offset=4) return {'Eval_BLEU_Score': eval_bleu} return {}
def __init__(self, params, model, name="ctc_loss"): """CTC loss constructor. See parent class for arguments description. Config parameters: * **mask_nan** (bool) --- whether to mask nans in the loss output. Defaults to True. """ super(CTCLoss, self).__init__(params, model, name) self._mask_nan = self.params.get("mask_nan", True) # this loss can only operate in full precision if self.params['dtype'] != tf.float32: deco_print("Warning: defaulting CTC loss to work in float32") self.params['dtype'] = tf.float32
def finalize_evaluation(self, results_per_batch, training_step=None): top1 = 0.0 top5 = 0.0 total = 0.0 for cur_total, cur_top1, cur_top5 in results_per_batch: top1 += cur_top1 top5 += cur_top5 total += cur_total top1 = 1.0 * top1 / total top5 = 1.0 * top5 / total deco_print("Validation top-1: {:.4f}".format(top1), offset=4) deco_print("Validation top-5: {:.4f}".format(top5), offset=4) return { "Eval top-1": top1, "Eval top-5": top5, }
def maybe_print_logs(self, input_values, output_values, training_step): labels = input_values['target_tensors'][0] logits = output_values[0] labels = np.where(labels == 1)[1] total = logits.shape[0] top1 = np.sum(np.argmax(logits, axis=1) == labels) top5 = np.sum(labels[:, np.newaxis] == np.argpartition(logits, -5)[:, -5:]) top1 = 1.0 * top1 / total top5 = 1.0 * top5 / total deco_print("Train batch top-1: {:.4f}".format(top1), offset=4) deco_print("Train batch top-5: {:.4f}".format(top5), offset=4) return { "Train batch top-1": top1, "Train batch top-5": top5, }
def maybe_print_logs(self, input_values, output_values, training_step): x, len_x = input_values['source_tensors'] y, len_y = input_values['target_tensors'] samples = output_values[0] x_sample = x[0] len_x_sample = len_x[0] y_sample = y[0] len_y_sample = len_y[0] deco_print( "Train Source[0]: " + array_to_string( x_sample[:len_x_sample], vocab=self.get_data_layer().params['source_idx2seq'], delim=self.get_data_layer().params["delimiter"], ), offset=4, ) deco_print( "Train Target[0]: " + array_to_string( y_sample[:len_y_sample], vocab=self.get_data_layer().params['target_idx2seq'], delim=self.get_data_layer().params["delimiter"], ), offset=4, ) deco_print( "Train Prediction[0]: " + array_to_string( samples[0, :], vocab=self.get_data_layer().params['target_idx2seq'], delim=self.get_data_layer().params["delimiter"], ), offset=4, ) return {}
def after_run(self, run_context, run_values): results, step = run_values.results self._iter_count = step if not results: return self._timer.update_last_triggered_step(self._iter_count - 1) if self._model.steps_in_epoch is None: deco_print("Global step {}:".format(step), end=" ") else: deco_print( "Epoch {}, global step {}:".format( step // self._model.steps_in_epoch, step), end=" ", ) loss = results[0] if not self._model.on_horovod or self._model.hvd.rank() == 0: if self._print_ppl: deco_print("Train loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}" .format(loss, math.exp(loss), loss/math.log(2)), start="", end=", ") else: deco_print( "Train loss: {:.4f} ".format(loss), offset=4) tm = (time.time() - self._last_time) / self._every_steps m, s = divmod(tm, 60) h, m = divmod(m, 60) deco_print( "time per step = {}:{:02}:{:.3f}".format(int(h), int(m), s), start="", ) self._last_time = time.time()
def evaluate(self, input_values, output_values): ex, elen_x = input_values['source_tensors'] ey, elen_y = input_values['target_tensors'] x_sample = ex[0] len_x_sample = elen_x[0] y_sample = ey[0] len_y_sample = elen_y[0] deco_print( "*****EVAL Source[0]: " + array_to_string( x_sample[:len_x_sample], vocab=self.get_data_layer().params['source_idx2seq'], delim=self.get_data_layer().params["delimiter"], ), offset=4, ) deco_print( "*****EVAL Target[0]: " + array_to_string( y_sample[:len_y_sample], vocab=self.get_data_layer().params['target_idx2seq'], delim=self.get_data_layer().params["delimiter"], ), offset=4, ) samples = output_values[0] deco_print( "*****EVAL Prediction[0]: " + array_to_string( samples[0, :], vocab=self.get_data_layer().params['target_idx2seq'], delim=self.get_data_layer().params["delimiter"], ), offset=4, ) preds, targets = [], [] if self.params.get('eval_using_bleu', True): preds.extend([ transform_for_bleu( sample, vocab=self.get_data_layer().params['target_idx2seq'], ignore_special=True, delim=self.get_data_layer().params["delimiter"], bpe_used=self.params.get('bpe_used', False), ) for sample in samples ]) targets.extend([[ transform_for_bleu( yi, vocab=self.get_data_layer().params['target_idx2seq'], ignore_special=True, delim=self.get_data_layer().params["delimiter"], bpe_used=self.params.get('bpe_used', False), ) ] for yi in ey]) return preds, targets
def infer(self, input_values, output_values): if self._lm_phase: vocab = self.get_data_layer().corp.dictionary.idx2word seed_tokens = self.params['encoder_params']['seed_tokens'] for i in range(len(seed_tokens)): print('Seed:', vocab[seed_tokens[i]] + '\n') deco_print( "Output: " + array_to_string( output_values[0][i], vocab=self.get_data_layer().corp.dictionary.idx2word, delim=self.delimiter, ), offset=4, ) return [] else: ex, elen_x = input_values['source_tensors'] ey, elen_y = None, None if 'target_tensors' in input_values: ey, elen_y = input_values['target_tensors'] n_samples = len(ex) results = [] for i in range(n_samples): current_x = array_to_string( ex[i][:elen_x[i]], vocab=self.get_data_layer().corp.dictionary.idx2word, delim=self.delimiter, ), current_pred = np.argmax(output_values[0][i]) curret_y = None if ey is not None: current_y = np.argmax(ey[i]) results.append((current_x[0], current_pred, current_y)) return results
def after_run(self, run_context, run_values): results, step = run_values.results self._iter_count = step if not self._triggered and step != self._last_step - 1: return self._timer.update_last_triggered_step(self._iter_count - 1) if not self._model.on_horovod or self._model.hvd.rank() == 0: deco_print("Running evaluation on a validation set:") results_per_batch, total_loss = get_results_for_epoch( self._model, run_context.session, mode="eval", compute_loss=True, ) if not self._model.on_horovod or self._model.hvd.rank() == 0: if self._print_ppl: deco_print("Validation loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}" .format(total_loss, math.exp(total_loss), total_loss/math.log(2)), offset=4) else: deco_print( "Validation loss: {:.4f} ".format(total_loss), offset=4) dict_to_log = self._model.finalize_evaluation(results_per_batch, step) dict_to_log['eval_loss'] = total_loss if self._print_ppl: # Add bpc and ppl metrics to tensorboard dict_to_log['ppl'] = math.exp(total_loss) dict_to_log['bpc'] = math.exp(total_loss/math.log(2)) # saving the best validation model if self._model.params['save_checkpoint_steps'] and \ total_loss < self._best_eval_loss: self._best_eval_loss = total_loss self._eval_saver.save( run_context.session, os.path.join(self._model.params['logdir'], 'best_models', 'val_loss={:.4f}-step'.format(total_loss)), global_step=step + 1, ) # optionally logging to tensorboard any values # returned from maybe_print_logs if self._model.params['save_summaries_steps']: log_summaries_from_dict( dict_to_log, self._model.params['logdir'], step, )
def finalize_inference(self, results_per_batch, output_file): with codecs.open(output_file, 'w', 'utf-8') as fout: step = 0 for input_strings, output_strings in results_per_batch: for input_string, output_string in zip(input_strings, output_strings): fout.write(output_string + "\n") if step % 200 == 0: deco_print("Input sequence: {}".format(input_string)) deco_print("Output sequence: {}".format(output_string)) deco_print("") step += 1
def maybe_print_logs(self, input_values, output_values, training_step): y, len_y = input_values['target_tensors'] decoded_sequence = output_values y_one_sample = y[0] len_y_one_sample = len_y[0] decoded_sequence_one_batch = decoded_sequence[0] if self.is_bpe: dec_list = sparse_tensor_to_chars_bpe(decoded_sequence_one_batch)[0] true_text = self.get_data_layer().sp.DecodeIds(y_one_sample[:len_y_one_sample].tolist()) pred_text = self.get_data_layer().sp.DecodeIds(dec_list) else: # we also clip the sample by the correct length true_text = "".join(map( self.get_data_layer().params['idx2char'].get, y_one_sample[:len_y_one_sample], )) pred_text = "".join(self.tensor_to_chars( decoded_sequence_one_batch, self.get_data_layer().params['idx2char'], **self.tensor_to_char_params )[0]) sample_wer = levenshtein(true_text.split(), pred_text.split()) / \ len(true_text.split()) self.autoregressive = self.get_data_layer().params.get('autoregressive', False) self.plot_attention = False # (output_values[1] != None).all() if self.plot_attention: attention_summary = plot_attention( output_values[1][0], pred_text, output_values[2][0], training_step) deco_print("Sample WER: {:.4f}".format(sample_wer), offset=4) deco_print("Sample target: " + true_text, offset=4) deco_print("Sample prediction: " + pred_text, offset=4) if self.plot_attention: return { 'Sample WER': sample_wer, 'Attention Summary': attention_summary, } else: return { 'Sample WER': sample_wer, }
def evaluate(self, input_values, output_values): ex, elen_x = input_values['source_tensors'] ey, elen_y = input_values['target_tensors'] x_sample = ex[0] len_x_sample = elen_x[0] y_sample = ey[0] len_y_sample = elen_y[0] return_values = {} if self._lm_phase: flip = random.random() if flip <= 0.9: return return_values deco_print( "*****EVAL Source[0]: " + array_to_string( x_sample[:len_x_sample], vocab=self.get_data_layer().corp.dictionary.idx2word, delim=self.delimiter, ), offset=4, ) samples = np.argmax(output_values[0][0], axis=-1) deco_print( "*****EVAL Target[0]: " + array_to_string( y_sample[:len_y_sample], vocab=self.get_data_layer().corp.dictionary.idx2word, delim=self.delimiter, ), offset=4, ) deco_print( "*****EVAL Prediction[0]: " + array_to_string( samples, vocab=self.get_data_layer().corp.dictionary.idx2word, delim=self.delimiter, ), offset=4, ) else: deco_print( "*****EVAL Source[0]: " + array_to_string( x_sample[:len_x_sample], vocab=self.get_data_layer().corp.dictionary.idx2word, delim=self.delimiter, ), offset=4, ) samples = output_values[0][0] deco_print( "EVAL Target[0]: " + str(np.argmax(y_sample)), offset=4, ) deco_print( "EVAL Prediction[0]: " + str(samples), offset=4, ) labels = np.argmax(ey, 1) preds = np.argmax(output_values[0], axis=-1) print('Labels', labels) print('Preds', preds) return_values['accuracy'] = metrics.accuracy(labels, preds) if self._print_f1: return_values['true_pos'] = metrics.true_positives( labels, preds) return_values['pred_pos'] = np.sum(preds) return_values['actual_pos'] = np.sum(labels) return return_values
def maybe_print_logs(self, input_values, output_values, training_step): x, len_x = input_values['source_tensors'] y, len_y = input_values['target_tensors'] x_sample = x[0] len_x_sample = len_x[0] y_sample = y[0] len_y_sample = len_y[0] deco_print( "Train Source[0]: " + array_to_string( x_sample[:len_x_sample], vocab=self.get_data_layer().corp.dictionary.idx2word, delim=self.delimiter, ), offset=4, ) if self._lm_phase: deco_print( "Train Target[0]: " + array_to_string( y_sample[:len_y_sample], vocab=self.get_data_layer().corp.dictionary.idx2word, delim=self.delimiter, ), offset=4, ) else: deco_print( "TRAIN Target[0]: " + str(np.argmax(y_sample)), offset=4, ) samples = output_values[0][0] deco_print( "TRAIN Prediction[0]: " + str(samples), offset=4, ) labels = np.argmax(y, 1) preds = np.argmax(output_values[0], axis=-1) print('Labels', labels) print('Preds', preds) deco_print( "Accuracy: {:.4f}".format(metrics.accuracy(labels, preds)), offset=4, ) if self._print_f1: deco_print( "Precision: {:.4f} | Recall: {:.4f} | F1: {:.4f}".format( metrics.precision(labels, preds), metrics.recall(labels, preds), metrics.f1(labels, preds)), offset=4, ) return {}
def build_layer(inputs, layer, layer_params, data_format, regularizer, training, verbose=True): """This function builds a layer from the layer function and it's parameters. It will automatically add regularizer parameter to the layer_params if the layer supports regularization. To check this, it will look for the "regularizer", "kernel_regularizer" and "gamma_regularizer" names in this order in the ``layer`` call signature. If one of this parameters is supported it will pass regularizer object as a value for that parameter. Based on the same "checking signature" technique "data_format" and "training" parameters will try to be added. Finally, "axis" parameter will try to be specified with axis = ``1 if data_format == 'channels_first' else 3``. This is required for automatic building batch normalization layer. Args: inputs: input Tensor that will be passed to the layer. Note that layer has to accept input as the first parameter. layer: layer function or class with ``__call__`` method defined. layer_params (dict): parameters passed to the ``layer``. data_format (string): data format ("channels_first" or "channels_last") that will be tried to be passed as an additional argument. regularizer: regularizer instance that will be tried to be passed as an additional argument. training (bool): whether layer is built in training mode. Will be tried to be passed as an additional argument. verbose (bool): whether to print information about built layers. Returns: Tensor with layer output. """ layer_params_cp = copy.deepcopy(layer_params) for reg_name in ['regularizer', 'kernel_regularizer', 'gamma_regularizer']: if reg_name not in layer_params_cp and \ reg_name in signature(layer).parameters: layer_params_cp.update({reg_name: regularizer}) if 'data_format' not in layer_params_cp and \ 'data_format' in signature(layer).parameters: layer_params_cp.update({'data_format': data_format}) # necessary to check axis for correct batch normalization processing if 'axis' not in layer_params_cp and \ 'axis' in signature(layer).parameters: layer_params_cp.update({'axis': 1 if data_format == 'channels_first' else 3}) if 'training' not in layer_params_cp and \ 'training' in signature(layer).parameters: layer_params_cp.update({'training': training}) outputs = layer(inputs, **layer_params_cp) if verbose: if hasattr(layer, '_tf_api_names'): layer_name = layer._tf_api_names[0] else: layer_name = layer deco_print("Building layer: {}(inputs, {})".format( layer_name, ", ".join("{}={}".format(key, value) for key, value in layer_params_cp.items()) )) return outputs
def compile(self, force_var_reuse=False, checkpoint=None, use_trt=False, precision='FP32'): """TensorFlow graph is built here.""" if 'initializer' not in self.params: initializer = None else: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) if not self.on_horovod: # not using Horovod # below we follow data parallelism for multi-GPU training losses = [] for gpu_cnt, gpu_id in enumerate(self._gpu_ids): with tf.device("/gpu:{}".format(gpu_id)), tf.variable_scope( name_or_scope=tf.get_variable_scope(), # re-using variables across GPUs. reuse=force_var_reuse or (gpu_cnt > 0), initializer=initializer, dtype=self.get_tf_dtype(), ): deco_print("Building graph on GPU:{}".format(gpu_id)) if self._interactive: self.get_data_layer( gpu_cnt).create_interactive_placeholders() else: self.get_data_layer(gpu_cnt).build_graph() input_tensors = self.get_data_layer(gpu_cnt).input_tensors loss, self._outputs[ gpu_cnt] = self.build_forward_pass_graph( input_tensors, gpu_id=gpu_cnt, checkpoint=checkpoint, use_trt=use_trt, precision=precision) if self._outputs[gpu_cnt] is not None and \ not isinstance(self._outputs[gpu_cnt], list): raise ValueError( 'Decoder outputs have to be either None or list') if self._mode == "train" or self._mode == "eval": losses.append(loss) # end of for gpu_ind loop if self._mode == "train": self.loss = tf.reduce_mean(losses) if self._mode == "eval": self.eval_losses = losses else: # is using Horovod # gpu_id should always be zero, since Horovod takes care of isolating # different processes to 1 GPU only with tf.device("/gpu:0"), tf.variable_scope( name_or_scope=tf.get_variable_scope(), reuse=force_var_reuse, initializer=initializer, dtype=self.get_tf_dtype(), ): deco_print("Building graph in Horovod rank: {}".format( self._hvd.rank())) self.get_data_layer().build_graph() input_tensors = self.get_data_layer().input_tensors all_loss, self._output = self._build_forward_pass_graph( input_tensors, gpu_id=0) if isinstance(all_loss, (dict, )): loss = all_loss['loss'] else: loss = all_loss if self._output is not None and not isinstance( self._output, list): raise ValueError( 'Decoder outputs have to be either None or list') if self._mode == "train": self.loss = loss if self._mode == "eval": self.eval_losses = [loss] try: self._num_objects_per_step = [ self._get_num_objects_per_step(worker_id) for worker_id in range(self.num_gpus) ] except NotImplementedError: pass if self._mode == "train": if 'lr_policy' not in self.params: lr_policy = None else: lr_params = self.params.get('lr_policy_params', {}) # adding default decay_steps = max_steps if lr_policy supports it and # different value is not provided func_params = signature(self.params['lr_policy']).parameters if 'decay_steps' in func_params and 'decay_steps' not in lr_params: lr_params['decay_steps'] = self._last_step if 'steps_per_epoch' in func_params and \ 'steps_per_epoch' not in lr_params and 'num_epochs' in self.params: lr_params['steps_per_epoch'] = self.steps_in_epoch lr_policy = lambda gs: self.params['lr_policy'](global_step=gs, **lr_params) if self.params.get('iter_size', 1) > 1: self.skip_update_ph = tf.placeholder(tf.bool) var_list = tf.trainable_variables() freeze_variables_regex = self.params.get('freeze_variables_regex', None) if freeze_variables_regex is not None: pattern = re.compile(freeze_variables_regex) var_list = [ var for var in tf.trainable_variables() if not pattern.match(var.name) ] self.train_op = optimize_loss( loss=tf.cast(self.loss, tf.float32) + get_regularization_loss(), dtype=self.params['dtype'], optimizer=self.params['optimizer'], optimizer_params=self.params.get('optimizer_params', {}), var_list=var_list, clip_gradients=self.params.get('max_grad_norm', None), learning_rate_decay_fn=lr_policy, summaries=self.params.get('summaries', None), larc_params=self.params.get('larc_params', None), loss_scaling=self.params.get('loss_scaling', 1.0), loss_scaling_params=self.params.get('loss_scaling_params', None), on_horovod=self.on_horovod, iter_size=self.params.get('iter_size', 1), skip_update_ph=self.skip_update_ph, model=self) tf.summary.scalar(name="train_loss", tensor=self.loss) if self.steps_in_epoch: tf.summary.scalar( name="epoch", tensor=tf.floor( tf.train.get_global_step() / tf.constant(self.steps_in_epoch, dtype=tf.int64)), ) if not self.on_horovod or self._hvd.rank() == 0: if freeze_variables_regex is not None: deco_print('Complete list of variables:') for var in tf.trainable_variables(): deco_print('{}'.format(var.name), offset=2) deco_print("Trainable variables:") total_params = 0 unknown_shape = False for var in var_list: var_params = 1 deco_print('{}'.format(var.name), offset=2) deco_print('shape: {}, {}'.format(var.get_shape(), var.dtype), offset=4) if var.get_shape(): for dim in var.get_shape(): var_params *= dim.value total_params += var_params else: unknown_shape = True if unknown_shape: deco_print( "Encountered unknown variable shape, can't compute total " "number of parameters.") else: deco_print( 'Total trainable parameters: {}'.format(total_params))
def _build_forward_pass_graph(self, input_tensors, gpu_id=0): """TensorFlow graph for encoder-decoder-loss model is created here. This function connects encoder, decoder and loss together. As an input for encoder it will specify source tensors (as returned from the data layer). As an input for decoder it will specify target tensors as well as all output returned from encoder. For loss it will also specify target tensors and all output returned from decoder. Note that loss will only be built for mode == "train" or "eval". Args: input_tensors (dict): ``input_tensors`` dictionary that has to contain ``source_tensors`` key with the list of all source tensors, and ``target_tensors`` with the list of all target tensors. Note that ``target_tensors`` only need to be provided if mode is "train" or "eval". gpu_id (int, optional): id of the GPU where the current copy of the model is constructed. For Horovod this is always zero. Returns: tuple: tuple containing loss tensor as returned from ``loss.compute_loss()`` and list of outputs tensors, which is taken from ``decoder.decode()['outputs']``. When ``mode == 'infer'``, loss will be None. """ if not isinstance(input_tensors, dict) or \ 'source_tensors' not in input_tensors: raise ValueError('Input tensors should be a dict containing ' '"source_tensors" key') if not isinstance(input_tensors['source_tensors'], list): raise ValueError('source_tensors should be a list') source_tensors = input_tensors['source_tensors'] if self.mode == "train" or self.mode == "eval": if 'target_tensors' not in input_tensors: raise ValueError( 'Input tensors should contain "target_tensors" key' 'when mode != "infer"') if not isinstance(input_tensors['target_tensors'], list): raise ValueError('target_tensors should be a list') target_tensors = input_tensors['target_tensors'] with tf.variable_scope("ForwardPass"): encoder_input = {"source_tensors": source_tensors} encoder_output = self.encoder.encode(input_dict=encoder_input) decoder_input = {"encoder_output": encoder_output} if self.mode == "train" or self.mode == "eval": decoder_input['target_tensors'] = target_tensors decoder_output = self.decoder.decode(input_dict=decoder_input) model_outputs = decoder_output.get("outputs", None) if self.mode == "train" or self.mode == "eval": with tf.variable_scope("Loss"): loss_input_dict = { "decoder_output": decoder_output, "target_tensors": target_tensors, } loss = self.loss_computator.compute_loss(loss_input_dict) else: deco_print("Inference Mode. Loss part of graph isn't built.") loss = None return loss, model_outputs
def train(train_model, eval_model=None, debug_port=None): if eval_model is not None and 'eval_steps' not in eval_model.params: raise ValueError("eval_steps parameter has to be specified " "if eval_model is provided") hvd = train_model.hvd if hvd: master_worker = hvd.rank() == 0 else: master_worker = True # initializing session parameters sess_config = tf.ConfigProto(allow_soft_placement=True) # pylint: disable=no-member sess_config.gpu_options.allow_growth = True if hvd is not None: # pylint: disable=no-member sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) # defining necessary hooks hooks = [tf.train.StopAtStepHook(last_step=train_model.last_step)] if hvd is not None: hooks.append(BroadcastGlobalVariablesHook(0)) if master_worker: checkpoint_dir = train_model.params['logdir'] load_model_dir = train_model.params['load_model'] else: checkpoint_dir = None load_model_dir = None if eval_model is not None: # noinspection PyTypeChecker hooks.append( RunEvaluationHook( every_steps=eval_model.params['eval_steps'], model=eval_model, last_step=train_model.last_step, print_ppl=isinstance(eval_model.get_data_layer(), WKTDataLayer), ), ) if master_worker: if train_model.params['save_checkpoint_steps'] is not None: # noinspection PyTypeChecker saver = tf.train.Saver( save_relative_paths=True, max_to_keep=train_model.params['num_checkpoints']) hooks.append( tf.train.CheckpointSaverHook( checkpoint_dir, saver=saver, save_steps=train_model.params['save_checkpoint_steps'], )) if train_model.params['print_loss_steps'] is not None: # noinspection PyTypeChecker hooks.append( PrintLossAndTimeHook( every_steps=train_model.params['print_loss_steps'], model=train_model, print_ppl=isinstance(train_model.get_data_layer(), WKTDataLayer), )) if train_model.params['print_samples_steps'] is not None: # noinspection PyTypeChecker hooks.append( PrintSamplesHook( every_steps=train_model.params['print_samples_steps'], model=train_model, )) total_time = 0.0 bench_start = train_model.params.get('bench_start', 10) if debug_port: hooks.append( tf_debug.TensorBoardDebugHook("localhost:{}".format(debug_port))) if train_model.on_horovod: init_data_layer = train_model.get_data_layer().iterator.initializer else: init_data_layer = tf.group([ train_model.get_data_layer(i).iterator.initializer for i in range(train_model.num_gpus) ]) if (not load_model_dir) or tf.train.latest_checkpoint(checkpoint_dir): scaffold = tf.train.Scaffold(local_init_op=tf.group( tf.local_variables_initializer(), init_data_layer)) else: scaffold = TransferScaffold(local_init_op=tf.group( tf.local_variables_initializer(), init_data_layer)) fetches = [train_model.train_op] try: total_objects = 0.0 # on horovod num_gpus is 1 for worker_id in range(train_model.num_gpus): fetches.append(train_model.get_num_objects_per_step(worker_id)) except NotImplementedError: deco_print( "WARNING: Can't compute number of objects per step, since " "train model does not define get_num_objects_per_step method.") # starting training if load_model_dir and not tf.train.latest_checkpoint(checkpoint_dir): sess = TransferMonitoredTrainingSession( scaffold=scaffold, checkpoint_dir=checkpoint_dir, save_summaries_steps=train_model.params['save_summaries_steps'], config=sess_config, save_checkpoint_secs=None, log_step_count_steps=train_model.params['save_summaries_steps'], stop_grace_period_secs=300, hooks=hooks, load_model_dir=load_model_dir, load_fc=train_model.params['load_fc']) else: sess = tf.train.MonitoredTrainingSession( scaffold=scaffold, checkpoint_dir=checkpoint_dir, save_summaries_steps=train_model.params['save_summaries_steps'], config=sess_config, save_checkpoint_secs=None, log_step_count_steps=train_model.params['save_summaries_steps'], stop_grace_period_secs=300, hooks=hooks) step = 0 num_bench_updates = 0 while True: if sess.should_stop(): break tm = time.time() try: feed_dict = {} iter_size = train_model.params.get('iter_size', 1) if iter_size > 1: feed_dict[train_model.skip_update_ph] = step % iter_size != 0 if step % iter_size == 0: if step >= bench_start: num_bench_updates += 1 fetches_vals = sess.run(fetches, feed_dict) else: # necessary to skip "no-update" steps when iter_size > 1 def run_with_no_hooks(step_context): return step_context.session.run(fetches, feed_dict) fetches_vals = sess.run_step_fn(run_with_no_hooks) except tf.errors.OutOfRangeError: break if step >= bench_start: total_time += time.time() - tm if len(fetches) > 1: for i in range(train_model.num_gpus): total_objects += np.sum(fetches_vals[i + 1]) if train_model.params['print_bench_info_steps'] is not None: if step % train_model.params['print_bench_info_steps'] == 0: total_objects_cur = collect_if_horovod(total_objects, hvd, mode="sum") if master_worker: avg_objects = 1.0 * total_objects_cur / total_time deco_print("Avg objects per second: {:.3f}".format( avg_objects)) step += 1 sess.close() if len(fetches) > 1: total_objects = collect_if_horovod(total_objects, hvd, mode="sum") if master_worker: deco_print("Finished training") if step > bench_start: avg_time = 1.0 * total_time / num_bench_updates deco_print("Avg time per step: {:.3f}s".format(avg_time)) if len(fetches) > 1: avg_objects = 1.0 * total_objects / total_time deco_print( "Avg objects per second: {:.3f}".format(avg_objects)) else: deco_print("Not enough steps for benchmarking")