def initialize_model(tf_manager: TensorFlowManager, initial_variables: Optional[List[str]], executables: List[GraphExecutor]): if initial_variables is None: # Assume we don't look at coder checkpoints when global # initial variables are supplied tf_manager.initialize_model_parts(executables, save=True) else: try: tf_manager.restore(initial_variables) except tf.errors.NotFoundError: warn("Some variables were not found in checkpoint.)")
def run_on_dataset(tf_manager: TensorFlowManager, runners: List[BaseRunner], dataset: Dataset, postprocess: Callable, write_out: bool=False, batch_size: Optional[int]=None) \ -> Tuple[List[ExecutionResult], Dict[str, List[Any]]]: """Apply the model on a dataset and optionally write outputs to files. Args: tf_manager: TensorFlow manager with initialized sessions. runners: A function that runs the code dataset: The dataset on which the model will be executed. evaluators: List of evaluators that are used for the model evaluation if the target data are provided. postprocess: an object to use as postprocessing of the write_out: Flag whether the outputs should be printed to a file defined in the dataset object. extra_fetches: Extra tensors to evaluate for each batch. Returns: Tuple of resulting sentences/numpy arrays, and evaluation results if they are available which are dictionary function -> value. """ contains_targets = all(dataset.has_series(runner.output_series) for runner in runners) all_results = tf_manager.execute(dataset, runners, train=contains_targets, batch_size=batch_size) result_data_raw = {runner.output_series: result.outputs for runner, result in zip(runners, all_results)} if postprocess is not None: result_data = postprocess(dataset, result_data_raw) else: result_data = result_data_raw if write_out: for series_id, data in result_data.items(): if series_id in dataset.series_outputs: path = dataset.series_outputs[series_id] if isinstance(data, np.ndarray): np.save(path, data) log('Result saved as numpy array to "{}"'.format(path)) else: with open(path, 'w') as f_out: f_out.writelines( [" ".join(sent) + "\n" for sent in data]) log("Result saved as plain text \"{}\"".format(path)) else: log("There is no output file for dataset: {}" .format(dataset.name), color='red') return all_results, result_data
def training_loop(tf_manager: TensorFlowManager, epochs: int, trainer: BaseRunner, # TODO better annotate batch_size: int, train_dataset: Dataset, val_dataset: Dataset, log_directory: str, evaluators: EvalConfiguration, runners: List[BaseRunner], test_datasets: Optional[List[Dataset]]=None, save_n_best_vars: int=1, link_best_vars="/tmp/variables.data.best", vars_prefix="/tmp/variables.data", logging_period: int=20, validation_period: int=500, runners_batch_size: Optional[int]=None, postprocess: Callable=None, minimize_metric: bool=False): # TODO finish the list """ Performs the training loop for given graph and data. Args: tf_manager: TensorFlowManager with initialized sessions. epochs: Number of epochs for which the algoritm will learn. trainer: The trainer object containg the TensorFlow code for computing the loss and optimization operation. train_dataset: val_dataset: postprocess: Function that takes the output sentence as produced by the decoder and transforms into tokenized sentence. log_directory: Directory where the TensordBoard log will be generated. If None, nothing will be done. evaluators: List of evaluators. The last evaluator is used as the main. An evaluator is a tuple of the name of the generated series, the name of the dataset series the generated one is evaluated with and the evaluation function. If only one series names is provided, it means the generated and dataset series have the same name. """ evaluators = [(e[0], e[0], e[1]) if len(e) == 2 else e for e in evaluators] main_metric = "{}/{}".format(evaluators[-1][0], evaluators[-1][-1].name) step = 0 seen_instances = 0 if save_n_best_vars < 1: raise Exception('save_n_best_vars must be greater than zero') if save_n_best_vars == 1: variables_files = [vars_prefix] elif save_n_best_vars > 1: variables_files = ['{}.{}'.format(vars_prefix, i) for i in range(save_n_best_vars)] if minimize_metric: saved_scores = [np.inf for _ in range(save_n_best_vars)] best_score = np.inf else: saved_scores = [-np.inf for _ in range(save_n_best_vars)] best_score = -np.inf tf_manager.save(variables_files[0]) if os.path.islink(link_best_vars): # if overwriting output dir os.unlink(link_best_vars) os.symlink(os.path.basename(variables_files[0]), link_best_vars) if log_directory: log("Initializing TensorBoard summary writer.") tb_writer = tf.train.SummaryWriter(log_directory, tf_manager.sessions[0].graph) log("TesorBoard writer initialized.") best_score_epoch = 0 best_score_batch_no = 0 log("Starting training") try: for i in range(epochs): log_print("") log("Epoch {} starts".format(i + 1), color='red') train_dataset.shuffle() train_batched_datasets = train_dataset.batch_dataset(batch_size) for batch_n, batch_dataset in enumerate(train_batched_datasets): step += 1 seen_instances += len(batch_dataset) if step % logging_period == logging_period - 1: trainer_result = tf_manager.execute( batch_dataset, [trainer], train=True, summaries=True) train_results, train_outputs = run_on_dataset( tf_manager, runners, batch_dataset, postprocess, write_out=False) train_evaluation = evaluation( evaluators, batch_dataset, runners, train_results, train_outputs) _log_continuons_evaluation(tb_writer, main_metric, train_evaluation, seen_instances, trainer_result, train=True) else: tf_manager.execute(batch_dataset, [trainer], train=True, summaries=False) if step % validation_period == validation_period - 1: val_results, val_outputs = run_on_dataset( tf_manager, runners, val_dataset, postprocess, write_out=False, batch_size=runners_batch_size) val_evaluation = evaluation( evaluators, val_dataset, runners, val_results, val_outputs) this_score = val_evaluation[main_metric] def is_better(score1, score2, minimize): if minimize: return score1 < score2 else: return score1 > score2 def argworst(scores, minimize): if minimize: return np.argmax(scores) else: return np.argmin(scores) if is_better(this_score, best_score, minimize_metric): best_score = this_score best_score_epoch = i + 1 best_score_batch_no = batch_n worst_index = argworst(saved_scores, minimize_metric) worst_score = saved_scores[worst_index] if is_better(this_score, worst_score, minimize_metric): # we need to save this score instead the worst score worst_var_file = variables_files[worst_index] tf_manager.save(worst_var_file) saved_scores[worst_index] = this_score log("Variable file saved in {}".format(worst_var_file)) # update symlink if best_score == this_score: os.unlink(link_best_vars) os.symlink(os.path.basename(worst_var_file), link_best_vars) log("Best scores saved so far: {}".format(saved_scores)) log("Validation (epoch {}, batch number {}):" .format(i + 1, batch_n), color='blue') _log_continuons_evaluation(tb_writer, main_metric, val_evaluation, seen_instances, val_results, train=False) if this_score == best_score: best_score_str = colored("{:.2f}".format(best_score), attrs=['bold']) else: best_score_str = "{:.2f}".format(best_score) log("best {} on validation: {} (in epoch {}, " "after batch number {})" .format(main_metric, best_score_str, best_score_epoch, best_score_batch_no), color='blue') log_print("") _print_examples(val_dataset, val_outputs) except KeyboardInterrupt: log("Training interrupted by user.") log("Training finished. Maximum {} on validation data: {:.2f}, epoch {}" .format(main_metric, best_score, best_score_epoch)) if test_datasets and os.path.islink(link_best_vars): tf_manager.restore(link_best_vars) for dataset in test_datasets: test_results, test_outputs = run_on_dataset( tf_manager, runners, dataset, postprocess, write_out=True, batch_size=runners_batch_size) eval_result = evaluation(evaluators, dataset, runners, test_results, test_outputs) print_final_evaluation(dataset.name, eval_result) log("Finished.")
def run_on_dataset( tf_manager: TensorFlowManager, runners: List[BaseRunner], dataset: Dataset, postprocess: Postprocess, write_out: bool = False, batch_size: Optional[int] = None, log_progress: int = 0 ) -> Tuple[List[ExecutionResult], Dict[str, List[Any]]]: """Apply the model on a dataset and optionally write outputs to files. Args: tf_manager: TensorFlow manager with initialized sessions. runners: A function that runs the code dataset: The dataset on which the model will be executed. evaluators: List of evaluators that are used for the model evaluation if the target data are provided. postprocess: an object to use as postprocessing of the write_out: Flag whether the outputs should be printed to a file defined in the dataset object. batch_size: size of the minibatch log_progress: log progress every X seconds extra_fetches: Extra tensors to evaluate for each batch. Returns: Tuple of resulting sentences/numpy arrays, and evaluation results if they are available which are dictionary function -> value. """ contains_targets = all( dataset.has_series(runner.decoder_data_id) for runner in runners if runner.decoder_data_id is not None) all_results = tf_manager.execute(dataset, runners, compute_losses=contains_targets, batch_size=batch_size, log_progress=log_progress) result_data = { runner.output_series: result.outputs for runner, result in zip(runners, all_results) } if postprocess is not None: for series_name, postprocessor in postprocess: postprocessed = postprocessor(dataset, result_data) if not hasattr(postprocessed, "__len__"): postprocessed = list(postprocessed) result_data[series_name] = postprocessed # check output series lengths for series_id, data in result_data.items(): if len(data) != len(dataset): warn("Output '{}' for dataset '{}' has length {}, but " "len(dataset) == {}".format(series_id, dataset.name, len(data), len(dataset))) def _check_savable_dict(data): """Check if the data is of savable type.""" if not (data and data[0]): return False supported_type = Union[List[Dict[str, np.ndarray]], List[List[Dict[str, np.ndarray]]]] try: check_type("data", data, supported_type, None) except TypeError: return False return True if write_out: for series_id, data in result_data.items(): if series_id in dataset.series_outputs: path = dataset.series_outputs[series_id] if isinstance(data, np.ndarray): np.save(path, data) log("Result saved as numpy array to '{}'".format(path)) elif _check_savable_dict(data): unbatched = dict( zip(data[0], zip(*[d.values() for d in data]))) np.savez(path, **unbatched) log("Result saved as numpy data to '{}.npz'".format(path)) else: with open(path, "w", encoding="utf-8") as f_out: f_out.writelines([ " ".join(sent) + "\n" if isinstance( sent, collections.Iterable) else str(sent) + "\n" for sent in data ]) log("Result saved as plain text '{}'".format(path)) else: log("There is no output file for dataset: {}".format( dataset.name), color="red") return all_results, result_data
def training_loop( tf_manager: TensorFlowManager, epochs: int, trainer: GenericTrainer, # TODO better annotate batch_size: int, log_directory: str, evaluators: EvalConfiguration, runners: List[BaseRunner], train_dataset: Dataset, val_dataset: Union[Dataset, List[Dataset]], test_datasets: Optional[List[Dataset]] = None, logging_period: Union[str, int] = 20, validation_period: Union[str, int] = 500, val_preview_input_series: Optional[List[str]] = None, val_preview_output_series: Optional[List[str]] = None, val_preview_num_examples: int = 15, train_start_offset: int = 0, runners_batch_size: Optional[int] = None, initial_variables: Optional[Union[str, List[str]]] = None, postprocess: Postprocess = None) -> None: """Execute the training loop for given graph and data. Args: tf_manager: TensorFlowManager with initialized sessions. epochs: Number of epochs for which the algoritm will learn. trainer: The trainer object containg the TensorFlow code for computing the loss and optimization operation. batch_size: number of examples in one mini-batch log_directory: Directory where the TensordBoard log will be generated. If None, nothing will be done. evaluators: List of evaluators. The last evaluator is used as the main. An evaluator is a tuple of the name of the generated series, the name of the dataset series the generated one is evaluated with and the evaluation function. If only one series names is provided, it means the generated and dataset series have the same name. runners: List of runners for logging and evaluation runs train_dataset: Dataset used for training val_dataset: used for validation. Can be Dataset or a list of datasets. The last dataset is used as the main one for storing best results. When using multiple datasets. It is recommended to name them for better Tensorboard visualization. test_datasets: List of datasets used for testing logging_period: after how many batches should the logging happen. It can also be defined as a time period in format like: 3s; 4m; 6h; 1d; 3m15s; 3seconds; 4minutes; 6hours; 1days validation_period: after how many batches should the validation happen. It can also be defined as a time period in same format as logging val_preview_input_series: which input series to preview in validation val_preview_output_series: which output series to preview in validation val_preview_num_examples: how many examples should be printed during validation train_start_offset: how many lines from the training dataset should be skipped. The training starts from the next batch. runners_batch_size: batch size of runners. It is the same as batch_size if not specified initial_variables: variables used for initialization, for example for continuation of training. Provide it with a path to your model directory and its checkpoint file group common prefix, e.g. "variables.data", or "variables.data.3" in case of multiple checkpoints per experiment. postprocess: A function which takes the dataset with its output series and generates additional series from them. """ check_argument_types() if isinstance(val_dataset, Dataset): val_datasets = [val_dataset] else: val_datasets = val_dataset log_period_batch, log_period_time = _resolve_period(logging_period) val_period_batch, val_period_time = _resolve_period(validation_period) _check_series_collisions(runners, postprocess) _log_model_variables(var_list=trainer.var_list) if runners_batch_size is None: runners_batch_size = batch_size evaluators = [(e[0], e[0], e[1]) if len(e) == 2 else e for e in evaluators] if evaluators: main_metric = "{}/{}".format(evaluators[-1][0], evaluators[-1][-1].name) else: main_metric = "{}/{}".format(runners[-1].decoder_data_id, runners[-1].loss_names[0]) if not tf_manager.minimize_metric: raise ValueError("minimize_metric must be set to True in " "TensorFlowManager when using loss as " "the main metric") step = 0 seen_instances = 0 last_seen_instances = 0 if initial_variables is None: # Assume we don't look at coder checkpoints when global # initial variables are supplied tf_manager.initialize_model_parts(runners + [trainer], save=True) # type: ignore else: try: tf_manager.restore(initial_variables) except tf.errors.NotFoundError: warn("Some variables were not found in checkpoint.)") if log_directory: log("Initializing TensorBoard summary writer.") tb_writer = tf.summary.FileWriter(log_directory, tf_manager.sessions[0].graph) log("TensorBoard writer initialized.") log("Starting training") last_log_time = time.process_time() last_val_time = time.process_time() interrupt = None try: for epoch_n in range(1, epochs + 1): log_print("") log("Epoch {} begins".format(epoch_n), color="red") train_dataset.shuffle() train_batched_datasets = train_dataset.batch_dataset(batch_size) if epoch_n == 1 and train_start_offset: if not isinstance(train_dataset, LazyDataset): warn("Not skipping training instances with " "shuffled in-memory dataset") else: _skip_lines(train_start_offset, train_batched_datasets) for batch_n, batch_dataset in enumerate(train_batched_datasets): step += 1 seen_instances += len(batch_dataset) if _is_logging_time(step, log_period_batch, last_log_time, log_period_time): trainer_result = tf_manager.execute(batch_dataset, [trainer], train=True, summaries=True) train_results, train_outputs = run_on_dataset( tf_manager, runners, batch_dataset, postprocess, write_out=False, batch_size=runners_batch_size) # ensure train outputs are iterable more than once train_outputs = { k: list(v) for k, v in train_outputs.items() } train_evaluation = evaluation(evaluators, batch_dataset, runners, train_results, train_outputs) _log_continuous_evaluation(tb_writer, main_metric, train_evaluation, seen_instances, epoch_n, epochs, trainer_result, train=True) last_log_time = time.process_time() else: tf_manager.execute(batch_dataset, [trainer], train=True, summaries=False) if _is_logging_time(step, val_period_batch, last_val_time, val_period_time): log_print("") val_duration_start = time.process_time() val_examples = 0 for val_id, valset in enumerate(val_datasets): val_examples += len(valset) val_results, val_outputs = run_on_dataset( tf_manager, runners, valset, postprocess, write_out=False, batch_size=runners_batch_size) # ensure val outputs are iterable more than once val_outputs = { k: list(v) for k, v in val_outputs.items() } val_evaluation = evaluation(evaluators, valset, runners, val_results, val_outputs) valheader = ( "Validation (epoch {}, batch number {}):".format( epoch_n, batch_n)) log(valheader, color="blue") _print_examples(valset, val_outputs, val_preview_input_series, val_preview_output_series, val_preview_num_examples) log_print("") log(valheader, color="blue") # The last validation set is selected to be the main if val_id == len(val_datasets) - 1: this_score = val_evaluation[main_metric] tf_manager.validation_hook(this_score, epoch_n, batch_n) if this_score == tf_manager.best_score: best_score_str = colored("{:.4g}".format( tf_manager.best_score), attrs=["bold"]) # store also graph parts all_coders = set.union(*[ rnr.all_coders for rnr in runners + [trainer] ]) # type: ignore for coder in all_coders: for session in tf_manager.sessions: coder.save(session) else: best_score_str = "{:.4g}".format( tf_manager.best_score) log("best {} on validation: {} (in epoch {}, " "after batch number {})".format( main_metric, best_score_str, tf_manager.best_score_epoch, tf_manager.best_score_batch), color="blue") v_name = valset.name if len(val_datasets) > 1 else None _log_continuous_evaluation(tb_writer, main_metric, val_evaluation, seen_instances, epoch_n, epochs, val_results, train=False, dataset_name=v_name) # how long was the training between validations training_duration = val_duration_start - last_val_time val_duration = time.process_time() - val_duration_start # the training should take at least twice the time of val. steptime = (training_duration / (seen_instances - last_seen_instances)) valtime = val_duration / val_examples last_seen_instances = seen_instances log("Validation time: {:.2f}s, inter-validation: {:.2f}s, " "per-instance (train): {:.2f}s, per-instance (val): " "{:.2f}s".format(val_duration, training_duration, steptime, valtime), color="blue") if training_duration < 2 * val_duration: notice("Validation period setting is inefficient.") log_print("") last_val_time = time.process_time() except KeyboardInterrupt as ex: interrupt = ex log("Training finished. Maximum {} on validation data: {:.4g}, epoch {}". format(main_metric, tf_manager.best_score, tf_manager.best_score_epoch)) if test_datasets: tf_manager.restore_best_vars() for dataset in test_datasets: test_results, test_outputs = run_on_dataset( tf_manager, runners, dataset, postprocess, write_out=True, batch_size=runners_batch_size) # ensure test outputs are iterable more than once test_outputs = {k: list(v) for k, v in test_outputs.items()} eval_result = evaluation(evaluators, dataset, runners, test_results, test_outputs) print_final_evaluation(dataset.name, eval_result) log("Finished.") if interrupt is not None: raise interrupt # pylint: disable=raising-bad-type
def run_on_dataset(tf_manager: TensorFlowManager, runners: List[BaseRunner], dataset: Dataset, postprocess: Postprocess, batching_scheme: BatchingScheme, write_out: bool = False, log_progress: int = 0) -> Tuple[ List[ExecutionResult], Dict[str, List[Any]]]: """Apply the model on a dataset and optionally write outputs to files. This function processes the dataset in batches and optionally prints out the execution progress. Args: tf_manager: TensorFlow manager with initialized sessions. runners: A function that runs the code dataset: The dataset on which the model will be executed. evaluators: List of evaluators that are used for the model evaluation if the target data are provided. postprocess: Dataset-level postprocessors write_out: Flag whether the outputs should be printed to a file defined in the dataset object. batching_scheme: Scheme used for batching. log_progress: log progress every X seconds extra_fetches: Extra tensors to evaluate for each batch. Returns: Tuple of resulting sentences/numpy arrays, and evaluation results if they are available which are dictionary function -> value. """ # If the dataset contains the target series, compute also losses. contains_targets = all(dataset.has_series(runner.decoder_data_id) for runner in runners if runner.decoder_data_id is not None) last_log_time = time.process_time() batch_results = [[] for _ in runners] # type: List[List[ExecutionResult]] feedables = set.union(*[runner.feedables for runner in runners]) processed_examples = 0 for batch in dataset.batches(batching_scheme): if 0 < log_progress < time.process_time() - last_log_time: log("Processed {} examples.".format(processed_examples)) last_log_time = time.process_time() execution_results = tf_manager.execute( batch, feedables, runners, compute_losses=contains_targets) processed_examples += len(batch) for script_list, ex_result in zip(batch_results, execution_results): script_list.append(ex_result) # Transpose runner interim results. all_results = [reduce_execution_results(res) for res in batch_results] # Convert execution results to dictionary. result_data = {runner.output_series: result.outputs for runner, result in zip(runners, all_results)} # Run dataset-level postprocessing. if postprocess is not None: for series_name, postprocessor in postprocess: postprocessed = postprocessor(dataset, result_data) if not hasattr(postprocessed, "__len__"): postprocessed = list(postprocessed) result_data[series_name] = postprocessed # Check output series lengths. for series_id, data in result_data.items(): if len(data) != len(dataset): warn("Output '{}' for dataset '{}' has length {}, but " "len(dataset) == {}".format(series_id, dataset.name, len(data), len(dataset))) if write_out and dataset.outputs is not None: for series_id, data in result_data.items(): if series_id in dataset.outputs: path, writer = dataset.outputs[series_id] writer(path, data) else: log("There is no file for output series '{}' in dataset: '{}'" .format(series_id, dataset.name), color="red") elif write_out: log("Dataset does not have any outputs, nothing to write out.", color="red") return all_results, result_data
def training_loop( tf_manager: TensorFlowManager, epochs: int, trainer: GenericTrainer, # TODO better annotate batch_size: int, train_dataset: Dataset, val_dataset: Dataset, log_directory: str, evaluators: EvalConfiguration, runners: List[BaseRunner], test_datasets: Optional[List[Dataset]] = None, logging_period: int = 20, validation_period: int = 500, val_preview_input_series: Optional[List[str]] = None, val_preview_output_series: Optional[List[str]] = None, val_preview_num_examples: int = 15, train_start_offset: int = 0, runners_batch_size: Optional[int] = None, initial_variables: Optional[Union[str, List[str]]] = None, postprocess: Postprocess = None) -> None: # TODO finish the list """ Performs the training loop for given graph and data. Args: tf_manager: TensorFlowManager with initialized sessions. epochs: Number of epochs for which the algoritm will learn. trainer: The trainer object containg the TensorFlow code for computing the loss and optimization operation. train_dataset: val_dataset: postprocess: Function that takes the output sentence as produced by the decoder and transforms into tokenized sentence. log_directory: Directory where the TensordBoard log will be generated. If None, nothing will be done. evaluators: List of evaluators. The last evaluator is used as the main. An evaluator is a tuple of the name of the generated series, the name of the dataset series the generated one is evaluated with and the evaluation function. If only one series names is provided, it means the generated and dataset series have the same name. """ if validation_period < logging_period: raise AssertionError( "Validation period can't be smaller than logging period.") _check_series_collisions(runners, postprocess) _log_model_variables() if tf_manager.report_gpu_memory_consumption: log("GPU memory usage: {}".format(gpu_memusage())) # TODO DOCUMENT_THIS if runners_batch_size is None: runners_batch_size = batch_size evaluators = [(e[0], e[0], e[1]) if len(e) == 2 else e for e in evaluators] if evaluators: main_metric = "{}/{}".format(evaluators[-1][0], evaluators[-1][-1].name) else: main_metric = "{}/{}".format(runners[-1].decoder_data_id, runners[-1].loss_names[0]) if not tf_manager.minimize_metric: raise ValueError("minimize_metric must be set to True in " "TensorFlowManager when using loss as " "the main metric") step = 0 seen_instances = 0 if initial_variables is None: # Assume we don't look at coder checkpoints when global # initial variables are supplied tf_manager.initialize_model_parts(runners + [trainer], save=True) # type: ignore else: tf_manager.restore(initial_variables) if log_directory: log("Initializing TensorBoard summary writer.") tb_writer = tf.summary.FileWriter(log_directory, tf_manager.sessions[0].graph) log("TensorBoard writer initialized.") log("Starting training") try: for epoch_n in range(1, epochs + 1): log_print("") log("Epoch {} starts".format(epoch_n), color='red') train_dataset.shuffle() train_batched_datasets = train_dataset.batch_dataset(batch_size) if epoch_n == 1 and train_start_offset: if not isinstance(train_dataset, LazyDataset): warn("Not skipping training instances with " "shuffled in-memory dataset") else: _skip_lines(train_start_offset, train_batched_datasets) for batch_n, batch_dataset in enumerate(train_batched_datasets): step += 1 seen_instances += len(batch_dataset) if step % logging_period == logging_period - 1: trainer_result = tf_manager.execute(batch_dataset, [trainer], train=True, summaries=True) train_results, train_outputs = run_on_dataset( tf_manager, runners, batch_dataset, postprocess, write_out=False) # ensure train outputs are iterable more than once train_outputs = { k: list(v) for k, v in train_outputs.items() } train_evaluation = evaluation(evaluators, batch_dataset, runners, train_results, train_outputs) _log_continuous_evaluation(tb_writer, tf_manager, main_metric, train_evaluation, seen_instances, epoch_n, epochs, trainer_result, train=True) else: tf_manager.execute(batch_dataset, [trainer], train=True, summaries=False) if step % validation_period == validation_period - 1: val_results, val_outputs = run_on_dataset( tf_manager, runners, val_dataset, postprocess, write_out=False, batch_size=runners_batch_size) # ensure val outputs are iterable more than once val_outputs = {k: list(v) for k, v in val_outputs.items()} val_evaluation = evaluation(evaluators, val_dataset, runners, val_results, val_outputs) this_score = val_evaluation[main_metric] tf_manager.validation_hook(this_score, epoch_n, batch_n) log("Validation (epoch {}, batch number {}):".format( epoch_n, batch_n), color='blue') _log_continuous_evaluation(tb_writer, tf_manager, main_metric, val_evaluation, seen_instances, epoch_n, epochs, val_results, train=False) if this_score == tf_manager.best_score: best_score_str = colored("{:.4g}".format( tf_manager.best_score), attrs=['bold']) else: best_score_str = "{:.4g}".format(tf_manager.best_score) log("best {} on validation: {} (in epoch {}, " "after batch number {})".format( main_metric, best_score_str, tf_manager.best_score_epoch, tf_manager.best_score_batch), color='blue') log_print("") _print_examples(val_dataset, val_outputs, val_preview_input_series, val_preview_output_series, val_preview_num_examples) except KeyboardInterrupt: log("Training interrupted by user.") log("Training finished. Maximum {} on validation data: {:.4g}, epoch {}". format(main_metric, tf_manager.best_score, tf_manager.best_score_epoch)) if test_datasets: tf_manager.restore_best_vars() for dataset in test_datasets: test_results, test_outputs = run_on_dataset( tf_manager, runners, dataset, postprocess, write_out=True, batch_size=runners_batch_size) # ensure test outputs are iterable more than once test_outputs = {k: list(v) for k, v in test_outputs.items()} eval_result = evaluation(evaluators, dataset, runners, test_results, test_outputs) print_final_evaluation(dataset.name, eval_result) log("Finished.")
def run_on_dataset( tf_manager: TensorFlowManager, runners: List[BaseRunner], dataset_runner: DatasetRunner, dataset: Dataset, postprocess: Postprocess, write_out: bool = False, log_progress: int = 0 ) -> Tuple[List[ExecutionResult], Dict[str, List], Dict[str, List]]: """Apply the model on a dataset and optionally write outputs to files. This function processes the dataset in batches and optionally prints out the execution progress. Args: tf_manager: TensorFlow manager with initialized sessions. runners: A function that runs the code dataset_runner: A runner object that fetches the data inputs dataset: The dataset on which the model will be executed. evaluators: List of evaluators that are used for the model evaluation if the target data are provided. postprocess: Dataset-level postprocessors write_out: Flag whether the outputs should be printed to a file defined in the dataset object. log_progress: log progress every X seconds extra_fetches: Extra tensors to evaluate for each batch. Returns: Tuple of resulting sentences/numpy arrays, and evaluation results if they are available which are dictionary function -> value. """ # If the dataset contains the target series, compute also losses. contains_targets = all(runner.decoder_data_id in dataset for runner in runners if runner.decoder_data_id is not None) last_log_time = time.process_time() batch_results = [[] for _ in runners] # type: List[List[ExecutionResult]] batch_results.append([]) # For dataset runner feedables = set.union(*[runner.feedables for runner in runners]) feedables |= dataset_runner.feedables fetched_input = {s: [] for s in dataset.series} # type: Dict[str, List] processed_examples = 0 for batch in dataset.batches(): if 0 < log_progress < time.process_time() - last_log_time: log("Processed {} examples.".format(processed_examples)) last_log_time = time.process_time() executors = [] # type: List[GraphExecutor] executors.extend(runners) executors.append(dataset_runner) execution_results = tf_manager.execute(batch, feedables, executors, compute_losses=contains_targets) processed_examples += len(batch) for script_list, ex_result in zip(batch_results, execution_results): script_list.append(ex_result) for s_id in batch.series: fetched_input[s_id].extend(batch.get_series(s_id)) # Transpose runner interim results. all_results = [join_execution_results(res) for res in batch_results[:-1]] # TODO uncomment this when dataset runner starts outputting the dataset # input_transposed = join_execution_results(batch_results[-1]).outputs # fetched_input = { # k: [dic[k] for dic in input_transposed] for k in input_transposed[0]} fetched_input_lengths = {s: len(fetched_input[s]) for s in dataset.series} if len(set(fetched_input_lengths.values())) != 1: warn("Fetched input dataset series are not of the same length: {}". format(str(fetched_input_lengths))) dataset_len = fetched_input_lengths[dataset.series[0]] # Convert execution results to dictionary. result_data = {} # type: Dict[str, Union[List, np.ndarray]] for s_id, data in (pair for res in all_results for pair in res.outputs.items()): if s_id in result_data: raise ValueError("Overwriting output series forbidden.") result_data[s_id] = data # Run dataset-level postprocessing. if postprocess is not None: for series_name, postprocessor in postprocess: postprocessed = postprocessor(fetched_input, result_data) if not hasattr(postprocessed, "__len__"): postprocessed = list(postprocessed) result_data[series_name] = postprocessed # Check output series lengths. for series_id, data in result_data.items(): if len(data) != dataset_len: warn("Output '{}' for dataset '{}' has length {}, but input " "dataset size is {}".format(series_id, dataset.name, len(data), dataset_len)) if write_out and dataset.outputs is not None: for series_id, data in result_data.items(): if series_id in dataset.outputs: path, writer = dataset.outputs[series_id] writer(path, data) else: log("There is no file for output series '{}' in dataset: '{}'". format(series_id, dataset.name), color="red") elif write_out: log("Dataset does not have any outputs, nothing to write out.", color="red") return all_results, result_data, fetched_input
def training_loop( tf_manager: TensorFlowManager, epochs: int, trainer: GenericTrainer, # TODO better annotate batch_size: int, train_dataset: Dataset, val_dataset: Dataset, log_directory: str, evaluators: EvalConfiguration, runners: List[BaseRunner], test_datasets: Optional[List[Dataset]] = None, link_best_vars="/tmp/variables.data.best", vars_prefix="/tmp/variables.data", logging_period: int = 20, validation_period: int = 500, val_preview_input_series: Optional[List[str]] = None, val_preview_output_series: Optional[List[str]] = None, val_preview_num_examples: int = 15, train_start_offset: int = 0, runners_batch_size: Optional[int] = None, initial_variables: Optional[Union[str, List[str]]] = None, postprocess: Postprocess = None, minimize_metric: bool = False): # TODO finish the list """ Performs the training loop for given graph and data. Args: tf_manager: TensorFlowManager with initialized sessions. epochs: Number of epochs for which the algoritm will learn. trainer: The trainer object containg the TensorFlow code for computing the loss and optimization operation. train_dataset: val_dataset: postprocess: Function that takes the output sentence as produced by the decoder and transforms into tokenized sentence. log_directory: Directory where the TensordBoard log will be generated. If None, nothing will be done. evaluators: List of evaluators. The last evaluator is used as the main. An evaluator is a tuple of the name of the generated series, the name of the dataset series the generated one is evaluated with and the evaluation function. If only one series names is provided, it means the generated and dataset series have the same name. """ if validation_period < logging_period: raise AssertionError( "Logging period can't smaller than validation period.") _check_series_collisions(runners, postprocess) paramstr = "Model has {} trainable parameters".format(trainer.n_parameters) if tf_manager.report_gpu_memory_consumption: paramstr += ", GPU memory usage: {}".format(gpu_memusage()) log(paramstr) # TODO DOCUMENT_THIS if runners_batch_size is None: runners_batch_size = batch_size evaluators = [(e[0], e[0], e[1]) if len(e) == 2 else e for e in evaluators] main_metric = "{}/{}".format(evaluators[-1][0], evaluators[-1][-1].name) step = 0 seen_instances = 0 save_n_best_vars = tf_manager.saver_max_to_keep if save_n_best_vars < 1: raise Exception('save_n_best_vars must be greater than zero') if save_n_best_vars == 1: variables_files = [vars_prefix] elif save_n_best_vars > 1: variables_files = [ '{}.{}'.format(vars_prefix, i) for i in range(save_n_best_vars) ] if minimize_metric: saved_scores = [np.inf for _ in range(save_n_best_vars)] best_score = np.inf else: saved_scores = [-np.inf for _ in range(save_n_best_vars)] best_score = -np.inf if initial_variables is None: # Assume we don't look at coder checkpoints when global # initial variables are supplied tf_manager.initialize_model_parts(runners + [trainer]) # type: ignore tf_manager.save(variables_files[0]) else: tf_manager.restore(initial_variables) if os.path.islink(link_best_vars): # if overwriting output dir os.unlink(link_best_vars) os.symlink(os.path.basename(variables_files[0]), link_best_vars) if log_directory: log("Initializing TensorBoard summary writer.") tb_writer = tf.train.SummaryWriter(log_directory, tf_manager.sessions[0].graph) log("TensorBoard writer initialized.") best_score_epoch = 0 best_score_batch_no = 0 log("Starting training") try: for epoch_n in range(1, epochs + 1): log_print("") log("Epoch {} starts".format(epoch_n), color='red') train_dataset.shuffle() train_batched_datasets = train_dataset.batch_dataset(batch_size) if epoch_n == 1 and train_start_offset: if not isinstance(train_dataset, LazyDataset): log( "Warning: Not skipping training instances with " "shuffled in-memory dataset", color="red") else: _skip_lines(train_start_offset, train_batched_datasets) for batch_n, batch_dataset in enumerate(train_batched_datasets): step += 1 seen_instances += len(batch_dataset) if step % logging_period == logging_period - 1: trainer_result = tf_manager.execute(batch_dataset, [trainer], train=True, summaries=True) train_results, train_outputs = run_on_dataset( tf_manager, runners, batch_dataset, postprocess, write_out=False) # ensure train outputs are iterable more than once train_outputs = { k: list(v) for k, v in train_outputs.items() } train_evaluation = evaluation(evaluators, batch_dataset, runners, train_results, train_outputs) _log_continuous_evaluation(tb_writer, tf_manager, main_metric, train_evaluation, seen_instances, epoch_n, epochs, trainer_result, train=True) else: tf_manager.execute(batch_dataset, [trainer], train=True, summaries=False) if step % validation_period == validation_period - 1: val_results, val_outputs = run_on_dataset( tf_manager, runners, val_dataset, postprocess, write_out=False, batch_size=runners_batch_size) # ensure val outputs are iterable more than once val_outputs = {k: list(v) for k, v in val_outputs.items()} val_evaluation = evaluation(evaluators, val_dataset, runners, val_results, val_outputs) this_score = val_evaluation[main_metric] def is_better(score1, score2, minimize): if minimize: return score1 < score2 else: return score1 > score2 def argworst(scores, minimize): if minimize: return np.argmax(scores) else: return np.argmin(scores) if is_better(this_score, best_score, minimize_metric): best_score = this_score best_score_epoch = epoch_n best_score_batch_no = batch_n worst_index = argworst(saved_scores, minimize_metric) worst_score = saved_scores[worst_index] if is_better(this_score, worst_score, minimize_metric): # we need to save this score instead the worst score worst_var_file = variables_files[worst_index] tf_manager.save(worst_var_file) saved_scores[worst_index] = this_score log("Variable file saved in {}".format(worst_var_file)) # update symlink if best_score == this_score: os.unlink(link_best_vars) os.symlink(os.path.basename(worst_var_file), link_best_vars) log("Best scores saved so far: {}".format( saved_scores)) log("Validation (epoch {}, batch number {}):".format( epoch_n, batch_n), color='blue') _log_continuous_evaluation(tb_writer, tf_manager, main_metric, val_evaluation, seen_instances, epoch_n, epochs, val_results, train=False) if this_score == best_score: best_score_str = colored("{:.4g}".format(best_score), attrs=['bold']) else: best_score_str = "{:.4g}".format(best_score) log("best {} on validation: {} (in epoch {}, " "after batch number {})".format( main_metric, best_score_str, best_score_epoch, best_score_batch_no), color='blue') log_print("") _print_examples(val_dataset, val_outputs, val_preview_input_series, val_preview_output_series, val_preview_num_examples) except KeyboardInterrupt: log("Training interrupted by user.") log("Training finished. Maximum {} on validation data: {:.4g}, epoch {}". format(main_metric, best_score, best_score_epoch)) if test_datasets and os.path.islink(link_best_vars): tf_manager.restore(link_best_vars) for dataset in test_datasets: test_results, test_outputs = run_on_dataset( tf_manager, runners, dataset, postprocess, write_out=True, batch_size=runners_batch_size) # ensure test outputs are iterable more than once test_outputs = {k: list(v) for k, v in test_outputs.items()} eval_result = evaluation(evaluators, dataset, runners, test_results, test_outputs) print_final_evaluation(dataset.name, eval_result) log("Finished.")