def run_on_dataset(tf_manager: TensorFlowManager, runners: List[BaseRunner], dataset: Dataset, postprocess: Callable, write_out: bool=False, batch_size: Optional[int]=None) \ -> Tuple[List[ExecutionResult], Dict[str, List[Any]]]: """Apply the model on a dataset and optionally write outputs to files. Args: tf_manager: TensorFlow manager with initialized sessions. runners: A function that runs the code dataset: The dataset on which the model will be executed. evaluators: List of evaluators that are used for the model evaluation if the target data are provided. postprocess: an object to use as postprocessing of the write_out: Flag whether the outputs should be printed to a file defined in the dataset object. extra_fetches: Extra tensors to evaluate for each batch. Returns: Tuple of resulting sentences/numpy arrays, and evaluation results if they are available which are dictionary function -> value. """ contains_targets = all(dataset.has_series(runner.output_series) for runner in runners) all_results = tf_manager.execute(dataset, runners, train=contains_targets, batch_size=batch_size) result_data_raw = {runner.output_series: result.outputs for runner, result in zip(runners, all_results)} if postprocess is not None: result_data = postprocess(dataset, result_data_raw) else: result_data = result_data_raw if write_out: for series_id, data in result_data.items(): if series_id in dataset.series_outputs: path = dataset.series_outputs[series_id] if isinstance(data, np.ndarray): np.save(path, data) log('Result saved as numpy array to "{}"'.format(path)) else: with open(path, 'w') as f_out: f_out.writelines( [" ".join(sent) + "\n" for sent in data]) log("Result saved as plain text \"{}\"".format(path)) else: log("There is no output file for dataset: {}" .format(dataset.name), color='red') return all_results, result_data
def check_dataset_and_coders(dataset: Dataset, runners: Iterable[BaseRunner]) -> None: # pylint: disable=protected-access data_list = [] for runner in runners: for c in runner.all_coders: if hasattr(c, "data_id"): data_list.append((c.data_id, c)) elif hasattr(c, "data_ids"): data_list.extend([(d, c) for d in c.data_ids]) else: log(("Coder: {} does not have " "a data attribute.").format(c)) debug("Found series: {}".format(str(data_list)), "checking") missing = [] for (serie, coder) in data_list: if not dataset.has_series(serie): log("dataset {} does not have serie {}".format( dataset.name, serie)) missing.append((coder, serie)) if missing: formated = [ "{} ({}, {}.{})".format(serie, cod.name, cod.__class__.__module__, cod.__class__.__name__) for cod, serie in missing ] raise CheckingException("Dataset '{}' is mising series {}:".format( dataset.name, ", ".join(formated)))
def _print_examples(dataset: Dataset, outputs: Dict[str, List[Any]], num_examples=15) -> None: """Print examples of the model output.""" log_print(colored("Examples:", attrs=['bold'])) # for further indexing we need to make sure, all relevant # dataset series are lists target_series = { series_id: list(dataset.get_series(series_id)) for series_id in outputs.keys() if dataset.has_series(series_id) } source_series = { series_id: list(dataset.get_series(series_id)) for series_id in dataset.series_ids if series_id not in outputs } for i in range(min(len(dataset), num_examples)): log_print( colored(" [{}]".format(i + 1), color='magenta', attrs=['bold'])) def print_line(prefix, color, content): colored_prefix = colored(prefix, color=color) formated = _data_item_to_str(content) log_print(" {}: {}".format(colored_prefix, formated)) for series_id, data in sorted(source_series.items(), key=lambda x: x[0]): print_line(series_id, 'yellow', data[i]) for series_id, data in sorted(outputs.items(), key=lambda x: x[0]): model_output = data[i] print_line(series_id, 'magenta', model_output) if series_id in target_series: desired_output = target_series[series_id][i] print_line(series_id + " (ref)", "red", desired_output) log_print("")
def run_on_dataset( tf_manager: TensorFlowManager, runners: List[BaseRunner], dataset: Dataset, postprocess: Postprocess, write_out: bool = False, batch_size: Optional[int] = None, log_progress: int = 0 ) -> Tuple[List[ExecutionResult], Dict[str, List[Any]]]: """Apply the model on a dataset and optionally write outputs to files. Args: tf_manager: TensorFlow manager with initialized sessions. runners: A function that runs the code dataset: The dataset on which the model will be executed. evaluators: List of evaluators that are used for the model evaluation if the target data are provided. postprocess: an object to use as postprocessing of the write_out: Flag whether the outputs should be printed to a file defined in the dataset object. batch_size: size of the minibatch log_progress: log progress every X seconds extra_fetches: Extra tensors to evaluate for each batch. Returns: Tuple of resulting sentences/numpy arrays, and evaluation results if they are available which are dictionary function -> value. """ contains_targets = all( dataset.has_series(runner.decoder_data_id) for runner in runners if runner.decoder_data_id is not None) all_results = tf_manager.execute(dataset, runners, compute_losses=contains_targets, batch_size=batch_size, log_progress=log_progress) result_data = { runner.output_series: result.outputs for runner, result in zip(runners, all_results) } if postprocess is not None: for series_name, postprocessor in postprocess: postprocessed = postprocessor(dataset, result_data) if not hasattr(postprocessed, "__len__"): postprocessed = list(postprocessed) result_data[series_name] = postprocessed # check output series lengths for series_id, data in result_data.items(): if len(data) != len(dataset): warn("Output '{}' for dataset '{}' has length {}, but " "len(dataset) == {}".format(series_id, dataset.name, len(data), len(dataset))) def _check_savable_dict(data): """Check if the data is of savable type.""" if not (data and data[0]): return False supported_type = Union[List[Dict[str, np.ndarray]], List[List[Dict[str, np.ndarray]]]] try: check_type("data", data, supported_type, None) except TypeError: return False return True if write_out: for series_id, data in result_data.items(): if series_id in dataset.series_outputs: path = dataset.series_outputs[series_id] if isinstance(data, np.ndarray): np.save(path, data) log("Result saved as numpy array to '{}'".format(path)) elif _check_savable_dict(data): unbatched = dict( zip(data[0], zip(*[d.values() for d in data]))) np.savez(path, **unbatched) log("Result saved as numpy data to '{}.npz'".format(path)) else: with open(path, "w", encoding="utf-8") as f_out: f_out.writelines([ " ".join(sent) + "\n" if isinstance( sent, collections.Iterable) else str(sent) + "\n" for sent in data ]) log("Result saved as plain text '{}'".format(path)) else: log("There is no output file for dataset: {}".format( dataset.name), color="red") return all_results, result_data
def run_on_dataset(tf_manager: TensorFlowManager, runners: List[BaseRunner], dataset: Dataset, postprocess: Postprocess, batching_scheme: BatchingScheme, write_out: bool = False, log_progress: int = 0) -> Tuple[ List[ExecutionResult], Dict[str, List[Any]]]: """Apply the model on a dataset and optionally write outputs to files. This function processes the dataset in batches and optionally prints out the execution progress. Args: tf_manager: TensorFlow manager with initialized sessions. runners: A function that runs the code dataset: The dataset on which the model will be executed. evaluators: List of evaluators that are used for the model evaluation if the target data are provided. postprocess: Dataset-level postprocessors write_out: Flag whether the outputs should be printed to a file defined in the dataset object. batching_scheme: Scheme used for batching. log_progress: log progress every X seconds extra_fetches: Extra tensors to evaluate for each batch. Returns: Tuple of resulting sentences/numpy arrays, and evaluation results if they are available which are dictionary function -> value. """ # If the dataset contains the target series, compute also losses. contains_targets = all(dataset.has_series(runner.decoder_data_id) for runner in runners if runner.decoder_data_id is not None) last_log_time = time.process_time() batch_results = [[] for _ in runners] # type: List[List[ExecutionResult]] feedables = set.union(*[runner.feedables for runner in runners]) processed_examples = 0 for batch in dataset.batches(batching_scheme): if 0 < log_progress < time.process_time() - last_log_time: log("Processed {} examples.".format(processed_examples)) last_log_time = time.process_time() execution_results = tf_manager.execute( batch, feedables, runners, compute_losses=contains_targets) processed_examples += len(batch) for script_list, ex_result in zip(batch_results, execution_results): script_list.append(ex_result) # Transpose runner interim results. all_results = [reduce_execution_results(res) for res in batch_results] # Convert execution results to dictionary. result_data = {runner.output_series: result.outputs for runner, result in zip(runners, all_results)} # Run dataset-level postprocessing. if postprocess is not None: for series_name, postprocessor in postprocess: postprocessed = postprocessor(dataset, result_data) if not hasattr(postprocessed, "__len__"): postprocessed = list(postprocessed) result_data[series_name] = postprocessed # Check output series lengths. for series_id, data in result_data.items(): if len(data) != len(dataset): warn("Output '{}' for dataset '{}' has length {}, but " "len(dataset) == {}".format(series_id, dataset.name, len(data), len(dataset))) if write_out and dataset.outputs is not None: for series_id, data in result_data.items(): if series_id in dataset.outputs: path, writer = dataset.outputs[series_id] writer(path, data) else: log("There is no file for output series '{}' in dataset: '{}'" .format(series_id, dataset.name), color="red") elif write_out: log("Dataset does not have any outputs, nothing to write out.", color="red") return all_results, result_data