Exemplo n.º 1
0
def run_on_dataset(tf_manager: TensorFlowManager,
                   runners: List[BaseRunner],
                   dataset: Dataset,
                   postprocess: Callable,
                   write_out: bool=False,
                   batch_size: Optional[int]=None) \
                                                -> Tuple[List[ExecutionResult],
                                                         Dict[str, List[Any]]]:
    """Apply the model on a dataset and optionally write outputs to files.

    Args:
        tf_manager: TensorFlow manager with initialized sessions.
        runners: A function that runs the code
        dataset: The dataset on which the model will be executed.
        evaluators: List of evaluators that are used for the model
            evaluation if the target data are provided.
        postprocess: an object to use as postprocessing of the
        write_out: Flag whether the outputs should be printed to a file defined
            in the dataset object.

        extra_fetches: Extra tensors to evaluate for each batch.

    Returns:
        Tuple of resulting sentences/numpy arrays, and evaluation results if
        they are available which are dictionary function -> value.

    """

    contains_targets = all(dataset.has_series(runner.output_series)
                           for runner in runners)

    all_results = tf_manager.execute(dataset, runners,
                                     train=contains_targets,
                                     batch_size=batch_size)

    result_data_raw = {runner.output_series: result.outputs
                       for runner, result in zip(runners, all_results)}

    if postprocess is not None:
        result_data = postprocess(dataset, result_data_raw)
    else:
        result_data = result_data_raw

    if write_out:
        for series_id, data in result_data.items():
            if series_id in dataset.series_outputs:
                path = dataset.series_outputs[series_id]
                if isinstance(data, np.ndarray):
                    np.save(path, data)
                    log('Result saved as numpy array to "{}"'.format(path))
                else:
                    with open(path, 'w') as f_out:
                        f_out.writelines(
                            [" ".join(sent) + "\n" for sent in data])
                    log("Result saved as plain text \"{}\"".format(path))
            else:
                log("There is no output file for dataset: {}"
                    .format(dataset.name), color='red')

    return all_results, result_data
Exemplo n.º 2
0
def check_dataset_and_coders(dataset: Dataset,
                             runners: Iterable[BaseRunner]) -> None:
    # pylint: disable=protected-access

    data_list = []
    for runner in runners:
        for c in runner.all_coders:
            if hasattr(c, "data_id"):
                data_list.append((c.data_id, c))
            elif hasattr(c, "data_ids"):
                data_list.extend([(d, c) for d in c.data_ids])
            else:
                log(("Coder: {} does not have " "a data attribute.").format(c))

    debug("Found series: {}".format(str(data_list)), "checking")
    missing = []

    for (serie, coder) in data_list:
        if not dataset.has_series(serie):
            log("dataset {} does not have serie {}".format(
                dataset.name, serie))
            missing.append((coder, serie))

    if missing:
        formated = [
            "{} ({}, {}.{})".format(serie, cod.name, cod.__class__.__module__,
                                    cod.__class__.__name__)
            for cod, serie in missing
        ]

        raise CheckingException("Dataset '{}' is mising series {}:".format(
            dataset.name, ", ".join(formated)))
Exemplo n.º 3
0
def _print_examples(dataset: Dataset,
                    outputs: Dict[str, List[Any]],
                    num_examples=15) -> None:
    """Print examples of the model output."""
    log_print(colored("Examples:", attrs=['bold']))

    # for further indexing we need to make sure, all relevant
    # dataset series are lists
    target_series = {
        series_id: list(dataset.get_series(series_id))
        for series_id in outputs.keys() if dataset.has_series(series_id)
    }
    source_series = {
        series_id: list(dataset.get_series(series_id))
        for series_id in dataset.series_ids if series_id not in outputs
    }

    for i in range(min(len(dataset), num_examples)):
        log_print(
            colored("  [{}]".format(i + 1), color='magenta', attrs=['bold']))

        def print_line(prefix, color, content):
            colored_prefix = colored(prefix, color=color)
            formated = _data_item_to_str(content)
            log_print("  {}: {}".format(colored_prefix, formated))

        for series_id, data in sorted(source_series.items(),
                                      key=lambda x: x[0]):
            print_line(series_id, 'yellow', data[i])

        for series_id, data in sorted(outputs.items(), key=lambda x: x[0]):
            model_output = data[i]
            print_line(series_id, 'magenta', model_output)

            if series_id in target_series:
                desired_output = target_series[series_id][i]
                print_line(series_id + " (ref)", "red", desired_output)
        log_print("")
Exemplo n.º 4
0
def run_on_dataset(
    tf_manager: TensorFlowManager,
    runners: List[BaseRunner],
    dataset: Dataset,
    postprocess: Postprocess,
    write_out: bool = False,
    batch_size: Optional[int] = None,
    log_progress: int = 0
) -> Tuple[List[ExecutionResult], Dict[str, List[Any]]]:
    """Apply the model on a dataset and optionally write outputs to files.

    Args:
        tf_manager: TensorFlow manager with initialized sessions.
        runners: A function that runs the code
        dataset: The dataset on which the model will be executed.
        evaluators: List of evaluators that are used for the model
            evaluation if the target data are provided.
        postprocess: an object to use as postprocessing of the
        write_out: Flag whether the outputs should be printed to a file defined
            in the dataset object.
        batch_size: size of the minibatch
        log_progress: log progress every X seconds

        extra_fetches: Extra tensors to evaluate for each batch.

    Returns:
        Tuple of resulting sentences/numpy arrays, and evaluation results if
        they are available which are dictionary function -> value.

    """
    contains_targets = all(
        dataset.has_series(runner.decoder_data_id) for runner in runners
        if runner.decoder_data_id is not None)

    all_results = tf_manager.execute(dataset,
                                     runners,
                                     compute_losses=contains_targets,
                                     batch_size=batch_size,
                                     log_progress=log_progress)

    result_data = {
        runner.output_series: result.outputs
        for runner, result in zip(runners, all_results)
    }

    if postprocess is not None:
        for series_name, postprocessor in postprocess:
            postprocessed = postprocessor(dataset, result_data)
            if not hasattr(postprocessed, "__len__"):
                postprocessed = list(postprocessed)

            result_data[series_name] = postprocessed

    # check output series lengths
    for series_id, data in result_data.items():
        if len(data) != len(dataset):
            warn("Output '{}' for dataset '{}' has length {}, but "
                 "len(dataset) == {}".format(series_id, dataset.name,
                                             len(data), len(dataset)))

    def _check_savable_dict(data):
        """Check if the data is of savable type."""
        if not (data and data[0]):
            return False

        supported_type = Union[List[Dict[str, np.ndarray]],
                               List[List[Dict[str, np.ndarray]]]]

        try:
            check_type("data", data, supported_type, None)
        except TypeError:
            return False
        return True

    if write_out:
        for series_id, data in result_data.items():
            if series_id in dataset.series_outputs:
                path = dataset.series_outputs[series_id]
                if isinstance(data, np.ndarray):
                    np.save(path, data)
                    log("Result saved as numpy array to '{}'".format(path))
                elif _check_savable_dict(data):
                    unbatched = dict(
                        zip(data[0], zip(*[d.values() for d in data])))

                    np.savez(path, **unbatched)
                    log("Result saved as numpy data to '{}.npz'".format(path))
                else:
                    with open(path, "w", encoding="utf-8") as f_out:
                        f_out.writelines([
                            " ".join(sent) + "\n" if isinstance(
                                sent, collections.Iterable) else str(sent) +
                            "\n" for sent in data
                        ])
                    log("Result saved as plain text '{}'".format(path))
            else:
                log("There is no output file for dataset: {}".format(
                    dataset.name),
                    color="red")

    return all_results, result_data
Exemplo n.º 5
0
def run_on_dataset(tf_manager: TensorFlowManager,
                   runners: List[BaseRunner],
                   dataset: Dataset,
                   postprocess: Postprocess,
                   batching_scheme: BatchingScheme,
                   write_out: bool = False,
                   log_progress: int = 0) -> Tuple[
                       List[ExecutionResult], Dict[str, List[Any]]]:
    """Apply the model on a dataset and optionally write outputs to files.

    This function processes the dataset in batches and optionally prints out
    the execution progress.

    Args:
        tf_manager: TensorFlow manager with initialized sessions.
        runners: A function that runs the code
        dataset: The dataset on which the model will be executed.
        evaluators: List of evaluators that are used for the model
            evaluation if the target data are provided.
        postprocess: Dataset-level postprocessors
        write_out: Flag whether the outputs should be printed to a file defined
            in the dataset object.
        batching_scheme: Scheme used for batching.
        log_progress: log progress every X seconds

        extra_fetches: Extra tensors to evaluate for each batch.

    Returns:
        Tuple of resulting sentences/numpy arrays, and evaluation results if
        they are available which are dictionary function -> value.

    """
    # If the dataset contains the target series, compute also losses.
    contains_targets = all(dataset.has_series(runner.decoder_data_id)
                           for runner in runners
                           if runner.decoder_data_id is not None)

    last_log_time = time.process_time()
    batch_results = [[] for _ in runners]  # type: List[List[ExecutionResult]]

    feedables = set.union(*[runner.feedables for runner in runners])

    processed_examples = 0
    for batch in dataset.batches(batching_scheme):
        if 0 < log_progress < time.process_time() - last_log_time:
            log("Processed {} examples.".format(processed_examples))
            last_log_time = time.process_time()

        execution_results = tf_manager.execute(
            batch, feedables, runners, compute_losses=contains_targets)
        processed_examples += len(batch)

        for script_list, ex_result in zip(batch_results, execution_results):
            script_list.append(ex_result)

    # Transpose runner interim results.
    all_results = [reduce_execution_results(res) for res in batch_results]

    # Convert execution results to dictionary.
    result_data = {runner.output_series: result.outputs
                   for runner, result in zip(runners, all_results)}

    # Run dataset-level postprocessing.
    if postprocess is not None:
        for series_name, postprocessor in postprocess:
            postprocessed = postprocessor(dataset, result_data)
            if not hasattr(postprocessed, "__len__"):
                postprocessed = list(postprocessed)

            result_data[series_name] = postprocessed

    # Check output series lengths.
    for series_id, data in result_data.items():
        if len(data) != len(dataset):
            warn("Output '{}' for dataset '{}' has length {}, but "
                 "len(dataset) == {}".format(series_id, dataset.name,
                                             len(data), len(dataset)))

    if write_out and dataset.outputs is not None:
        for series_id, data in result_data.items():
            if series_id in dataset.outputs:
                path, writer = dataset.outputs[series_id]
                writer(path, data)
            else:
                log("There is no file for output series '{}' in dataset: '{}'"
                    .format(series_id, dataset.name), color="red")
    elif write_out:
        log("Dataset does not have any outputs, nothing to write out.",
            color="red")

    return all_results, result_data