예제 #1
0
def _preprocessed_datasets(
        dataset: Dataset,
        series_config: SeriesConfig) -> None:
    """Apply dataset-level preprocessing."""
    keys = [key for key in series_config.keys()
            if PREPROCESSED_SERIES.match(key)]

    for key in keys:
        name = get_first_match(PREPROCESSED_SERIES, key)
        preprocessor = cast(DatasetPreprocess, series_config[key])

        if isinstance(dataset, LazyDataset):
            # Store the preprocessed series in the right place in lazy dataset:
            # Note the `None` in the assignment. Since there is no single
            # source series ID, we just don't use it.
            if name in dataset.preprocess_series:
                raise ValueError(
                    "Series already exists: {}".format(name))

            dataset.preprocess_series[name] = (None, preprocessor)

            # During initialization, lazy dataset provides empty series to the
            # parent class. Here, we do this after the initialization:
            dataset.add_lazy_series(name)

        elif isinstance(dataset, Dataset):
            new_series = list(preprocessor(dataset))
            dataset.add_series(name, new_series)
예제 #2
0
def load_dataset_from_files(name: str,
                            lazy: bool = False,
                            preprocessors: List[Tuple[str, str,
                                                      Callable]] = None,
                            **kwargs) -> "Dataset":
    """Load a dataset from the files specified by the provided arguments.

    Paths to the data are provided in a form of dictionary.

    Keyword arguments:
        name: The name of the dataset to use. If None (default), the name will
              be inferred from the file names.
        lazy: Boolean flag specifying whether to use lazy loading (useful for
              large files). Note that the lazy dataset cannot be shuffled.
              Defaults to False.
        preprocessor: A callable used for preprocessing of the input sentences.
        kwargs: Dataset keyword argument specs. These parameters should begin
                with 's_' prefix and may end with '_out' suffix.  For example,
                a data series 'source' which specify the source sentences
                should be initialized with the 's_source' parameter, which
                specifies the path and optinally reader of the source file. If
                runners generate data of the 'target' series, the output file
                should be initialized with the 's_target_out' parameter.
                Series identifiers should not contain underscores.
                Dataset-level preprocessors are defined with 'pre_' prefix
                followed by a new series name. In case of the pre-processed
                series, a callable taking the dataset and returning a new
                series is expected as a value.

    Returns:
        The newly created dataset.
    """
    warn("Use of deprecated function. Consider using dataset.load instead.")
    check_argument_types()

    series_paths_and_readers = _get_series_paths_and_readers(kwargs)
    outputs = _get_series_outputs(kwargs)

    if not series_paths_and_readers:
        raise ValueError("No input files were provided.")

    series, data = [list(x) for x in zip(*series_paths_and_readers.items())]

    # Series-level preprocessors
    if preprocessors:
        for src, tgt, fun in preprocessors:
            series.append(tgt)
            data.append((fun, src))

    # Dataset-level preprocessors
    keys = [key for key in kwargs if PREPROCESSED_SERIES.match(key)]

    for key in keys:
        s_name = get_first_match(PREPROCESSED_SERIES, key)
        preprocessor = cast(DatasetPreprocess, kwargs[key])
        series.append(s_name)
        data.append(preprocessor)

    buffer_size = None if not lazy else 5000
    return load(name, series, data, outputs, buffer_size, False)
예제 #3
0
def _get_series_paths_and_readers(
        series_config: SeriesConfig) -> Dict[str, Tuple[List[str], Reader]]:
    """Get paths to files that contain data from the dataset kwargs.

    Input file for a serie named 'xxx' is specified by parameter 's_xxx'. The
    dataset series is defined by a string with a path / list of strings with
    paths, or a tuple whose first member is a path or a list of paths and the
    second memeber is a reader function.

    The paths can contain wildcards, which will be expanded using
    :py:func:`glob.glob` in sorted order.

    Arguments:
        series_config: A dictionary containing the dataset keyword argument
            specs.

    Returns:
        A dictionary which maps serie names to the paths of their input files
        and readers..
    """
    keys = [k for k in list(series_config.keys()) if SERIES_SOURCE.match(k)]
    names = [get_first_match(SERIES_SOURCE, k) for k in keys]

    series_sources = {}
    for name, key in zip(names, keys):
        value = cast(ReaderDef, series_config[key])

        if isinstance(value, tuple):
            patterns, reader = value  # type: ignore
        else:
            patterns = value
            reader = UtfPlainTextReader

        if isinstance(patterns, str):
            patterns = [patterns]

        paths = []
        for pattern in patterns:
            matched_files = sorted(glob.glob(pattern))
            if not matched_files:
                raise FileNotFoundError(
                    "Pattern did not match any files. Series: {}, Pattern: {}"
                    .format(name, pattern))
            paths.extend(matched_files)

        debug("Series '{}' has the following files: {}".format(name, paths))

        series_sources[name] = (paths, reader)

    return series_sources
예제 #4
0
def _preprocessed_datasets(dataset: Dataset,
                           series_config: SeriesConfig) -> None:
    """Apply dataset-level preprocessing."""
    keys = [
        key for key in series_config.keys() if PREPROCESSED_SERIES.match(key)
    ]

    for key in keys:
        name = get_first_match(PREPROCESSED_SERIES, key)
        preprocessor = cast(DatasetPreprocess, series_config[key])

        if isinstance(dataset, Dataset):
            new_series = list(preprocessor(dataset))
            dataset.add_series(name, new_series)
        elif isinstance(dataset, LazyDataset):
            dataset.preprocess_series[name] = (None, preprocessor)
예제 #5
0
def _get_series_paths_and_readers(
        series_config: SeriesConfig) -> Dict[str, Tuple[List[str], Reader]]:
    """Get paths to files that contain data from the dataset kwargs.

    Input file for a serie named 'xxx' is specified by parameter 's_xxx'. The
    dataset series is defined by a string with a path / list of strings with
    paths, or a tuple whose first member is a path or a list of paths and the
    second memeber is a reader function.

    The paths can contain wildcards, which will be expanded using
    :py:func:`glob.glob` in sorted order.

    Arguments:
        series_config: A dictionary containing the dataset keyword argument
            specs.

    Returns:
        A dictionary which maps serie names to the paths of their input files
        and readers..
    """
    keys = [k for k in list(series_config.keys()) if SERIES_SOURCE.match(k)]
    names = [get_first_match(SERIES_SOURCE, k) for k in keys]

    series_sources = {}
    for name, key in zip(names, keys):
        value = cast(ReaderDef, series_config[key])

        if isinstance(value, tuple):
            patterns, reader = value  # type: ignore
        else:
            patterns = value
            reader = UtfPlainTextReader

        if isinstance(patterns, str):
            patterns = [patterns]

        paths = _expand_patterns_flat(patterns)
        debug("Series '{}' has the following files: {}".format(name, paths))

        series_sources[name] = (paths, reader)

    return series_sources