Exemplo n.º 1
0
def build_dataset(
    dataset_key: str, config=None, dataset_type="train"
) -> torch.utils.data.Dataset:
    """Builder function for creating a dataset. If dataset_key is passed
    the dataset is created from default config of the dataset and thus is
    disable config even if it is passed. Otherwise, we use MultiDatasetLoader to
    build and return an instance of dataset based on the config

    Args:
        dataset_key (str): Key of dataset to build.
        config (DictConfig, optional): Configuration that will be used to create
            the dataset. If not passed, dataset's default config will be used.
            Defaults to {}.
        dataset_type (str, optional): Type of the dataset to build, train|val|test.
            Defaults to "train".

    Returns:
        (torch.utils.data.Dataset): A dataset instance of type torch Dataset
    """
    from mmf.datasets.base_dataset_builder import BaseDatasetBuilder
    from mmf.utils.configuration import load_yaml_with_defaults

    datamodule_instance = build_datamodule(dataset_key)
    # If config is not provided, we take it from default one
    if not config:
        config_path = datamodule_instance.config_path()
        if config_path is None:
            # If config path wasn't defined, send an empty config path
            # but don't force dataset to define a config
            warnings.warn(
                f"Config path not defined for {dataset_key}, "
                + "continuing with empty config"
            )
            config = OmegaConf.create()
        else:
            config = load_yaml_with_defaults(config_path)
            config = OmegaConf.select(config, f"dataset_config.{dataset_key}")
            if config is None:
                config = OmegaConf.create()
            OmegaConf.set_struct(config, True)
    elif dataset_key in config:
        # Handle Global config
        config = config[dataset_key]

    datamodule_instance.build_dataset(config)
    dataset = datamodule_instance.load_dataset(config, dataset_type)
    if hasattr(datamodule_instance, "update_registry_for_model"):
        datamodule_instance.update_registry_for_model(config)

    return dataset
Exemplo n.º 2
0
def build_dataset(dataset_key: str,
                  config=None,
                  dataset_type="train") -> mmf_typings.DatasetType:
    """Builder function for creating a dataset. If dataset_key is passed
    the dataset is created from default config of the dataset and thus is
    disable config even if it is passed. Otherwise, we use MultiDatasetLoader to
    build and return an instance of dataset based on the config

    Args:
        dataset_key (str): Key of dataset to build.
        config (DictConfig, optional): Configuration that will be used to create
            the dataset. If not passed, dataset's default config will be used.
            Defaults to {}.
        dataset_type (str, optional): Type of the dataset to build, train|val|test.
            Defaults to "train".

    Returns:
        (DatasetType): A dataset instance of type BaseDataset
    """
    from mmf.utils.configuration import load_yaml_with_defaults

    dataset_builder = registry.get_builder_class(dataset_key)
    assert dataset_builder, (f"Key {dataset_key} doesn't have a registered " +
                             "dataset builder")

    # If config is not provided, we take it from default one
    if not config:
        config_path = dataset_builder.config_path()
        if config_path is None:
            # If config path wasn't defined, send an empty config path
            # but don't force dataset to define a config
            warnings.warn(f"Config path not defined for {dataset_key}, " +
                          "continuing with empty config")
            config = OmegaConf.create()
        else:
            config = load_yaml_with_defaults(config_path)
            config = OmegaConf.select(config, f"dataset_config.{dataset_key}")
            if config is None:
                config = OmegaConf.create()
            OmegaConf.set_struct(config, True)

    builder_instance: mmf_typings.DatasetBuilderType = dataset_builder()
    builder_instance.build_dataset(config, dataset_type)
    dataset = builder_instance.load_dataset(config, dataset_type)
    if hasattr(builder_instance, "update_registry_for_model"):
        builder_instance.update_registry_for_model(config)

    return dataset