Пример #1
0
    def __init__(self, inputs: List[str], ioctx: IOContext = None):
        """Initialize a JsonReader.

        Arguments:
            inputs (str|list): either a glob expression for files, e.g.,
                "/tmp/**/*.json", or a list of single file paths or URIs, e.g.,
                ["s3://bucket/file.json", "s3://bucket/file2.json"].
            ioctx (IOContext): current IO context object.
        """

        self.ioctx = ioctx or IOContext()
        if isinstance(inputs, str):
            inputs = os.path.abspath(os.path.expanduser(inputs))
            if os.path.isdir(inputs):
                inputs = os.path.join(inputs, "*.json")
                logger.warning(
                    "Treating input directory as glob pattern: {}".format(
                        inputs))
            if urlparse(inputs).scheme not in [""] + WINDOWS_DRIVES:
                raise ValueError(
                    "Don't know how to glob over `{}`, ".format(inputs) +
                    "please specify a list of files to read instead.")
            else:
                self.files = glob.glob(inputs)
        elif type(inputs) is list:
            self.files = inputs
        else:
            raise ValueError(
                "type of inputs must be list or str, not {}".format(inputs))
        if self.files:
            logger.info("Found {} input files.".format(len(self.files)))
        else:
            raise ValueError("No files found matching {}".format(inputs))
        self.cur_file = None
Пример #2
0
    def __init__(self,
                 path,
                 ioctx=None,
                 max_file_size=64 * 1024 * 1024,
                 compress_columns=frozenset(["obs", "new_obs"])):
        """Initialize a JsonWriter.

        Arguments:
            path (str): a path/URI of the output directory to save files in.
            ioctx (IOContext): current IO context object.
            max_file_size (int): max size of single files before rolling over.
            compress_columns (list): list of sample batch columns to compress.
        """

        self.path = path
        self.ioctx = ioctx or IOContext()
        self.max_file_size = max_file_size
        self.compress_columns = compress_columns
        if urlparse(path).scheme:
            self.path_is_uri = True
        else:
            # Try to create local dirs if they don't exist
            try:
                os.makedirs(path)
            except OSError:
                pass  # already exists
            assert os.path.exists(path), "Failed to create {}".format(path)
            self.path_is_uri = False
        self.file_index = 0
        self.bytes_written = 0
        self.cur_file = None
Пример #3
0
    def __init__(self,
                 path: str,
                 ioctx: IOContext = None,
                 max_file_size: int = 64 * 1024 * 1024,
                 compress_columns: List[str] = frozenset(["obs", "new_obs"])):
        """Initializes a JsonWriter instance.

        Args:
            path: a path/URI of the output directory to save files in.
            ioctx: current IO context object.
            max_file_size: max size of single files before rolling over.
            compress_columns: list of sample batch columns to compress.
        """
        self.ioctx = ioctx or IOContext()
        self.max_file_size = max_file_size
        self.compress_columns = compress_columns
        if urlparse(path).scheme not in [""] + WINDOWS_DRIVES:
            self.path_is_uri = True
        else:
            path = os.path.abspath(os.path.expanduser(path))
            # Try to create local dirs if they don't exist
            try:
                os.makedirs(path)
            except OSError:
                pass  # already exists
            assert os.path.exists(path), "Failed to create {}".format(path)
            self.path_is_uri = False
        self.path = path
        self.file_index = 0
        self.bytes_written = 0
        self.cur_file = None
Пример #4
0
    def __init__(self,
                 ioctx: IOContext = None,
                 compress_columns: List[str] = frozenset(["obs", "new_obs"])):
        """Initializes a DatasetWriter instance.

        Examples:
        config = {
            "output"="dataset",
            "output_config"={
                "format": "json",
                "path": "/tmp/test_samples/",
                "max_num_samples_per_file": 100000,
            }
        }

        Args:
            ioctx: current IO context object.
            compress_columns: list of sample batch columns to compress.
        """
        self.ioctx = ioctx or IOContext()

        output_config: Dict = ioctx.output_config
        assert "format" in output_config, (
            "output_config.type must be specified when using Dataset output.")
        assert "path" in output_config, (
            "output_config.path must be specified when using Dataset output.")

        self.format = output_config["format"]
        self.path = os.path.abspath(os.path.expanduser(output_config["path"]))
        self.max_num_samples_per_file = (
            output_config["max_num_samples_per_file"]
            if "max_num_samples_per_file" in output_config else 100000)
        self.compress_columns = compress_columns

        self.samples = []
Пример #5
0
    def __init__(self,
                 inputs: Union[str, List[str]],
                 ioctx: Optional[IOContext] = None):
        """Initializes a JsonReader instance.

        Args:
            inputs: Either a glob expression for files, e.g. `/tmp/**/*.json`,
                or a list of single file paths or URIs, e.g.,
                ["s3://bucket/file.json", "s3://bucket/file2.json"].
            ioctx: Current IO context object or None.
        """
        logger.info("You are using JSONReader. It is recommended to use " +
                    "DatasetReader instead for better sharding support.")

        self.ioctx = ioctx or IOContext()
        self.default_policy = self.policy_map = None
        if self.ioctx.worker is not None:
            self.policy_map = self.ioctx.worker.policy_map
            self.default_policy = self.policy_map.get(DEFAULT_POLICY_ID)

        if isinstance(inputs, str):
            inputs = os.path.abspath(os.path.expanduser(inputs))
            if os.path.isdir(inputs):
                inputs = [
                    os.path.join(inputs, "*.json"),
                    os.path.join(inputs, "*.zip")
                ]
                logger.warning(
                    f"Treating input directory as glob patterns: {inputs}")
            else:
                inputs = [inputs]

            if any(
                    urlparse(i).scheme not in [""] + WINDOWS_DRIVES
                    for i in inputs):
                raise ValueError(
                    "Don't know how to glob over `{}`, ".format(inputs) +
                    "please specify a list of files to read instead.")
            else:
                self.files = []
                for i in inputs:
                    self.files.extend(glob.glob(i))
        elif isinstance(inputs, (list, tuple)):
            self.files = list(inputs)
        else:
            raise ValueError(
                "type of inputs must be list or str, not {}".format(inputs))
        if self.files:
            logger.info("Found {} input files.".format(len(self.files)))
        else:
            raise ValueError("No files found matching {}".format(inputs))
        self.cur_file = None
Пример #6
0
    def __init__(self, inputs: List[str], ioctx: IOContext = None):
        """Initialize a JsonReader.

        Args:
            inputs (str|list): Either a glob expression for files, e.g.,
                "/tmp/**/*.json", or a list of single file paths or URIs, e.g.,
                ["s3://bucket/file.json", "s3://bucket/file2.json"].
            ioctx (IOContext): Current IO context object.
        """

        self.ioctx = ioctx or IOContext()
        self.default_policy = None
        if self.ioctx.worker is not None:
            self.default_policy = \
                self.ioctx.worker.policy_map.get(DEFAULT_POLICY_ID)
        if isinstance(inputs, str):
            inputs = os.path.abspath(os.path.expanduser(inputs))
            if os.path.isdir(inputs):
                inputs = [
                    os.path.join(inputs, "*.json"),
                    os.path.join(inputs, "*.zip")
                ]
                logger.warning(
                    f"Treating input directory as glob patterns: {inputs}")
            else:
                inputs = [inputs]

            if any(
                    urlparse(i).scheme not in [""] + WINDOWS_DRIVES
                    for i in inputs):
                raise ValueError(
                    "Don't know how to glob over `{}`, ".format(inputs) +
                    "please specify a list of files to read instead.")
            else:
                self.files = []
                for i in inputs:
                    self.files.extend(glob.glob(i))
        elif type(inputs) is list:
            self.files = inputs
        else:
            raise ValueError(
                "type of inputs must be list or str, not {}".format(inputs))
        if self.files:
            logger.info("Found {} input files.".format(len(self.files)))
        else:
            raise ValueError("No files found matching {}".format(inputs))
        self.cur_file = None
Пример #7
0
    def __init__(self, dist: Dict[JsonReader, float], ioctx: IOContext):
        """Initialize a MixedInput.

        Args:
            dist (dict): dict mapping JSONReader paths or "sampler" to
                probabilities. The probabilities must sum to 1.0.
            ioctx (IOContext): current IO context object.
        """
        if sum(dist.values()) != 1.0:
            raise ValueError("Values must sum to 1.0: {}".format(dist))
        self.choices = []
        self.p = []
        for k, v in dist.items():
            if k == "sampler":
                self.choices.append(ioctx.default_sampler_input())
            else:
                self.choices.append(JsonReader(k))
            self.p.append(v)
Пример #8
0
    def __init__(self, dist: Dict[JsonReader, float], ioctx: IOContext):
        """Initialize a MixedInput.

        Args:
            dist (dict): dict mapping JSONReader paths or "sampler" to
                probabilities. The probabilities must sum to 1.0.
            ioctx (IOContext): current IO context object.
        """
        if sum(dist.values()) != 1.0:
            raise ValueError("Values must sum to 1.0: {}".format(dist))
        self.choices = []
        self.p = []
        for k, v in dist.items():
            if k == "sampler":
                self.choices.append(ioctx.default_sampler_input())
            elif isinstance(k, FunctionType):
                self.choices.append(k(ioctx))
            elif isinstance(k, str) and registry_contains_input(k):
                input_creator = registry_get_input(k)
                self.choices.append(input_creator(ioctx))
            else:
                self.choices.append(JsonReader(k, ioctx))
            self.p.append(v)