def _generate_tables(self, files): for i, file in enumerate(files): if self.config.field is not None: with open(file, encoding="utf-8") as f: dataset = json.load(f) # We keep only the field we are interested in dataset = dataset[self.config.field] # We accept two format: a list of dicts or a dict of lists if isinstance(dataset, (list, tuple)): pa_table = paj.read_json( BytesIO("\n".join(json.dumps(row) for row in dataset).encode("utf-8")), read_options=self.config.pa_read_options, parse_options=self.config.pa_parse_options, ) else: pa_table = pa.Table.from_pydict(mapping=dataset, schema=self.config.schema) else: try: pa_table = paj.read_json( file, read_options=self.config.pa_read_options, parse_options=self.config.pa_parse_options, ) except pa.ArrowInvalid: with open(file, encoding="utf-8") as f: dataset = json.load(f) raise ValueError( f"Not able to read records in the JSON file at {file}. " f"You should probably indicate the field of the JSON file containing your records. " f"This JSON file contain the following fields: {str(list(dataset.keys()))}. " f"Select the correct one and provide it as `field='XXX'` to the `load_dataset` method. " ) yield i, pa_table
def stream_json(fn, parquet_fn, schema=None, chunk_size=10000000): if isinstance(fn, str): fn = [fn] if schema is None: schema = read_json(fn[0]).schema writer = pq.ParquetWriter(parquet_fn, schema) for _f in fn: check_gz = _f.endswith('.gz') if check_gz: f = gzip.open(_f, 'r') else: f = open(_f, 'r') while True: chunk = f.readlines(chunk_size) if not chunk: break tbl = read_json(io.BytesIO(''.join(chunk).encode())) assert tbl.schema == schema # make sure the read table schema is the same as the parsed schema writer.write_table(tbl) f.close() writer.close()
def convert_to_arrow(file_paths, save_path, cache_path_prefix="./data_chunk", no_combine=False): converted_tables = [] if len(file_paths) == 1: mmap = pa.memory_map(file_paths[0]) json_input = json.read_json(mmap) writer = nlp.arrow_writer.ArrowWriter(path=save_path) writer.write_table(json_input) else: for idx, file in enumerate(file_paths): cache_path = cache_path_prefix + "." + str(idx) mmap = pa.memory_map(file) json_input = json.read_json(mmap) writer = nlp.arrow_writer.ArrowWriter(path=cache_path) writer.write_table(json_input) mmap = pa.memory_map(cache_path) f = pa.ipc.open_stream(mmap) pa_table = f.read_all() converted_tables.append(pa_table) if not no_combine: pa_table = pa.concat_tables(converted_tables, promote=False) writer = nlp.arrow_writer.ArrowWriter(path=save_path) writer.write_table(pa_table)
def convert_ndjsons_to_parquet(files: List[Path], file_name: str, out_dir: Union[Path, str], schema: pa.Schema) -> Path: pq_file = Path(f"{out_dir}/{file_name}.parquet") if not schema: schema = pa_json.read_json(files[0]).schema with pq.ParquetWriter(pq_file, schema) as writer: parse_options = pa_json.ParseOptions(explicit_schema=schema) for f in files: logger.debug(f"Processing {f}") table = pa_json.read_json(f, parse_options=parse_options) writer.write_table(table) remove(f) return pq_file
def _generate_tables(self, files): for i, file in enumerate(files): if self.config.field is not None: with open(file, encoding="utf-8") as f: dataset = json.load(f) # We keep only the field we are interested in dataset = dataset[self.config.field] # We accept two format: a list of dicts or a dict of lists if isinstance(dataset, (list, tuple)): pa_table = paj.read_json( BytesIO("\n".join(json.dumps(row) for row in dataset).encode("utf-8")), read_options=self.config.pa_read_options, parse_options=self.config.pa_parse_options, ) else: pa_table = pa.Table.from_pydict(mapping=dataset) else: try: pa_table = paj.read_json( file, read_options=self.config.pa_read_options, parse_options=self.config.pa_parse_options, ) except pa.ArrowInvalid: with open(file, encoding="utf-8") as f: dataset = json.load(f) raise ValueError( f"Not able to read records in the JSON file at {file}. " f"You should probably indicate the field of the JSON file containing your records. " f"This JSON file contain the following fields: {str(list(dataset.keys()))}. " f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. " ) if self.config.features: # Encode column if ClassLabel for i, col in enumerate(self.config.features.keys()): if isinstance(self.config.features[col], datasets.ClassLabel): pa_table = pa_table.set_column( i, self.config.schema.field(col), [self.config.features[col].str2int(pa_table[col])]) # Cast allows str <-> int/float, while parse_option explicit_schema does NOT # Before casting, rearrange JSON field names to match passed features schema field names order pa_table = pa.Table.from_arrays( [pa_table[name] for name in self.config.features], schema=self.config.schema) yield i, pa_table
def arrow_from_json(r, *args, **kwargs): """ Read the stream from s3 and turn JSON into an Arrow Table. """ table = json.read_json(r) print("Created Dataframe with dimensions: (nrow, ncol) = %s" % str(table.shape), file=sys.stderr) return table
def from_jsonl( cls, json_path: str, identifier: Identifier = None, dataset_fmt: str = "in_memory", ) -> Dataset: """Load a dataset from a .jsonl file on disk, where each line of the json file consists of a single example.""" if dataset_fmt == "in_memory": # Load the .jsonl file with open(json_path) as f: data = [json.loads(line) for line in f] return cls( data, identifier=identifier, dataset_fmt=dataset_fmt, ) elif dataset_fmt == "datasets": # Use jsonarrow to directly load the json return cls( jsonarrow.read_json(json_path), identifier=identifier, dataset_fmt=dataset_fmt, ) else: raise NotImplementedError
def pa_read_json( input_file: Union[IO, str], schema: Union[pa.Schema, Metadata, dict] = None, expect_full_schema: bool = True, **kwargs, ): """Read a jsonlines file into an Arrow table. Args: input_file (Union[IO, str]): the JSONL you want to read. string, path or file-like object. schema (pyarrow.Schema): pyarrow Schema with the expected columns wanted. If unset pyarrow will infer datatypes. expect_full_schema (bool, optional): if True, pyarrow reader will expect the input schema to have fields for every col in the input file. If False, then will only cast columns that are listed in the schema, leaving all other columns to their default type on read. **kwargs (optional): Additional kwargs are passed to pyarrow.json.read_json Returns: pyarrow.Table: the jsonl file in pyarrow format casted to the specified schema """ if schema: schema = _get_arrow_schema(schema) pa_json_table = json.read_json(input_file, **kwargs) if schema: pa_json_table = cast_arrow_table_to_schema( pa_json_table, schema=schema, expect_full_schema=expect_full_schema) return pa_json_table
def from_json(cls, json_path: str, identifier: Identifier) -> Dataset: """Load a dataset from a JSON file on disk, where each line of the json file consists of a single example.""" return cls( jsonarrow.read_json(json_path), identifier=identifier, )
def _generate_tables(self, files): for i, file in enumerate(files): pa_table = paj.read_json( file, read_options=self.config.pa_read_options, parse_options=self.config.pa_parse_options, ) yield i, pa_table
def _read_file(self, f: "pyarrow.NativeFile", **arrow_reader_args): from pyarrow import json read_options = arrow_reader_args.pop( "read_options", json.ReadOptions(use_threads=False)) return json.read_json(f, read_options=read_options, **arrow_reader_args)
def _generate_tables(self, files): for file_idx, file in enumerate(files): # If the file is one json object and if we need to look at the list of items in one specific field if self.config.field is not None: with open(file, encoding="utf-8") as f: dataset = json.load(f) # We keep only the field we are interested in dataset = dataset[self.config.field] # We accept two format: a list of dicts or a dict of lists if isinstance(dataset, (list, tuple)): mapping = { col: [dataset[i][col] for i in range(len(dataset))] for col in dataset[0].keys() } else: mapping = dataset pa_table = pa.Table.from_pydict(mapping=mapping) yield file_idx, self._cast_classlabels(pa_table) # If the file has one json object per line else: with open(file, "rb") as f: batch_idx = 0 while True: batch = f.read(self.config.chunksize) if not batch: break batch += f.readline() # finish current line try: pa_table = paj.read_json(BytesIO(batch)) except json.JSONDecodeError as e: logger.error( f"Failed to read file '{file}' with error {type(e)}: {e}" ) try: with open(file, encoding="utf-8") as f: dataset = json.load(f) except json.JSONDecodeError: raise e raise ValueError( f"Not able to read records in the JSON file at {file}. " f"You should probably indicate the field of the JSON file containing your records. " f"This JSON file contain the following fields: {str(list(dataset.keys()))}. " f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. " ) # Uncomment for debugging (will print the Arrow table size and elements) # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}") # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows))) yield (file_idx, batch_idx), self._cast_classlabels(pa_table) batch_idx += 1
def json_read(read_paths: List[str]): logger.debug(f"Reading {len(read_paths)} files.") tables = [] for read_path in read_paths: with filesystem.open_input_file(read_path) as f: tables.append( json.read_json( f, read_options=json.ReadOptions(use_threads=False), **arrow_json_args)) block = ArrowBlock(pa.concat_tables(tables)) return block, block.get_metadata(input_files=read_paths)
def import_table(source: str): if not source: return sample_table() if source.endswith(".csv"): from pyarrow import csv return csv.read_csv(source) if source.endswith(".json"): from pyarrow import json return json.read_json(source) if source.endswith(".parquet"): return pq.read_table(source) raise ValueError("source must be csv, json or parquet")
def readZippedFile(file: str, verbose: bool = False) -> pd.DataFrame: """Read a zipped file. Reads a dataset export file as exported and downloaded from Pega. The export file is formatted as a zipped multi-line JSON file or CSV file and the data is read into a pandas dataframe. Parameters ---------- file : str The full path to the file verbose : str, default=False Whether to print the names of the files within the unzipped file for debugging purposes Returns ------- pd.DataFrame A pandas dataframe with the contents. """ with zipfile.ZipFile(file, mode="r") as z: files = z.namelist() if verbose: print(files) # pragma: no cover if "data.json" in files: with z.open("data.json") as zippedfile: try: from pyarrow import json return json.read_json( zippedfile).to_pandas() # pragma: no cover except ImportError: # pragma: no cover try: dataset = pd.read_json(zippedfile, lines=True) return dataset except ValueError: dataset = pd.read_json(zippedfile) return dataset if "csv.json" in files: # pragma: no cover with z.open("data.csv") as zippedfile: try: from pyarrow import csv return csv.read_json(zippedfile).to_pandas() except ImportError: return pd.read_csv(zippedfile) else: # pragma: no cover raise FileNotFoundError( "Cannot find a 'data' file in the zip folder.")
def _read_table_from_source( source: Union[pd.DataFrame, str]) -> Tuple[pa.Table, List[str]]: """ Infers a data source type (path or Pandas DataFrame) and reads it in as a PyArrow Table. Args: source (Union[pd.DataFrame, str]): Either a string path or Pandas DataFrame. Returns: Tuple[pa.Table, List[str]]: Tuple containing PyArrow table of dataset, and column names of PyArrow table. """ # Pandas DataFrame detected if isinstance(source, pd.DataFrame): table = pa.Table.from_pandas(df=source) # Inferring a string path elif isinstance(source, str): file_path = source filename, file_ext = os.path.splitext(file_path) if ".csv" in file_ext: from pyarrow import csv table = csv.read_csv(filename) elif ".json" in file_ext: from pyarrow import json table = json.read_json(filename) else: table = pq.read_table(file_path) else: raise ValueError( f"Unknown data source provided for ingestion: {source}") # Ensure that PyArrow table is initialised assert isinstance(table, pa.lib.Table) column_names = table.column_names return table, column_names
def _init_table_from_path(self): if '.jsonl' in self.path.suffixes: # Can read ".jsonl" or ".jsonl.gz" import pyarrow.json as paj self.table = paj.read_json( str(self.path), read_options=paj.ReadOptions( # magic constants: # 894 - estimated average number of bytes per JSON item manifest # 10000 - how many items we want to have in a chunk (Arrow's "batch") block_size=894 * 10000)) elif '.arrow' == self.path.suffixes[-1]: # Can read ".arrow" import pyarrow as pa mmap = pa.memory_map(str(self.path)) stream = pa.ipc.open_file(mmap) self.table = stream.read_all() else: raise ValueError(f"Unknown LazyDict file format : '{self.path}'")
def _generate_tables(self, files): for file_idx, file in enumerate(files): # If the file is one json object and if we need to look at the list of items in one specific field if self.config.field is not None: with open(file, encoding="utf-8") as f: dataset = json.load(f) # We keep only the field we are interested in dataset = dataset[self.config.field] # We accept two format: a list of dicts or a dict of lists if isinstance(dataset, (list, tuple)): mapping = { col: [dataset[i][col] for i in range(len(dataset))] for col in dataset[0].keys() } else: mapping = dataset pa_table = pa.Table.from_pydict(mapping=mapping) yield file_idx, self._cast_classlabels(pa_table) # If the file has one json object per line else: with open(file, "rb") as f: batch_idx = 0 # Use block_size equal to the chunk size divided by 32 to leverage multithreading # Set a default minimum value of 16kB if the chunk size is really small block_size = max(self.config.chunksize // 32, 16 << 10) while True: batch = f.read(self.config.chunksize) if not batch: break # Finish current line try: batch += f.readline() except (AttributeError, io.UnsupportedOperation): batch += readline(f) try: while True: try: pa_table = paj.read_json( io.BytesIO(batch), read_options=paj.ReadOptions( block_size=block_size)) break except (pa.ArrowInvalid, pa.ArrowNotImplementedError) as e: if (isinstance(e, pa.ArrowInvalid) and "straddling" not in str(e) or block_size > len(batch)): raise else: # Increase the block size in case it was too small. # The block size will be reset for the next file. logger.debug( f"Batch of {len(batch)} bytes couldn't be parsed with block_size={block_size}. Retrying with block_size={block_size * 2}." ) block_size *= 2 except pa.ArrowInvalid as e: logger.error( f"Failed to read file '{file}' with error {type(e)}: {e}" ) try: with open(file, encoding="utf-8") as f: dataset = json.load(f) except json.JSONDecodeError: raise e raise ValueError( f"Not able to read records in the JSON file at {file}. " f"You should probably indicate the field of the JSON file containing your records. " f"This JSON file contain the following fields: {str(list(dataset.keys()))}. " f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. " ) from None # Uncomment for debugging (will print the Arrow table size and elements) # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}") # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows))) yield (file_idx, batch_idx), self._cast_classlabels(pa_table) batch_idx += 1
def query_json_file(f, column, val): table = pj.read_json(f) return [i for i in table.column(column) if i == val]
def readDSExport( filename: Union[pd.DataFrame, str], path: str = ".", verbose: bool = True, force_pandas: bool = False, **kwargs, ) -> pd.DataFrame: """Read a Pega dataset export file. Can accept either a Pandas DataFrame or one of the following formats: - .csv - .json - .zip (zipped json or CSV) It automatically infers the default file names for both model data as well as predictor data. If you supply either 'modelData' or 'predictorData' as the 'file' argument, it will search for them. If you supply the full name of the file in the 'path' directory, it will import that instead. Parameters ---------- filename : [pd.DataFrame, str] Either a Pandas DataFrame with the source data (for compatibility), or a string, in which case it can either be: - The name of the file (if a custom name) or - Whether we want to look for 'modelData' or 'predictorData' in the path folder. path : str, default = '.' The location of the file verbose : bool, default = True Whether to print out which file will be imported Keyword arguments: Any arguments to plug into the read csv or json function, from either PyArrow or Pandas. Returns ------- pd.DataFrame The read data from the given file Examples: >>> df = readDSExport(file = 'modelData', path = './datamart') >>> df = readDSExport(file = 'ModelSnapshot.json', path = 'data/ADMData') >>> df = pd.read_csv('file.csv') >>> df = readDSExport(file = df) """ if isinstance(filename, pd.DataFrame): return filename is_url = False if os.path.isfile(os.path.join(path, filename)): file = os.path.join(path, filename) else: file = get_latest_file(path, filename) if file == "Target not found": import requests try: response = requests.get(f"{path}/{filename}") is_url = True if response.status_code == 200 else False except: is_url = False if is_url: file = f"{path}/{filename}" if file.split(".")[-1] == "zip": file = urllib.request.urlopen(f"{path}/{filename}") if verbose: print("File found through URL") if file in [None, "Target not found"]: if verbose: print(f"File {filename} not found in dir {path}") return None if isinstance(file, str): extension = file.split(".")[-1] elif isinstance(file, http.client.HTTPResponse): extension = "zipped" if verbose: print(f"Importing: {os.path.join(path,filename)}" ) if is_url else print(f"Importing: {file}") if extension == "parquet": # pragma: no cover try: import pyarrow.parquet as pq return pq.read_table(file).to_pandas() except ImportError: print("You need to import pyarrow to read parquet files.") if extension == "csv": try: if force_pandas or is_url: raise ImportError("Forcing pandas.") from pyarrow import csv, ArrowInvalid try: # pragma: no cover return csv.read_csv( file, parse_options=csv.ParseOptions( delimiter=kwargs.get("sep", ",")), ).to_pandas() except ArrowInvalid: # pragma: no cover raise ImportError() except ImportError: if not is_url: if verbose: print( "Can't import pyarrow, so defaulting to pandas. For faster imports, please install pyarrow." ) return pd.read_csv(file, **kwargs) except OSError: # pragma: no cover raise FileNotFoundError(f"File {file} is not found.") elif extension == "json": try: # pragma: no cover if force_pandas: raise ImportError("Forcing pandas.") from pyarrow import json, ArrowInvalid try: return json.read_json(file, **kwargs).to_pandas() except ArrowInvalid: raise ImportError() except ImportError: # pragma: no cover if verbose: print( "Can't import pyarrow, so defaulting to pandas. For faster imports, please install pyarrow." ) try: return pd.read_json(file, lines=True, **kwargs) except ValueError: return pd.read_json(file, **kwargs) except OSError: # pragma: no cover raise FileNotFoundError(f"File {file} is not found.") else: try: if is_url and extension == "zipped": return readZippedFile(file=BytesIO(file.read())) elif extension == "zip": return readZippedFile(file=file) else: return FileNotFoundError( f"File {file} is not found.") # pragma: no cover except OSError: # pragma: no cover raise FileNotFoundError(f"File {file} is not found.")
def read_json(self, *args, **kwargs): read_options = kwargs.setdefault('read_options', ReadOptions()) read_options.use_threads = True table = read_json(*args, **kwargs) table.validate() return table
def read_json(self, *args, **kwargs): read_options = kwargs.setdefault('read_options', ReadOptions()) read_options.use_threads = True table = read_json(*args, **kwargs) table._validate() return table
def _read_table_from_source(source: Union[pd.DataFrame, str], chunk_size: int, max_workers: int) -> Tuple[str, str]: """ Infers a data source type (path or Pandas DataFrame) and reads it in as a PyArrow Table. The PyArrow Table that is read will be written to a parquet file with row group size determined by the minimum of: * (table.num_rows / max_workers) * chunk_size The parquet file that is created will be passed as file path to the multiprocessing pool workers. Args: source (Union[pd.DataFrame, str]): Either a string path or Pandas DataFrame. chunk_size (int): Number of worker processes to use to encode values. max_workers (int): Amount of rows to load and ingest at a time. Returns: Tuple[str, str]: Tuple containing parent directory path and destination path to parquet file. """ # Pandas DataFrame detected if isinstance(source, pd.DataFrame): table = pa.Table.from_pandas(df=source) # Inferring a string path elif isinstance(source, str): file_path = source filename, file_ext = os.path.splitext(file_path) if ".csv" in file_ext: from pyarrow import csv table = csv.read_csv(filename) elif ".json" in file_ext: from pyarrow import json table = json.read_json(filename) else: table = pq.read_table(file_path) else: raise ValueError( f"Unknown data source provided for ingestion: {source}") # Ensure that PyArrow table is initialised assert isinstance(table, pa.lib.Table) # Write table as parquet file with a specified row_group_size dir_path = tempfile.mkdtemp() tmp_table_name = f"{int(time.time())}.parquet" dest_path = f"{dir_path}/{tmp_table_name}" row_group_size = min(ceil(table.num_rows / max_workers), chunk_size) pq.write_table(table=table, where=dest_path, row_group_size=row_group_size) # Remove table from memory del table return dir_path, dest_path
def _split_generators(self, dl_manager): if not self.config.data_files: raise ValueError( f"At least one data file must be specified, but got data_files={self.config.data_files}" ) # Do an early pass if: # * `features` are not specified, to infer the class labels # * `drop_metadata` is False, to find the metadata files do_analyze = ( self.config.features is None and not self.config.drop_labels) or not self.config.drop_metadata if do_analyze: labels = set() metadata_files = collections.defaultdict(list) def analyze(files_or_archives, downloaded_files_or_dirs, split): if len(downloaded_files_or_dirs) == 0: return # The files are separated from the archives at this point, so check the first sample # to see if it's a file or a directory and iterate accordingly if os.path.isfile(downloaded_files_or_dirs[0]): original_files, downloaded_files = files_or_archives, downloaded_files_or_dirs for original_file, downloaded_file in zip( original_files, downloaded_files): original_file, downloaded_file = str( original_file), str(downloaded_file) _, original_file_ext = os.path.splitext(original_file) if original_file_ext.lower() in self.IMAGE_EXTENSIONS: labels.add( os.path.basename( os.path.dirname(original_file))) elif os.path.basename( original_file) == self.METADATA_FILENAME: metadata_files[split].append( (original_file, downloaded_file)) else: original_file_name = os.path.basename( original_file) logger.debug( f"The file '{original_file_name}' was ignored: it is not an image, and is not {self.METADATA_FILENAME} either." ) else: archives, downloaded_dirs = files_or_archives, downloaded_files_or_dirs for archive, downloaded_dir in zip(archives, downloaded_dirs): archive, downloaded_dir = str(archive), str( downloaded_dir) for downloaded_dir_file in dl_manager.iter_files( downloaded_dir): _, downloaded_dir_file_ext = os.path.splitext( downloaded_dir_file) if downloaded_dir_file_ext in self.IMAGE_EXTENSIONS: labels.add( os.path.basename( os.path.dirname(downloaded_dir_file))) elif os.path.basename(downloaded_dir_file ) == self.METADATA_FILENAME: metadata_files[split].append( (None, downloaded_dir_file)) else: archive_file_name = os.path.basename(archive) original_file_name = os.path.basename( downloaded_dir_file) logger.debug( f"The file '{original_file_name}' from the archive '{archive_file_name}' was ignored: it is not an image, and is not {self.METADATA_FILENAME} either." ) if not self.config.drop_labels: logger.info("Inferring labels from data files...") if not self.config.drop_metadata: logger.info("Analyzing metadata files...") data_files = self.config.data_files splits = [] for split_name, files in data_files.items(): if isinstance(files, str): files = [files] files, archives = self._split_files_and_archives(files) downloaded_files = dl_manager.download(files) downloaded_dirs = dl_manager.download_and_extract(archives) if do_analyze: analyze(files, downloaded_files, split_name) analyze(archives, downloaded_dirs, split_name) splits.append( datasets.SplitGenerator( name=split_name, gen_kwargs={ "files": [(file, downloaded_file) for file, downloaded_file in zip( files, downloaded_files)] + [(None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs], "metadata_files": metadata_files if not self.config.drop_metadata else None, "split_name": split_name, }, )) if not self.config.drop_metadata and metadata_files: # Verify that: # * all metadata files have the same set of features # * the `file_name` key is one of the metadata keys and is of type string features_per_metadata_file: List[Tuple[str, datasets.Features]] = [] for _, downloaded_metadata_file in itertools.chain.from_iterable( metadata_files.values()): with open(downloaded_metadata_file, "rb") as f: pa_metadata_table = paj.read_json(f) features_per_metadata_file.append( (downloaded_metadata_file, datasets.Features.from_arrow_schema( pa_metadata_table.schema))) for downloaded_metadata_file, metadata_features in features_per_metadata_file: if metadata_features != features_per_metadata_file[0][1]: raise ValueError( f"Metadata files {downloaded_metadata_file} and {features_per_metadata_file[0][0]} have different features: {features_per_metadata_file[0]} != {metadata_features}" ) metadata_features = features_per_metadata_file[0][1] if "file_name" not in metadata_features: raise ValueError( "`file_name` must be present as dictionary key in metadata files" ) if metadata_features["file_name"] != datasets.Value("string"): raise ValueError("`file_name` key must be a string") del metadata_features["file_name"] else: metadata_features = None # Normally, we would do this in _info, but we need to know the labels and/or metadata # before building the features if self.config.features is None: if not self.config.drop_labels and not metadata_files: self.info.features = datasets.Features({ "image": datasets.Image(), "label": datasets.ClassLabel(names=sorted(labels)) }) task_template = ImageClassification(image_column="image", label_column="label") task_template = task_template.align_with_features( self.info.features) self.info.task_templates = [task_template] else: self.info.features = datasets.Features( {"image": datasets.Image()}) if not self.config.drop_metadata and metadata_files: # Verify that there are no duplicated keys when compared to the existing features ("image", optionally "label") duplicated_keys = set( self.info.features) & set(metadata_features) if duplicated_keys: raise ValueError( f"Metadata feature keys {list(duplicated_keys)} are already present as the image features" ) self.info.features.update(metadata_features) return splits
def _generate_examples(self, files, metadata_files, split_name): if not self.config.drop_metadata and metadata_files: split_metadata_files = metadata_files.get(split_name, []) image_empty_metadata = { k: None for k in self.info.features if k != "image" } last_checked_dir = None metadata_dir = None metadata_dict = None downloaded_metadata_file = None file_idx = 0 for original_file, downloaded_file_or_dir in files: if original_file is not None: _, original_file_ext = os.path.splitext(original_file) if original_file_ext.lower() in self.IMAGE_EXTENSIONS: # If the file is an image, and we've just entered a new directory, # find the nereast metadata file (by counting path segments) for the directory current_dir = os.path.dirname(original_file) if last_checked_dir is None or last_checked_dir != current_dir: last_checked_dir = current_dir metadata_file_candidates = [ ( os.path.relpath( original_file, os.path.dirname( metadata_file_candidate)), metadata_file_candidate, downloaded_metadata_file, ) for metadata_file_candidate, downloaded_metadata_file in split_metadata_files if metadata_file_candidate is not None # ignore metadata_files that are inside archives and not os.path.relpath( original_file, os.path.dirname(metadata_file_candidate) ).startswith("..") ] if metadata_file_candidates: _, metadata_file, downloaded_metadata_file = min( metadata_file_candidates, key=lambda x: count_path_segments(x[0])) with open(downloaded_metadata_file, "rb") as f: pa_metadata_table = paj.read_json(f) pa_file_name_array = pa_metadata_table[ "file_name"] pa_file_name_array = pc.replace_substring( pa_file_name_array, pattern="\\", replacement="/") pa_metadata_table = pa_metadata_table.drop( ["file_name"]) metadata_dir = os.path.dirname(metadata_file) metadata_dict = { file_name: image_metadata for file_name, image_metadata in zip( pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table)) } else: raise ValueError( f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}." ) if metadata_dir is not None and downloaded_metadata_file is not None: file_relpath = os.path.relpath( original_file, metadata_dir) file_relpath = file_relpath.replace("\\", "/") if file_relpath not in metadata_dict: raise ValueError( f"Image at {file_relpath} doesn't have metadata in {downloaded_metadata_file}." ) image_metadata = metadata_dict[file_relpath] else: raise ValueError( f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}." ) yield file_idx, { **image_empty_metadata, "image": downloaded_file_or_dir, **image_metadata, } file_idx += 1 else: for downloaded_dir_file in downloaded_file_or_dir: _, downloaded_dir_file_ext = os.path.splitext( downloaded_dir_file) if downloaded_dir_file_ext.lower( ) in self.IMAGE_EXTENSIONS: current_dir = os.path.dirname(downloaded_dir_file) if last_checked_dir is None or last_checked_dir != current_dir: last_checked_dir = current_dir metadata_file_candidates = [ ( os.path.relpath( downloaded_dir_file, os.path.dirname( downloaded_metadata_file)), metadata_file_candidate, downloaded_metadata_file, ) for metadata_file_candidate, downloaded_metadata_file in split_metadata_files if metadata_file_candidate is None # ignore metadata_files that are not inside archives and not os.path.relpath( downloaded_dir_file, os.path.dirname( downloaded_metadata_file) ).startswith("..") ] if metadata_file_candidates: _, metadata_file, downloaded_metadata_file = min( metadata_file_candidates, key=lambda x: count_path_segments(x[0] )) with open(downloaded_metadata_file, "rb") as f: pa_metadata_table = paj.read_json(f) pa_file_name_array = pa_metadata_table[ "file_name"] pa_file_name_array = pc.replace_substring( pa_file_name_array, pattern="\\", replacement="/") pa_metadata_table = pa_metadata_table.drop( ["file_name"]) metadata_dir = os.path.dirname( downloaded_metadata_file) metadata_dict = { file_name: image_metadata for file_name, image_metadata in zip( pa_file_name_array.to_pylist(), pa_table_to_pylist( pa_metadata_table)) } else: raise ValueError( f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}." ) if metadata_dir is not None and downloaded_metadata_file is not None: downloaded_dir_file_relpath = os.path.relpath( downloaded_dir_file, metadata_dir) downloaded_dir_file_relpath = downloaded_dir_file_relpath.replace( "\\", "/") if downloaded_dir_file_relpath not in metadata_dict: raise ValueError( f"Image at {downloaded_dir_file_relpath} doesn't have metadata in {downloaded_metadata_file}." ) image_metadata = metadata_dict[ downloaded_dir_file_relpath] else: raise ValueError( f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}." ) yield file_idx, { **image_empty_metadata, "image": downloaded_dir_file, **image_metadata, } file_idx += 1 else: file_idx = 0 for original_file, downloaded_file_or_dir in files: if original_file is not None: _, original_file_ext = os.path.splitext(original_file) if original_file_ext.lower() in self.IMAGE_EXTENSIONS: if self.config.drop_labels or metadata_files: yield file_idx, { "image": downloaded_file_or_dir, } else: yield file_idx, { "image": downloaded_file_or_dir, "label": os.path.basename( os.path.dirname(original_file)), } file_idx += 1 else: for downloaded_dir_file in downloaded_file_or_dir: _, downloaded_dir_file_ext = os.path.splitext( downloaded_dir_file) if downloaded_dir_file_ext.lower( ) in self.IMAGE_EXTENSIONS: if self.config.drop_labels or metadata_files: yield file_idx, { "image": downloaded_dir_file, } else: yield file_idx, { "image": downloaded_dir_file, "label": os.path.basename( os.path.dirname(downloaded_dir_file)), } file_idx += 1
from pyarrow import json import pyarrow.parquet as pq import pyarrow as pa from glob import glob import time t = time.time_ns() fn = "tests/data/formats/tweets" table = json.read_json(fn + ".jsonl") pq.write_table(table, "parc.parquet", compression="ZSTD") # table = pq.read_table(fn + '.parquet') # print(table.schema) # print(table.column_names) # print(table.select(['username'])) # print(table.take([0]).to_pydict()) print(table.take([0, 500, 9000])["username"]) # print(table.filter()) def iter_arrow(tbl): for batch in tbl.to_batches(): dict_batch = batch.to_pydict() for index in range(len(batch)): yield {k: v[index] for k, v in dict_batch.items()} # , filter=([('user_verified', '=', True)])) # table = pq.read_table(fn + '.parquet')
def read_to_object(self, f: IOBase): at = pa_json.read_json(f.name) return at
#! /usr/bin/env nix-shell #! nix-shell -i python3 -p python3 withPackages(ps: [ ps.pandas ps.numpy ps.pyarrow]) # nix-shell -p "python3.withPackages(ps: [ ps.pandas ps.numpy ps.pyarrow ps.yapf])" from pyarrow import json as paj import json as js import numpy as np import pandas as pd import pyarrow as pa import pyarrow.parquet as pq print("Start ....") x = np.random.randn(10000) df = pd.DataFrame({'one': x, 'two': x + 5, 'three': x > 0.1}, ) table = pa.Table.from_pandas(df) pq.write_table(table, 'example.parquet') with open("exemplary.json", "w") as f: js.dump(dict(x=list(x), y=[dict(a=1, b=5, c=False) for i in list(x)]), f) table = paj.read_json(f"exemplary.json") pq.write_table(table, 'example_nested.parquet') print("Stop ....")
def from_jsonl(cls, path: Pathlike) -> 'LazyDict': _check_arrow() import pyarrow.json as paj table = paj.read_json(str(path)) return cls(table)
def test_json_file_to_arrow(f): return pa_json.read_json(f)