def _register_arrow_json_readoptions_serializer(): import ray if ( os.environ.get( "RAY_DISABLE_CUSTOM_ARROW_JSON_OPTIONS_SERIALIZATION", "0", ) == "1" ): import logging logger = logging.getLogger(__name__) logger.info("Disabling custom Arrow JSON ReadOptions serialization.") return try: import pyarrow.json as pajson except ModuleNotFoundError: return ray.util.register_serializer( pajson.ReadOptions, serializer=lambda opts: (opts.use_threads, opts.block_size), deserializer=lambda args: pajson.ReadOptions(*args), )
def _read_file(self, f: "pyarrow.NativeFile", **arrow_reader_args): from pyarrow import json read_options = arrow_reader_args.pop( "read_options", json.ReadOptions(use_threads=False)) return json.read_json(f, read_options=read_options, **arrow_reader_args)
def json_read(read_paths: List[str]): logger.debug(f"Reading {len(read_paths)} files.") tables = [] for read_path in read_paths: with filesystem.open_input_file(read_path) as f: tables.append( json.read_json( f, read_options=json.ReadOptions(use_threads=False), **arrow_json_args)) block = ArrowBlock(pa.concat_tables(tables)) return block, block.get_metadata(input_files=read_paths)
class JsonConfig(nlp.BuilderConfig): """BuilderConfig for JSON.""" read_options: paj.ReadOptions = paj.ReadOptions() parse_options: paj.ParseOptions = paj.ParseOptions() @property def pa_read_options(self): return self.read_options @property def pa_parse_options(self): return self.parse_options
def _init_table_from_path(self): if '.jsonl' in self.path.suffixes: # Can read ".jsonl" or ".jsonl.gz" import pyarrow.json as paj self.table = paj.read_json( str(self.path), read_options=paj.ReadOptions( # magic constants: # 894 - estimated average number of bytes per JSON item manifest # 10000 - how many items we want to have in a chunk (Arrow's "batch") block_size=894 * 10000)) elif '.arrow' == self.path.suffixes[-1]: # Can read ".arrow" import pyarrow as pa mmap = pa.memory_map(str(self.path)) stream = pa.ipc.open_file(mmap) self.table = stream.read_all() else: raise ValueError(f"Unknown LazyDict file format : '{self.path}'")
def pa_read_options(self): return paj.ReadOptions(use_threads=self.use_threads, block_size=self.block_size)
def _generate_tables(self, files): for file_idx, file in enumerate(files): # If the file is one json object and if we need to look at the list of items in one specific field if self.config.field is not None: with open(file, encoding="utf-8") as f: dataset = json.load(f) # We keep only the field we are interested in dataset = dataset[self.config.field] # We accept two format: a list of dicts or a dict of lists if isinstance(dataset, (list, tuple)): mapping = { col: [dataset[i][col] for i in range(len(dataset))] for col in dataset[0].keys() } else: mapping = dataset pa_table = pa.Table.from_pydict(mapping=mapping) yield file_idx, self._cast_classlabels(pa_table) # If the file has one json object per line else: with open(file, "rb") as f: batch_idx = 0 # Use block_size equal to the chunk size divided by 32 to leverage multithreading # Set a default minimum value of 16kB if the chunk size is really small block_size = max(self.config.chunksize // 32, 16 << 10) while True: batch = f.read(self.config.chunksize) if not batch: break # Finish current line try: batch += f.readline() except (AttributeError, io.UnsupportedOperation): batch += readline(f) try: while True: try: pa_table = paj.read_json( io.BytesIO(batch), read_options=paj.ReadOptions( block_size=block_size)) break except (pa.ArrowInvalid, pa.ArrowNotImplementedError) as e: if (isinstance(e, pa.ArrowInvalid) and "straddling" not in str(e) or block_size > len(batch)): raise else: # Increase the block size in case it was too small. # The block size will be reset for the next file. logger.debug( f"Batch of {len(batch)} bytes couldn't be parsed with block_size={block_size}. Retrying with block_size={block_size * 2}." ) block_size *= 2 except pa.ArrowInvalid as e: logger.error( f"Failed to read file '{file}' with error {type(e)}: {e}" ) try: with open(file, encoding="utf-8") as f: dataset = json.load(f) except json.JSONDecodeError: raise e raise ValueError( f"Not able to read records in the JSON file at {file}. " f"You should probably indicate the field of the JSON file containing your records. " f"This JSON file contain the following fields: {str(list(dataset.keys()))}. " f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. " ) from None # Uncomment for debugging (will print the Arrow table size and elements) # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}") # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows))) yield (file_idx, batch_idx), self._cast_classlabels(pa_table) batch_idx += 1