def _read_jsonlines_lazy(path: Union[str, Path]): """ Lazily return the contents of a jsonlines file """ parser = simdjson.Parser() with open(path) as f: for line in f: yield parser.parse(line, recursive=True)
def _read_jsonlines_list(path: Union[str, Path]): """ Read a jsonlines file into memory all at once """ parser = simdjson.Parser() out = [] with open(path) as f: for line in f: out.append(parser.parse(line, recursive=True)) return out
def __init__(self, index: bytes): if hasattr(index, "read"): index = index.read() # type:ignore try: import simdjson self._index = simdjson.Parser().parse(index) except ImportError: self._index = orjson.loads(index)
def json(ds): import simdjson """parse each line in the file to a dictionary""" try: json_parser = simdjson.Parser() return json_parser.parse(ds) except ValueError: # fall back import json return json.loads(ds)
def perftest_pysimdjson_parser(jsonfile, number): import simdjson with open(jsonfile, 'rb') as f: jsonb = f.read() parser = simdjson.Parser() return benchmark("pysimdjson parse", lambda: parser.parse(jsonb), number=number)
def test_implementation(): """Ensure we can set the implementation.""" parser = simdjson.Parser() # Ensure a rubbish implementation does not get set - simdjson does not do # a safety check, buy pysimdjson does. A break in this check will cause # a segfault. with pytest.raises(ValueError): parser.implementation = 'rubbish' # The generic, always-available implementation. parser.implementation = 'fallback' parser.parse('{"hello": "world"}')
def get_bbox(self, label): """ Given a label, compute an enclosing bounding box for it. Returns: Bbox in physical coordinates """ locations = defaultdict(list) parser = simdjson.Parser() label = str(label) bbox = None if self.sql_db: conn = connect(self.sql_db) cur = conn.cursor() cur.execute( """ select index_files.filename from file_lookup, index_files where file_lookup.fid = index_files.id and file_lookup.label = ? """, (label, )) iterator = [ self.fetch_index_files((row[0] for row in cur.fetchall())) ] conn.close() else: iterator = self.fetch_all_index_files() for index_files in iterator: for filename, content in index_files.items(): segid_bbox_dict = parser.parse(content) filename = os.path.basename(filename) if label not in segid_bbox_dict: continue current_bbox = Bbox.from_list( np.frombuffer( segid_bbox_dict[label].as_buffer(of_type="i"), dtype=np.int64)) if bbox is None: bbox = current_bbox else: bbox = Bbox.expand(bbox, current_bbox) return bbox
def test_writer_substitutions(): w = StreamWriter( dataset="TEST/{key}/{value}", inner_writer=NullWriter, ) for record in DATA_SET: # convert to a simd object to test behavior as_json = orjson.dumps(record) parser = simdjson.Parser() as_simd = parser.parse(as_json) combinations = w.append(as_simd) assert combinations == as_simd["combinations"], combinations
def __init__(self, input_files, tokenizer, max_seq_len, target_field='text', seed=1, shuffle_files=True, **kwargs): super().__init__() self.files = [] self.setup_files(input_files) if shuffle_files: random.seed(seed) random.shuffle(self.files) self.create_pipeline() self.tokenizer = tokenizer self.max_seq_len = max_seq_len self.target_field = target_field self.parser = json.Parser() self.idx = 0
def file_locations_per_label_json(self, labels, allow_missing=False): locations = defaultdict(list) parser = simdjson.Parser() if labels is not None: labels = set(toiter(labels)) for index_files in self.fetch_all_index_files(): for filename, content in index_files.items(): index_labels = set(parser.parse(content).keys()) filename = os.path.basename(filename) if labels is None: for label in index_labels: locations[int(label)].append(filename) elif len(labels) > len(index_labels): for label in index_labels: if int(label) in labels: locations[int(label)].append(filename) else: for label in labels: if str(label) in index_labels: locations[int(label)].append(filename) return locations
def json(ds): """parse each line in the file to a dictionary""" json_parser = simdjson.Parser() return json_parser.parse(ds)
start = time.perf_counter_ns() for r in reader: orjson.loads(r) return (time.perf_counter_ns() - start) / 1e9 import json import ujson import orjson # import cysimdjson import simdjson import os import sys simparser = simdjson.Parser() def sparser(o): return simparser.parse(o) def simd_dump(o): o.mini # sys.path.insert(1, os.path.join(sys.path[0], "../..")) # print("cysimd parse:", time_it(test_parser, parser.parse)) print("pysimd parse:", time_it(test_parser, test_simd_serializer)) # print("json parse :", time_it(test_parser, json.loads))
def readl_simdjson(filepath: str): parser = simdjson.Parser() with open(filepath) as fp: return [simdjson.loads(line) for line in fp]
def query(self, bbox, allow_missing=False): """ For the specified bounding box (or equivalent representation), list all segment ids enclosed within it. If allow_missing is set, then don't raise an error if an index file is missing. Returns: set(labels) """ bbox = Bbox.create(bbox, context=self.physical_bounds, autocrop=True) original_bbox = bbox.clone() bbox = bbox.expand_to_chunk_size(self.chunk_size.astype( self.physical_bounds.dtype), offset=self.physical_bounds.minpt) if bbox.subvoxel(): return [] labels = set() fast_path = bbox.contains_bbox(self.physical_bounds) if self.sql_db and fast_path: conn = connect(self.sql_db) cur = conn.cursor() cur.execute("select label from file_lookup") while True: rows = cur.fetchmany(size=2**20) if len(rows) == 0: break # Sqlite only stores signed integers, so we need to coerce negative # integers back into unsigned with a bitwise and. labels.update( (int(row[0]) & 0xffffffffffffffff for row in rows)) cur.close() conn.close() return labels index_files = self.index_file_paths_for_bbox(bbox) num_blocks = int(np.ceil(len(index_files) / 10000)) for index_files_subset in tqdm(sip(index_files, 10000), total=num_blocks, desc="Block", disable=((not self.config.progress) or (num_blocks == 1))): results = self.fetch_index_files(index_files_subset) parser = simdjson.Parser() for filename, content in tqdm(results.items(), desc="Decoding Labels", disable=(not self.config.progress)): if content is None: if allow_missing: continue else: raise SpatialIndexGapError(filename + " was not found.") # The bbox test saps performance a lot # but we can skip it if we know 100% that # the labels are going to be inside. This # optimization is important for querying # entire datasets, which is contemplated # for shard generation. if fast_path: res = parser.parse(content).keys() labels.update( (int(label) for label in res)) # fast path: 16% CPU else: res = simdjson.loads(content) for label, label_bbx in res.items(): label = int(label) label_bbx = Bbox.from_list(label_bbx) if Bbox.intersects(label_bbx, original_bbox): labels.add(label) return labels
ds = tfds.load(name="ThePile", try_gcs=True) # Have not tested below ds.map(simple_tokenization, num_parallel_calls=tf.data.experimental.AUTOTUNE) # or ds.map(lambda item: simple_tokenization(item), num_parallel_calls=tf.data.experimental.AUTOTUNE) """ try: import simdjson as json except ImportError: print('Installing simdjson library') os.system('pip install -q pysimdjson') import simdjson as json parser = json.Parser() _DESCRIPTION = """ The Pile is a large, diverse, open source language modelling data set that consists of many smaller datasets combined together. The objective is to obtain text from as many modalities as possible to ensure that models trained using The Pile will have much broader generalization abilities. We are currently developing Version 1, with an ultimate goal of 1 TiB of English text. After the completion of Version 1, our next goal is a fully-multilingual, 10TiB text dataset. """ _CITATION = """ """ _DATASET_MODES = ["lm"] _PILE_URL = 'http://eaidata.bmk.sh/data/pile/train/{}.jsonl.zst'
def without_buffer(content): import numpy parser = simdjson.Parser() doc = parser.parse(content) assert len(numpy.array(doc.as_list())) == 10001
def with_buffer(content): import numpy parser = simdjson.Parser() doc = parser.parse(content) assert len(numpy.frombuffer(doc.as_buffer(of_type='d'))) == 10001
async def go(): io = ConsoleIO() sampling_output = io.section().error_output percent_complete = 0 sampling_output.write_line( f"<comment>Sampling:</comment> {percent_complete:3.0f}%") current_and_max_iterations_re = re.compile( r"Iteration:\s+(\d+)\s+/\s+(\d+)") async with stan.common.HttpstanClient() as client: operations = [] for payload in payloads: resp = await client.post(f"/{self.model_name}/fits", json=payload) if resp.status == 422: raise ValueError(str(resp.json())) elif resp.status != 201: raise RuntimeError(resp.json()["message"]) assert resp.status == 201 operations.append(resp.json()) # poll to get progress for each chain until all chains finished current_iterations = {} while not all(operation["done"] for operation in operations): for operation in operations: if operation["done"]: continue resp = await client.get(f"/{operation['name']}") assert resp.status != 404 operation.update(resp.json()) progress_message = operation["metadata"].get( "progress") if not progress_message: continue iteration, iteration_max = map( int, current_and_max_iterations_re.findall( progress_message).pop(0)) current_iterations[operation["name"]] = iteration iterations_count = sum(current_iterations.values()) total_iterations = iteration_max * num_chains percent_complete = 100 * iterations_count / total_iterations sampling_output.clear() if io.supports_ansi( ) else sampling_output.write("\n") sampling_output.write_line( f"<comment>Sampling:</comment> {round(percent_complete):3.0f}% ({iterations_count}/{total_iterations})" ) await asyncio.sleep(0.01) fit_in_cache = len(current_iterations) < num_chains stan_outputs = [] for operation in operations: fit_name = operation["result"].get("name") if fit_name is None: # operation["result"] is an error assert not str(operation["result"]["code"]).startswith( "2"), operation message = operation["result"]["message"] if """ValueError('Initialization failed.')""" in message: sampling_output.clear() sampling_output.write_line( "<info>Sampling:</info> <error>Initialization failed.</error>" ) raise RuntimeError("Initialization failed.") raise RuntimeError(message) resp = await client.get(f"/{fit_name}") if resp.status != 200: raise RuntimeError((resp.json())["message"]) stan_outputs.append(resp.content) # clean up after ourselves when fit is uncacheable (no random seed) if self.random_seed is None: resp = await client.delete(f"/{fit_name}") if resp.status not in {200, 202, 204}: raise RuntimeError((resp.json())["message"]) sampling_output.clear() if io.supports_ansi( ) else sampling_output.write("\n") sampling_output.write_line( "<info>Sampling:</info> 100%, done." if fit_in_cache else f"<info>Sampling:</info> {percent_complete:3.0f}% ({iterations_count}/{total_iterations}), done." ) if not io.supports_ansi(): sampling_output.write("\n") stan_outputs = tuple( stan_outputs) # Fit constructor expects a tuple. def is_nonempty_logger_message(msg: simdjson.Object): return msg["topic"] == "logger" and msg["values"][0] != "info:" def is_iteration_or_elapsed_time_logger_message( msg: simdjson.Object): # Assumes `msg` is a message with topic `logger`. text = msg["values"][0] return ( text.startswith("info:Iteration:") or text.startswith("info: Elapsed Time:") # this detects lines following "Elapsed Time:", part of a multi-line Stan message or text.startswith("info:" + " " * 15)) parser = simdjson.Parser() nonstandard_logger_messages = [] for stan_output in stan_outputs: for line in stan_output.splitlines(): # Do not attempt to parse non-logger messages. Draws could contain nan or inf values. # simdjson cannot parse lines containing such values. if b'"logger"' not in line: continue msg = parser.parse(line) if is_nonempty_logger_message( msg ) and not is_iteration_or_elapsed_time_logger_message(msg): nonstandard_logger_messages.append(msg.as_dict()) del parser # simdjson.Parser is no longer used at this point. if nonstandard_logger_messages: io.error_line( "<comment>Messages received during sampling:</comment>") for msg in nonstandard_logger_messages: text = msg["values"][0].replace("info:", " ").replace( "error:", " ") if text.strip(): io.error_line(f"{text}") fit = stan.fit.Fit( stan_outputs, num_chains, self.param_names, self.constrained_param_names, self.dims, num_warmup, num_samples, num_thin, save_warmup, ) for entry_point in stan.plugins.get_plugins(): Plugin = entry_point.load() fit = Plugin().on_post_sample(fit) return fit
def to_sqlite(self, database_name="spatial_index.db", create_indices=True, progress=None): """ Create a sqlite database of labels and filenames from the JSON spatial_index for faster performance. Depending on the dataset size, this could take a while. With a dataset with ~140k index files, the DB took over an hour to build and was 42 GB. """ progress = nvl(progress, self.config.progress) conn = sqlite3.connect(database_name) cur = conn.cursor() cur.execute(""" CREATE TABLE index_files ( id INTEGER PRIMARY KEY AUTOINCREMENT, filename TEXT NOT NULL ) """) cur.execute("CREATE INDEX idxfname ON index_files (filename)") cur.execute(""" CREATE TABLE file_lookup ( label INTEGER NOT NULL, fid INTEGER NOT NULL REFERENCES index_files(id), PRIMARY KEY(label,fid) ) """) cur.execute("PRAGMA journal_mode = MEMORY") cur.execute("PRAGMA synchronous = OFF") parser = simdjson.Parser() for index_files in self.fetch_all_index_files(progress=progress): for filename, content in index_files.items(): index_labels = parser.parse(content).keys() filename = os.path.basename(filename) cur.execute("INSERT INTO index_files(filename) VALUES (?)", (filename, )) cur.execute("SELECT id from index_files where filename = ?", (filename, )) fid = cur.fetchone()[0] values = ((int(label), fid) for label in index_labels) cur.executemany( "INSERT INTO file_lookup(label, fid) VALUES (?,?)", values) conn.commit() cur.execute("PRAGMA journal_mode = DELETE") cur.execute("PRAGMA synchronous = FULL") if create_indices: if progress: print("Creating labels index...") cur.execute("CREATE INDEX file_lbl ON file_lookup (label)") if progress: print("Creating filename index...") cur.execute("CREATE INDEX fname ON file_lookup (fid)") conn.close()
def __init__( self, stan_outputs: Tuple[bytes, ...], num_chains: int, param_names: Tuple[str, ...], constrained_param_names: Tuple[str, ...], dims: Tuple[Tuple[int, ...]], num_warmup: int, num_samples: int, num_thin: int, save_warmup: bool, ) -> None: self.stan_outputs = stan_outputs self.num_chains = num_chains assert self.num_chains == len(self.stan_outputs) self.param_names, self.dims, self.constrained_param_names = ( param_names, dims, constrained_param_names, ) self.num_warmup, self.num_samples = num_warmup, num_samples self.num_thin, self.save_warmup = num_thin, save_warmup # `self.sample_and_sampler_param_names` collects the sample and sampler param names. # - "sample params" include `lp__`, `accept_stat__` # - "sampler params" include `stepsize__`, `treedepth__`, ... # These names are gathered later in this function by inspecting the output from Stan. self.sample_and_sampler_param_names: Tuple[str, ...] num_flat_params = sum(np.product( dims_ or 1) for dims_ in dims) # if dims == [] then it is a scalar assert num_flat_params == len(constrained_param_names) num_samples_saved = (self.num_samples + self.num_warmup * self.save_warmup) // self.num_thin # self._draws holds all the draws. We cannot allocate it before looking at the draws # because we do not know how many sampler-specific parameters are present. Later in this # function we count them and only then allocate the array for `self._draws`. # # _draws is an ndarray with shape (num_sample_and_sampler_params + num_flat_params, num_draws, num_chains) self._draws: np.ndarray parser = simdjson.Parser() for chain_index, stan_output in zip(range(self.num_chains), self.stan_outputs): draw_index = 0 for line in stan_output.splitlines(): try: msg = parser.parse(line) except ValueError: # Occurs when draws contain an nan or infinity. simdjson cannot parse such values. msg = json.loads(line) if msg["topic"] == "sample": # Ignore sample message which is mixed together with proper draws. if not isinstance(msg["values"], (simdjson.Object, dict)): continue # for the first draw: collect sample and sampler parameter names. if not hasattr(self, "_draws"): feature_names = cast(Tuple[str, ...], tuple(msg["values"].keys())) self.sample_and_sampler_param_names = tuple( name for name in feature_names if name.endswith("__")) num_rows = len(self.sample_and_sampler_param_names ) + num_flat_params # column-major order ("F") aligns with how the draws are stored (in cols). self._draws = np.empty( (num_rows, num_samples_saved, num_chains), order="F") # rudimentary check of parameter order (sample & sampler params must be first) if num_flat_params and feature_names[-1].endswith( "__"): raise RuntimeError( f"Expected last parameter name to be one declared in program code, found `{feature_names[-1]}`" ) draw_row = tuple(msg["values"].values( )) # a "row" of values from a single draw from Stan C++ self._draws[:, draw_index, chain_index] = draw_row draw_index += 1 assert draw_index == num_samples_saved assert self.sample_and_sampler_param_names and self._draws.size self._draws.flags["WRITEABLE"] = False
async def go(): io = ConsoleIO() io.error_line("<info>Sampling...</info>") progress_bar = ProgressBar(io) progress_bar.set_format("very_verbose") current_and_max_iterations_re = re.compile( r"Iteration:\s+(\d+)\s+/\s+(\d+)") async with stan.common.HttpstanClient() as client: operations = [] for payload in payloads: resp = await client.post(f"/{self.model_name}/fits", json=payload) if resp.status == 422: raise ValueError(str(resp.json())) elif resp.status != 201: raise RuntimeError(resp.json()["message"]) assert resp.status == 201 operations.append(resp.json()) # poll to get progress for each chain until all chains finished current_iterations = {} while not all(operation["done"] for operation in operations): for operation in operations: if operation["done"]: continue resp = await client.get(f"/{operation['name']}") assert resp.status != 404 operation.update(resp.json()) progress_message = operation["metadata"].get( "progress") if not progress_message: continue iteration, iteration_max = map( int, current_and_max_iterations_re.findall( progress_message).pop(0)) if not progress_bar.get_max_steps( ): # i.e., has not started progress_bar.start(max=iteration_max * num_chains) current_iterations[operation["name"]] = iteration progress_bar.set_progress( sum(current_iterations.values())) await asyncio.sleep(0.01) # Sampling has finished. But we do not call `progress_bar.finish()` right # now. First we write informational messages to the screen, then we # redraw the (complete) progress bar. Only after that do we call `finish`. stan_outputs = [] for operation in operations: fit_name = operation["result"].get("name") if fit_name is None: # operation["result"] is an error assert not str(operation["result"]["code"]).startswith( "2"), operation raise RuntimeError(operation["result"]["message"]) resp = await client.get(f"/{fit_name}") if resp.status != 200: raise RuntimeError((resp.json())["message"]) stan_outputs.append(resp.content) # clean up after ourselves when fit is uncacheable (no random seed) if self.random_seed is None: resp = await client.delete(f"/{fit_name}") if resp.status not in {200, 202, 204}: raise RuntimeError((resp.json())["message"]) stan_outputs = tuple( stan_outputs) # Fit constructor expects a tuple. def is_nonempty_logger_message(msg: simdjson.Object): return msg["topic"] == "logger" and msg["values"][0] != "info:" def is_iteration_or_elapsed_time_logger_message( msg: simdjson.Object): # Assumes `msg` is a message with topic `logger`. text = msg["values"][0] return ( text.startswith("info:Iteration:") or text.startswith("info: Elapsed Time:") # this detects lines following "Elapsed Time:", part of a multi-line Stan message or text.startswith("info:" + " " * 15)) parser = simdjson.Parser() nonstandard_logger_messages = [] for stan_output in stan_outputs: for line in stan_output.splitlines(): # Do not attempt to parse non-logger messages. Draws could contain nan or inf values. # simdjson cannot parse lines containing such values. if b'"logger"' not in line: continue msg = parser.parse(line) if is_nonempty_logger_message( msg ) and not is_iteration_or_elapsed_time_logger_message(msg): nonstandard_logger_messages.append(msg.as_dict()) del parser # simdjson.Parser is no longer used at this point. progress_bar.clear() io.error("\x08" * progress_bar._last_messages_length ) # move left to start of line if nonstandard_logger_messages: io.error_line( "<comment>Messages received during sampling:</comment>") for msg in nonstandard_logger_messages: text = msg["values"][0].replace("info:", " ").replace( "error:", " ") if text.strip(): io.error_line(f"{text}") progress_bar.display() # re-draw the (complete) progress bar progress_bar.finish() io.error_line("\n<info>Done.</info>") fit = stan.fit.Fit( stan_outputs, num_chains, self.param_names, self.constrained_param_names, self.dims, num_warmup, num_samples, num_thin, save_warmup, ) for entry_point in stan.plugins.get_plugins(): Plugin = entry_point.load() fit = Plugin().on_post_fit(fit) return fit
def _to_sql_common(self, conn, cur, create_indices, progress, mysql_syntax=False): # handle SQLite vs MySQL syntax quirks BIND = '%s' if mysql_syntax else '?' AUTOINC = "AUTO_INCREMENT" if mysql_syntax else "AUTOINCREMENT" INTEGER = "BIGINT UNSIGNED" if mysql_syntax else "INTEGER" progress = nvl(progress, self.config.progress) cur.execute("""DROP TABLE IF EXISTS index_files""") cur.execute("""DROP TABLE IF EXISTS file_lookup""") cur.execute(f""" CREATE TABLE index_files ( id {INTEGER} PRIMARY KEY {AUTOINC}, filename VARCHAR(100) NOT NULL ) """) cur.execute("CREATE INDEX idxfname ON index_files (filename)") cur.execute(f""" CREATE TABLE file_lookup ( label {INTEGER} NOT NULL, fid {INTEGER} NOT NULL REFERENCES index_files(id), PRIMARY KEY(label,fid) ) """) parser = simdjson.Parser() for index_files in self.fetch_all_index_files(progress=progress): for filename, content in index_files.items(): index_labels = parser.parse(content).keys() filename = os.path.basename(filename) cur.execute( f"INSERT INTO index_files(filename) VALUES ({BIND})", (filename, )) cur.execute( f"SELECT id from index_files where filename = {BIND}", (filename, )) fid = cur.fetchone()[0] values = ((int(label), fid) for label in index_labels) if mysql_syntax: values = list( values) # doesn't support generators in v8.0.26 cur.executemany( f"INSERT INTO file_lookup(label, fid) VALUES ({BIND},{BIND})", values) conn.commit() if create_indices: if progress: print("Creating labels index...") cur.execute("CREATE INDEX file_lbl ON file_lookup (label)") if progress: print("Creating filename index...") cur.execute("CREATE INDEX fname ON file_lookup (fid)")