def test_sketch_from_sequence(sketches_json): for sketch_json in sketches_json: sketch = Sketch.from_fs_json(sketch_json) seq = sketch_to_sequence(sketch) sketch2 = sketch_from_sequence(seq) seq2 = sketch_to_sequence(sketch) assert len(seq) == len(seq2) for op1, op2 in zip(seq, seq2): assert op1 == op2
def test_sketch_from_sequence(sketches_json): for sketch_json in sketches_json: sketch = Sketch.from_fs_json(sketch_json, include_external_constraints=False) seq = sketch_to_sequence(sketch) if any(s.label in _UNSUPPORTED_CONSTRAINTS for s in seq): # Skip not supported constraints for now continue sketch2 = sketch_from_sequence(seq) seq2 = sketch_to_sequence(sketch2) assert len(seq) == len(seq2) for op1, op2 in zip(seq, seq2): assert op1 == op2
def _worker(paths_queue, processed_sketches, max_sketches, sketch_counter): num_filtered = 0 num_invalid = 0 while max_sketches is None or sketch_counter.value < max_sketches: paths = paths_queue.get() if paths is None: break sketches = [] for path in paths: sketch_list = _load_json(path) for sketch_json in sketch_list: try: sketch = Sketch.from_fs_json(sketch_json) except Exception as err: num_invalid += 1 print('Error processing sketch in file {0}'.format(path)) traceback.print_exception(type(err), err, err.__traceback__) if filter_sketch(sketch): num_filtered += 1 continue sketches.append(sketch) offsets, data = flat_array.raw_list_flat(sketches) processed_sketches.put((offsets, data)) with sketch_counter.get_lock(): sketch_counter.value += len(sketches) processed_sketches.put({ 'num_filtered': num_filtered, 'num_invalid': num_invalid })
def load_json_tarball(path): """Loads a json tarball as an iterable of sketches. Parameters ---------- path : str A path to the location of a single shard Returns ------- iterable of `Sketch` An iterable of `Sketch` representing all the sketches present in the tarball. """ with open(path, 'rb') as base_file: dctx = zstd.ZstdDecompressor() with dctx.stream_reader(base_file) as tarball: with tarfile.open(fileobj=tarball, mode='r|') as directory: while True: json_file = directory.next() if json_file is None: break if not json_file.isfile(): continue document_id, part_id = parse_sketch_id(json_file.name) data = directory.extractfile(json_file).read() if len(data) == 0: # skip empty files continue try: sketches_json = json.loads(data) except json.JSONDecodeError as exc: raise ValueError( 'Error decoding JSON for document {0} part {1}.'. format(document_id, part_id)) for i, sketch_json in enumerate(sketches_json): yield (document_id, part_id, i), Sketch.from_fs_json(sketch_json)
def sketches(sketches_json): """Return a list of sample sketches.""" return [Sketch.from_fs_json(j) for j in sketches_json]
def test_sketch_from_json(sketches_json): for sketch_json in sketches_json: Sketch.from_fs_json(sketch_json)
def test_plot_sketch(sketches_json): sketch_json_list = sketches_json[:10] for sketch_json in sketch_json_list: fig = render_sketch(Sketch.from_fs_json(sketch_json)) assert fig is not None
def test_sequence_from_sketch(sketches_json): for sketch_json in sketches_json: sketch = Sketch.from_fs_json(sketch_json) seq = sketch_to_sequence(sketch)
def sketches(sketches_json): return [Sketch.from_fs_json(j) for j in sketches_json]