def merge_objects(self, objects): r = JSONValueRewriter(self.prev_post_sales_map) for k in list(objects.keys()): data = objects[k] updated = r.rewrite(data) ident = updated['id'] if k != ident: if ident in objects: read = reader.Reader() m = read.read(json.dumps(objects[ident])) n = read.read(json.dumps(updated)) merger = CromObjectMerger() m = merger.merge(m, n) objects[ident] = json.loads(factory.toString(m, False)) else: objects[ident] = updated del (objects[k])
def merge(self, model_object, fn): r = reader.Reader() merger = self.merger with open(fn, 'r') as fh: content = fh.read() try: m = r.read(content) if m == model_object: return None else: merger.merge(m, model_object) return m except model.DataError as e: print(f'Exception caught while merging data from {fn} ({str(e)}):') print(factory.toString(model_object, False)) print(content) raise
def __call__(self, data: dict, *args, **kwargs): d = data['_OUTPUT'] dd = json.loads(d) dr = data['_ARCHES_MODEL'] if dr not in self.output: self.output[dr] = {} uu = data.get('uuid') if 'id' in dd: uu = hashlib.sha256(dd['id'].encode('utf-8')).hexdigest() elif not uu and 'uri' in data: uu = hashlib.sha256(data['uri'].encode('utf-8')).hexdigest() # print(f'*** No UUID in top-level resource. Using a hash of top-level URI: {uu}') if not uu: uu = str(uuid.uuid4()) # print(f'*** No UUID in top-level resource;') # print(f'*** Using an assigned UUID filename for the content: {uu}') fn = '%s.json' % uu data = json.loads(d) if fn in self.output[dr]: r = reader.Reader() model_object = r.read(d) merger = self.merger content = self.output[dr][fn] try: m = r.read(content) if m == model_object: self.output[dr][fn] = data return else: merger.merge(m, model_object) self.output[dr][fn] = json.loads(factory.toString( m, False)) return except model.DataError: print(f'Exception caught while merging data from {fn}:') print(d) print(content) raise else: self.output[dr][fn] = data
def test_read(self): self.assertRaises(DataError, self.reader.read, "") self.assertRaises(DataError, self.reader.read, "This is not JSON") self.assertRaises(DataError, self.reader.read, "{}") whostr = '{"type": "Person", "label": "me"}' self.assertTrue(isinstance(self.reader.read(whostr), Person)) whostr = '{"@context": "fishbat", "type": "Person", "label": "me"}' self.assertTrue(isinstance(self.reader.read(whostr), Person)) levelstr = '{"type": "Person", "parent_of": {"type": "Person", "label": "child"}}' self.assertTrue( isinstance(self.reader.read(levelstr).parent_of, Person)) basestr = '{"label": "base"}' self.assertTrue(isinstance(self.reader.read(basestr), BaseResource)) unknown = '{"type":"FishBat"}' self.assertRaises(DataError, self.reader.read, unknown) unknown2 = '{"type":"Person", "fishbat": "bob"}' self.assertRaises(DataError, self.reader.read, unknown) # somewhere else, rdf_value has been added try: del Dimension._properties['value'] except: # maybe not? pass value = '{"type": "Dimension", "value": 100}' self.assertRaises(DataError, self.reader.read, value) r2 = reader.Reader(rdf_value=True) d = r2.read(value) self.assertEqual(d.value, 100)
def setUp(self): self.reader = reader.Reader() # ensure we can use parent_of override_okay(Person, 'parent_of')
def _rewrite_output_files(files, r, update_filename, worker_id, total_workers, kwargs): i = 0 if not files: return print( f'rewrite worker partition {worker_id} called with {len(files)} files [{files[0]} .. {files[-1]}]' ) start = time.time() rewritten_count = 0 processed_count = 0 ignore_errors = kwargs.get('ignore_errors', False) for i, f in enumerate(files): processed_count += 1 # print(f'{i} {f}', end="\r", flush=True) with open(f) as data_file: try: bytes = data_file.read() if 'content_filter_re' in kwargs: filter_re = kwargs['content_filter_re'] if not re.search(filter_re, bytes): pass # print(f'skipping {f}') continue else: pass # print(f'processing {f}') data = json.loads(bytes) except json.decoder.JSONDecodeError: sys.stderr.write( f'Failed to load JSON during rewriting of {f}\n') if ignore_errors: continue else: raise d = r.rewrite(data, file=f) if update_filename: newfile = filename_for(d, original_filename=f, **kwargs) else: newfile = f if d == data and f == newfile: # nothing changed; do not rewrite the file continue else: pass # print(f'*** rewrote data in {f} --> {newfile}') if newfile != f: if os.path.exists(newfile): read = reader.Reader() merger = CromObjectMerger() with open(newfile, 'r') as fh: content = fh.read() try: m = read.read(content) n = read.read(d) # print('========================= MERGING =========================') # print('merging objects:') # print(f'- {m}') # print(f'- {n}') merger.merge(m, n) # except model.DataError as e: except Exception as e: print( f'Exception caught while merging data from {newfile} ({str(e)}):' ) print(d) print(content) if ignore_errors: continue else: raise data = factory.toString(m, False) d = json.loads(data) with open(newfile, 'w') as data_file: rewritten_count += 1 json.dump(d, data_file, indent=2, ensure_ascii=False) if newfile != f: os.remove(f) end = time.time() elapsed = end - start if rewritten_count: print( f'worker partition {worker_id}/{total_workers} finished with {rewritten_count}/{processed_count} files rewritten in %.1fs' % (elapsed, )) else: print( f'worker partition {worker_id}/{total_workers} finished in %.1fs' % (elapsed, ))
from collections import defaultdict, Counter from settings import output_file_path from pipeline.util import CromObjectMerger from cromulent.model import factory from cromulent import model, vocab, reader vocab.conceptual_only_parts() vocab.add_linked_art_boundary_check() vocab.add_attribute_assignment_check() path = sys.argv[1] if len(sys.argv) > 1 else output_file_path files = sorted(Path(path).rglob('*.json')) seen = {} read = reader.Reader() coalesce_count = 0 print(f'Coalescing JSON files in {path} ...') counter = Counter() files_by_id = defaultdict(list) for filename in files: p = Path(filename) id = p.name counter[id] += 1 files_by_id[id].append(p) for id in sorted(counter): count = counter[id] if count > 1: files = files_by_id[id] for filename in files:
def setUp(self): self.reader = reader.Reader() # ensure we can use parent_of Person._properties['parent_of']['okayToUse'] = 1