예제 #1
0
 def merge_objects(self, objects):
     r = JSONValueRewriter(self.prev_post_sales_map)
     for k in list(objects.keys()):
         data = objects[k]
         updated = r.rewrite(data)
         ident = updated['id']
         if k != ident:
             if ident in objects:
                 read = reader.Reader()
                 m = read.read(json.dumps(objects[ident]))
                 n = read.read(json.dumps(updated))
                 merger = CromObjectMerger()
                 m = merger.merge(m, n)
                 objects[ident] = json.loads(factory.toString(m, False))
             else:
                 objects[ident] = updated
             del (objects[k])
예제 #2
0
파일: file.py 프로젝트: kasei/pipeline
	def merge(self, model_object, fn):
		r = reader.Reader()
		merger = self.merger
		with open(fn, 'r') as fh:
			content = fh.read()
			try:
				m = r.read(content)
				if m == model_object:
					return None
				else:
					merger.merge(m, model_object)
					return m
			except model.DataError as e:
				print(f'Exception caught while merging data from {fn} ({str(e)}):')
				print(factory.toString(model_object, False))
				print(content)
				raise
예제 #3
0
    def __call__(self, data: dict, *args, **kwargs):
        d = data['_OUTPUT']
        dd = json.loads(d)
        dr = data['_ARCHES_MODEL']
        if dr not in self.output:
            self.output[dr] = {}
        uu = data.get('uuid')
        if 'id' in dd:
            uu = hashlib.sha256(dd['id'].encode('utf-8')).hexdigest()
        elif not uu and 'uri' in data:
            uu = hashlib.sha256(data['uri'].encode('utf-8')).hexdigest()
# 			print(f'*** No UUID in top-level resource. Using a hash of top-level URI: {uu}')
        if not uu:
            uu = str(uuid.uuid4())


# 			print(f'*** No UUID in top-level resource;')
# 			print(f'*** Using an assigned UUID filename for the content: {uu}')
        fn = '%s.json' % uu
        data = json.loads(d)
        if fn in self.output[dr]:
            r = reader.Reader()
            model_object = r.read(d)
            merger = self.merger
            content = self.output[dr][fn]
            try:
                m = r.read(content)
                if m == model_object:
                    self.output[dr][fn] = data
                    return
                else:
                    merger.merge(m, model_object)
                    self.output[dr][fn] = json.loads(factory.toString(
                        m, False))
                    return
            except model.DataError:
                print(f'Exception caught while merging data from {fn}:')
                print(d)
                print(content)
                raise
        else:
            self.output[dr][fn] = data
예제 #4
0
    def test_read(self):
        self.assertRaises(DataError, self.reader.read, "")
        self.assertRaises(DataError, self.reader.read, "This is not JSON")
        self.assertRaises(DataError, self.reader.read, "{}")

        whostr = '{"type": "Person", "label": "me"}'
        self.assertTrue(isinstance(self.reader.read(whostr), Person))

        whostr = '{"@context": "fishbat", "type": "Person", "label": "me"}'
        self.assertTrue(isinstance(self.reader.read(whostr), Person))

        levelstr = '{"type": "Person", "parent_of": {"type": "Person", "label": "child"}}'
        self.assertTrue(
            isinstance(self.reader.read(levelstr).parent_of, Person))

        basestr = '{"label": "base"}'
        self.assertTrue(isinstance(self.reader.read(basestr), BaseResource))

        unknown = '{"type":"FishBat"}'
        self.assertRaises(DataError, self.reader.read, unknown)

        unknown2 = '{"type":"Person", "fishbat": "bob"}'
        self.assertRaises(DataError, self.reader.read, unknown)

        # somewhere else, rdf_value has been added
        try:
            del Dimension._properties['value']
        except:
            # maybe not?
            pass

        value = '{"type": "Dimension", "value": 100}'
        self.assertRaises(DataError, self.reader.read, value)

        r2 = reader.Reader(rdf_value=True)
        d = r2.read(value)
        self.assertEqual(d.value, 100)
예제 #5
0
	def setUp(self):
		self.reader = reader.Reader()
		# ensure we can use parent_of
		override_okay(Person, 'parent_of')
예제 #6
0
def _rewrite_output_files(files, r, update_filename, worker_id, total_workers,
                          kwargs):
    i = 0
    if not files:
        return
    print(
        f'rewrite worker partition {worker_id} called with {len(files)} files [{files[0]} .. {files[-1]}]'
    )
    start = time.time()
    rewritten_count = 0
    processed_count = 0
    ignore_errors = kwargs.get('ignore_errors', False)
    for i, f in enumerate(files):
        processed_count += 1
        # print(f'{i} {f}', end="\r", flush=True)
        with open(f) as data_file:
            try:
                bytes = data_file.read()
                if 'content_filter_re' in kwargs:
                    filter_re = kwargs['content_filter_re']
                    if not re.search(filter_re, bytes):
                        pass
                        # 						print(f'skipping   {f}')
                        continue
                    else:
                        pass
# 						print(f'processing {f}')
                data = json.loads(bytes)
            except json.decoder.JSONDecodeError:
                sys.stderr.write(
                    f'Failed to load JSON during rewriting of {f}\n')
                if ignore_errors:
                    continue
                else:
                    raise
        d = r.rewrite(data, file=f)
        if update_filename:
            newfile = filename_for(d, original_filename=f, **kwargs)
        else:
            newfile = f
        if d == data and f == newfile:
            # nothing changed; do not rewrite the file
            continue
        else:
            pass
            # print(f'*** rewrote data in {f} --> {newfile}')
        if newfile != f:
            if os.path.exists(newfile):
                read = reader.Reader()
                merger = CromObjectMerger()
                with open(newfile, 'r') as fh:
                    content = fh.read()
                    try:
                        m = read.read(content)
                        n = read.read(d)
                        # 						print('========================= MERGING =========================')
                        # 						print('merging objects:')
                        # 						print(f'- {m}')
                        # 						print(f'- {n}')
                        merger.merge(m, n)


# 					except model.DataError as e:
                    except Exception as e:
                        print(
                            f'Exception caught while merging data from {newfile} ({str(e)}):'
                        )
                        print(d)
                        print(content)
                        if ignore_errors:
                            continue
                        else:
                            raise
                    data = factory.toString(m, False)
                    d = json.loads(data)
        with open(newfile, 'w') as data_file:
            rewritten_count += 1
            json.dump(d, data_file, indent=2, ensure_ascii=False)
        if newfile != f:
            os.remove(f)
    end = time.time()
    elapsed = end - start
    if rewritten_count:
        print(
            f'worker partition {worker_id}/{total_workers} finished with {rewritten_count}/{processed_count} files rewritten in %.1fs'
            % (elapsed, ))
    else:
        print(
            f'worker partition {worker_id}/{total_workers} finished in %.1fs' %
            (elapsed, ))
예제 #7
0
from collections import defaultdict, Counter

from settings import output_file_path
from pipeline.util import CromObjectMerger
from cromulent.model import factory
from cromulent import model, vocab, reader

vocab.conceptual_only_parts()
vocab.add_linked_art_boundary_check()
vocab.add_attribute_assignment_check()

path = sys.argv[1] if len(sys.argv) > 1 else output_file_path
files = sorted(Path(path).rglob('*.json'))
seen = {}

read = reader.Reader()
coalesce_count = 0
print(f'Coalescing JSON files in {path} ...')
counter = Counter()
files_by_id = defaultdict(list)
for filename in files:
    p = Path(filename)
    id = p.name
    counter[id] += 1
    files_by_id[id].append(p)

for id in sorted(counter):
    count = counter[id]
    if count > 1:
        files = files_by_id[id]
        for filename in files:
예제 #8
0
 def setUp(self):
     self.reader = reader.Reader()
     # ensure we can use parent_of
     Person._properties['parent_of']['okayToUse'] = 1