def test_mosdef_only(testresourcepath, expected_modout1): modin = newmodel() literate.parse(INPUT_GRAPH_1, modin) modin = newmodel() literate.parse(INPUT_GRAPH_1, modin) FINGERPRINT_RULES = { SCH_NS('MusicAlbum'): (if_(contains(follow(SCH_NS('byArtist')), DOC_NS('md')), materialize(COPY()))), SCH_NS('Person'): (materialize(COPY())), } ppl = generic_pipeline(FINGERPRINT_RULES, {}, {}) modout = ppl.run(input_model=modin) # Use -s to see this print('=' * 10, 'test_mosdef_only', '=' * 10) literate.write(modout) # import pprint; pprint.pprint(list(iter(modout))) assert len(modout) == 17 assert len( list(util.all_origins(modout, only_types={SCH_NS('MusicAlbum')}))) == 1 assert len(list(util.all_origins(modout, only_types={SCH_NS('Person')}))) == 3
def main(source): 'Transform CSV SOURCE file to BF Lite in Versa' ppl = csv_bibframe_pipeline() input_model = newmodel() with open(source) as csvfp: for row_model in csv.parse_iter(csvfp, VLITERATE_TEMPLATE): if row_model: input_model.update(row_model) # Debug print of input model # literate.write([input_model], out=sys.stdout) output_model = ppl.run(input_model=input_model) print('Low level JSON dump of output data model: ') util.jsondump(output_model, sys.stdout) print('\n') # 2 CRs print('Versa literate form of output: ') literate.write(output_model, out=sys.stdout) print('Diagram from extracted a sample: ') out_resources = [] for vs in ppl.fingerprints.values(): out_resources.extend(vs) ITYPE = BF_NS('Instance') instances = [ r for r in out_resources if ITYPE in util.resourcetypes(output_model, r) ] zoomed, _ = util.zoom_in(output_model, random.choice(instances), depth=2) mermaid.write(zoomed)
def test_basics_1(testresourcepath, expected_modout1): modin = newmodel() modin_fpath = 'schemaorg/catcherintherye-ugly.md' literate.parse( open(os.path.join(testresourcepath, modin_fpath)).read(), modin) FINGERPRINT_RULES = { SCH_NS('Book'): (materialize( BF_NS('Instance'), fprint=[ (BF_NS('isbn'), follow(SCH_NS('isbn'))), ], )) } TRANSFORM_RULES = { SCH_NS('name'): link(rel=BF_NS('name')), SCH_NS('author'): materialize(BF_NS('Person'), BF_NS('creator'), vars={ 'birthDate': follow(SCH_NS('authorBirthDate'), origin=var('input-resource')) }, fprint=[ (BF_NS('name'), target()), (BF_NS('birthDate'), var('birthDate')), ], links=[ (BF_NS('name'), target()), (BF_NS('birthDate'), var('birthDate')), ]), } modout = newmodel() def new_entity_hook(eid): # Add a triple to each materialized resource modout.add(eid, 'http://example.org/materializedBy', 'py.test') return ctxextras = {'@new-entity-hook': new_entity_hook} root_ctx = DUMMY_CONTEXT.copy(output_model=modout, extras=ctxextras) ppl = generic_pipeline(FINGERPRINT_RULES, TRANSFORM_RULES, LABELIZE_RULES, root_ctx=root_ctx) ppl.run(input_model=modin, output_model=modout) # Use -s to see this print('=' * 10, 'test_basics_1', '=' * 10) literate.write(modout) assert len( list(modout.match(None, 'http://example.org/materializedBy', None))) == 2
def main(source): 'Transform CSV SOURCE file to Schema.org in Versa' ppl = csv_schema_pipeline() input_model = newmodel() with open(source) as csvfp: csv.parse(csvfp, VLITERATE_TEMPLATE, input_model) # Debug print of input model # literate.write([input_model], out=sys.stdout) output_model = ppl.run(input_model=input_model) print('Resulting record Fingerprints:', ppl.fingerprints) print('Low level JSON dump of output data model: ') util.jsondump(output_model, sys.stdout) print('Versa literate form of output: ') literate.write(output_model, out=sys.stdout)
def test_basics_1(testresourcepath, expected_modout1): modin = newmodel() modin_fpath = 'schemaorg/catcherintherye-ugly.md' literate.parse(open(os.path.join(testresourcepath, modin_fpath)).read(), modin) FINGERPRINT_RULES = { SCH_NS('Book'): ( materialize(BF_NS('Instance'), fprint=[ (BF_NS('isbn'), follow(SCH_NS('isbn'))), ], ) ) } TRANSFORM_RULES = { SCH_NS('name'): link(rel=BF_NS('name')), SCH_NS('author'): materialize(BF_NS('Person'), BF_NS('creator'), vars={ 'birthDate': follow(SCH_NS('authorBirthDate'), origin=var('input-resource')) }, fprint=[ (BF_NS('name'), target()), (BF_NS('birthDate'), var('birthDate')), ], links=[ (BF_NS('name'), target()), (BF_NS('birthDate'), var('birthDate')), ] ), } ppl = generic_pipeline(FINGERPRINT_RULES, TRANSFORM_RULES, LABELIZE_RULES) modout = ppl.run(input_model=modin) # Use -s to see this print('='*10, 'test_basics_1', '='*10) literate.write(modout) assert len(modout) == 8 assert len(list(util.all_origins(modout, only_types={BF_NS('Instance')}))) == 1 assert len(list(util.all_origins(modout, only_types={BF_NS('Person')}))) == 1 assert len(list(modout.match(None, BF_NS('birthDate'), '1919-01-01'))) == 1
def test_basics_2(testresourcepath): modin = newmodel() modin_fpath = 'schemaorg/catcherintherye-ugly.md' literate.parse(open(os.path.join(testresourcepath, modin_fpath)).read(), modin) FINGERPRINT_RULES = { SCH_NS('Book'): ( materialize(var('itype'), fprint=[ (BF_NS('isbn'), follow(SCH_NS('isbn'))), ], links=[ (BF_NS('instantiates'), materialize(BF_NS('Work'), fprint=[ (BF_NS('name'), follow(SCH_NS('title'))), (BF_NS('creator'), follow(SCH_NS('author'))), (BF_NS('language'), var('lang')), ], links=[('http://instantiated-by', var('@stem'))], attach=False # Can remove when we have smart sessions to avoid duplicate instantiates links ), ) ], # Not really necessary; just testing vars in this scenario vars={ 'lang': follow(SCH_NS('inLanguage')), 'itype': BF_NS('Instance') } ) ) } TRANSFORM_RULES = { # Rule for output resource type of Work or Instance (SCH_NS('name'), WT, IT): link(rel=BF_NS('name')), # Rule only for output resource type of Work (SCH_NS('author'), WT): materialize(BF_NS('Person'), BF_NS('creator'), vars={ 'birthDate': follow(SCH_NS('authorBirthDate'), origin=var('input-resource')) }, fprint=[ # Supplementary type (VTYPE_REL, SCH_NS('Novelist')), (BF_NS('name'), target()), (BF_NS('birthDate'), var('birthDate')), ], links=[ # Supplementary type (VTYPE_REL, SCH_NS('Novelist')), (BF_NS('name'), target()), (BF_NS('birthDate'), var('birthDate')), ], preserve_fprint=True, ), } ppl = generic_pipeline(FINGERPRINT_RULES, TRANSFORM_RULES, LABELIZE_RULES) modout = ppl.run(input_model=modin) # Use -s to see this print('='*10, 'test_basics_2', '='*10) literate.write(modout) #import pprint; pprint.pprint(list(iter(modout))) assert len(modout) == 15 assert len(list(util.all_origins(modout, only_types={BF_NS('Instance')}))) == 1 assert len(list(util.all_origins(modout, only_types={BF_NS('Work')}))) == 1 assert len(list(util.all_origins(modout, only_types={BF_NS('Person')}))) == 1 assert len(list(modout.match(None, BF_NS('birthDate'), '1919-01-01'))) == 1
def test_basics_4(testresourcepath): ''' Convert from schema.org to [MusicBrainz scheme](https://musicbrainz.org/doc/MusicBrainz_Database/Schema) ''' import sys # Uncomment to debug MB_NS = I('https://musicbrainz.org/doc/MusicBrainz_Database/Schema/') R_TYP = MB_NS('Release') RG_TYP = MB_NS('ReleaseGroup') A_TYP = MB_NS('Artist') DOC_NS = I('http://example.org/records/') modin = newmodel() modin_fpath = 'schemaorg/blackstar.md' literate.parse(open(os.path.join(testresourcepath, modin_fpath)).read(), modin) # Hand-add a comment property to the Mos Def resource to test that this value doesn't bleed e.g. to Kweli's output modin.add(DOC_NS('md'), SCH_NS('comment'), 'test') FINGERPRINT_RULES = { SCH_NS('MusicAlbum'): ( materialize(MB_NS('ReleaseGroup'), fprint=[ (MB_NS('title'), follow(SCH_NS('name'))), (MB_NS('artist'), follow(SCH_NS('byArtist'), SCH_NS('name'))), ], links=[ (MB_NS('contains'), materialize(MB_NS('Release'), fprint=[ (MB_NS('catalogue-number'), var('catnum')), ], links=[ (MB_NS('catalogue-number'), var('catnum')), ] )) ], vars={'catnum': follow(SCH_NS('catalogNumber'))}, # debug=sys.stderr, # Uncomment to debug ) ), SCH_NS('Person'): ( materialize(MB_NS('Artist'), fprint=[ (MB_NS('name'), var('aname')), ], links=[ (MB_NS('name'), var('aname')), (MB_NS('remark'), var('comment')), ], vars={'aname': follow(SCH_NS('name')), 'comment': follow(SCH_NS('comment'))}, ) ) } TRANSFORM_RULES = { (SCH_NS('name'), R_TYP, RG_TYP): link(rel=MB_NS('title')), (SCH_NS('byArtist'), R_TYP): link(rel=MB_NS('by'), target=lookup('@resource')), } # Intentionally shadows the global LABELIZE_RULES LABELIZE_RULES = { MB_NS('ReleaseGroup'): follow(MB_NS('title')), MB_NS('Release'): follow(MB_NS('title')), MB_NS('Artist'): follow(MB_NS('name')) } ppl = generic_pipeline(FINGERPRINT_RULES, TRANSFORM_RULES, LABELIZE_RULES) modout = ppl.run(input_model=modin) # Use -s to see this print('='*10, 'test_basics_4', '='*10) literate.write(modout) # import pprint; pprint.pprint(list(iter(modout))) assert len(modout) == 16 assert len(list(util.all_origins(modout, only_types={MB_NS('ReleaseGroup')}))) == 1 assert len(list(util.all_origins(modout, only_types={MB_NS('ReleaseGroup')}))) == 1 assert len(list(util.all_origins(modout, only_types={MB_NS('Artist')}))) == 2 # assert len(list(modout.match(None, BF_NS('birthDate'), '1919-01-01'))) == 1 # DOC_NS('md') -> I('i5GvPVm7ClA') in the transform assert [ l[0] for l in modout.match(None, MB_NS('remark'), 'test')] == [I('i5GvPVm7ClA')]
def labelize(self): ''' Executes a utility rule to create labels in output model for new resources ''' # XXX Check if there's already a label? # Apply a common transform strategy using rules defined above def missed_label(origin, type): ''' Callback to handle cases where a transform wasn't found to match a link (by relationship) in the input model ''' warnings.warn(f'No label generated for: {origin}') labels = self.labelize_helper(LABELIZE_RULES, handle_misses=missed_label) return True if __name__ == '__main__': for rec in INPUT_RECORDS: ppl = dc_schema_pipeline() input_model = newmodel() literate.parse(rec, input_model) output_model = ppl.run(input_model=input_model) print('Resulting record Fingerprints:', ppl.fingerprints) print('Low level JSON dump of output data model: ') util.jsondump(output_model, sys.stdout) print('Versa literate form of output: ') literate.write(output_model, out=sys.stdout) # from versa.serial import mermaid # print('Mermaid diagram form of output: ') # mermaid.write(output_model, out=sys.stdout)