def _toiri(ctx): _arg = arg(ctx) if is_pipeline_action(arg) else arg _arg = [_arg] if not isinstance(_arg, list) else _arg ret = [] for u in _arg: iu = u if not (ignore_refs and not iri.is_absolute(iu)): # coerce into an IRIref, but fallout as untyped text otherwise try: iu = I(iu) except ValueError as e: # attempt to recover by percent encoding try: iu = I(iri.percent_encode(iu)) except ValueError as e: ctx.extras['logger'].warn( 'Unable to convert "{}" to IRI reference:\n{}'. format(iu, e)) if base is not None and isinstance(iu, I): iu = I(iri.absolutize(iu, base)) ret.append(iu) return ret
def _link(ctx): (origin, _, t, a) = ctx.current_link if derive_origin: #Have enough info to derive the origin from context. Ignore origin in current link origin = derive_origin(ctx) #If need be call the Versa action function to determine the relationship to the materialized resource rels = rel(ctx) if callable(rel) else rel if not isinstance(rels, list): rels = [rels] _value = value(ctx) if callable(value) else ( t if value is None else value) #Just work with the first provided statement, for now if res and not (ignore_refs and not iri.is_absolute(_value)): try: _value = I(_value) except ValueError: ctx.extras['logger'].warn( 'Requirement to convert link target to IRI failed for invalid input, causing the corresponding output link to be omitted entirely: {0}' .format( repr( (I(origin), I(iri.absolutize(rel, ctx.base)), _value)))) #XXX How do we really want to handle this error? return [] for r in rels: ctx.output_model.add(I(origin), I(iri.absolutize(r, ctx.base)), _value, {}) return
def expand_iri(iri_in, base): if iri_in.startswith('@'): return I(iri.absolutize(iri_in[1:], VERSA_BASEIRI)) iri_match = URI_EXPLICIT_PAT.match(iri_in) if iri_match: return I(iri.absolutize(iri_match.group(1), base)) iri_match = URI_ABBR_PAT.match(iri_in) if iri_match: uri = iris[iri_match.group(1)] fulliri = URI_ABBR_PAT.sub(uri + '\\2\\3', iri_in) else: fulliri = I(iri.absolutize(iri_in, base)) return fulliri
def abbreviate(rel, bases): for base in bases: abbr = iri.relativize(rel, base, subPathOnly=True) if abbr: if base is VERSA_BASEIRI: abbr = '@' + abbr return abbr return I(rel)
def handle_resourcelist(ltext, **kwargs): ''' A helper that converts lists of resources from a textual format such as Markdown, including absolutizing relative IRIs ''' base = kwargs.get('base', VERSA_BASEIRI) model = kwargs.get('model') iris = ltext.strip().split() newlist = model.generate_resource() for i in iris: model.add(newlist, VERSA_BASEIRI + 'item', I(iri.absolutize(i, base))) return newlist
def handle_resourceset(ltext, **kwargs): ''' A helper that converts sets of resources from a textual format such as Markdown, including absolutizing relative IRIs ''' fullprop = kwargs.get('fullprop') rid = kwargs.get('rid') base = kwargs.get('base', VERSA_BASEIRI) model = kwargs.get('model') iris = ltext.strip().split() for i in iris: model.add(rid, fullprop, I(iri.absolutize(i, base))) return None
def materialize_entity(etype, ctx_params=None, model_to_update=None, data=None, addtype=True, loop=None, logger=logging): ''' Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping according to the resource type. Implements the Libhub Resource Hash Convention As a convenience, if a vocabulary base is provided, concatenate it to etype and the data keys data - list of key/value pairs used to compute the hash. If empty the hash will be a default for the entity type WARNING: THIS FUNCTION MANGLES THE data ARG ''' ctx_params = ctx_params or {} vocabbase = ctx_params.get('vocabbase', BL) entbase = ctx_params.get('entbase') existing_ids = ctx_params.get('existing_ids', set()) plugins = ctx_params.get('plugins') logger = ctx_params.get('logger', logging) output_model = ctx_params.get('output_model') ids = ctx_params.get('ids', default_idgen(entbase)) if vocabbase and not iri.is_absolute(etype): etype = vocabbase + etype params = {'logger': logger} data = data or [] if addtype: data.insert(0, [TYPE_REL, etype]) data_full = [ ((vocabbase + k if not iri.is_absolute(k) else k), v) for (k, v) in data ] plaintext = json.dumps(data_full, separators=(',', ':'), cls=OrderedJsonEncoder) eid = ids.send(plaintext) if model_to_update: model_to_update.add(I(eid), TYPE_REL, I(etype)) params['materialized_id'] = eid params['first_seen'] = eid in existing_ids params['plaintext'] = plaintext for plugin in plugins or (): #Not using yield from if BF_MATRES_TASK in plugin: for p in plugin[BF_MATRES_TASK](loop, output_model, params): pass #logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop)) return eid
def duplicate_statements(model, oldorigin, neworigin, rfilter=None): ''' Take links with a given origin, and create duplicate links with the same information but a new origin :param model: Versa model to be updated :param oldres: resource IRI to be duplicated :param newres: origin resource IRI for duplication :return: None ''' for o, r, t, a in model.match(oldorigin): if rfilter is None or rfilter(o, r, t, a): model.add(I(neworigin), r, t, a) return
def process(source, target, rdfsonly, base=None, logger=logging): ''' Prepare a statement into a triple ready for rdflib graph ''' for link in source.match(): s, p, o = link[:3] #SKip docheader statements if s == (base or '') + '@docheader': continue if p in RESOURCE_MAPPING: p = RESOURCE_MAPPING[p] if o in RESOURCE_MAPPING: o = RESOURCE_MAPPING[o] if p == VERSA_BASEIRI + 'refines': tlinks = list(source.match(s, TYPE_REL)) if tlinks: if tlinks[0][TARGET] == VERSA_BASEIRI + 'Resource': p = I(RDFS_NAMESPACE + 'subClassOf') elif tlinks[0][TARGET] == VERSA_BASEIRI + 'Property': p = I(RDFS_NAMESPACE + 'subPropertyOf') if p == VERSA_BASEIRI + 'properties': suri = I(iri.absolutize(s, base)) if base else s target.add( (URIRef(o), URIRef(RDFS_NAMESPACE + 'domain'), URIRef(suri))) continue if p == VERSA_BASEIRI + 'value': if o not in ['Literal', 'IRI']: ouri = I(iri.absolutize(o, base)) if base else o target.add((URIRef(s), URIRef(RDFS_NAMESPACE + 'range'), URIRef(ouri))) continue s = URIRef(s) #Translate v:type to rdf:type p = RDF.type if p == TYPE_REL else URIRef(p) o = URIRef(o) if isinstance(o, I) else Literal(o) if not rdfsonly or p.startswith(RDF_NAMESPACE) or p.startswith( RDFS_NAMESPACE): target.add((s, p, o)) return
def materialize_entity(ctx, etype, fprint=None): ''' Low-level routine for creating a resource. Takes the entity (resource) type and a data mapping according to the resource type. As a convenience, if a vocabulary base is provided in the context, concatenate it to etype and data keys ctx - context information governing creation of the new entity etype - type IRI for the new entity fprint - list of key/value tuples of data to use in generating unique ID, or None in which case one is randomly generated ''' fprint_processed = [] for ix, (k, v) in enumerate(fprint or []): fprint_processed.append((k, v(ctx) if is_pipeline_action(v) else v)) return I(resource_id(etype, fprint=fprint_processed, idgen=ctx.idgen, vocabbase=ctx.base))
def jsonload(model, fp): ''' Load Versa model dumped into JSON form, either raw or canonical ''' dumped_list = json.load(fp) for link in dumped_list: if len(link) == 2: sid, (s, p, o, a) = link elif len(link) == 4: #canonical (s, p, o, a) = link tt = a.get('@target-type') if tt == '@iri-ref': o = I(o) a.pop('@target-type', None) else: continue model.add(s, p, o, a) return
def create_resource(output_model, rtypes, fprint, links, existing_ids=None, id_helper=None, preserve_fprint=False): ''' General-purpose routine to create a new resource in the output model, based on provided resource types and fingerprinting info output_model - Versa connection to model to be updated rtypes - Type IRIor list of IRIs for the new resource, used to give the object a Versa type relationship fprint - list of key/value pairs for determining a unique hash for the new resource links - list of key/value pairs for setting properties on the new resource id_helper - If a string, a base URL for the generatd ID. If callable, a function used to return the entity. If None, set a default good enough for testing. existing_ids - set of existing IDs to not recreate, or None, in which case a new resource will always be created ''' rtypes = rtypes if isinstance(rtypes, list) else [rtypes] rtype, *moretypes = rtypes for t in moretypes: links.append([VTYPE_REL, t]) if isinstance(id_helper, str): idg = idgen(id_helper) elif isinstance(id_helper, GeneratorType): idg = id_helper elif id_helper is None: idg = default_idgen(None) else: #FIXME: G11N raise ValueError('id_helper must be string (URL), callable or None') ctx = context(None, None, output_model, base=None, idgen=idg, existing_ids=existing_ids, extras=None) rid = I(materialize_entity(ctx, rtype, fprint=fprint)) if existing_ids is not None: if rid in existing_ids: return (False, rid) existing_ids.add(rid) output_model.add(rid, VTYPE_REL, rtype) if preserve_fprint: attrs = { k:v for (k,v) in fprint } attrs[VTYPE_REL] = rtypes output_model.add(rid, VFPRINT_REL, rtype, attrs) for r, t in links: output_model.add(rid, r, t) return (True, rid)
def resource_id(etype, fprint=None, idgen=default_idgen(None), vocabbase=None): ''' Lowest level routine for generating a, ID value using the Versa comvention The Versa convention originated as the hash algorithm outlined by the Libhub initiative for for BIBFRAME Lite, and now codified in the document [Computing Versa Resource Hashes ](https://github.com/uogbuji/versa/wiki/Computing-Versa-Resource-Hashes). etype - type IRI for the new entity (if the entity has multiple types, this is the primary and additional types can be provided in the fingerprint set) fprint - fingerprint set. List of key/value tuples of data to use in generating its unique ID, or None in which case one is just randomly generated defaultvocabbase - for convenience, provided, use to resolve relative etype & fingerprint keys >>> from versa.pipeline import resource_id >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Jonathan Bruce Postel"), ("http://schema.org/birthDate", "1943-08-06")]) '-7hP9d_Xo8M' >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Augusta Ada King")]) 'xjgOrUFiw_o' ''' params = {} if vocabbase and not iri.is_absolute(etype): etype = vocabbase(etype) fprint_processed = [] for k, v in fprint or []: if vocabbase and not iri.is_absolute(k): k = vocabbase(k) fprint_processed.append((k, v)) if fprint_processed: fprint_processed.append((VTYPE_REL, etype)) fprint_processed.sort() plaintext = json.dumps(fprint_processed, separators=(',', ':'), cls=OrderedJsonEncoder) eid = idgen.send(plaintext) else: #We only have a type; no other distinguishing data. Generate a random hash eid = next(idgen) return I(eid)
def instance_postprocess(params, skip_relationships=None): skip_relationships = list(skip_relationships) or [] instanceids = params['instanceids'] model = params['output_model'] vocabbase = params['vocabbase'] skip_relationships.extend([ ISBN_REL, ISBN_VTYPE_REL, I(iri.absolutize('instantiates', vocabbase)) ]) def dupe_filter(o, r, t, a): #Filter out ISBN relationships return (r, t) != (VTYPE_REL, I(iri.absolutize('Instance', vocabbase))) \ and r not in skip_relationships if len(instanceids) > 1: base_instance_id = instanceids[0] for instanceid in instanceids[1:]: duplicate_statements(model, base_instance_id, instanceid, rfilter=dupe_filter) return
import warnings # from pathlib import Path # import plac # Cmdline processing tool from amara3 import iri from versa import ORIGIN, RELATIONSHIP, TARGET from versa import I, VERSA_BASEIRI, VTYPE_REL, VLABEL_REL from versa import util from versa.driver.memory import newmodel from versa.serial import literate from versa.pipeline import * from versa.contrib.datachefids import idgen as default_idgen BOOK_NS = I('https://example.org/') DC_NS = I('http://purl.org/dc/terms/') SCH_NS = I('https://schema.org/') # Input data (e.g. as if parsed from DC XML) # see e.g. the MODS https://library.britishcouncil.co.zw/cgi-bin/koha/opac-export.pl?op=export&bib=59705&format=mods # Abstractly, Versa pipelines operate by maping a set of input entities # to an output entity, but in practice the input entities are often bundled # into some sort of record format. We'll use such terminology interchangeably. INPUT_RECORDS = [] INPUT_RECORDS.append('''\ # @docheader * @iri:
import time from itertools import islice import logging import rdflib from rdflib import URIRef, Literal from amara3 import iri from rdflib import URIRef, Literal, RDF from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET from versa.driver import memory from versa import VERSA_BASEIRI from versa.reader.md import from_markdown TYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI)) VNS = rdflib.Namespace(VERSA_BASEIRI) RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' RDFS_NAMESPACE = 'http://www.w3.org/2000/01/rdf-schema#' RESOURCE_MAPPING = { I(VERSA_BASEIRI + 'Resource'): I(RDFS_NAMESPACE + 'Class'), I(VERSA_BASEIRI + 'Property'): I(RDF_NAMESPACE + 'Property'), I(VERSA_BASEIRI + 'description'): I(RDFS_NAMESPACE + 'comment'), I(VERSA_BASEIRI + 'label'): I(RDFS_NAMESPACE + 'label'), } def prep(link): '''
* alternateName: Hi-Tek * name: Tony Cottrell * birthDate: 1976-05-05 ''' @pytest.fixture def expected_modout1(): modout = newmodel() #literate.parse(''' #''', modout) return modout SCH_NS = I('https://schema.org/') DOC_NS = I('http://example.org/records/') def test_mosdef_only(testresourcepath, expected_modout1): modin = newmodel() literate.parse(INPUT_GRAPH_1, modin) modin = newmodel() literate.parse(INPUT_GRAPH_1, modin) FINGERPRINT_RULES = { SCH_NS('MusicAlbum'): (if_(contains(follow(SCH_NS('byArtist')), DOC_NS('md')), materialize(COPY()))), SCH_NS('Person'): (materialize(COPY())),
# bibframe from versa import I # use BFZ namespace to scope MARC tags that don't match transformation recipes BFZ = I('http://bibfra.me/vocab/marcext/') BFLC = I('http://bibframe.org/vocab/') #A way to register services to specialize bibframe.py processing #Maps URL to callable g_services = {} BF_INIT_TASK = 'http://bibfra.me/tool/pybibframe#task.init' BF_INPUT_TASK = 'http://bibfra.me/tool/pybibframe#task.input-model' BF_INPUT_XREF_TASK = 'http://bibfra.me/tool/pybibframe#task.input-xref-model' BF_MARCREC_TASK = 'http://bibfra.me/tool/pybibframe#task.marcrec' BF_MATRES_TASK = 'http://bibfra.me/tool/pybibframe#task.materialize-resource' BF_FINAL_TASK = 'http://bibfra.me/tool/pybibframe#task.final' BL = I('http://bibfra.me/vocab/lite/') BA = I('http://bibfra.me/vocab/annotation/') REL = I('http://bibfra.me/vocab/relation/') MARC = I('http://bibfra.me/vocab/marc/') RBMS = I('http://bibfra.me/vocab/rbms/') AV = I('http://bibfra.me/vocab/audiovisual/') ARCHIVE = I('http://bibfra.me/vocab/archive/') MARCEXT = I('http://bibfra.me/vocab/marcext/') POSTPROCESS_AS_INSTANCE = 'http://bibfra.me/tool/pybibframe#marc.postprocess.instance' #def register_service(coro, iri=None):
import warnings from pathlib import Path import click # Cmdline processing tool. pip install click from amara3 import iri from versa import ORIGIN, RELATIONSHIP, TARGET from versa import I, VERSA_BASEIRI, VTYPE_REL, VLABEL_REL from versa import util from versa.driver.memory import newmodel from versa.serial import csv, literate, mermaid from versa.pipeline import * from versa.contrib.datachefids import idgen as default_idgen BOOK_NS = I('https://example.org/') IMPLICIT_NS = I('http://example.org/vocab/') SCH_NS = I('https://schema.org/') from versa.pipeline import * FINGERPRINT_RULES = { # Fingerprint DC book by ISBN & output resource will be a SCH Book IMPLICIT_NS('Book'): materialize(SCH_NS('Book'), fprint=[ (SCH_NS('isbn'), follow(IMPLICIT_NS('identifier'))), ] ) }
def test_basics_4(testresourcepath): ''' Convert from schema.org to [MusicBrainz scheme](https://musicbrainz.org/doc/MusicBrainz_Database/Schema) ''' import sys # Uncomment to debug MB_NS = I('https://musicbrainz.org/doc/MusicBrainz_Database/Schema/') R_TYP = MB_NS('Release') RG_TYP = MB_NS('ReleaseGroup') A_TYP = MB_NS('Artist') DOC_NS = I('http://example.org/records/') modin = newmodel() modin_fpath = 'schemaorg/blackstar.md' literate.parse(open(os.path.join(testresourcepath, modin_fpath)).read(), modin) # Hand-add a comment property to the Mos Def resource to test that this value doesn't bleed e.g. to Kweli's output modin.add(DOC_NS('md'), SCH_NS('comment'), 'test') FINGERPRINT_RULES = { SCH_NS('MusicAlbum'): ( materialize(MB_NS('ReleaseGroup'), fprint=[ (MB_NS('title'), follow(SCH_NS('name'))), (MB_NS('artist'), follow(SCH_NS('byArtist'), SCH_NS('name'))), ], links=[ (MB_NS('contains'), materialize(MB_NS('Release'), fprint=[ (MB_NS('catalogue-number'), var('catnum')), ], links=[ (MB_NS('catalogue-number'), var('catnum')), ] )) ], vars={'catnum': follow(SCH_NS('catalogNumber'))}, # debug=sys.stderr, # Uncomment to debug ) ), SCH_NS('Person'): ( materialize(MB_NS('Artist'), fprint=[ (MB_NS('name'), var('aname')), ], links=[ (MB_NS('name'), var('aname')), (MB_NS('remark'), var('comment')), ], vars={'aname': follow(SCH_NS('name')), 'comment': follow(SCH_NS('comment'))}, ) ) } TRANSFORM_RULES = { (SCH_NS('name'), R_TYP, RG_TYP): link(rel=MB_NS('title')), (SCH_NS('byArtist'), R_TYP): link(rel=MB_NS('by'), target=lookup('@resource')), } # Intentionally shadows the global LABELIZE_RULES LABELIZE_RULES = { MB_NS('ReleaseGroup'): follow(MB_NS('title')), MB_NS('Release'): follow(MB_NS('title')), MB_NS('Artist'): follow(MB_NS('name')) } ppl = generic_pipeline(FINGERPRINT_RULES, TRANSFORM_RULES, LABELIZE_RULES) modout = ppl.run(input_model=modin) # Use -s to see this print('='*10, 'test_basics_4', '='*10) literate.write(modout) # import pprint; pprint.pprint(list(iter(modout))) assert len(modout) == 16 assert len(list(util.all_origins(modout, only_types={MB_NS('ReleaseGroup')}))) == 1 assert len(list(util.all_origins(modout, only_types={MB_NS('ReleaseGroup')}))) == 1 assert len(list(util.all_origins(modout, only_types={MB_NS('Artist')}))) == 2 # assert len(list(modout.match(None, BF_NS('birthDate'), '1919-01-01'))) == 1 # DOC_NS('md') -> I('i5GvPVm7ClA') in the transform assert [ l[0] for l in modout.match(None, MB_NS('remark'), 'test')] == [I('i5GvPVm7ClA')]
py.test -s test/py/test_pipeline.py ''' import os # Requires pytest-mock import pytest from versa import I, VERSA_BASEIRI, VTYPE_REL, VLABEL_REL, ORIGIN, RELATIONSHIP, TARGET from versa import util from versa.driver.memory import newmodel from versa.serial import csv, literate, mermaid from versa.pipeline import * SCH_NS = I('https://schema.org/') BF_NS = I('http://bibfra.me/') @pytest.fixture def expected_modout1(): modout = newmodel() #literate.parse(''' #''', modout) return modout WT = BF_NS('Work') IT = BF_NS('Instance')
import re import os import logging import itertools #from rdflib import Graph, BNode, Namespace from rdflib import URIRef, Literal, RDF, RDFS from amara3 import iri from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET from bibframe import BFZ, BFLC VTYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI)) VLABEL_REL = I(iri.absolutize('label', VERSA_BASEIRI)) WORKCLASS = iri.absolutize('Work', BFZ) INSTANCECLASS = iri.absolutize('Instance', BFZ) INSTANCEREL = iri.absolutize('hasInstance', BFZ) PROP_MAP = { VTYPE_REL: RDF.type, VLABEL_REL: RDFS.label, } def prep(stmt): ''' Prepare a statement into a triple ready for rdflib
import itertools import asyncio from itertools import tee, zip_longest from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET from versa.util import simple_lookup from amara3 import iri from bibframe import BFZ, BFLC, g_services, BF_INIT_TASK, BF_MARCREC_TASK, BF_MATRES_TASK, BF_FINAL_TASK RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' RDFS_NAMESPACE = 'http://www.w3.org/2000/01/rdf-schema#' VTYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI)) RDFS_LABEL = RDFS_NAMESPACE + 'label' def pairwise(iterable): a, b = tee(iterable) next(b, None) return zip_longest(a, b) #A plug-in is a series of callables, each of which handles a phase of #Process #The only phase predefined for all plug-ins is BF_INIT_TASK
def _materialize(ctx): ''' Inserts at least two main links in the context's output_model, one or more for the relationship from the origin to the materialized resource, one for the type of the materialized resource, and links according to the links parameter :param ctx: Runtime Versa context used in processing (e.g. includes the prototype link) :return: None This function is intricate in its use and shifting of Versa context, but the intricacies are all designed to make the marcpatterns mini language more natural. ''' # FIXME: Part of the datachef sorting out if not ctx.idgen: ctx.idgen = idgen if debug is None: def log_debug(msg): return elif not hasattr(debug, 'write'): raise TypeError('debug argument to materialize must be file-like object or None') else: def log_debug(msg): print(msg, file=debug) # Set up variables to be made available in any derived contexts vars_items = list((vars or {}).items()) if vars_items: # First make sure we're not tainting the passed-in context ctx = ctx.copy(variables=ctx.variables.copy()) for k, v in vars_items: if None in (k, v): continue #v = v if isinstance(v, list) else [v] v = v(ctx) if is_pipeline_action(v) else v if v: v = v[0] if isinstance(v, list) else v ctx.variables[k] = v (o, r, t, a) = ctx.current_link if isinstance(typ, COPY): object_copy = typ object_copy.id = o _typ = next(util.resourcetypes(ctx.input_model, o), None) object_copy.links = [] for stmt in ctx.input_model.match(o): if object_copy.rels is None or stmt[RELATIONSHIP] in typ.rels: # FIXME: Attributes? object_copy.links.append((stmt[RELATIONSHIP], stmt[TARGET])) else: _typ = typ(ctx) if is_pipeline_action(typ) else typ object_copy = None _fprint = fprint(ctx) if is_pipeline_action(fprint) else fprint # FIXME: On redesign implement split using function composition instead targets = [ sub_t.strip() for sub_t in t.split(split) if sub_t.strip() ] if split else [t] # If the rel in the incoming context is null and there is no rel passed in, nothing to attach # Especially useful signal in a pipeline's fingerprinting stage attach_ = False if rel is None and r is None else attach if '@added-links' not in ctx.extras: ctx.extras['@added-links'] = set() # Make sure we end up with a list or None rels = rel if isinstance(rel, list) else ([rel] if rel else [r]) log_debug(f'materialize action. Type: {_typ}. Anchoring rels: {rels} Initial context current link: {ctx.current_link}') log_debug(f'Variables (including from vars= arg): {ctx.variables}') objids = [] # Botanical analogy: stem context is from the caller (e.g. connection point of newly materialized resource) # vein comtexts derive from the stem for target in targets: ctx_stem = ctx.copy(current_link=(ctx.current_link[ORIGIN], ctx.current_link[RELATIONSHIP], target, ctx.current_link[ATTRIBUTES])) if origin: # Have been given enough info to derive the origin from context. Ignore origin in current link o = origin(ctx_stem) if not o: #Defensive coding continue computed_fprint = [] if _fprint else None rtypes = set([_typ]) if _fprint: # strip None values from computed unique list, including pairs where v is None for k, v in _fprint: if None in (k, v): continue for subitem in (v if isinstance(v, list) else [v]): subval = subitem(ctx_stem) if is_pipeline_action(subitem) else subitem if subval: subval = subval if isinstance(subval, list) else [subval] if k == VTYPE_REL: rtypes.update(set(subval)) computed_fprint.extend([(k, s) for s in subval]) log_debug(f'Provided fingerprinting info: {computed_fprint}') if object_copy: objid = object_copy.id else: objid = materialize_entity(ctx_stem, _typ, fprint=computed_fprint) objids.append(objid) log_debug(f'Newly materialized object: {objid}') # rels = [ ('_' + curr_rel if curr_rel.isdigit() else curr_rel) for curr_rel in rels if curr_rel ] computed_rels = [] for curr_relobj in rels: # e.g. scenario if passed in rel=ifexists(...) curr_rels = curr_relobj(ctx_stem) if is_pipeline_action(curr_relobj) else curr_relobj curr_rels = curr_rels if isinstance(curr_rels, list) else [curr_rels] for curr_rel in curr_rels: if not curr_rel: continue # FIXME: Fix properly, by slugifying & making sure slugify handles all numeric case (prepend '_') curr_rel = '_' + curr_rel if curr_rel.isdigit() else curr_rel if attach_: _smart_add(ctx_stem.output_model, I(o), I(iri.absolutize(curr_rel, ctx_stem.base)), I(objid), (), ctx.extras['@added-links']) computed_rels.append(curr_rel) # print((objid, ctx_.existing_ids)) # XXX: Means links are only processed on new objects! This needs some thought if objid not in ctx_stem.existing_ids: if _typ: _smart_add(ctx_stem.output_model, I(objid), VTYPE_REL, I(iri.absolutize(_typ, ctx_stem.base)), (), ctx.extras['@added-links']) if preserve_fprint: # Consolidate types computed_fprint = [ (k, v) for (k, v) in computed_fprint if k != VTYPE_REL ] # computed_fprint += attrs = tuple(computed_fprint + [(VTYPE_REL, r) for r in rtypes]) _smart_add(ctx_stem.output_model, I(objid), VFPRINT_REL, _typ, attrs, ctx.extras['@added-links']) # XXX: Use Nones to mark blanks, or should Versa define some sort of null resource? all_links = object_copy.links + links if object_copy else links for l in all_links: if len(l) == 2: lo = I(objid) lr, lt = l elif len(l) == 3: lo, lr, lt = l # This context is in effect # First of all, hold on to the inbound origin so that it can be accessed in embedded actions vein_vars = ctx_stem.variables.copy() vein_vars['@stem'] = ctx_stem.current_link[ORIGIN] # Newly materialized resource is the origin. The overall context target for embedded actions ctx_vein = ctx_stem.copy(current_link=(objid, ctx_stem.current_link[RELATIONSHIP], ctx_stem.current_link[TARGET], ctx_stem.current_link[ATTRIBUTES]), variables=vein_vars) lo = lo or ctx_vein.current_link[ORIGIN] lr = lr or ctx_vein.current_link[RELATIONSHIP] lt = lt or ctx_vein.current_link[TARGET] lo = lo(ctx_vein) if is_pipeline_action(lo) else lo lo = lo if isinstance(lo, list) else [lo] lr = lr(ctx_vein) if is_pipeline_action(lr) else lr # Update lr # XXX This needs cleaning up ctx_vein = ctx_stem.copy(current_link=(ctx_vein.current_link[ORIGIN], lr, ctx_vein.current_link[TARGET], ctx_stem.current_link[ATTRIBUTES]), variables=vein_vars) # If k is a list of contexts use it to dynamically execute functions if isinstance(lr, list): if lr and isinstance(lr[0], context): for newctx in lr: #The function in question will generate any needed links in the output model lt(newctx) continue # import traceback; traceback.print_stack() #For looking up the call stack e.g. to debug nested materialize # Check that the links key is not None, which is a signal not to # generate the item. For example if the key is an ifexists and the # test expression result is False, it will come back as None, # and we don't want to run the v function if lr: lt = lt(ctx_vein) if is_pipeline_action(lt) else lt # If k or v come from pipeline functions as None it signals to skip generating anything else for this link item if lt is not None: # FIXME: Fix properly, by slugifying & making sure slugify handles all-numeric case if lr.isdigit(): lr = '_' + lr _lr = I(iri.absolutize(lr, ctx_vein.base)) log_debug(f'Generated link: {lo, _lr, lt}') if isinstance(lt, list): for valitems in lt: if valitems: for loi in lo: _smart_add(ctx_vein.output_model, loi, _lr, valitems, (), ctx.extras['@added-links']) else: for loi in lo: _smart_add(ctx_vein.output_model, loi, _lr, lt, (), ctx.extras['@added-links']) ctx_stem.existing_ids.add(objid) for func in ctx.extras.get('@new-entity-hook', []): func(objid) log_debug(f'End materialize') return objids
def parse(md, model, encoding='utf-8', config=None): """ Translate the Versa Markdown syntax into Versa model relationships md -- markdown source text model -- Versa model to take the output relationship encoding -- character encoding (defaults to UTF-8) Returns: The overall base URI (`@base`) specified in the Markdown file, or None >>> from versa.driver.memory import newmodel >>> from versa.serial.literate import parse >>> m = newmodel() >>> parse(open('test/resource/poetry.md').read(), m) 'http://uche.ogbuji.net/poems/' >>> m.size() 40 >>> next(m.match(None, 'http://uche.ogbuji.net/poems/updated', '2013-10-15')) (I(http://uche.ogbuji.net/poems/1), I(http://uche.ogbuji.net/poems/updated), '2013-10-15', {}) """ #Set up configuration to interpret the conventions for the Markdown config = config or {} #This mapping takes syntactical elements such as the various header levels in Markdown and associates a resource type with the specified resources syntaxtypemap = {} if config.get('autotype-h1'): syntaxtypemap['h1'] = config.get('autotype-h1') if config.get('autotype-h2'): syntaxtypemap['h2'] = config.get('autotype-h2') if config.get('autotype-h3'): syntaxtypemap['h3'] = config.get('autotype-h3') interp_stanza = config.get('interpretations', {}) interpretations = {} def setup_interpretations(interp): #Map the interpretation IRIs to functions to do the data prep for prop, interp_key in interp.items(): if interp_key.startswith('@'): interp_key = iri.absolutize(interp_key[1:], VERSA_BASEIRI) if interp_key in PREP_METHODS: interpretations[prop] = PREP_METHODS[interp_key] else: #just use the identity, i.e. no-op interpretations[prop] = lambda x, **kwargs: x setup_interpretations(interp_stanza) #Prep ID generator, in case needed idg = idgen(None) #Preprocess the Markdown to deal with IRI-valued property values def iri_ref_tool(m): body = m.group(1) lchar = '<' if iri.matches_uri_ref_syntax(body) else '<' return lchar + m.group(1) + '>' md = IRIREF_CAND_PAT.sub(iri_ref_tool, md) #Parse the Markdown #Alternately: #from xml.sax.saxutils import escape, unescape #h = markdown.markdown(escape(md.decode(encoding)), output_format='html5') #Note: even using safe_mode this should not be presumed safe from tainted input #h = markdown.markdown(md.decode(encoding), safe_mode='escape', output_format='html5') comments = mkdcomments.CommentsExtension() h = markdown.markdown(md, safe_mode='escape', output_format='html5', extensions=[comments]) #doc = html.markup_fragment(inputsource.text(h.encode('utf-8'))) tb = treebuilder() h = '<html>' + h + '</html>' root = html5.parse(h) #root = tb.parse(h) #Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest first_h1 = next(select_name(descendants(root), 'h1')) #top_section_fields = itertools.takewhile(lambda x: x.xml_name != 'h1', select_name(following_siblings(first_h1), 'h2')) # Extract header elements. Notice I use an empty element with an empty parent as the default result docheader = next( select_value(select_name(descendants(root), 'h1'), '@docheader'), element('empty', parent=root)) # //h1[.="@docheader"] sections = filter( lambda x: x.xml_value != '@docheader', select_name_pattern(descendants(root), HEADER_PAT) ) # //h1[not(.="@docheader")]|h2[not(.="@docheader")]|h3[not(.="@docheader")] def fields(sect): ''' Each section represents a resource and contains a list with its properties This generator parses the list and yields the key value pairs representing the properties Some properties have attributes, expressed in markdown as a nested list. If present these attributes Are yielded as well, else None is yielded ''' #import logging; logging.debug(repr(sect)) #Pull all the list elements until the next header. This accommodates multiple lists in a section try: sect_body_items = itertools.takewhile( lambda x: HEADER_PAT.match(x.xml_name) is None, select_elements(following_siblings(sect))) except StopIteration: return #results_until(sect.xml_select('following-sibling::*'), 'self::h1|self::h2|self::h3') #field_list = [ U(li) for ul in sect.xml_select('following-sibling::ul') for li in ul.xml_select('./li') ] field_list = [ li for elem in select_name(sect_body_items, 'ul') for li in select_name(elem, 'li') ] def parse_li(pair): ''' Parse each list item into a property pair ''' if pair.strip(): matched = REL_PAT.match(pair) if not matched: raise ValueError( _('Syntax error in relationship expression: {0}'. format(pair))) if matched.group(3): prop = matched.group(3).strip() if matched.group(4): prop = matched.group(4).strip() if matched.group(7): val = matched.group(7).strip() typeindic = RES_VAL elif matched.group(9): val = matched.group(9).strip() typeindic = TEXT_VAL elif matched.group(11): val = matched.group(11).strip() typeindic = TEXT_VAL elif matched.group(12): val = matched.group(12).strip() typeindic = UNKNOWN_VAL else: val = '' typeindic = UNKNOWN_VAL #prop, val = [ part.strip() for part in U(li.xml_select('string(.)')).split(':', 1) ] #import logging; logging.debug(repr((prop, val))) return prop, val, typeindic return None, None, None def prep_li(li): ''' Take care of Markdown parsing minutiae. Also, Exclude child uls * a/href embedded in the li means it was specified as <link_text>. Restore the angle brackets as expected by the li parser * Similar for cases where e.g. prop: <abc> gets turned into prop: <abc></abc> ''' prepped = '' for ch in itertools.takewhile( lambda x: not (isinstance(x, element) and x.xml_name == 'ul'), li.xml_children): if isinstance(ch, text): prepped += ch elif isinstance(ch, element): if ch.xml_name == 'a': prepped += '<' + ch.xml_value + '>' else: prepped += '<' + ch.xml_name + '>' return prepped #Go through each list item for li in field_list: #Is there a nested list, which expresses attributes on a property if list(select_name(li, 'ul')): #main = ''.join([ node.xml_value # for node in itertools.takewhile( # lambda x: x.xml_name != 'ul', select_elements(li) # ) # ]) main = prep_li(li) prop, val, typeindic = parse_li(main) subfield_list = [ parse_li(prep_li(sli)) for e in select_name(li, 'ul') for sli in (select_name(e, 'li')) ] subfield_list = [(p, v, t) for (p, v, t) in subfield_list if p is not None] #Support a special case for syntax such as in the @iri and @interpretations: stanza of @docheader if val is None: val = '' yield prop, val, typeindic, subfield_list #Just a regular, unadorned property else: prop, val, typeindic = parse_li(prep_li(li)) if prop: yield prop, val, typeindic, None iris = {} # Gather the document-level metadata from the @docheader section base = schemabase = rtbase = document_iri = default_lang = None for prop, val, typeindic, subfield_list in fields(docheader): #The @iri section is where key IRI prefixes can be set if prop == '@iri': for (k, uri, typeindic) in subfield_list: if k == '@base': base = schemabase = rtbase = uri # @property is legacy elif k == '@schema' or k == '@property': schemabase = uri elif k == '@resource-type': rtbase = uri else: iris[k] = uri #The @interpretations section is where defaults can be set as to the primitive types of values from the Markdown, based on the relevant property/relationship elif prop == '@interpretations': #Iterate over items from the @docheader/@interpretations section to set up for further parsing interp = {} for k, v, x in subfield_list: interp[I(iri.absolutize(k, schemabase))] = v setup_interpretations(interp) #Setting an IRI for this very document being parsed elif prop == '@document': document_iri = val elif prop == '@language': default_lang = val #If we have a resource to which to attach them, just attach all other properties elif document_iri or base: rid = document_iri or base fullprop = I(iri.absolutize(prop, schemabase or base)) if fullprop in interpretations: val = interpretations[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=model) if val is not None: model.add(rid, fullprop, val) else: model.add(rid, fullprop, val) #Default IRI prefixes if @iri/@base is set if not schemabase: schemabase = base if not rtbase: rtbase = base if not document_iri: document_iri = base #Go through the resources expressed in remaining sections for sect in sections: #if U(sect) == '@docheader': continue #Not needed because excluded by ss #The header can take one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]" #The 3rd form is for an anonymous resource with specified type and the 4th an anonymous resource with unspecified type matched = RESOURCE_PAT.match(sect.xml_value) if not matched: raise ValueError( _('Syntax error in resource header: {0}'.format( sect.xml_value))) rid = matched.group(1) rtype = matched.group(3) if rtype: rtype = I(iri.absolutize(rtype, schemabase)) if rid: rid = I(iri.absolutize(rid, base)) if not rid: rid = next(idg) #Resource type might be set by syntax config if not rtype: rtype = syntaxtypemap.get(sect.xml_name) if rtype: model.add(rid, TYPE_REL, rtype) def expand_iri(iri_in, base): if iri_in.startswith('@'): return I(iri.absolutize(iri_in[1:], VERSA_BASEIRI)) iri_match = URI_EXPLICIT_PAT.match(iri_in) if iri_match: return I(iri.absolutize(iri_match.group(1), base)) iri_match = URI_ABBR_PAT.match(iri_in) if iri_match: uri = iris[iri_match.group(1)] fulliri = URI_ABBR_PAT.sub(uri + '\\2\\3', iri_in) else: fulliri = I(iri.absolutize(iri_in, base)) return fulliri #Add the property for prop, val, typeindic, subfield_list in fields(sect): attrs = {} for (aprop, aval, atype) in subfield_list or (): fullaprop = expand_iri(aprop, schemabase) if atype == RES_VAL: val = expand_iri(aval, rtbase) valmatch = URI_ABBR_PAT.match(aval) if valmatch: uri = iris[valmatch.group(1)] attrs[fullaprop] = URI_ABBR_PAT.sub( uri + '\\2\\3', aval) else: attrs[fullaprop] = I(iri.absolutize(aval, rtbase)) elif atype == TEXT_VAL: attrs[fullaprop] = aval elif atype == UNKNOWN_VAL: val_iri_match = URI_EXPLICIT_PAT.match(aval) if val_iri_match: aval = expand_iri(aval, rtbase) elif fullaprop in interpretations: aval = interpretations[fullaprop](aval, rid=rid, fullprop=fullaprop, base=base, model=model) if aval is not None: attrs[fullaprop] = aval fullprop = expand_iri(prop, schemabase) if typeindic == RES_VAL: val = expand_iri(val, rtbase) model.add(rid, fullprop, val, attrs) elif typeindic == TEXT_VAL: if '@lang' not in attrs: attrs['@lang'] = default_lang model.add(rid, fullprop, val, attrs) elif typeindic == UNKNOWN_VAL: val_iri_match = URI_EXPLICIT_PAT.match(val) if val_iri_match: val = expand_iri(val, rtbase) elif fullprop in interpretations: val = interpretations[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=model) if val is not None: model.add(rid, fullprop, val, attrs) #resinfo = AB_RESOURCE_PAT.match(val) #if resinfo: # val = resinfo.group(1) # valtype = resinfo.group(3) # if not val: val = model.generate_resource() # if valtype: attrs[TYPE_REL] = valtype return document_iri
''' fullprop = kwargs.get('fullprop') rid = kwargs.get('rid') base = kwargs.get('base', VERSA_BASEIRI) model = kwargs.get('model') iris = ltext.strip().split() for i in iris: model.add(rid, fullprop, I(iri.absolutize(i, base))) return None PREP_METHODS = { VERSA_BASEIRI + 'text': lambda x, **kwargs: x, VERSA_BASEIRI + 'resource': lambda x, base=VERSA_BASEIRI, **kwargs: I(iri.absolutize(x, base)), VERSA_BASEIRI + 'resourceset': handle_resourceset, } def parse(md, model, encoding='utf-8', config=None): """ Translate the Versa Markdown syntax into Versa model relationships md -- markdown source text model -- Versa model to take the output relationship encoding -- character encoding (defaults to UTF-8) Returns: The overall base URI (`@base`) specified in the Markdown file, or None
''' import os import json import itertools import asyncio from versa import I, ORIGIN, RELATIONSHIP, TARGET from versa.util import simple_lookup from amara3 import iri from bibframe import BFZ, BFLC, g_services, BF_INIT_TASK, BF_MARCREC_TASK, BF_FINAL_TASK ISBN_REL = I(iri.absolutize('isbn', BFZ)) TITLE_REL = I(iri.absolutize('title', BFZ)) BFHOST = 'bibfra.me' #A plug-in is a series of callables, each of which handles a phase of #Process #The only phase predefined for all plug-ins is BF_INIT_TASK #One convenient way to organize the Plug-in is as a class #In this case we want to create a separate instance for each full processing event loop class linkreport(object): PLUGIN_ID = 'http://bibfra.me/tool/pybibframe#linkreport'
Test NTriples serializer ''' import logging import functools # Requires pytest-mock import pytest from amara3 import iri from versa import I from versa.driver.memory import newmodel from versa.serial.ntriples import * # from versa.util import jsondump, jsonload NT_SPEC = I('http://www.w3.org/2001/sw/RDFCore/ntriples/') DC_CREATOR = I('http://purl.org/dc/elements/1.1/creator') DC_PUBLISHER = I('http://purl.org/dc/elements/1.1/publisher') W3C = I('http://www.w3.org/') @pytest.fixture def ntrips_1(): return '''\ <http://www.w3.org/2001/sw/RDFCore/ntriples/> <http://purl.org/dc/elements/1.1/creator> "Dave Beckett" . <http://www.w3.org/2001/sw/RDFCore/ntriples/> <http://purl.org/dc/elements/1.1/creator> "Art Barstow" . <http://www.w3.org/2001/sw/RDFCore/ntriples/> <http://purl.org/dc/elements/1.1/publisher> <http://www.w3.org/> . ''' def test_parse1(ntrips_1):
def handle_record_links(self, loop, model, params): ''' Task coroutine of the main event loop for MARC conversion, called with In this case update a report of links encountered in the MARC/XML model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed params -- parameters passed in from processing: params['workid']: ID of the work constructed from the MARC record params['instanceid']: list of IDs of instances constructed from the MARC record ''' #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID) #Get the configured default vocabulary base IRI vocabbase = params['vocabbase'] for obj, _r, typ, _a in model.match(None, VTYPE_REL, None): # build labels based on model order, iterating over every property of # every resource, and building the label if that property is consulted rule = self._config['lookup'].get(typ) if rule is None: continue rules = rule if isinstance(rule, list) else [rule] label = '' for rule in rules: def chunk_eval(s): # used when configuration is stored in JSON and one of these labelizer instructions is an eval-able string # a known Python injection attack vector, so mentioned in README if isinstance(s, str) and len(s) > 5: s = eval(s, {'I': I}, locals()) return s marc_order = rule.get('marcOrder', False) separator = chunk_eval(rule.get('separator', ' ')) wrapper = chunk_eval(rule.get('wrapper', None)) multivalsep = chunk_eval(rule.get('multivalSeparator', ' | ')) props = rule.get('properties', []) if marc_order: link_stream = pairwise( (l for l in model.match(obj, None, None) if l[1] in props)) else: link_stream = pairwise( (l for p in props for l in model.match(obj, p, None))) #print("LABELIZING {} of type {}".format(obj, typ)) for (link1, link2) in link_stream: _o1, rel1, target1, _a1 = link1 _o2, rel2, target2, _a2 = link2 if link2 is not None else ( None, None, None, None) ctx = { 'currentProperty': rel1, 'currentValue': target1, 'nextProperty': rel2, 'nextValue': target2, } _wrapper = wrapper(ctx) if callable(wrapper) else wrapper if _wrapper: target1 = _wrapper[0] + target1 + _wrapper[1] label += target1 if rel2 == rel1: _multivalsep = multivalsep(ctx) if callable( multivalsep) else multivalsep label += _multivalsep elif rel2 is not None: _separator = separator(ctx) if callable( separator) else separator label += _separator #print("current label", label) if label: model.add(obj, I(RDFS_LABEL), label) break # we've found a rule that produces a label, so skip other rules label = '' if not label and 'default-label' in self._config: # if we've gone through all rules and not produced a label, yield specified default model.add(obj, I(RDFS_LABEL), self._config['default-label']) return
def parse(nt, model, encoding='utf-8', disjoint=None, only_rel=None, exclude_rel=None): ''' nt - string or file-like object with NTriples to parse model - Versa model into which to parse the data encoding character encoding for NTriples (default UTF-8) disjoint - if not None a list or set of link tuples against which parsed links should be compared, and omitted if matching. only_rel - if not None a collection of link relations limiting the parsed NTriples statements to only be added to the model if the predicate matches one in only_rel exclude_rel - if not None a collection of link relations limiting the parsed NTriples statements to be skipped if the predicate matches one in exclude_rel >>> ''' exclude_rel = exclude_rel or set() only_rel = only_rel or set() disjoint = disjoint or set() added_links = set() new_origins = set() # Make sure typing is not accidentally omitted if only_rel: only_rel.add(VTYPE_REL) def _add(o, r, t, a=None): ''' Conditionally add a statement to model, if not a duplicate ''' a = a or {} parts = (o, r, t, tuple(a.items())) if (parts in added_links) or (parts in disjoint): return False model.add(o, r, t, a) added_links.add((o, r, t, tuple(a.items()))) return True nt_gen = nt if isinstance(nt, str): nt_gen = nt.splitlines() for line in nt_gen: m = NT_LINE_PAT.match(line.strip()) if m: #print(list(enumerate(m.groups()))) _, s, s_iri, s_blank, p_iri, o, _, o_iri, o_str, o_blank = tuple( m.groups()) #print((s, s_iri, s_blank, p_iri, o, o_iri, o_str, o_blank)) if p_iri == RDF_TYPE_REL: p_iri = VTYPE_REL if o_blank or s_blank: raise NotImplementedError('Blank nodes not yet implemented') p_iri = I(p_iri) if only_rel: if p_iri not in only_rel: print('skipped', line) continue else: if p_iri in exclude_rel: continue if _add(I(s_iri), p_iri, I(o_iri) if o_iri else o_str): new_origins.add(I(s_iri)) return