def process(source, target, rdfsonly, base=None, logger=logging): ''' Prepare a statement into a triple ready for rdflib graph ''' for link in source.match(): s, p, o = link[:3] #SKip docheader statements if s == (base or '') + '@docheader': continue if p in RESOURCE_MAPPING: p = RESOURCE_MAPPING[p] if o in RESOURCE_MAPPING: o = RESOURCE_MAPPING[o] if p == VERSA_BASEIRI + 'refines': tlinks = list(source.match(s, TYPE_REL)) if tlinks: if tlinks[0][TARGET] == VERSA_BASEIRI + 'Resource': p = I(RDFS_NAMESPACE + 'subClassOf') elif tlinks[0][TARGET] == VERSA_BASEIRI + 'Property': p = I(RDFS_NAMESPACE + 'subPropertyOf') if p == VERSA_BASEIRI + 'properties': suri = I(iri.absolutize(s, base)) if base else s target.add((URIRef(o), URIRef(RDFS_NAMESPACE + 'domain'), URIRef(suri))) continue if p == VERSA_BASEIRI + 'value': if o not in ['Literal', 'IRI']: ouri = I(iri.absolutize(o, base)) if base else o target.add((URIRef(s), URIRef(RDFS_NAMESPACE + 'range'), URIRef(ouri))) continue s = URIRef(s) #Translate v:type to rdf:type p = RDF.type if p == TYPE_REL else URIRef(p) o = URIRef(o) if isinstance(o, I) else Literal(o) if not rdfsonly or p.startswith(RDF_NAMESPACE) or p.startswith(RDFS_NAMESPACE): target.add((s, p, o)) return
def _link(ctx): (origin, _, t, a) = ctx.current_link if derive_origin: #Have enough info to derive the origin from context. Ignore origin in current link origin = derive_origin(ctx) #If need be call the Versa action function to determine the relationship to the materialized resource rels = rel(ctx) if callable(rel) else rel if not isinstance(rels, list): rels = [rels] _value = value(ctx) if callable(value) else ( t if value is None else value) #Just work with the first provided statement, for now if res and not (ignore_refs and not iri.is_absolute(_value)): try: _value = I(_value) except ValueError: ctx.extras['logger'].warn( 'Requirement to convert link target to IRI failed for invalid input, causing the corresponding output link to be omitted entirely: {0}' .format( repr( (I(origin), I(iri.absolutize(rel, ctx.base)), _value)))) #XXX How do we really want to handle this error? return [] for r in rels: ctx.output_model.add(I(origin), I(iri.absolutize(r, ctx.base)), _value, {}) return
def handle_docheader(self, docheader_elem): # Special node to hold document header info for processing # FIXME: reconsider ID & type docheader_node = node(ONYA('docheader'), ONYA('docheader')) iris = {} # Gather document-level metadata from the @docheader section fields(docheader_elem, docheader_node, None) for prop in docheader_node.properties: # @iri section is where key IRI prefixes can be set if prop == '@iri': for (k, uri, typeindic) in subfield_list: if k == '@base': self.base = self.schemabase = self.rtbase = uri # @property is legacy elif k == '@schema' or k == '@property': self.schemabase = uri elif k == '@resource-type': self.rtbase = uri else: iris[k] = uri # @interpretations section is where defaults can be set as to the primitive types of values from the Markdown, based on the relevant property/relationship elif prop == '@interpretations': #Iterate over items from the @docheader/@interpretations section to set up for further parsing interp = {} for k, v, x in subfield_list: interp[I(iri.absolutize(k, schemabase))] = v self.setup_interpretations(interp) # Setting an IRI for this very document being parsed elif prop == '@document': document_iri = val elif prop == '@language': default_lang = val # If we have a resource to which to attach them, just attach all other properties elif document_iri or base: rid = document_iri or base fullprop = I(iri.absolutize(prop, schemabase or base)) if fullprop in self.interpretations: val = self.interpretations[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=model) if val is not None: model.add(rid, fullprop, val) else: model.add(rid, fullprop, val) # Default IRI prefixes if @iri/@base is set if not self.schemabase: self.schemabase = base if not self.rtbase: self.rtbase = base if not self.document_iri: self.document_iri = base schema = (base, schemabase, rtbase, document_iri, default_lang)
def isbn_instancegen(params, loop, model): ''' Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs According to Vicki Instances can be signalled by 007, 020 or 3XX, but we stick to 020 for now ''' #Handle ISBNs re: https://foundry.zepheira.com/issues/1976 entbase = params['entbase'] output_model = params['output_model'] input_model = params['input_model'] vocabbase = params['vocabbase'] logger = params['logger'] materialize_entity = params['materialize_entity'] existing_ids = params['existing_ids'] workid = params['workid'] ids = params['ids'] plugins = params['plugins'] INSTANTIATES_REL = I(iri.absolutize('instantiates', vocabbase)) isbns = list(( val for code, val in marc_lookup(input_model, '020$a'))) logger.debug('Raw ISBNS:\t{0}'.format(isbns)) # sorted to remove non-determinism which interferes with canonicalization normalized_isbns = sorted(list(isbn_list(isbns, logger=logger))) subscript = ord('a') instance_ids = [] logger.debug('Normalized ISBN:\t{0}'.format(normalized_isbns)) if normalized_isbns: for inum, itype in normalized_isbns: ean13 = compute_ean13_check(inum) data = [['instantiates', workid], [ISBNNS + 'isbn', ean13]] instanceid = materialize_entity('Instance', ctx_params=params, loop=loop, model_to_update=params['output_model'], data=data) if entbase: instanceid = I(iri.absolutize(instanceid, entbase)) output_model.add(I(instanceid), ISBN_REL, ean13) output_model.add(I(instanceid), INSTANTIATES_REL, I(workid)) if itype: output_model.add(I(instanceid), ISBN_TYPE_REL, itype) existing_ids.add(instanceid) instance_ids.append(instanceid) else: #If there are no ISBNs, we'll generate a default Instance data = [['instantiates', workid]] instanceid = materialize_entity('Instance', ctx_params=params, loop=loop, model_to_update=params['output_model'], data=data) instanceid = I(iri.absolutize(instanceid, entbase)) if entbase else I(instanceid) output_model.add(I(instanceid), INSTANTIATES_REL, I(workid)) existing_ids.add(instanceid) instance_ids.append(instanceid) #output_model.add(instance_ids[0], I(iri.absolutize('instantiates', vocabbase)), I(workid)) #output_model.add(I(instance_ids[0]), TYPE_REL, I(iri.absolutize('Instance', vocabbase))) return instance_ids
def expand_iri(iri_in, base): if iri_in.startswith('@'): return I(iri.absolutize(iri_in[1:], VERSA_BASEIRI)) iri_match = URI_EXPLICIT_PAT.match(iri_in) if iri_match: return I(iri.absolutize(iri_match.group(1), base)) iri_match = URI_ABBR_PAT.match(iri_in) if iri_match: uri = iris[iri_match.group(1)] fulliri = URI_ABBR_PAT.sub(uri + '\\2\\3', iri_in) else: fulliri = I(iri.absolutize(iri_in, base)) return fulliri
def _toiri(ctx): _arg = arg(ctx) if is_pipeline_action(arg) else arg _arg = [_arg] if not isinstance(_arg, list) else _arg ret = [] for u in _arg: iu = u if not (ignore_refs and not iri.is_absolute(iu)): # coerce into an IRIref, but fallout as untyped text otherwise try: iu = I(iu) except ValueError as e: # attempt to recover by percent encoding try: iu = I(iri.percent_encode(iu)) except ValueError as e: ctx.extras['logger'].warn( 'Unable to convert "{}" to IRI reference:\n{}'. format(iu, e)) if base is not None and isinstance(iu, I): iu = I(iri.absolutize(iu, base)) ret.append(iu) return ret
def handle_record_links(self, loop, model, params): ''' Task coroutine of the main event loop for MARC conversion, called with In this case update a report of links encountered in the MARC/XML model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed params -- parameters passed in from processing: params['workid']: ID of the work constructed from the MARC record params['instanceid']: list of IDs of instances constructed from the MARC record ''' #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID) #Get the configured default vocabulary base IRI vocabbase = params['vocabbase'] for cls, prop in self._config['lookup'].items(): for link in model.match(None, VTYPE_REL, I(iri.absolutize(cls, vocabbase))): #simple_lookup() is a little helper for getting a property from a resource props = prop if isinstance(prop, list) else ['', prop] label = '' sep = props[0] def label_segments(props): for p in props[1:]: links = model.match(link[ORIGIN], I(iri.absolutize(p, vocabbase))) s = [ link[TARGET] for link in links ] if len(s) > 0: yield ' | '.join(s) segments = list(label_segments(props)) model.add(link[ORIGIN], I(RDFS_LABEL), sep.join(segments)) return
def test_relativize(): for targetUri, againstUri, relativeUri, subPathUri in relativize_test_cases: res = iri.relativize(targetUri, againstUri) assert relativeUri == res, 'target=%r against=%r (subPathOnly=False)' % \ (targetUri, againstUri) if res is not None: res = iri.absolutize(res, againstUri) assert res == targetUri, 'target=%r against=%r (subPathOnly=False, Absolutize)' % \ (targetUri, againstUri) res = iri.relativize(targetUri, againstUri, True) assert subPathUri == res, 'target=%r against=%r (subPathOnly=True)' % \ (targetUri, againstUri) if res is not None: res = iri.absolutize(res, againstUri) assert res == targetUri, 'target=%r against=%r (subPathOnly=True, Absolutize)' % \ (targetUri, againstUri)
def idgen(idbase, tint=None, bits=64): ''' Generate an IRI as a hash of given information, or just make one up if None given idbase -- Base URI for generating links tint -- String that affects the sequence of IDs generated if sent None >>> from bibframe.contrib.datachefids import idgen >>> g = idgen(None) >>> next(g) #Or g.send(None) 'gKNG1b7eySo' >>> next(g) 'cXx7iv67-3E' >>> g.send('spam') 'OZxOEos8e-k' >>> next(g) 'mCFhsaWQ1_0' >>> g.send('spam') 'OZxOEos8e-k' >>> g.send('eggs') 'xQAd4Guk040' >>> g.send('') 'AAAAAAAAAAA' ''' counter = -1 to_hash = None while True: if to_hash is None: to_hash = str(counter) if tint: to_hash += tint to_hash = simple_hashstring(to_hash, bits=bits) to_hash = yield iri.absolutize(to_hash, idbase) if idbase else to_hash counter += 1
def _link(ctx): (origin, _, t, a) = ctx.current_link if derive_origin: #Have enough info to derive the origin from context. Ignore origin in current link origin = derive_origin(ctx) #If need be call the Versa action function to determine the relationship to the materialized resource rels = rel(ctx) if callable(rel) else rel if not isinstance(rels, list): rels = [rels] values = value(ctx) if callable(value) else (t if value is None else value) if not isinstance(values, list): values = [values] def recurse_values(vs): for v in vs: if callable(v): yield from recurse_values(v(ctx)) else: yield v for _value in recurse_values(values): #If asked to convert value to resource, do so as long as it is absolute and ignore_refs is false if res and not (ignore_refs and not iri.is_absolute(_value)): try: _value = I(_value) except ValueError: ctx.extras['logger'].warn('Requirement to convert link target to IRI failed for invalid input, causing the corresponding output link to be omitted entirely: {0}'.format(repr((I(origin), I(iri.absolutize(rel, ctx.base)), _value)))) #XXX How do we really want to handle this error? #return [] continue for r in rels: ctx.output_model.add(I(origin), I(iri.absolutize(r, ctx.base)), _value, {}) return
def materialize_entity(ctx, etype, unique=None): ''' Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping according to the resource type. Implements the Libhub Resource Hash Convention As a convenience, if a vocabulary base is provided in the context, concatenate it to etype and the data keys ctx - context information governing creation fo the mew entity etype - type IRI for th enew entity unique - scalar or ordered dict of data to use in generating its unique ID, or None in which case one is just randomly generated ''' params = {} if ctx.base: etype = ctx.base + etype unique_full = unique if isinstance(unique, OrderedDict): unique_full = OrderedDict() for (k, v) in unique.items(): unique_full[ k if iri.is_absolute(k) else iri.absolutize(k, ctx.base) ] = v if unique_full: plaintext = json.dumps([etype, unique_full], cls=OrderedJsonEncoder) eid = ctx.idgen.send(plaintext) else: #We only have a type; no other distinguishing data. Generate a random hash eid = next(ctx.idgen) return eid
def idgen(idbase, tint=None): ''' Generate an IRI as a hash of given information, or just make one up if None given idbase -- Base URI for generating links tint -- String that affects the sequence of IDs generated if sent None >>> from datachef.ids import idgen >>> g = idgen(None) >>> next(g) #Or g.send(None) 'RtW-3skq' >>> next(g) 'e4r-u_tx' >>> g.send('spam') 'ThKLPHvp' >>> next(g) 'YbGlkNf9' >>> g.send('spam') 'ThKLPHvp' >>> g.send('eggs') 'HeBrpNON' >>> g.send('') 'AAAAAAAA' ''' counter = -1 to_hash = None while True: if to_hash is None: to_hash = str(counter) if tint: to_hash += tint to_hash = simple_hashstring(to_hash) to_hash = yield iri.absolutize(to_hash, idbase) if idbase else to_hash counter += 1
def resource_id(etype, unique=None, idgen=default_idgen(None), vocabbase=None): ''' Very low level routine for generating a, ID value using the hash algorithm outlined by the Libhub initiative for for BIBFRAME Lite (Libhub Resource Hash Convention). Takes the entity (resource) type and an ordered data mapping. etype - type IRI for th enew entity unique - list of key/value tuples of data to use in generating its unique ID, or None in which case one is just randomly generated defaultvocabbase - for convenience, provided, use to resolve relative etype & data keys ''' params = {} #XXX: Use proper URI normalization? Have a philosophical discussion with Mark about this :) if vocabbase: etype = vocabbase + etype unique_computed = [] for k, v in unique: if vocabbase: #XXX OK absolutize used here. Go figure k = k if iri.is_absolute(k) else iri.absolutize(k, vocabbase) unique_computed.append((k, v)) if unique_computed: # XXX Is OrderedJsonEncoder neded now that we're using list of tuples rather than ordered dict? plaintext = json.dumps([etype, unique_computed], cls=OrderedJsonEncoder) eid = idgen.send(plaintext) else: #We only have a type; no other distinguishing data. Generate a random hash eid = next(idgen) return eid
def test_absolutize(): for uriRef, baseUri, expectedUri in absolutize_test_cases: res = iri.absolutize(uriRef, baseUri) # in a couple cases, there's more than one correct result if isinstance(expectedUri, tuple): assert res in expectedUri, 'base=%r ref=%r' % (baseUri, uriRef) else: assert expectedUri == res, 'base=%r ref=%r' % (baseUri, uriRef)
def _rename(ctx): (o, r, t, a) = ctx.current_link if res: try: t = I(t) except ValueError: return [] out_attrs = {} for k, v in attributes.items(): k = k(ctx) if callable(k) else k #If k is a list of contexts use it to dynamically execute functions if isinstance(k, list): if k and isinstance(k[0], context): for newctx in k: #Presumably the function in question will generate any needed links in the output model v(newctx) continue #import traceback; traceback.print_stack() #For looking up the call stack e.g. to debug nested materialize #Check that the attributes key is not None, which is a signal not to #generate the item. For example if the key is an ifexists and the #test expression result is False, it will come back as None, #and we don't want to run the v function if k: new_current_link = (o, k, ctx.current_link[TARGET], ctx.current_link[ATTRIBUTES]) newctx = ctx.copy(current_link=new_current_link) v = v(newctx) if callable(v) else v #If k or v come from pipeline functions as None it signals to skip generating anything else for this link item if v is not None: v = v(newctx) if callable(v) else v #FIXME: Fix properly, by slugifying & making sure slugify handles all-numeric case if k.isdigit(): k = '_' + k if isinstance(v, list): for valitems in v: if valitems: #out_attrs[k] = valitems out_attrs[I(iri.absolutize(k, newctx.base))] = valitems else: #out_attrs[I(iri.absolutize(k, newctx.base))] = v out_attrs[k] = v ctx.output_model.add(I(o), I(iri.absolutize(rel, ctx.base)), t, out_attrs) return
def idgen(idbase): ''' Generate a IRI ''' #Simple tumbler for now, possibly switch to random number, with some sort of sequence override for unit testing ix = 0 while True: yield iri.absolutize(str(ix), idbase) if idbase else str(ix) ix += 1
def relabel(ctx, new_rel=None, res=False): ''' Update the label of the relationship to be added to the link space ''' #Just work with the first provided statement, for now (o, r, t) = ctx.current_link if res: t = I(t) ctx.output_model.add(I(o), I(iri.absolutize(new_rel, ctx.base)), t, {}) return None
def _rename(ctx): (o, r, t, a) = ctx.current_link if res: try: t = I(t) except ValueError: return [] ctx.output_model.add(I(o), I(iri.absolutize(rel, ctx.base)), t, {}) return
def setup_interpretations(interp): #Map the interpretation IRIs to functions to do the data prep for prop, interp_key in interp.items(): if interp_key.startswith('@'): interp_key = iri.absolutize(interp_key[1:], VERSA_BASEIRI) if interp_key in PREP_METHODS: interpretations[prop] = PREP_METHODS[interp_key] else: #just use the identity, i.e. no-op interpretations[prop] = lambda x, **kwargs: x
def process_annotation(anntype, subfields, extra_annotation_props): #Separate annotation subfields from object subfields object_subfields = subfields.copy() annotation_subfields = {} for k, v in subfields.items(): if code + k in ANNOTATIONS_FIELDS: annotation_subfields[k] = v del object_subfields[k] params['transforms'].append((code + k, code + k)) #objectid = next(idg) #object_props.update(object_subfields) annotationid = next(ids) relsink.add(I(annotationid), TYPE_REL, I(iri.absolutize(anntype, BFZ))) for k, v in itertools.chain(annotation_subfields.items(), extra_annotation_props.items()): relsink.add(I(annotationid), I(iri.absolutize(k, BFZ)), v) #Return enough info to generate the main subject/object relationship. The annotation is taken care of at this point return annotationid, object_subfields
def __init__(self, baseurl=None): if baseurl: model, _ = load_rdfa_page(baseurl) if not model: raise RuntimeError(baseurl, 'doesn\'t appear to be a Library.Link site') #<dd property="dcterms:modified">2018-04-17T04:17:32Z</dd> self.lastmod = next(versautil.lookup(model, None, 'http://purl.org/dc/terms/modified'), None) self.sitemap = iri.absolutize('/harvest/sitemap.xml', baseurl) self.url = baseurl protocol, self.host, path, query, fragment = iri.split_uri_ref(baseurl)
def handle_record_links(self, loop, model, params): ''' Task coroutine of the main event loop for MARC conversion, called with In this case update a report of links encountered in the MARC/XML model -- raw Versa model with converted resource information from the MARC details from each MARC/XML record processed params -- parameters passed in from processing: params['workid']: ID of the work constructed from the MARC record params['instanceid']: list of IDs of instances constructed from the MARC record ''' #print ('BF_MARCREC_TASK', linkreport.PLUGIN_ID) #Get the configured vocabulary base IRI vocabbase = params['vocabbase'] for cls, prop in self._config['lookup'].items(): for link in model.match(None, TYPE_REL, I(iri.absolutize(cls, vocabbase))): #simple_lookup() is a little helper for getting a property from a resource val = simple_lookup(model, link[ORIGIN], I(iri.absolutize(prop, vocabbase))) if val: model.add(link[ORIGIN], I(iri.absolutize('label', vocabbase)), val) return
def inverse_materialize(ctx, hashidgen=None, existing_ids=None, unique=None, typ=None, new_rel=None, properties=None): ''' Create a new resource related to the origin ''' properties = properties or {} #Just work with the first provided statement, for now (o, r, t) = ctx.current_link if unique: objid = hashidgen.send(unique(ctx)) else: objid = next(hashidgen) if objid != I(iri.absolutize(FROM_EMPTY_HASH, ctx.base)): ctx.output_model.add(I(objid), I(iri.absolutize(new_rel, ctx.base)), I(o), {}) if objid not in existing_ids: if typ: ctx.output_model.add(I(objid), VTYPE_REL, I(iri.absolutize(typ, ctx.base)), {}) for k, v in properties.items(): if callable(v): v = v(ctx) ctx.output_model.add(I(objid), I(iri.absolutize(k, ctx.base)), v, {}) return objid
def handle_resourcelist(ltext, **kwargs): ''' A helper that converts lists of resources from a textual format such as Markdown, including absolutizing relative IRIs ''' base = kwargs.get('base', VERSA_BASEIRI) model = kwargs.get('model') iris = ltext.strip().split() newlist = model.generate_resource() for i in iris: model.add(newlist, VERSA_BASEIRI + 'item', I(iri.absolutize(i, base))) return newlist
def handle_resourcelist(ltext, **kwargs): ''' A helper that converts lists of resources from a textual format such as Markdown, including absolutizing relative IRIs ''' base=kwargs.get('base', VERSA_BASEIRI) model=kwargs.get('model') iris = ltext.strip().split() newlist = model.generate_resource() for i in iris: model.add(newlist, VERSA_BASEIRI + 'item', I(iri.absolutize(i, base))) return newlist
def handle_resourceset(ltext, **kwargs): ''' A helper that converts sets of resources from a textual format such as Markdown, including absolutizing relative IRIs ''' fullprop=kwargs.get('fullprop') rid=kwargs.get('rid') base=kwargs.get('base', VERSA_BASEIRI) model=kwargs.get('model') iris = ltext.strip().split() for i in iris: model.add(rid, fullprop, I(iri.absolutize(i, base))) return None
def _rename(ctx): workid, iid = ctx.extras[WORKID], ctx.extras[IID] new_o = {origin_class.work: workid, origin_class.instance: iid}[self._use_origin] #Just work with the first provided statement, for now (o, r, t, a) = ctx.current_link if res: try: t = I(t) except ValueError: return [] ctx.output_model.add(I(new_o), I(iri.absolutize(rel, ctx.base)), t, {}) return
def handle_resourceset(ltext, **kwargs): ''' A helper that converts sets of resources from a textual format such as Markdown, including absolutizing relative IRIs ''' fullprop = kwargs.get('fullprop') rid = kwargs.get('rid') base = kwargs.get('base', VERSA_BASEIRI) model = kwargs.get('model') iris = ltext.strip().split() for i in iris: model.add(rid, fullprop, I(iri.absolutize(i, base))) return None
def handle_resourceset(ltext, **kwargs): ''' Helper converts lists of resources from text (i.e. Markdown), including absolutizing relative IRIs ''' fullprop = kwargs.get('fullprop') rid = kwargs.get('rid') base = kwargs.get('base', ONYA) model = kwargs.get('model') iris = ltext.strip().split() for i in iris: model.add(rid, fullprop, I(iri.absolutize(i, base))) return None
def links_from_html(root, baseurl, look_for=HTML_LINKS): ''' ''' for e in select_elements(descendants(root)): if e.xml_name in HTML_LINKS: for k, v in e.xml_attributes.items(): if k in HTML_LINKS[e.xml_name]: try: link = iri.absolutize(v, baseurl, limit_schemes=('http', 'https')) except ValueError: #Ignore scheme continue link, frag = iri.split_fragment(link) yield link
def process_materialization(lookup, subfields, code=None): materializedid = hashid(idbase, tuple(subfields.items())) #The extra_props are parameters inherent to a particular MARC field/subfield for purposes of linked data representation if code is None: code = lookup (subst, extra_props) = MATERIALIZE[lookup] if RESOURCE_TYPE in extra_props: relsink.add(I(materializedid), TYPE_REL, I(iri.absolutize(extra_props[RESOURCE_TYPE], BFZ))) #logger.debug((lookup, subfields, extra_props)) if materializedid not in T_prior_materializedids: #Just bundle in the subfields as they are, to avoid throwing out data. They can be otherwise used or just stripped later on #for k, v in itertools.chain((('marccode', code),), subfields.items(), extra_props.items()): for k, v in itertools.chain(subfields.items(), extra_props.items()): if k == RESOURCE_TYPE: continue fieldname = 'subfield-' + k if code + k in FIELD_RENAMINGS: fieldname = FIELD_RENAMINGS[code + k] if len(k) == 1: params['transforms'].append((code + k, fieldname)) #Only if proper MARC subfield #params['transforms'].append((code + k, FIELD_RENAMINGS.get(sflookup, sflookup))) relsink.add(I(materializedid), iri.absolutize(fieldname, BFZ), v) T_prior_materializedids.add(materializedid) return materializedid, subst
def process(source, target, rdfsonly, base=None, logger=logging): ''' Prepare a statement into a triple ready for rdflib graph ''' for link in source.match(): s, p, o = link[:3] #SKip docheader statements if s == (base or '') + '@docheader': continue if p in RESOURCE_MAPPING: p = RESOURCE_MAPPING[p] if o in RESOURCE_MAPPING: o = RESOURCE_MAPPING[o] if p == VERSA_BASEIRI + 'refines': tlinks = list(source.match(s, TYPE_REL)) if tlinks: if tlinks[0][TARGET] == VERSA_BASEIRI + 'Resource': p = I(RDFS_NAMESPACE + 'subClassOf') elif tlinks[0][TARGET] == VERSA_BASEIRI + 'Property': p = I(RDFS_NAMESPACE + 'subPropertyOf') if p == VERSA_BASEIRI + 'properties': suri = I(iri.absolutize(s, base)) if base else s target.add( (URIRef(o), URIRef(RDFS_NAMESPACE + 'domain'), URIRef(suri))) continue if p == VERSA_BASEIRI + 'value': if o not in ['Literal', 'IRI']: ouri = I(iri.absolutize(o, base)) if base else o target.add((URIRef(s), URIRef(RDFS_NAMESPACE + 'range'), URIRef(ouri))) continue s = URIRef(s) #Translate v:type to rdf:type p = RDF.type if p == TYPE_REL else URIRef(p) o = URIRef(o) if isinstance(o, I) else Literal(o) if not rdfsonly or p.startswith(RDF_NAMESPACE) or p.startswith( RDFS_NAMESPACE): target.add((s, p, o)) return
def instance_postprocess(params, skip_relationships=None): skip_relationships = list(skip_relationships) or [] instanceids = params['instanceids'] model = params['output_model'] vocabbase = params['vocabbase'] skip_relationships.extend([ISBN_REL, ISBN_TYPE_REL, I(iri.absolutize('instantiates', vocabbase))]) def dupe_filter(o, r, t, a): #Filter out ISBN relationships return (r, t) != (TYPE_REL, I(iri.absolutize('Instance', vocabbase))) \ and r not in skip_relationships if len(instanceids) > 1: base_instance_id = instanceids[0] for instanceid in instanceids[1:]: duplicate_statements(model, base_instance_id, instanceid, rfilter=dupe_filter) return
def _link(ctx): (o, r, t, a) = ctx.current_link _value = value(ctx) if callable(value) else (t if value is None else value) workid, iid = ctx.extras[WORKID], ctx.extras[IID] new_o = {origin_class.work: workid, origin_class.instance: iid}[self._use_origin] #Just work with the first provided statement, for now if res: try: _value = I(_value) except ValueError: ctx.extras['logger'].warn('Requirement to convert link target to IRI failed for invalid input, causing the corresponding output link to be omitted entirely: {0}'.format(repr((I(new_o), I(iri.absolutize(rel, ctx.base)), _value)))) #XXX How do we really want to handle this error? return [] ctx.output_model.add(I(new_o), I(iri.absolutize(rel, ctx.base)), _value, {}) return
def isbn_instancegen(params): ''' Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs ''' #Handle ISBNs re: https://foundry.zepheira.com/issues/1976 entbase = params['entbase'] model = params['model'] vocabbase = params['vocabbase'] logger = params['logger'] ids = params['ids'] rec = params['rec'] existing_ids = params['existing_ids'] workid = params['workid'] isbns = marc_lookup(rec, ['020$a']) logger.debug('Raw ISBNS:\t{0}'.format(isbns)) normalized_isbns = list(isbn_list(isbns)) subscript = ord('a') instance_ids = [] logger.debug('Normalized ISBN:\t{0}'.format(normalized_isbns)) if normalized_isbns: for subix, (inum, itype) in enumerate(normalized_isbns): instanceid = ids.send(['Instance', workid, inum]) if entbase: instanceid = I(iri.absolutize(instanceid, entbase)) model.add(I(instanceid), I(iri.absolutize('isbn', vocabbase)), inum) #subitem['id'] = instanceid + (unichr(subscript + subix) if subix else '') if itype: model.add(I(instanceid), I(iri.absolutize('isbnType', vocabbase)), itype) instance_ids.append(instanceid) else: instanceid = ids.send(['Instance', workid]) if entbase: instanceid = I(iri.absolutize(instanceid, entbase)) model.add(I(instanceid), TYPE_REL, I(iri.absolutize('Instance', vocabbase))) existing_ids.add(instanceid) instance_ids.append(instanceid) for instanceid in instance_ids: model.add(I(workid), I(iri.absolutize('hasInstance', vocabbase)), instanceid) model.add(I(instanceid), TYPE_REL, I(iri.absolutize('Instance', vocabbase))) return instance_ids
def _res(ctx): _arg = arg(ctx) if callable(arg) else arg _arg = [_arg] if not isinstance(_arg, list) else _arg ret = [] for u in _arg: iu = None try: iu = I(u) except ValueError: # attempt to recover by percent encoding try: iu = I(iri.percent_encode(u)) except ValueError as e: ctx.logger('Unable to convert "{}" to IRI reference:\n{}'.format(u, e)) continue if iu and not iri.is_absolute(iu) and base is not None: iu = I(iri.absolutize(iu, base)) ret.append(iu) return ret
def instance_postprocess(params, skip_relationships=None): skip_relationships = list(skip_relationships) or [] instanceids = params['instanceids'] model = params['output_model'] vocabbase = params['vocabbase'] skip_relationships.extend([ ISBN_REL, ISBN_VTYPE_REL, I(iri.absolutize('instantiates', vocabbase)) ]) def dupe_filter(o, r, t, a): #Filter out ISBN relationships return (r, t) != (VTYPE_REL, I(iri.absolutize('Instance', vocabbase))) \ and r not in skip_relationships if len(instanceids) > 1: base_instance_id = instanceids[0] for instanceid in instanceids[1:]: duplicate_statements(model, base_instance_id, instanceid, rfilter=dupe_filter) return
def _res(ctx): _arg = arg(ctx) if callable(arg) else arg _arg = [_arg] if not isinstance(_arg, list) else _arg ret = [] for u in _arg: iu = u if not (ignore_refs and not iri.is_absolute(iu)): # coerce into an IRIref, but fallout as untyped text otherwise try: iu = I(iu) except ValueError as e: # attempt to recover by percent encoding try: iu = I(iri.percent_encode(iu)) except ValueError as e: ctx.extras['logger'].warn('Unable to convert "{}" to IRI reference:\n{}'.format(iu, e)) if base is not None and isinstance(iu, I): iu = I(iri.absolutize(iu, base)) ret.append(iu) return ret
def resource_id(etype, unique=None, idgen=default_idgen(None), vocabbase=None): ''' Very low level routine for generating a, ID value using the hash algorithm outlined by the Libhub initiative for for BIBFRAME Lite (Libhub Resource Hash Convention). https://github.com/zepheira/pybibframe/wiki/From-Records-to-Resources:-the-Library.Link-resource-ID-generation-algorithm Takes the entity (resource) type and an ordered data mapping. etype - type IRI for th enew entity unique - list of key/value tuples of data to use in generating its unique ID, or None in which case one is just randomly generated defaultvocabbase - for convenience, provided, use to resolve relative etype & data keys >>> from bibframe.util import resource_id >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Jonathan Bruce Postel"), ("http://schema.org/birthDate", "1943-08-06")]) '-7hP9d_Xo8M' >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Augusta Ada King")]) 'xjgOrUFiw_o' ''' params = {} #XXX: Use proper URI normalization? Have a philosophical discussion with Mark about this :) if vocabbase: etype = vocabbase + etype unique_computed = [] for k, v in unique: if vocabbase: #XXX OK absolutize used here. Go figure k = k if iri.is_absolute(k) else iri.absolutize(k, vocabbase) unique_computed.append((k, v)) if unique_computed: unique_computed.insert(0, [VTYPE_REL, etype]) plaintext = json.dumps(unique_computed, separators=(',', ':')) eid = idgen.send(plaintext) else: #We only have a type; no other distinguishing data. Generate a random hash eid = next(idgen) return eid
def do_parse(elem, resource, vocab=None, prop=None, prefixes=None): prefixes = prefixes or DEFAULT_PREFIXES.copy() vocab = elem.xml_attributes.get('vocab', vocab) #element_satisfied = False if vocab: prefix = elem.xml_attributes.get('prefix') if prefix: #logging.debug('{}'.format(prefix)) prefix_bits = prefix.split() # a, b = tee(prefix.split()) # next(b, None) # for p, ns in zip(a, b): # p = p.strip().strip(':') # ns = ns.strip() # print((p, ns)) # #print(p, ns) # prefixes[p] = ns for i, j in zip(range(0, len(prefix_bits), 2), range(1, len(prefix_bits), 2)): p = prefix_bits[i].strip().strip(':') ns = prefix_bits[j].strip() #print(p, ns) prefixes[p] = ns new_resource = elem.xml_attributes.get('resource') if new_resource: try: resource = new_resource = I(iri.absolutize(new_resource, source_uri)) except ValueError: warnings.warn('Invalid URL or anchor {} found in {}. Ignored.'.format(new_resource, source_uri)) new_resource = None typeof_list = elem.xml_attributes.get('typeof') if typeof_list: if not new_resource: new_resource = mock_bnode('') for typeof in typeof_list.split(): try: typeof = I(iri.absolutize(typeof, vocab)) except ValueError: warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(typeof, source_uri)) statement_sink.send((new_resource or resource, RDF_NS + 'type', typeof)) new_prop_list = elem.xml_attributes.get('property') new_value = None if new_prop_list: if new_resource: new_value = new_resource for new_prop in new_prop_list.split(): if new_prop == 'about': continue elif ':' in new_prop: p, local = new_prop.split(':', 1) if not p in prefixes: #FIXME: Silent error for now continue try: prop = I(iri.absolutize(local, prefixes[p])) except ValueError: warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(local, source_uri)) continue else: try: prop = I(iri.absolutize(new_prop, vocab)) except ValueError: warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(new_prop, source_uri)) continue href_res = elem.xml_attributes.get('href') if href_res: try: href_res = I(href_res) except ValueError: warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(href_res, source_uri)) continue href_src = elem.xml_attributes.get('src') if href_src: try: href_src = I(href_src) except ValueError: warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(href_src, source_uri)) continue value = new_value or elem.xml_attributes.get('content') or href_res or href_src or elem.xml_value statement_sink.send((resource, prop, value)) #logging.debug('{}'.format((resource, prop, value))) #element_satisfied = True if new_value: resource = new_value for child in elem.xml_children: if isinstance(child, element): do_parse(child, resource, vocab=vocab, prop=prop, prefixes=prefixes)
#LINKROLES = {0: link.origin, 1: link.relationship, 2: link.target, 3: link.attributes} def init_localization(): '''prepare l10n''' locale.setlocale(locale.LC_ALL, '') # User's preferred locale, according to environment # Use first two characters of country code, defaulting to 'en' in the absence of a preference loc = locale.getlocale() lang = loc[0][0:2] if loc[0] else 'en' filename = "res/messages_%s.mo" % lang try: logging.debug( "Opening message file %s for locale %s", filename, loc[0] ) trans = gettext.GNUTranslations(open( filename, "rb" ) ) except IOError: logging.debug( "Locale not found. Using default messages" ) trans = gettext.NullTranslations() trans.install() #Intentionally after the localization setup from versa.iriref import iriref as I VERSA_BASEIRI = I('http://bibfra.me/purl/versa/') #Very common Versa:specific types. Analogous to rdf:type & rdfs:label VTYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI)) VLABEL_REL = I(iri.absolutize('label', VERSA_BASEIRI))
def _materialize(ctx): ''' Inserts at least two main links in the context's output_model, one or more for the relationship from the origin to the materialized resource, one for the type of the materialized resource, and links according to the links parameter :param ctx: Runtime Versa context used in processing (e.g. includes the prototype link) :return: None This function is intricate in its use and shifting of Versa context, but the intricacies are all designed to make the marcpatterns mini language more natural. ''' # FIXME: Part of the datachef sorting out if not ctx.idgen: ctx.idgen = idgen if debug is None: def log_debug(msg): return elif not hasattr(debug, 'write'): raise TypeError('debug argument to materialize must be file-like object or None') else: def log_debug(msg): print(msg, file=debug) # Set up variables to be made available in any derived contexts vars_items = list((vars or {}).items()) if vars_items: # First make sure we're not tainting the passed-in context ctx = ctx.copy(variables=ctx.variables.copy()) for k, v in vars_items: if None in (k, v): continue #v = v if isinstance(v, list) else [v] v = v(ctx) if is_pipeline_action(v) else v if v: v = v[0] if isinstance(v, list) else v ctx.variables[k] = v (o, r, t, a) = ctx.current_link if isinstance(typ, COPY): object_copy = typ object_copy.id = o _typ = next(util.resourcetypes(ctx.input_model, o), None) object_copy.links = [] for stmt in ctx.input_model.match(o): if object_copy.rels is None or stmt[RELATIONSHIP] in typ.rels: # FIXME: Attributes? object_copy.links.append((stmt[RELATIONSHIP], stmt[TARGET])) else: _typ = typ(ctx) if is_pipeline_action(typ) else typ object_copy = None _fprint = fprint(ctx) if is_pipeline_action(fprint) else fprint # FIXME: On redesign implement split using function composition instead targets = [ sub_t.strip() for sub_t in t.split(split) if sub_t.strip() ] if split else [t] # If the rel in the incoming context is null and there is no rel passed in, nothing to attach # Especially useful signal in a pipeline's fingerprinting stage attach_ = False if rel is None and r is None else attach if '@added-links' not in ctx.extras: ctx.extras['@added-links'] = set() # Make sure we end up with a list or None rels = rel if isinstance(rel, list) else ([rel] if rel else [r]) log_debug(f'materialize action. Type: {_typ}. Anchoring rels: {rels} Initial context current link: {ctx.current_link}') log_debug(f'Variables (including from vars= arg): {ctx.variables}') objids = [] # Botanical analogy: stem context is from the caller (e.g. connection point of newly materialized resource) # vein comtexts derive from the stem for target in targets: ctx_stem = ctx.copy(current_link=(ctx.current_link[ORIGIN], ctx.current_link[RELATIONSHIP], target, ctx.current_link[ATTRIBUTES])) if origin: # Have been given enough info to derive the origin from context. Ignore origin in current link o = origin(ctx_stem) if not o: #Defensive coding continue computed_fprint = [] if _fprint else None rtypes = set([_typ]) if _fprint: # strip None values from computed unique list, including pairs where v is None for k, v in _fprint: if None in (k, v): continue for subitem in (v if isinstance(v, list) else [v]): subval = subitem(ctx_stem) if is_pipeline_action(subitem) else subitem if subval: subval = subval if isinstance(subval, list) else [subval] if k == VTYPE_REL: rtypes.update(set(subval)) computed_fprint.extend([(k, s) for s in subval]) log_debug(f'Provided fingerprinting info: {computed_fprint}') if object_copy: objid = object_copy.id else: objid = materialize_entity(ctx_stem, _typ, fprint=computed_fprint) objids.append(objid) log_debug(f'Newly materialized object: {objid}') # rels = [ ('_' + curr_rel if curr_rel.isdigit() else curr_rel) for curr_rel in rels if curr_rel ] computed_rels = [] for curr_relobj in rels: # e.g. scenario if passed in rel=ifexists(...) curr_rels = curr_relobj(ctx_stem) if is_pipeline_action(curr_relobj) else curr_relobj curr_rels = curr_rels if isinstance(curr_rels, list) else [curr_rels] for curr_rel in curr_rels: if not curr_rel: continue # FIXME: Fix properly, by slugifying & making sure slugify handles all numeric case (prepend '_') curr_rel = '_' + curr_rel if curr_rel.isdigit() else curr_rel if attach_: _smart_add(ctx_stem.output_model, I(o), I(iri.absolutize(curr_rel, ctx_stem.base)), I(objid), (), ctx.extras['@added-links']) computed_rels.append(curr_rel) # print((objid, ctx_.existing_ids)) # XXX: Means links are only processed on new objects! This needs some thought if objid not in ctx_stem.existing_ids: if _typ: _smart_add(ctx_stem.output_model, I(objid), VTYPE_REL, I(iri.absolutize(_typ, ctx_stem.base)), (), ctx.extras['@added-links']) if preserve_fprint: # Consolidate types computed_fprint = [ (k, v) for (k, v) in computed_fprint if k != VTYPE_REL ] # computed_fprint += attrs = tuple(computed_fprint + [(VTYPE_REL, r) for r in rtypes]) _smart_add(ctx_stem.output_model, I(objid), VFPRINT_REL, _typ, attrs, ctx.extras['@added-links']) # XXX: Use Nones to mark blanks, or should Versa define some sort of null resource? all_links = object_copy.links + links if object_copy else links for l in all_links: if len(l) == 2: lo = I(objid) lr, lt = l elif len(l) == 3: lo, lr, lt = l # This context is in effect # First of all, hold on to the inbound origin so that it can be accessed in embedded actions vein_vars = ctx_stem.variables.copy() vein_vars['@stem'] = ctx_stem.current_link[ORIGIN] # Newly materialized resource is the origin. The overall context target for embedded actions ctx_vein = ctx_stem.copy(current_link=(objid, ctx_stem.current_link[RELATIONSHIP], ctx_stem.current_link[TARGET], ctx_stem.current_link[ATTRIBUTES]), variables=vein_vars) lo = lo or ctx_vein.current_link[ORIGIN] lr = lr or ctx_vein.current_link[RELATIONSHIP] lt = lt or ctx_vein.current_link[TARGET] lo = lo(ctx_vein) if is_pipeline_action(lo) else lo lo = lo if isinstance(lo, list) else [lo] lr = lr(ctx_vein) if is_pipeline_action(lr) else lr # Update lr # XXX This needs cleaning up ctx_vein = ctx_stem.copy(current_link=(ctx_vein.current_link[ORIGIN], lr, ctx_vein.current_link[TARGET], ctx_stem.current_link[ATTRIBUTES]), variables=vein_vars) # If k is a list of contexts use it to dynamically execute functions if isinstance(lr, list): if lr and isinstance(lr[0], context): for newctx in lr: #The function in question will generate any needed links in the output model lt(newctx) continue # import traceback; traceback.print_stack() #For looking up the call stack e.g. to debug nested materialize # Check that the links key is not None, which is a signal not to # generate the item. For example if the key is an ifexists and the # test expression result is False, it will come back as None, # and we don't want to run the v function if lr: lt = lt(ctx_vein) if is_pipeline_action(lt) else lt # If k or v come from pipeline functions as None it signals to skip generating anything else for this link item if lt is not None: # FIXME: Fix properly, by slugifying & making sure slugify handles all-numeric case if lr.isdigit(): lr = '_' + lr _lr = I(iri.absolutize(lr, ctx_vein.base)) log_debug(f'Generated link: {lo, _lr, lt}') if isinstance(lt, list): for valitems in lt: if valitems: for loi in lo: _smart_add(ctx_vein.output_model, loi, _lr, valitems, (), ctx.extras['@added-links']) else: for loi in lo: _smart_add(ctx_vein.output_model, loi, _lr, lt, (), ctx.extras['@added-links']) ctx_stem.existing_ids.add(objid) for func in ctx.extras.get('@new-entity-hook', []): func(objid) log_debug(f'End materialize') return objids
''' import os import json import itertools import asyncio from versa import I, ORIGIN, RELATIONSHIP, TARGET from versa.util import simple_lookup from amara3 import iri from bibframe import BFZ, BFLC, g_services, BF_INIT_TASK, BF_MARCREC_TASK, BF_FINAL_TASK ISBN_REL = I(iri.absolutize('isbn', BFZ)) TITLE_REL = I(iri.absolutize('title', BFZ)) BFHOST = 'bibfra.me' #A plug-in is a series of callables, each of which handles a phase of #Process #The only phase predefined for all plug-ins is BF_INIT_TASK #One convenient way to organize the Plug-in is as a class #In this case we want to create a separate instance for each full processing event loop class linkreport(object): PLUGIN_ID = 'http://bibfra.me/tool/pybibframe#linkreport'
''' fullprop = kwargs.get('fullprop') rid = kwargs.get('rid') base = kwargs.get('base', VERSA_BASEIRI) model = kwargs.get('model') iris = ltext.strip().split() for i in iris: model.add(rid, fullprop, I(iri.absolutize(i, base))) return None PREP_METHODS = { VERSA_BASEIRI + 'text': lambda x, **kwargs: x, VERSA_BASEIRI + 'resource': lambda x, base=VERSA_BASEIRI, **kwargs: I(iri.absolutize(x, base)), VERSA_BASEIRI + 'resourceset': handle_resourceset, } def parse(md, model, encoding='utf-8', config=None): """ Translate the Versa Markdown syntax into Versa model relationships md -- markdown source text model -- Versa model to take the output relationship encoding -- character encoding (defaults to UTF-8) Returns: The overall base URI (`@base`) specified in the Markdown file, or None
def parse(md, model, encoding='utf-8', config=None): """ Translate the Versa Markdown syntax into Versa model relationships md -- markdown source text model -- Versa model to take the output relationship encoding -- character encoding (defaults to UTF-8) Returns: The overall base URI (`@base`) specified in the Markdown file, or None >>> from versa.driver.memory import newmodel >>> from versa.serial.literate import parse >>> m = newmodel() >>> parse(open('test/resource/poetry.md').read(), m) 'http://uche.ogbuji.net/poems/' >>> m.size() 40 >>> next(m.match(None, 'http://uche.ogbuji.net/poems/updated', '2013-10-15')) (I(http://uche.ogbuji.net/poems/1), I(http://uche.ogbuji.net/poems/updated), '2013-10-15', {}) """ #Set up configuration to interpret the conventions for the Markdown config = config or {} #This mapping takes syntactical elements such as the various header levels in Markdown and associates a resource type with the specified resources syntaxtypemap = {} if config.get('autotype-h1'): syntaxtypemap['h1'] = config.get('autotype-h1') if config.get('autotype-h2'): syntaxtypemap['h2'] = config.get('autotype-h2') if config.get('autotype-h3'): syntaxtypemap['h3'] = config.get('autotype-h3') interp_stanza = config.get('interpretations', {}) interpretations = {} def setup_interpretations(interp): #Map the interpretation IRIs to functions to do the data prep for prop, interp_key in interp.items(): if interp_key.startswith('@'): interp_key = iri.absolutize(interp_key[1:], VERSA_BASEIRI) if interp_key in PREP_METHODS: interpretations[prop] = PREP_METHODS[interp_key] else: #just use the identity, i.e. no-op interpretations[prop] = lambda x, **kwargs: x setup_interpretations(interp_stanza) #Prep ID generator, in case needed idg = idgen(None) #Preprocess the Markdown to deal with IRI-valued property values def iri_ref_tool(m): body = m.group(1) lchar = '<' if iri.matches_uri_ref_syntax(body) else '<' return lchar + m.group(1) + '>' md = IRIREF_CAND_PAT.sub(iri_ref_tool, md) #Parse the Markdown #Alternately: #from xml.sax.saxutils import escape, unescape #h = markdown.markdown(escape(md.decode(encoding)), output_format='html5') #Note: even using safe_mode this should not be presumed safe from tainted input #h = markdown.markdown(md.decode(encoding), safe_mode='escape', output_format='html5') comments = mkdcomments.CommentsExtension() h = markdown.markdown(md, safe_mode='escape', output_format='html5', extensions=[comments]) #doc = html.markup_fragment(inputsource.text(h.encode('utf-8'))) tb = treebuilder() h = '<html>' + h + '</html>' root = html5.parse(h) #root = tb.parse(h) #Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest first_h1 = next(select_name(descendants(root), 'h1')) #top_section_fields = itertools.takewhile(lambda x: x.xml_name != 'h1', select_name(following_siblings(first_h1), 'h2')) # Extract header elements. Notice I use an empty element with an empty parent as the default result docheader = next( select_value(select_name(descendants(root), 'h1'), '@docheader'), element('empty', parent=root)) # //h1[.="@docheader"] sections = filter( lambda x: x.xml_value != '@docheader', select_name_pattern(descendants(root), HEADER_PAT) ) # //h1[not(.="@docheader")]|h2[not(.="@docheader")]|h3[not(.="@docheader")] def fields(sect): ''' Each section represents a resource and contains a list with its properties This generator parses the list and yields the key value pairs representing the properties Some properties have attributes, expressed in markdown as a nested list. If present these attributes Are yielded as well, else None is yielded ''' #import logging; logging.debug(repr(sect)) #Pull all the list elements until the next header. This accommodates multiple lists in a section try: sect_body_items = itertools.takewhile( lambda x: HEADER_PAT.match(x.xml_name) is None, select_elements(following_siblings(sect))) except StopIteration: return #results_until(sect.xml_select('following-sibling::*'), 'self::h1|self::h2|self::h3') #field_list = [ U(li) for ul in sect.xml_select('following-sibling::ul') for li in ul.xml_select('./li') ] field_list = [ li for elem in select_name(sect_body_items, 'ul') for li in select_name(elem, 'li') ] def parse_li(pair): ''' Parse each list item into a property pair ''' if pair.strip(): matched = REL_PAT.match(pair) if not matched: raise ValueError( _('Syntax error in relationship expression: {0}'. format(pair))) if matched.group(3): prop = matched.group(3).strip() if matched.group(4): prop = matched.group(4).strip() if matched.group(7): val = matched.group(7).strip() typeindic = RES_VAL elif matched.group(9): val = matched.group(9).strip() typeindic = TEXT_VAL elif matched.group(11): val = matched.group(11).strip() typeindic = TEXT_VAL elif matched.group(12): val = matched.group(12).strip() typeindic = UNKNOWN_VAL else: val = '' typeindic = UNKNOWN_VAL #prop, val = [ part.strip() for part in U(li.xml_select('string(.)')).split(':', 1) ] #import logging; logging.debug(repr((prop, val))) return prop, val, typeindic return None, None, None def prep_li(li): ''' Take care of Markdown parsing minutiae. Also, Exclude child uls * a/href embedded in the li means it was specified as <link_text>. Restore the angle brackets as expected by the li parser * Similar for cases where e.g. prop: <abc> gets turned into prop: <abc></abc> ''' prepped = '' for ch in itertools.takewhile( lambda x: not (isinstance(x, element) and x.xml_name == 'ul'), li.xml_children): if isinstance(ch, text): prepped += ch elif isinstance(ch, element): if ch.xml_name == 'a': prepped += '<' + ch.xml_value + '>' else: prepped += '<' + ch.xml_name + '>' return prepped #Go through each list item for li in field_list: #Is there a nested list, which expresses attributes on a property if list(select_name(li, 'ul')): #main = ''.join([ node.xml_value # for node in itertools.takewhile( # lambda x: x.xml_name != 'ul', select_elements(li) # ) # ]) main = prep_li(li) prop, val, typeindic = parse_li(main) subfield_list = [ parse_li(prep_li(sli)) for e in select_name(li, 'ul') for sli in (select_name(e, 'li')) ] subfield_list = [(p, v, t) for (p, v, t) in subfield_list if p is not None] #Support a special case for syntax such as in the @iri and @interpretations: stanza of @docheader if val is None: val = '' yield prop, val, typeindic, subfield_list #Just a regular, unadorned property else: prop, val, typeindic = parse_li(prep_li(li)) if prop: yield prop, val, typeindic, None iris = {} # Gather the document-level metadata from the @docheader section base = schemabase = rtbase = document_iri = default_lang = None for prop, val, typeindic, subfield_list in fields(docheader): #The @iri section is where key IRI prefixes can be set if prop == '@iri': for (k, uri, typeindic) in subfield_list: if k == '@base': base = schemabase = rtbase = uri # @property is legacy elif k == '@schema' or k == '@property': schemabase = uri elif k == '@resource-type': rtbase = uri else: iris[k] = uri #The @interpretations section is where defaults can be set as to the primitive types of values from the Markdown, based on the relevant property/relationship elif prop == '@interpretations': #Iterate over items from the @docheader/@interpretations section to set up for further parsing interp = {} for k, v, x in subfield_list: interp[I(iri.absolutize(k, schemabase))] = v setup_interpretations(interp) #Setting an IRI for this very document being parsed elif prop == '@document': document_iri = val elif prop == '@language': default_lang = val #If we have a resource to which to attach them, just attach all other properties elif document_iri or base: rid = document_iri or base fullprop = I(iri.absolutize(prop, schemabase or base)) if fullprop in interpretations: val = interpretations[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=model) if val is not None: model.add(rid, fullprop, val) else: model.add(rid, fullprop, val) #Default IRI prefixes if @iri/@base is set if not schemabase: schemabase = base if not rtbase: rtbase = base if not document_iri: document_iri = base #Go through the resources expressed in remaining sections for sect in sections: #if U(sect) == '@docheader': continue #Not needed because excluded by ss #The header can take one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]" #The 3rd form is for an anonymous resource with specified type and the 4th an anonymous resource with unspecified type matched = RESOURCE_PAT.match(sect.xml_value) if not matched: raise ValueError( _('Syntax error in resource header: {0}'.format( sect.xml_value))) rid = matched.group(1) rtype = matched.group(3) if rtype: rtype = I(iri.absolutize(rtype, schemabase)) if rid: rid = I(iri.absolutize(rid, base)) if not rid: rid = next(idg) #Resource type might be set by syntax config if not rtype: rtype = syntaxtypemap.get(sect.xml_name) if rtype: model.add(rid, TYPE_REL, rtype) def expand_iri(iri_in, base): if iri_in.startswith('@'): return I(iri.absolutize(iri_in[1:], VERSA_BASEIRI)) iri_match = URI_EXPLICIT_PAT.match(iri_in) if iri_match: return I(iri.absolutize(iri_match.group(1), base)) iri_match = URI_ABBR_PAT.match(iri_in) if iri_match: uri = iris[iri_match.group(1)] fulliri = URI_ABBR_PAT.sub(uri + '\\2\\3', iri_in) else: fulliri = I(iri.absolutize(iri_in, base)) return fulliri #Add the property for prop, val, typeindic, subfield_list in fields(sect): attrs = {} for (aprop, aval, atype) in subfield_list or (): fullaprop = expand_iri(aprop, schemabase) if atype == RES_VAL: val = expand_iri(aval, rtbase) valmatch = URI_ABBR_PAT.match(aval) if valmatch: uri = iris[valmatch.group(1)] attrs[fullaprop] = URI_ABBR_PAT.sub( uri + '\\2\\3', aval) else: attrs[fullaprop] = I(iri.absolutize(aval, rtbase)) elif atype == TEXT_VAL: attrs[fullaprop] = aval elif atype == UNKNOWN_VAL: val_iri_match = URI_EXPLICIT_PAT.match(aval) if val_iri_match: aval = expand_iri(aval, rtbase) elif fullaprop in interpretations: aval = interpretations[fullaprop](aval, rid=rid, fullprop=fullaprop, base=base, model=model) if aval is not None: attrs[fullaprop] = aval fullprop = expand_iri(prop, schemabase) if typeindic == RES_VAL: val = expand_iri(val, rtbase) model.add(rid, fullprop, val, attrs) elif typeindic == TEXT_VAL: if '@lang' not in attrs: attrs['@lang'] = default_lang model.add(rid, fullprop, val, attrs) elif typeindic == UNKNOWN_VAL: val_iri_match = URI_EXPLICIT_PAT.match(val) if val_iri_match: val = expand_iri(val, rtbase) elif fullprop in interpretations: val = interpretations[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=model) if val is not None: model.add(rid, fullprop, val, attrs) #resinfo = AB_RESOURCE_PAT.match(val) #if resinfo: # val = resinfo.group(1) # valtype = resinfo.group(3) # if not val: val = model.generate_resource() # if valtype: attrs[TYPE_REL] = valtype return document_iri
def bind(models, context=None, ignore_oftypes=None, logger=logging): if not isinstance(models, list): models = [models] vocab = context.get('@vocab') non_top_ids = set() obj_pool = {} #Mapping from resource id to object and list of referring ids used_objects = set() #Track multiple instance of docs to prevent data structure recursion #typed_origins = set() for m in models: #Everything with a type for origin in all_origins(m): typ = next(lookup(m, origin, RDF_TYPE), None) #if p == VERSA_TYPE: p = RDF_TYPE obj, referents = obj_pool.setdefault(origin, ({}, [])) if vocab and typ: typ_rel = iri.relativize(typ, vocab) if typ_rel: typ = typ_rel if typ: obj['@type'] = typ if not origin.startswith('__VERSABLANKNODE__'): obj['@id'] = origin for o, r, t, a in m.match(origin): if r == RDF_TYPE: continue if isinstance(t, I) and o != t: if vocab: t_rel = iri.relativize(t, vocab) if t_rel: t = t_rel valobj, referents = obj_pool.setdefault(t, ({}, [])) if t in used_objects: val = t else: val = valobj if not t.startswith('__VERSABLANKNODE__') and '@id' not in val: val['@id'] = t used_objects.add(t) non_top_ids.add(t) #If something has an object as a value it does not appear at the top referents.append(o) else: val = t if vocab: r_rel = iri.relativize(r, vocab) if r_rel: r = r_rel if r in obj and isinstance(obj[r], list): obj[r].append(val) elif r in obj: obj[r] = [obj[r], val] else: obj[r] = val #Eliminate objects of types to be ignored to_remove = [] for (oid, (obj, referents)) in obj_pool.items(): typ = obj.get('@type') if vocab and typ: typ = iri.absolutize(typ, vocab) if typ in ignore_oftypes: to_remove.append(oid) for ref in referents: refobj, _ = obj_pool[ref] for k in list(refobj.keys()): v = refobj[k] if isinstance(v, list) and obj in v: v.remove(obj) if len(v) == 1: refobj[k] = v[0] elif v == obj: del refobj[k] for k in to_remove: del obj_pool[k] #Handle @id only for (oid, (obj, referents)) in obj_pool.items(): for k, v in obj.items(): if len(v) == 1 and '@id' in v: obj[k] = v['@id'] top_objs = [ obj for (k, (obj, refs)) in obj_pool.items() if k not in non_top_ids ] #Eliminate stranded top-level objects with no more than type to_remove = [] #for ix, obj in enumerate(top_objs): for obj in top_objs: if len(obj) == 1 and '@type' in obj: to_remove.append(obj) for obj in to_remove: top_objs.remove(obj) #import pprint;pprint.pprint(top_objs) if context and context.get('@output', True): top = {'@context': context, '@graph': top_objs} else: return top_objs
SIMPLE_BOOK = { 'id': 'http://example.org/book/catcher-in-the-rye', 'title': 'The Catcher in the Rye', 'type': 'http://ogp.me/ns/books#books.book', 'link': 'https://en.wikipedia.org/wiki/The_Catcher_in_the_Rye', 'author': 'J.D. Salinger', 'cover': 'http://example.org/book/catcher-in-the-rye-book-cover.jpg', } BOOK_TYPE = 'http://schema.org/Book' SCH = SCHEMA_ORG = 'http://schema.org/' EXAMPLE_ORG = 'http://example.org/' BOOK_ID = 'http://example.org/book/catcher-in-the-rye' SCHEMA_NAME = I(iri.absolutize('name', SCHEMA_ORG)) SCHEMA_AUTHOR = I(iri.absolutize('author', SCHEMA_ORG)) XXX_WROTE = 'http://example.org/wrote' BOOK_CASES = [] transforms = { 'id': ignore(), 'title': link(rel=SCH + 'name'), 'author': materialize(SCH + 'Person', rel=SCH + 'author', unique=[(SCH + 'name', target())], links=[(SCH + 'name', target())]),
import re import os import logging import itertools #from rdflib import Graph, BNode, Namespace from rdflib import URIRef, Literal, RDF, RDFS from amara3 import iri from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET from bibframe import BFZ, BFLC VTYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI)) VLABEL_REL = I(iri.absolutize('label', VERSA_BASEIRI)) WORKCLASS = iri.absolutize('Work', BFZ) INSTANCECLASS = iri.absolutize('Instance', BFZ) INSTANCEREL = iri.absolutize('hasInstance', BFZ) PROP_MAP = { VTYPE_REL: RDF.type, VLABEL_REL: RDFS.label, } def prep(stmt): ''' Prepare a statement into a triple ready for rdflib
def process_marcpatterns(params, transforms, input_model, phase_target): output_model = params['output_model'] if phase_target == BOOTSTRAP_PHASE: input_model_iter = params['input_model'] else: # Need to sort our way through the input model so that the materializations occur # at the same place each time, otherwise canonicalization fails due to the # addition of the subfield context (at the end of materialize()) # XXX Is the int() cast necessary? If not we could do key=operator.itemgetter(0) input_model_iter = sorted(list(params['input_model']), key=lambda x: int(x[0])) params['to_postprocess'] = [] for lid, marc_link in input_model_iter: origin, taglink, val, attribs = marc_link origin = params.get('default-origin', origin) #params['logger'].debug('PHASE {} ORIGIN: {}\n'.format(phase_target, origin)) if taglink == MARCXML_NS + '/leader': params['leader'] = leader = val continue #Sort out attributes params['indicators'] = indicators = { k: v for k, v in attribs.items() if k.startswith('ind') } params['subfields'] = curr_subfields = subfields(attribs) curr_subfields_keys = [tup[0] for tup in curr_subfields] if taglink.startswith(MARCXML_NS + '/extra/') or 'tag' not in attribs: continue params['code'] = tag = attribs['tag'] if taglink.startswith(MARCXML_NS + '/control'): #No indicators on control fields. Turn them off, in effect indicator_list = ('#', '#') key = 'tag-' + tag if tag == '006': params['fields006'].append(val) if tag == '007': params['fields007'].append(val) if tag == '008': params['field008'] = val if phase_target != BOOTSTRAP_PHASE: params['transform_log'].append((tag, key)) params['fields_used'].append((tag, )) elif taglink.startswith(MARCXML_NS + '/data'): indicator_list = ((attribs.get('ind1') or ' ')[0].replace(' ', '#'), (attribs.get('ind2') or ' ')[0].replace(' ', '#')) key = 'tag-' + tag #logger.debug('indicators: ', repr(indicators)) #indicator_list = (indicators['ind1'], indicators['ind2']) if phase_target != BOOTSTRAP_PHASE: params['fields_used'].append(tuple([tag] + curr_subfields_keys)) #This is where we check each incoming MARC link to see if it matches a transform into an output link (e.g. renaming 001 to 'controlCode') to_process = [] #Start with most specific matches, then to most general # "?" syntax in lookups is a single char wildcard #First with subfields, with & without indicators: for k, v in curr_subfields: #if indicator_list == ('#', '#'): lookups = [ '{0}-{1}{2}${3}'.format(tag, indicator_list[0], indicator_list[1], k), '{0}-?{2}${3}'.format(tag, indicator_list[0], indicator_list[1], k), '{0}-{1}?${3}'.format(tag, indicator_list[0], indicator_list[1], k), '{0}${1}'.format(tag, k), ] for lookup in lookups: if lookup in transforms: to_process.append((transforms[lookup], v, lookup)) else: # don't report on subfields for which a code-transform exists, # disregard wildcards if phase_target != BOOTSTRAP_PHASE and not tag in transforms and '?' not in lookup: params['dropped_codes'].setdefault(lookup, 0) params['dropped_codes'][lookup] += 1 #Now just the tag, with & without indicators lookups = [ '{0}-{1}{2}'.format(tag, indicator_list[0], indicator_list[1]), '{0}-?{2}'.format(tag, indicator_list[0], indicator_list[1]), '{0}-{1}?'.format(tag, indicator_list[0], indicator_list[1]), tag, ] #Remember how many lookups were successful based on subfields subfields_results_len = len(to_process) for lookup in lookups: if lookup in transforms: to_process.append((transforms[lookup], val, lookup)) if phase_target != BOOTSTRAP_PHASE and subfields_results_len == len( to_process) and not curr_subfields: # Count as dropped if subfields were not processed and theer were no matches on non-subfield lookups params['dropped_codes'].setdefault(tag, 0) params['dropped_codes'][tag] += 1 mat_ent = functools.partial(materialize_entity, ctx_params=params, loop=params['loop']) #Apply all the handlers that were found for funcinfo, val, lookup in to_process: #Support multiple actions per lookup funcs = funcinfo if isinstance(funcinfo, tuple) else (funcinfo, ) for func in funcs: extras = { 'origins': params['origins'], 'match-spec': lookup, 'indicators': indicators, 'logger': params['logger'], 'lookups': params['lookups'], 'postprocessing': [], 'inputns': MARC, 'abort-signal': False, } #Build Versa processing context #Should we include indicators? #Should we be passing in taglink rather than tag? ctx = bfcontext((origin, tag, val, attribs), input_model, output_model, extras=extras, base=params['vocabbase'], idgen=mat_ent, existing_ids=params['existing_ids']) func(ctx) params['to_postprocess'].extend(ctx.extras['postprocessing']) if ctx.extras['abort-signal']: return False if phase_target != BOOTSTRAP_PHASE and not to_process: #Nothing else has handled this data field; go to the fallback fallback_rel_base = '../marcext/tag-' + tag if not curr_subfields: #Fallback for control field: Captures MARC tag & value output_model.add( I(origin), I(iri.absolutize(fallback_rel_base, params['vocabbase'])), val) for k, v in curr_subfields: #Fallback for data field: Captures MARC tag, indicators, subfields & value fallback_rel = '../marcext/{0}-{1}{2}-{3}'.format( fallback_rel_base, indicator_list[0].replace('#', 'X'), indicator_list[1].replace('#', 'X'), k) #params['transform_log'].append((code, fallback_rel)) try: output_model.add( I(origin), I(iri.absolutize(fallback_rel, params['vocabbase'])), v) except ValueError as e: control_code = list(marc_lookup( input_model, '001')) or ['NO 001 CONTROL CODE'] dumb_title = list(marc_lookup( input_model, '245$a')) or ['NO 245$a TITLE'] params['logger'].warning( '{}\nSkipping statement for {}: "{}"'.format( e, control_code[0], dumb_title[0])) #For now do not run special transforms if in a custom phase #XXX: Needs discussion if phase_target in (BOOTSTRAP_PHASE, DEFAULT_MAIN_PHASE): #params['logger'].debug('PHASE {}\n'.format(phase_target)) extra_stmts = set() # prevent duplicate statements special_transforms = params['transforms'].specials for origin, k, v in itertools.chain( special_transforms.process_leader(params), special_transforms.process_006(params['fields006'], params), special_transforms.process_007(params['fields007'], params), special_transforms.process_008(params['field008'], params)): v = v if isinstance(v, tuple) else (v, ) for item in v: o = origin or I(params['default-origin']) if o and (o, k, item) not in extra_stmts: output_model.add(o, k, item) extra_stmts.add((o, k, item)) return True
def isbn_instancegen(params, loop, model): ''' Default handling of the idea of splitting a MARC record with FRBR Work info as well as instances signalled by ISBNs According to Vicki Instances can be signalled by 007, 020 or 3XX, but we stick to 020 for now ''' #Handle ISBNs re: https://foundry.zepheira.com/issues/1976 entbase = params['entbase'] output_model = params['output_model'] input_model = params['input_model'] vocabbase = params['vocabbase'] logger = params['logger'] materialize_entity = params['materialize_entity'] existing_ids = params['existing_ids'] workid = params['default-origin'] ids = params['ids'] plugins = params['plugins'] INSTANTIATES_REL = I(iri.absolutize('instantiates', vocabbase)) isbns = list((val for code, val in marc_lookup(input_model, '020$a'))) logger.debug('Raw ISBNS:\t{0}'.format(isbns)) # sorted to remove non-determinism which interferes with canonicalization normalized_isbns = sorted(list(isbn_list(isbns, logger=logger))) subscript = ord('a') instance_ids = [] logger.debug('Normalized ISBN:\t{0}'.format(normalized_isbns)) if normalized_isbns: for inum, itype in normalized_isbns: ean13 = compute_ean13_check(inum) data = [['instantiates', workid], [ISBNNS + 'isbn', ean13]] instanceid = materialize_entity('Instance', ctx_params=params, model_to_update=output_model, data=data, loop=loop) if entbase: instanceid = I(iri.absolutize(instanceid, entbase)) output_model.add(I(instanceid), ISBN_REL, ean13) output_model.add(I(instanceid), INSTANTIATES_REL, I(workid)) if itype: output_model.add(I(instanceid), ISBN_VTYPE_REL, itype) existing_ids.add(instanceid) instance_ids.append(instanceid) else: #If there are no ISBNs, we'll generate a default Instance data = [['instantiates', workid]] instanceid = materialize_entity('Instance', ctx_params=params, model_to_update=output_model, data=data, loop=loop) instanceid = I(iri.absolutize(instanceid, entbase)) if entbase else I(instanceid) output_model.add(I(instanceid), INSTANTIATES_REL, I(workid)) existing_ids.add(instanceid) instance_ids.append(instanceid) #output_model.add(instance_ids[0], I(iri.absolutize('instantiates', vocabbase)), I(workid)) #output_model.add(I(instance_ids[0]), VTYPE_REL, I(iri.absolutize('Instance', vocabbase))) return instance_ids
import itertools import asyncio from itertools import tee, zip_longest from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET from versa.util import simple_lookup from amara3 import iri from bibframe import BFZ, BFLC, g_services, BF_INIT_TASK, BF_MARCREC_TASK, BF_MATRES_TASK, BF_FINAL_TASK RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' RDFS_NAMESPACE = 'http://www.w3.org/2000/01/rdf-schema#' VTYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI)) RDFS_LABEL = RDFS_NAMESPACE + 'label' def pairwise(iterable): a, b = tee(iterable) next(b, None) return zip_longest(a, b) #A plug-in is a series of callables, each of which handles a phase of #Process #The only phase predefined for all plug-ins is BF_INIT_TASK
def record_handler(loop, model, entbase=None, vocabbase=BL, limiting=None, plugins=None, ids=None, postprocess=None, out=None, logger=logging, transforms=TRANSFORMS, special_transforms=unused_flag, canonical=False, model_factory=memory.connection, lookups=None, **kwargs): ''' loop - asyncio event loop model - the Versa model for the record entbase - base IRI used for IDs of generated entity resources limiting - mutable pair of [count, limit] used to control the number of records processed ''' #Deprecated legacy API support if isinstance(transforms, dict) or special_transforms is not unused_flag: warnings.warn('Please switch to using bibframe.transforms_set', PendingDeprecationWarning) special_transforms = special_transforms or default_special_transforms() transforms = transform_set(transforms) transforms.specials = special_transforms _final_tasks = set( ) #Tasks for the event loop contributing to the MARC processing plugins = plugins or [] if ids is None: ids = idgen(entbase) #FIXME: For now always generate instances from ISBNs, but consider working this through the plugins system instancegen = isbn_instancegen existing_ids = set() #Start the process of writing out the JSON representation of the resulting Versa if out and not canonical: out.write('[') first_record = True try: while True: input_model = yield leader = None #Add work item record, with actual hash resource IDs based on default or plugged-in algo #FIXME: No plug-in support yet params = { 'input_model': input_model, 'logger': logger, #'input_model': input_model, 'output_model': model, 'logger': logger, 'entbase': entbase, 'vocabbase': vocabbase, 'ids': ids, 'existing_ids': existing_ids, 'plugins': plugins, 'transforms': transforms, 'materialize_entity': materialize_entity, 'leader': leader, 'lookups': lookups or {}, 'loop': loop } # Earliest plugin stage, with an unadulterated input model for plugin in plugins: if BF_INPUT_TASK in plugin: yield from plugin[BF_INPUT_TASK](loop, input_model, params) #Prepare cross-references (i.e. 880s) #See the "$6 - Linkage" section of https://www.loc.gov/marc/bibliographic/ecbdcntf.html #XXX: Figure out a way to declare in TRANSFORMS? We might have to deal with non-standard relationship designators: https://github.com/lcnetdev/marc2bibframe/issues/83 xrefs = {} remove_links = set() add_links = [] xref_link_tag_workaround = {} for lid, marc_link in input_model: origin, taglink, val, attribs = marc_link if taglink == MARCXML_NS + '/leader' or taglink.startswith( MARCXML_NS + '/data/9'): #900 fields are local and might not follow the general xref rules params['leader'] = leader = val continue #XXX Do other fields with a 9 digit (not just 9XX) also need to be skipped? if taglink.startswith(MARCXML_NS + '/extra/') or 'tag' not in attribs: continue this_tag = attribs['tag'] #if this_tag == '100': import pdb; pdb.set_trace() for xref in attribs.get('6', []): matched = LINKAGE_PAT.match(xref) this_taglink, this_occ, this_scriptid, this_rtl = matched.groups( ) if matched else (None, None, None, None) if not this_taglink and occ: control_code = list(marc_lookup( input_model, '001')) or ['NO 001 CONTROL CODE'] dumb_title = list(marc_lookup( input_model, '245$a')) or ['NO 245$a TITLE'] logger.warning( 'Skipping invalid $6: "{}" for {}: "{}"'.format( xref, control_code[0], dumb_title[0])) continue if this_tag == this_taglink: #Pretty sure this is an erroneous self-link, but we've seen this in the wild (e.g. QNL). Issue warning & do the best we can linking via occurrence #Note: the resulting workround (lookup table from occurence code to the correct tag) will not work in cases of linking from any tag higher in ordinal value than 880 (if such a situation is even possible) logger.warning( 'Invalid input: erroneous self-link $6: "{}" from "{}". Trying to work around.' .format(xref, this_tag)) if this_tag != '880': xref_link_tag_workaround[this_occ] = this_tag #FIXME: Remove this debugging if statament at some point if scriptid or rtl: logger.debug( 'Language info specified in subfield 6, {}'.format( xref)) #Locate the matching taglink if this_tag == '880' and this_occ == '00': #Special case, no actual xref, used to separate scripts in a record (re Multiscript Records) #FIXME: Not really handled right now. Presume some sort of merge dynamics will need to be implemented attribs['tag'] = this_taglink add_links.append( (origin, MARCXML_NS + '/data/' + this_taglink, val, attribs)) if xref_link_tag_workaround: if this_tag == '880': this_taglink = xref_link_tag_workaround.get( this_occ) links = input_model.match( None, MARCXML_NS + '/data/' + this_taglink) for that_link in links: #6 is the cross-reference subfield for that_ref in link[ATTRIBUTES].get('6', []): matched = LINKAGE_PAT.match(that_ref) that_taglink, that_occ, that_scriptid, that_rtl = matched.groups( ) if matched else (None, None, None, None) #if not that_tag and that_occ: # control_code = list(marc_lookup(input_model, '001')) or ['NO 001 CONTROL CODE'] # dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE'] # logger.warning('Skipping invalid $6: "{}" for {}: "{}"'.format(to_ref, control_code[0], dumb_title[0])) # continue if ([that_taglink, that_occ] == [ this_tag, this_occ ]) or (xref_link_tag_workaround and that_occ == this_occ): if this_tag == '880': #This is an 880, which we'll handle by integrating back into the input model using the correct tag, flagged to show the relationship remove_links.add(lid) if that_taglink == '880': #Rule for 880s: duplicate but link more robustly copied_attribs = attribs.copy() for k, v in that_link[ATTRIBUTES].items(): if k[:3] not in ('tag', 'ind'): copied_attribs.setdefault( k, []).extend(v) add_links.append( (origin, MARCXML_NS + '/data/' + this_tag, val, copied_attribs)) input_model.remove(remove_links) input_model.add_many(add_links) # hook for plugins interested in the xref-resolved input model for plugin in plugins: if BF_INPUT_XREF_TASK in plugin: yield from plugin[BF_INPUT_XREF_TASK](loop, input_model, params) #Do one pass to establish work hash #XXX Should crossrefs precede this? bootstrap_dummy_id = next(params['input_model'].match())[ORIGIN] logger.debug('Entering bootstrap phase. Dummy ID: {}'.format( bootstrap_dummy_id)) params['default-origin'] = bootstrap_dummy_id params['instanceids'] = [bootstrap_dummy_id + '-instance'] params['output_model'] = model_factory() params['field008'] = leader = None params['fields006'] = fields006 = [] params['fields007'] = fields007 = [] params['to_postprocess'] = [] params['origins'] = { WORK_TYPE: bootstrap_dummy_id, INSTANCE_TYPE: params['instanceids'][0] } #First apply special patterns for determining the main target resources curr_transforms = transforms.compiled[BOOTSTRAP_PHASE] ok = process_marcpatterns(params, curr_transforms, input_model, BOOTSTRAP_PHASE) if not ok: continue #Abort current record if signalled bootstrap_output = params['output_model'] temp_main_target = main_type = None for o, r, t, a in bootstrap_output.match( None, PYBF_BOOTSTRAP_TARGET_REL): #FIXME: We need a better designed way of determining fallback to bib if t is not None: temp_main_target, main_type = o, t #Switch to the main output model for processing params['output_model'] = model if temp_main_target is None: #If no target was set explicitly fall back to the transforms registered for the biblio phase #params['logger'].debug('WORK HASH ORIGIN {}\n'.format(bootstrap_dummy_id)) #params['logger'].debug('WORK HASH MODEL {}\n'.format(repr(bootstrap_output))) workid_data = gather_workid_data(bootstrap_output, bootstrap_dummy_id) workid = materialize_entity('Work', ctx_params=params, data=workid_data, loop=loop) logger.debug( 'Entering default main phase, Work ID: {0}'.format(workid)) is_folded = workid in existing_ids existing_ids.add(workid) control_code = list(marc_lookup( input_model, '001')) or ['NO 001 CONTROL CODE'] dumb_title = list(marc_lookup(input_model, '245$a')) or ['NO 245$a TITLE'] logger.debug('Work hash data: {0}'.format(repr(workid_data))) logger.debug('Control code: {0}'.format(control_code[0])) logger.debug('Uniform title: {0}'.format(dumb_title[0])) logger.debug('Work ID: {0}'.format(workid)) workid = I(iri.absolutize(workid, entbase)) if entbase else I(workid) folded = [workid] if is_folded else [] model.add(workid, VTYPE_REL, I(iri.absolutize('Work', vocabbase))) params['default-origin'] = workid params['folded'] = folded #Figure out instances instanceids = instancegen(params, loop, model) params['instanceids'] = instanceids or [None] main_transforms = transforms.compiled[DEFAULT_MAIN_PHASE] params['origins'] = { WORK_TYPE: workid, INSTANCE_TYPE: params['instanceids'][0] } phase_target = DEFAULT_MAIN_PHASE else: targetid_data = gather_targetid_data( bootstrap_output, temp_main_target, transforms.orderings[main_type]) #params['logger'].debug('Data for resource: {}\n'.format([main_type] + targetid_data)) targetid = materialize_entity(main_type, ctx_params=params, data=targetid_data, loop=loop) logger.debug( 'Entering specialized phase, Target resource ID: {}, type: {}' .format(targetid, main_type)) is_folded = targetid in existing_ids existing_ids.add(targetid) #Determine next transform phase main_transforms = transforms.compiled[main_type] params['origins'] = {main_type: targetid} params['default-origin'] = targetid phase_target = main_type model.add(I(targetid), VTYPE_REL, I(main_type)) params['transform_log'] = [] # set() params['fields_used'] = [] params['dropped_codes'] = {} #Defensive coding against missing leader or 008 params['field008'] = leader = None params['fields006'] = fields006 = [] params['fields007'] = fields007 = [] params['to_postprocess'] = [] ok = process_marcpatterns(params, main_transforms, input_model, phase_target) if not ok: continue #Abort current record if signalled skipped_rels = set() for op, rels, rid in params['to_postprocess']: for rel in rels: skipped_rels.add(rel) if op == POSTPROCESS_AS_INSTANCE: if params['instanceids'] == [None]: params['instanceids'] = [rid] else: params['instanceids'].append(rid) instance_postprocess(params, skip_relationships=skipped_rels) logger.debug('+') #XXX At this point there must be at least one record with a Versa type for plugin in plugins: #Each plug-in is a task #task = asyncio.Task(plugin[BF_MARCREC_TASK](loop, relsink, params), loop=loop) if BF_MARCREC_TASK in plugin: yield from plugin[BF_MARCREC_TASK](loop, model, params) logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop)) #FIXME: This blocks and thus serializes the plugin operation, rather than the desired coop scheduling approach #For some reason seting to async task then immediately deferring to next task via yield from sleep leads to the "yield from wasn't used with future" error (Not much clue at: https://codereview.appspot.com/7396044/) #yield from asyncio.Task(asyncio.sleep(0.01), loop=loop) #yield from asyncio.async(asyncio.sleep(0.01)) #yield from asyncio.sleep(0.01) #Basically yield to next task #Can we somehow move this to passed-in postprocessing? if out and not canonical and not first_record: out.write(',\n') if out: if not canonical: first_record = False last_chunk = None #Using iterencode avoids building a big JSON string in memory, or having to resort to file pointer seeking #Then again builds a big list in memory, so still working on opt here for chunk in json.JSONEncoder().iterencode( [link for link in model]): if last_chunk is None: last_chunk = chunk[1:] else: out.write(last_chunk) last_chunk = chunk if last_chunk: out.write(last_chunk[:-1]) #FIXME: Postprocessing should probably be a task too if postprocess: postprocess() #limiting--running count of records processed versus the max number, if any limiting[0] += 1 if limiting[1] is not None and limiting[0] >= limiting[1]: break except GeneratorExit: logger.debug('Completed processing {0} record{1}.'.format( limiting[0], '' if limiting[0] == 1 else 's')) if out and not canonical: out.write(']') #if not plugins: loop.stop() for plugin in plugins: #Each plug-in is a task func = plugin.get(BF_FINAL_TASK) if not func: continue task = asyncio.Task(func(loop), loop=loop) _final_tasks.add(task) def task_done(task): #print('Task done: ', task) _final_tasks.remove(task) #logger.debug((plugins)) #if plugins and len(_final_tasks) == 0: #print("_final_tasks is empty, stopping loop.") #loop = asyncio.get_event_loop() # loop.stop() #Once all the plug-in tasks are done, all the work is done task.add_done_callback(task_done) #print('DONE') #raise return
import re import os import logging import itertools #from rdflib import Graph, BNode, Namespace from rdflib import URIRef, Literal, RDF, RDFS from amara3 import iri from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET from bibframe import BFZ, BFLC VTYPE_REL = I(iri.absolutize('type', VERSA_BASEIRI)) VLABEL_REL = I(iri.absolutize('label', VERSA_BASEIRI)) WORKCLASS = iri.absolutize('Work', BFZ) INSTANCECLASS = iri.absolutize('Instance', BFZ) INSTANCEREL = iri.absolutize('hasInstance', BFZ) PROP_MAP = { VTYPE_REL: RDF.type, VLABEL_REL: RDFS.label, } def prep(stmt): ''' Prepare a statement into a triple ready for rdflib '''
including absolutizing relative IRIs ''' fullprop = kwargs.get('fullprop') rid = kwargs.get('rid') base = kwargs.get('base', ONYA) model = kwargs.get('model') iris = ltext.strip().split() for i in iris: model.add(rid, fullprop, I(iri.absolutize(i, base))) return None PREP_METHODS = { ONYA('text'): lambda x, **kwargs: x, ONYA('resource'): lambda x, base=ONYA, **kwargs: I(iri.absolutize(x, base)), ONYA('resourceset'): handle_resourceset, } def get_block_text(block): ''' Get simplified contents of an block a/href embedded in the block comes from Markdown such as `<link_text>`. Restore the angle brackets as expected by the li parser Also exclude child uls (to be processed separately) ''' return ''.join([ (ch if isinstance(ch, text) else ('<' + ch.xml_value +