def main(source): 'Transform CSV SOURCE file to BF Lite in Versa' ppl = csv_bibframe_pipeline() input_model = newmodel() with open(source) as csvfp: for row_model in csv.parse_iter(csvfp, VLITERATE_TEMPLATE): if row_model: input_model.update(row_model) # Debug print of input model # literate.write([input_model], out=sys.stdout) output_model = ppl.run(input_model=input_model) print('Low level JSON dump of output data model: ') util.jsondump(output_model, sys.stdout) print('\n') # 2 CRs print('Versa literate form of output: ') literate.write(output_model, out=sys.stdout) print('Diagram from extracted a sample: ') out_resources = [] for vs in ppl.fingerprints.values(): out_resources.extend(vs) ITYPE = BF_NS('Instance') instances = [ r for r in out_resources if ITYPE in util.resourcetypes(output_model, r) ] zoomed, _ = util.zoom_in(output_model, random.choice(instances), depth=2) mermaid.write(zoomed)
def labelize_helper(self, rules, label_rel=VLABEL_REL, origins=None, handle_misses=None, root_context=DUMMY_CONTEXT): ''' Implements a common label making strategy where output resources are put through pattern/action according to type in order to determine the output label ''' new_labels = {} # Anything with a Versa type is an output resource # FIXME weid, redundant logic for out_rid in util.all_origins(self.output_model, of_types='*'): for typ in util.resourcetypes(self.output_model, out_rid): if typ in rules: rule = rules[typ] link = (out_rid, VTYPE_REL, typ, {}) # Notice that it reads from the output model and also updates same ctx = root_context.copy(current_link=link, input_model=self.output_model, output_model=self.output_model) out_labels = rule(ctx) if not out_labels: continue for label in out_labels: if not label or not str(label).strip(): if handle_misses: handle_misses(out_rid, typ) # Stripped because labels are for human reading so conventional not to differentiate by whitespace # FIXME: fully normalize label = str(label).strip() new_labels[out_rid] = label self.output_model.add(out_rid, label_rel, label) return new_labels
def write(model, out=sys.stdout, base=None, schema=None, shorteners=None): ''' models - input Versa model from which output is generated ''' shorteners = shorteners or {} all_schema = [schema] if schema else [] all_schema.append(VERSA_BASEIRI) if any((base, schema, shorteners)): out.write('# @docheader\n\n* @iri:\n') if base: out.write(' * @base: {0}'.format(base)) if schema: out.write(' * @schema: {0}'.format(schema)) #for k, v in shorteners: # out.write(' * @base: {0}'.format(base)) out.write('\n\n') origin_space = set(all_origins(model)) for o in origin_space: # First type found # XXX: Maybe there could be a standard primary-type attribute # to flag the property with the type to highlight first_type = next(resourcetypes(model, o), None) if first_type: first_type_str = abbreviate(first_type, all_schema) out.write(f'# {o} [{first_type_str}]\n\n') else: out.write(f'# {o}\n\n') for o_, r, t, a in model.match(o): if (r, t) == (VTYPE_REL, first_type): continue rendered_r = abbreviate(r, all_schema) if isinstance(rendered_r, I): rendered_r = f'<{rendered_r}>' value_format(t) out.write(f'* {rendered_r}: {value_format(t)}\n') for k, v in a.items(): rendered_k = abbreviate(k, all_schema) if isinstance(rendered_k, I): rendered_r = f'<{rendered_k}>' out.write(f' * {rendered_k}: {value_format(t)}\n') out.write('\n') return
def fingerprint_helper(self, rules, root_context=DUMMY_CONTEXT): ''' Implements a common fingerprinting strategy where the input model is scanned for resources and each one is matched by type to the passed-in rules If any type is matched that corresponding action is run to determine the new resource ID & type ''' # All output resources, whether or not from a direct fingerprint of an input resource new_rids = set() resources = list(util.all_origins(self.input_model)) for rid in resources: for typ in util.resourcetypes(self.input_model, rid): if typ in rules: rule_tup = rules[typ] rule_tup = (rule_tup if isinstance(rule_tup, list) or isinstance(rule_tup, tuple) else (rule_tup,)) for rule in rule_tup: out_rids = set() def new_entity(eid): ''' Called on Versa pipeline materialization of new entity Ensures we capture additional entities created by pipeline actions during this fingerprint phase ''' out_rids.add(eid) # None relationship here acts as a signal to actions # such as materialize to not try to attach the newly created # resource anywhere in the output, since this is just the # fingerprinting stage link = (rid, None, typ, {}) ctx = root_context.copy(current_link=link, input_model=self.input_model, output_model=self.output_model) ne_hook = ctx.extras.setdefault('@new-entity-hook', []) ctx.extras['@new-entity-hook'] = make_list(ne_hook, new_entity) main_ridouts = rule(ctx) main_ridouts = set(main_ridouts) if isinstance(main_ridouts, list) else {main_ridouts} mains, others = self.fingerprints.setdefault(rid, (set(), set())) mains.update(main_ridouts), others.update(out_rids) others -= mains new_rids.update(out_rids) return new_rids
def _materialize(ctx): ''' Inserts at least two main links in the context's output_model, one or more for the relationship from the origin to the materialized resource, one for the type of the materialized resource, and links according to the links parameter :param ctx: Runtime Versa context used in processing (e.g. includes the prototype link) :return: None This function is intricate in its use and shifting of Versa context, but the intricacies are all designed to make the marcpatterns mini language more natural. ''' # FIXME: Part of the datachef sorting out if not ctx.idgen: ctx.idgen = idgen if debug is None: def log_debug(msg): return elif not hasattr(debug, 'write'): raise TypeError('debug argument to materialize must be file-like object or None') else: def log_debug(msg): print(msg, file=debug) # Set up variables to be made available in any derived contexts vars_items = list((vars or {}).items()) if vars_items: # First make sure we're not tainting the passed-in context ctx = ctx.copy(variables=ctx.variables.copy()) for k, v in vars_items: if None in (k, v): continue #v = v if isinstance(v, list) else [v] v = v(ctx) if is_pipeline_action(v) else v if v: v = v[0] if isinstance(v, list) else v ctx.variables[k] = v (o, r, t, a) = ctx.current_link if isinstance(typ, COPY): object_copy = typ object_copy.id = o _typ = next(util.resourcetypes(ctx.input_model, o), None) object_copy.links = [] for stmt in ctx.input_model.match(o): if object_copy.rels is None or stmt[RELATIONSHIP] in typ.rels: # FIXME: Attributes? object_copy.links.append((stmt[RELATIONSHIP], stmt[TARGET])) else: _typ = typ(ctx) if is_pipeline_action(typ) else typ object_copy = None _fprint = fprint(ctx) if is_pipeline_action(fprint) else fprint # FIXME: On redesign implement split using function composition instead targets = [ sub_t.strip() for sub_t in t.split(split) if sub_t.strip() ] if split else [t] # If the rel in the incoming context is null and there is no rel passed in, nothing to attach # Especially useful signal in a pipeline's fingerprinting stage attach_ = False if rel is None and r is None else attach if '@added-links' not in ctx.extras: ctx.extras['@added-links'] = set() # Make sure we end up with a list or None rels = rel if isinstance(rel, list) else ([rel] if rel else [r]) log_debug(f'materialize action. Type: {_typ}. Anchoring rels: {rels} Initial context current link: {ctx.current_link}') log_debug(f'Variables (including from vars= arg): {ctx.variables}') objids = [] # Botanical analogy: stem context is from the caller (e.g. connection point of newly materialized resource) # vein comtexts derive from the stem for target in targets: ctx_stem = ctx.copy(current_link=(ctx.current_link[ORIGIN], ctx.current_link[RELATIONSHIP], target, ctx.current_link[ATTRIBUTES])) if origin: # Have been given enough info to derive the origin from context. Ignore origin in current link o = origin(ctx_stem) if not o: #Defensive coding continue computed_fprint = [] if _fprint else None rtypes = set([_typ]) if _fprint: # strip None values from computed unique list, including pairs where v is None for k, v in _fprint: if None in (k, v): continue for subitem in (v if isinstance(v, list) else [v]): subval = subitem(ctx_stem) if is_pipeline_action(subitem) else subitem if subval: subval = subval if isinstance(subval, list) else [subval] if k == VTYPE_REL: rtypes.update(set(subval)) computed_fprint.extend([(k, s) for s in subval]) log_debug(f'Provided fingerprinting info: {computed_fprint}') if object_copy: objid = object_copy.id else: objid = materialize_entity(ctx_stem, _typ, fprint=computed_fprint) objids.append(objid) log_debug(f'Newly materialized object: {objid}') # rels = [ ('_' + curr_rel if curr_rel.isdigit() else curr_rel) for curr_rel in rels if curr_rel ] computed_rels = [] for curr_relobj in rels: # e.g. scenario if passed in rel=ifexists(...) curr_rels = curr_relobj(ctx_stem) if is_pipeline_action(curr_relobj) else curr_relobj curr_rels = curr_rels if isinstance(curr_rels, list) else [curr_rels] for curr_rel in curr_rels: if not curr_rel: continue # FIXME: Fix properly, by slugifying & making sure slugify handles all numeric case (prepend '_') curr_rel = '_' + curr_rel if curr_rel.isdigit() else curr_rel if attach_: _smart_add(ctx_stem.output_model, I(o), I(iri.absolutize(curr_rel, ctx_stem.base)), I(objid), (), ctx.extras['@added-links']) computed_rels.append(curr_rel) # print((objid, ctx_.existing_ids)) # XXX: Means links are only processed on new objects! This needs some thought if objid not in ctx_stem.existing_ids: if _typ: _smart_add(ctx_stem.output_model, I(objid), VTYPE_REL, I(iri.absolutize(_typ, ctx_stem.base)), (), ctx.extras['@added-links']) if preserve_fprint: # Consolidate types computed_fprint = [ (k, v) for (k, v) in computed_fprint if k != VTYPE_REL ] # computed_fprint += attrs = tuple(computed_fprint + [(VTYPE_REL, r) for r in rtypes]) _smart_add(ctx_stem.output_model, I(objid), VFPRINT_REL, _typ, attrs, ctx.extras['@added-links']) # XXX: Use Nones to mark blanks, or should Versa define some sort of null resource? all_links = object_copy.links + links if object_copy else links for l in all_links: if len(l) == 2: lo = I(objid) lr, lt = l elif len(l) == 3: lo, lr, lt = l # This context is in effect # First of all, hold on to the inbound origin so that it can be accessed in embedded actions vein_vars = ctx_stem.variables.copy() vein_vars['@stem'] = ctx_stem.current_link[ORIGIN] # Newly materialized resource is the origin. The overall context target for embedded actions ctx_vein = ctx_stem.copy(current_link=(objid, ctx_stem.current_link[RELATIONSHIP], ctx_stem.current_link[TARGET], ctx_stem.current_link[ATTRIBUTES]), variables=vein_vars) lo = lo or ctx_vein.current_link[ORIGIN] lr = lr or ctx_vein.current_link[RELATIONSHIP] lt = lt or ctx_vein.current_link[TARGET] lo = lo(ctx_vein) if is_pipeline_action(lo) else lo lo = lo if isinstance(lo, list) else [lo] lr = lr(ctx_vein) if is_pipeline_action(lr) else lr # Update lr # XXX This needs cleaning up ctx_vein = ctx_stem.copy(current_link=(ctx_vein.current_link[ORIGIN], lr, ctx_vein.current_link[TARGET], ctx_stem.current_link[ATTRIBUTES]), variables=vein_vars) # If k is a list of contexts use it to dynamically execute functions if isinstance(lr, list): if lr and isinstance(lr[0], context): for newctx in lr: #The function in question will generate any needed links in the output model lt(newctx) continue # import traceback; traceback.print_stack() #For looking up the call stack e.g. to debug nested materialize # Check that the links key is not None, which is a signal not to # generate the item. For example if the key is an ifexists and the # test expression result is False, it will come back as None, # and we don't want to run the v function if lr: lt = lt(ctx_vein) if is_pipeline_action(lt) else lt # If k or v come from pipeline functions as None it signals to skip generating anything else for this link item if lt is not None: # FIXME: Fix properly, by slugifying & making sure slugify handles all-numeric case if lr.isdigit(): lr = '_' + lr _lr = I(iri.absolutize(lr, ctx_vein.base)) log_debug(f'Generated link: {lo, _lr, lt}') if isinstance(lt, list): for valitems in lt: if valitems: for loi in lo: _smart_add(ctx_vein.output_model, loi, _lr, valitems, (), ctx.extras['@added-links']) else: for loi in lo: _smart_add(ctx_vein.output_model, loi, _lr, lt, (), ctx.extras['@added-links']) ctx_stem.existing_ids.add(objid) for func in ctx.extras.get('@new-entity-hook', []): func(objid) log_debug(f'End materialize') return objids
def transform_by_rel_helper(self, rules, origins=None, handle_misses=None, root_context=DUMMY_CONTEXT): ''' Implements a common transform strategy where each fingerprinted input model resource is examined for outbound links, and each one matched by relationship to the passed-in rules. If matched the corresponding action is run to update the output model ''' origins = origins or self.fingerprints # Really just for lightweight sanity checks applied_rules_count = 0 types_cache = {} for rid in origins: (mains, others) = origins[rid] # import pprint; pprint.pprint([mains, others]) # Go over all the links for the input resource for o, r, t, attribs in self.input_model.match(rid): # Match input resource against the rules mapping keys. # The mains can match on just rel if it's a simple, scalar key # Either mains or others can match on a (rel, T1, T2...) tuple key # whether a main or other if TN is one of the output resource's types # Collect node/match pairs match_sets = set() for out_rid in itertools.chain(mains, others): for (rspec, rule) in rules.items(): if (out_rid in mains) and rspec == r: match_sets.add((rule, out_rid)) elif rspec[0] == r: if out_rid in types_cache: out_rid_types = types_cache[out_rid] else: out_rid_types = frozenset(util.resourcetypes(self.output_model, out_rid)) types_cache[out_rid] = out_rid_types _, *typs = rspec for typ in typs: if typ in out_rid_types: match_sets.add((rule, out_rid)) break # If nothing matched, trigger caller's miss handler, if any if not match_sets: if handle_misses: handle_misses((rid, r, t, attribs)) continue for (rule, out_rid) in match_sets: # At the heart of the Versa pipeline context is a prototype link, # which looks like the link that triggered the current tule, but with the # origin changed to the output resource link = (out_rid, r, t, attribs) # Build the rest of the context variables = root_context.variables.copy() variables.update({'input-resource': rid}) extras = root_context.extras.copy() extras.update({'@resource': { k: list(m) for (k, (m, o)) in self.fingerprints.items() }}) ctx = root_context.copy(current_link=link, input_model=self.input_model, output_model=self.output_model, variables=variables, extras=extras) # Run the rule, expecting the side effect of data added to the output model rule(ctx) applied_rules_count += 1 return applied_rules_count