Пример #1
0
def main(source):
    'Transform CSV SOURCE file to BF Lite in Versa'
    ppl = csv_bibframe_pipeline()
    input_model = newmodel()
    with open(source) as csvfp:
        for row_model in csv.parse_iter(csvfp, VLITERATE_TEMPLATE):
            if row_model: input_model.update(row_model)

    # Debug print of input model
    # literate.write([input_model], out=sys.stdout)
    output_model = ppl.run(input_model=input_model)
    print('Low level JSON dump of output data model: ')
    util.jsondump(output_model, sys.stdout)
    print('\n')  # 2 CRs
    print('Versa literate form of output: ')
    literate.write(output_model, out=sys.stdout)

    print('Diagram from extracted a sample: ')
    out_resources = []
    for vs in ppl.fingerprints.values():
        out_resources.extend(vs)
    ITYPE = BF_NS('Instance')
    instances = [
        r for r in out_resources
        if ITYPE in util.resourcetypes(output_model, r)
    ]
    zoomed, _ = util.zoom_in(output_model, random.choice(instances), depth=2)
    mermaid.write(zoomed)
Пример #2
0
 def labelize_helper(self, rules, label_rel=VLABEL_REL, origins=None,
                         handle_misses=None, root_context=DUMMY_CONTEXT):
     '''
     Implements a common label making strategy where output
     resources are put through pattern/action according to type in order
     to determine the output label
     '''
     new_labels = {}
     # Anything with a Versa type is an output resource
     # FIXME weid, redundant logic
     for out_rid in util.all_origins(self.output_model, of_types='*'):
         for typ in util.resourcetypes(self.output_model, out_rid):
             if typ in rules:
                 rule = rules[typ]
                 link = (out_rid, VTYPE_REL, typ, {})
                 # Notice that it reads from the output model and also updates same
                 ctx = root_context.copy(current_link=link, input_model=self.output_model,
                                         output_model=self.output_model)
                 out_labels = rule(ctx)
                 if not out_labels: continue
                 for label in out_labels:
                     if not label or not str(label).strip():
                         if handle_misses:
                             handle_misses(out_rid, typ)
                     # Stripped because labels are for human reading so conventional not to differentiate by whitespace
                     # FIXME: fully normalize
                     label = str(label).strip()
                     new_labels[out_rid] = label
                     self.output_model.add(out_rid, label_rel, label)
     return new_labels
Пример #3
0
def write(model, out=sys.stdout, base=None, schema=None, shorteners=None):
    '''
    models - input Versa model from which output is generated
    '''
    shorteners = shorteners or {}

    all_schema = [schema] if schema else []
    all_schema.append(VERSA_BASEIRI)

    if any((base, schema, shorteners)):
        out.write('# @docheader\n\n* @iri:\n')
    if base:
        out.write('    * @base: {0}'.format(base))
    if schema:
        out.write('    * @schema: {0}'.format(schema))
    #for k, v in shorteners:
    #    out.write('    * @base: {0}'.format(base))

    out.write('\n\n')

    origin_space = set(all_origins(model))

    for o in origin_space:
        # First type found
        # XXX: Maybe there could be a standard primary-type attribute
        # to flag the property with the type to highlight
        first_type = next(resourcetypes(model, o), None)
        if first_type:
            first_type_str = abbreviate(first_type, all_schema)
            out.write(f'# {o} [{first_type_str}]\n\n')
        else:
            out.write(f'# {o}\n\n')
        for o_, r, t, a in model.match(o):
            if (r, t) == (VTYPE_REL, first_type): continue
            rendered_r = abbreviate(r, all_schema)
            if isinstance(rendered_r, I):
                rendered_r = f'<{rendered_r}>'
            value_format(t)
            out.write(f'* {rendered_r}: {value_format(t)}\n')
            for k, v in a.items():
                rendered_k = abbreviate(k, all_schema)
                if isinstance(rendered_k, I):
                    rendered_r = f'<{rendered_k}>'
                out.write(f'    * {rendered_k}: {value_format(t)}\n')

        out.write('\n')
    return
Пример #4
0
    def fingerprint_helper(self, rules, root_context=DUMMY_CONTEXT):
        '''
        Implements a common fingerprinting strategy where the input model
        is scanned for resources and each one is matched by type to the passed-in rules
        If any type is matched that corresponding action is run to determine
        the new resource ID & type
        '''
        # All output resources, whether or not from a direct fingerprint of an input resource
        new_rids = set()

        resources = list(util.all_origins(self.input_model))
        for rid in resources:
            for typ in util.resourcetypes(self.input_model, rid):
                if typ in rules:
                    rule_tup = rules[typ]
                    rule_tup = (rule_tup
                        if isinstance(rule_tup, list)
                            or isinstance(rule_tup, tuple)
                        else
                            (rule_tup,))
                    for rule in rule_tup:
                        out_rids = set()
                        def new_entity(eid):
                            '''
                            Called on Versa pipeline materialization of new entity
                            Ensures we capture additional entities created by
                            pipeline actions during this fingerprint phase
                            '''
                            out_rids.add(eid)

                        # None relationship here acts as a signal to actions
                        # such as materialize to not try to attach the newly created
                        # resource anywhere in the output, since this is just the
                        # fingerprinting stage
                        link = (rid, None, typ, {})
                        ctx = root_context.copy(current_link=link, input_model=self.input_model,
                            output_model=self.output_model)
                        ne_hook = ctx.extras.setdefault('@new-entity-hook', [])
                        ctx.extras['@new-entity-hook'] = make_list(ne_hook, new_entity)
                        main_ridouts = rule(ctx)
                        main_ridouts = set(main_ridouts) if isinstance(main_ridouts, list) else {main_ridouts}
                        mains, others = self.fingerprints.setdefault(rid, (set(), set()))
                        mains.update(main_ridouts), others.update(out_rids)
                        others -= mains
                        new_rids.update(out_rids)
        return new_rids
Пример #5
0
    def _materialize(ctx):
        '''
        Inserts at least two main links in the context's output_model, one or more for
        the relationship from the origin to the materialized resource, one for the
        type of the materialized resource, and links according to the links parameter

        :param ctx: Runtime Versa context used in processing (e.g. includes the prototype link)
        :return: None

        This function is intricate in its use and shifting of Versa context, but the
        intricacies are all designed to make the marcpatterns mini language more natural.
        '''
        # FIXME: Part of the datachef sorting out
        if not ctx.idgen: ctx.idgen = idgen
        if debug is None:
            def log_debug(msg): return
        elif not hasattr(debug, 'write'):
            raise TypeError('debug argument to materialize must be file-like object or None')
        else:
            def log_debug(msg):
                print(msg, file=debug)

        # Set up variables to be made available in any derived contexts
        vars_items = list((vars or {}).items())
        if vars_items:
            # First make sure we're not tainting the passed-in context
            ctx = ctx.copy(variables=ctx.variables.copy())
            for k, v in vars_items:
                if None in (k, v): continue
                #v = v if isinstance(v, list) else [v]
                v = v(ctx) if is_pipeline_action(v) else v
                if v:
                    v = v[0] if isinstance(v, list) else v
                    ctx.variables[k] = v

        (o, r, t, a) = ctx.current_link
        if isinstance(typ, COPY):
            object_copy = typ
            object_copy.id = o
            _typ = next(util.resourcetypes(ctx.input_model, o), None)
            object_copy.links = []
            for stmt in ctx.input_model.match(o):
                if object_copy.rels is None or stmt[RELATIONSHIP] in typ.rels:
                    # FIXME: Attributes?
                    object_copy.links.append((stmt[RELATIONSHIP], stmt[TARGET]))
        else:
            _typ = typ(ctx) if is_pipeline_action(typ) else typ
            object_copy = None
        _fprint = fprint(ctx) if is_pipeline_action(fprint) else fprint
        # FIXME: On redesign implement split using function composition instead
        targets = [ sub_t.strip() for sub_t in t.split(split) if sub_t.strip() ] if split else [t]

        # If the rel in the incoming context is null and there is no rel passed in, nothing to attach
        # Especially useful signal in a pipeline's fingerprinting stage
        attach_ = False if rel is None and r is None else attach

        if '@added-links' not in ctx.extras: ctx.extras['@added-links'] = set()

        # Make sure we end up with a list or None
        rels = rel if isinstance(rel, list) else ([rel] if rel else [r])
        log_debug(f'materialize action. Type: {_typ}. Anchoring rels: {rels} Initial context current link: {ctx.current_link}')
        log_debug(f'Variables (including from vars= arg): {ctx.variables}')
        objids = []

        # Botanical analogy: stem context is from the caller (e.g. connection point of newly materialized resource)
        # vein comtexts derive from the stem
        for target in targets:
            ctx_stem = ctx.copy(current_link=(ctx.current_link[ORIGIN], ctx.current_link[RELATIONSHIP], target, ctx.current_link[ATTRIBUTES]))
            if origin:
                # Have been given enough info to derive the origin from context. Ignore origin in current link
                o = origin(ctx_stem)
            if not o: #Defensive coding
                continue

            computed_fprint = [] if _fprint else None
            rtypes = set([_typ])
            if _fprint:
                # strip None values from computed unique list, including pairs where v is None
                for k, v in _fprint:
                    if None in (k, v): continue
                    for subitem in (v if isinstance(v, list) else [v]):
                        subval = subitem(ctx_stem) if is_pipeline_action(subitem) else subitem
                        if subval:
                            subval = subval if isinstance(subval, list) else [subval]
                            if k == VTYPE_REL: rtypes.update(set(subval))
                            computed_fprint.extend([(k, s) for s in subval])
            log_debug(f'Provided fingerprinting info: {computed_fprint}')

            if object_copy:
                objid = object_copy.id
            else:
                objid = materialize_entity(ctx_stem, _typ, fprint=computed_fprint)
            objids.append(objid)
            log_debug(f'Newly materialized object: {objid}')
            # rels = [ ('_' + curr_rel if curr_rel.isdigit() else curr_rel) for curr_rel in rels if curr_rel ]
            computed_rels = []
            for curr_relobj in rels:
                # e.g. scenario if passed in rel=ifexists(...)
                curr_rels = curr_relobj(ctx_stem) if is_pipeline_action(curr_relobj) else curr_relobj
                curr_rels = curr_rels if isinstance(curr_rels, list) else [curr_rels]
                for curr_rel in curr_rels:
                    if not curr_rel: continue
                    # FIXME: Fix properly, by slugifying & making sure slugify handles all numeric case (prepend '_')
                    curr_rel = '_' + curr_rel if curr_rel.isdigit() else curr_rel
                    if attach_:
                        _smart_add(ctx_stem.output_model, I(o), I(iri.absolutize(curr_rel, ctx_stem.base)), I(objid), (), ctx.extras['@added-links'])
                    computed_rels.append(curr_rel)
            # print((objid, ctx_.existing_ids))
            # XXX: Means links are only processed on new objects! This needs some thought
            if objid not in ctx_stem.existing_ids:
                if _typ:
                    _smart_add(ctx_stem.output_model, I(objid), VTYPE_REL, I(iri.absolutize(_typ, ctx_stem.base)), (), ctx.extras['@added-links'])
                if preserve_fprint:
                    # Consolidate types
                    computed_fprint = [ (k, v) for (k, v) in computed_fprint if k != VTYPE_REL ]
                    # computed_fprint += 
                    attrs = tuple(computed_fprint + [(VTYPE_REL, r) for r in rtypes])
                    _smart_add(ctx_stem.output_model, I(objid), VFPRINT_REL, _typ, attrs, ctx.extras['@added-links'])

                # XXX: Use Nones to mark blanks, or should Versa define some sort of null resource?
                all_links = object_copy.links + links if object_copy else links
                for l in all_links:
                    if len(l) == 2:
                        lo = I(objid)
                        lr, lt = l
                    elif len(l) == 3:
                        lo, lr, lt = l
                    # This context is in effect 

                    # First of all, hold on to the inbound origin so that it can be accessed in embedded actions
                    vein_vars = ctx_stem.variables.copy()
                    vein_vars['@stem'] = ctx_stem.current_link[ORIGIN]

                    # Newly materialized resource is the origin. The overall context target for embedded actions
                    ctx_vein = ctx_stem.copy(current_link=(objid, ctx_stem.current_link[RELATIONSHIP], ctx_stem.current_link[TARGET], ctx_stem.current_link[ATTRIBUTES]), variables=vein_vars)

                    lo = lo or ctx_vein.current_link[ORIGIN]
                    lr = lr or ctx_vein.current_link[RELATIONSHIP]
                    lt = lt or ctx_vein.current_link[TARGET]

                    lo = lo(ctx_vein) if is_pipeline_action(lo) else lo
                    lo = lo if isinstance(lo, list) else [lo]
                    lr = lr(ctx_vein) if is_pipeline_action(lr) else lr

                    # Update lr
                    # XXX This needs cleaning up
                    ctx_vein = ctx_stem.copy(current_link=(ctx_vein.current_link[ORIGIN], lr, ctx_vein.current_link[TARGET], ctx_stem.current_link[ATTRIBUTES]), variables=vein_vars)

                    # If k is a list of contexts use it to dynamically execute functions
                    if isinstance(lr, list):
                        if lr and isinstance(lr[0], context):
                            for newctx in lr:
                                #The function in question will generate any needed links in the output model
                                lt(newctx)
                            continue

                    # import traceback; traceback.print_stack() #For looking up the call stack e.g. to debug nested materialize
                    # Check that the links key is not None, which is a signal not to
                    # generate the item. For example if the key is an ifexists and the
                    # test expression result is False, it will come back as None,
                    # and we don't want to run the v function
                    if lr:
                        lt = lt(ctx_vein) if is_pipeline_action(lt) else lt

                        # If k or v come from pipeline functions as None it signals to skip generating anything else for this link item
                        if lt is not None:
                            # FIXME: Fix properly, by slugifying & making sure slugify handles all-numeric case
                            if lr.isdigit(): lr = '_' + lr
                            _lr = I(iri.absolutize(lr, ctx_vein.base))
                            log_debug(f'Generated link: {lo, _lr, lt}')
                            if isinstance(lt, list):
                                for valitems in lt:
                                    if valitems:
                                        for loi in lo:
                                            _smart_add(ctx_vein.output_model, loi, _lr, valitems, (), ctx.extras['@added-links'])
                            else:
                                for loi in lo:
                                    _smart_add(ctx_vein.output_model, loi, _lr, lt, (), ctx.extras['@added-links'])
                ctx_stem.existing_ids.add(objid)
                for func in ctx.extras.get('@new-entity-hook', []):
                    func(objid)
        log_debug(f'End materialize')
            
        return objids
Пример #6
0
    def transform_by_rel_helper(self, rules, origins=None, handle_misses=None,
                                    root_context=DUMMY_CONTEXT):
        '''
        Implements a common transform strategy where each fingerprinted
        input model resource is examined for outbound links, and each one matched
        by relationship to the passed-in rules. If matched the corresponding action
        is run to update the output model
        '''
        origins = origins or self.fingerprints
        # Really just for lightweight sanity checks
        applied_rules_count = 0
        types_cache = {}
        for rid in origins:
            (mains, others) = origins[rid]
            # import pprint; pprint.pprint([mains, others])

            # Go over all the links for the input resource
            for o, r, t, attribs in self.input_model.match(rid):
                # Match input resource against the rules mapping keys.
                # The mains can match on just rel if it's a simple, scalar key
                # Either mains or others can match on a (rel, T1, T2...) tuple key
                # whether a main or other if TN is one of the output resource's types

                # Collect node/match pairs
                match_sets = set()
                for out_rid in itertools.chain(mains, others):
                    for (rspec, rule) in rules.items():
                        if (out_rid in mains) and rspec == r:
                            match_sets.add((rule, out_rid))
                        elif rspec[0] == r:
                            if out_rid in types_cache:
                                out_rid_types = types_cache[out_rid]
                            else:
                                out_rid_types = frozenset(util.resourcetypes(self.output_model, out_rid))
                                types_cache[out_rid] = out_rid_types
                            _, *typs = rspec
                            for typ in typs:
                                if typ in out_rid_types:
                                    match_sets.add((rule, out_rid))
                                    break

                # If nothing matched, trigger caller's miss handler, if any
                if not match_sets:
                    if handle_misses:
                        handle_misses((rid, r, t, attribs))
                    continue

                for (rule, out_rid) in match_sets:
                    # At the heart of the Versa pipeline context is a prototype link,
                    # which looks like the link that triggered the current tule, but with the
                    # origin changed to the output resource
                    link = (out_rid, r, t, attribs)
                    # Build the rest of the context
                    variables = root_context.variables.copy()
                    variables.update({'input-resource': rid})
                    extras = root_context.extras.copy()
                    extras.update({'@resource': { k: list(m) for (k, (m, o)) in self.fingerprints.items() }})
                    ctx = root_context.copy(current_link=link, input_model=self.input_model,
                                                output_model=self.output_model, variables=variables,
                                                extras=extras)
                    # Run the rule, expecting the side effect of data added to the output model
                    rule(ctx)
                    applied_rules_count += 1
        return applied_rules_count