Пример #1
0
def materialize_entity(ctx, etype, unique=None):
    '''
    Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping
    according to the resource type. Implements the Libhub Resource Hash Convention
    As a convenience, if a vocabulary base is provided in the context, concatenate it to etype and the data keys

    ctx - context information governing creation fo the mew entity
    etype - type IRI for th enew entity
    unique - scalar or ordered dict of data to use in generating its unique ID, or None in which case one is just randomly generated
    '''
    params = {}
    if ctx.base:
        etype = ctx.base + etype
    unique_full = unique
    if isinstance(unique, OrderedDict):
        unique_full = OrderedDict()
        for (k, v) in unique.items():
            unique_full[ k if iri.is_absolute(k) else iri.absolutize(k, ctx.base) ] = v

    if unique_full:
        plaintext = json.dumps([etype, unique_full], cls=OrderedJsonEncoder)
        eid = ctx.idgen.send(plaintext)
    else:
        #We only have a type; no other distinguishing data. Generate a random hash
        eid = next(ctx.idgen)
    return eid
Пример #2
0
    def _link(ctx):
        (origin, _, t, a) = ctx.current_link
        if derive_origin:
            #Have enough info to derive the origin from context. Ignore origin in current link
            origin = derive_origin(ctx)

        #If need be call the Versa action function to determine the relationship to the materialized resource
        rels = rel(ctx) if callable(rel) else rel
        if not isinstance(rels, list): rels = [rels]

        values = value(ctx) if callable(value) else (t if value is None else value)
        if not isinstance(values, list): values = [values]

        def recurse_values(vs):
            for v in vs:
                if callable(v):
                    yield from recurse_values(v(ctx))
                else:
                    yield v

        for _value in recurse_values(values):
            #If asked to convert value to resource, do so as long as it is absolute and ignore_refs is false
            if res and not (ignore_refs and not iri.is_absolute(_value)):
                try:
                    _value = I(_value)
                except ValueError:
                    ctx.extras['logger'].warn('Requirement to convert link target to IRI failed for invalid input, causing the corresponding output link to be omitted entirely: {0}'.format(repr((I(origin), I(iri.absolutize(rel, ctx.base)), _value))))
                    #XXX How do we really want to handle this error?
                    #return []
                    continue

            for r in rels:
                ctx.output_model.add(I(origin), I(iri.absolutize(r, ctx.base)), _value, {})

        return
Пример #3
0
    def _toiri(ctx):
        _arg = arg(ctx) if is_pipeline_action(arg) else arg
        _arg = [_arg] if not isinstance(_arg, list) else _arg
        ret = []
        for u in _arg:
            iu = u
            if not (ignore_refs and not iri.is_absolute(iu)):
                # coerce into an IRIref, but fallout as untyped text otherwise
                try:
                    iu = I(iu)
                except ValueError as e:
                    # attempt to recover by percent encoding
                    try:
                        iu = I(iri.percent_encode(iu))
                    except ValueError as e:
                        ctx.extras['logger'].warn(
                            'Unable to convert "{}" to IRI reference:\n{}'.
                            format(iu, e))

                if base is not None and isinstance(iu, I):
                    iu = I(iri.absolutize(iu, base))

            ret.append(iu)

        return ret
Пример #4
0
def resource_id(etype, unique=None, idgen=default_idgen(None), vocabbase=None):
    '''
    Very low level routine for generating a, ID value using the hash algorithm
    outlined by the Libhub initiative for for BIBFRAME Lite (Libhub Resource Hash Convention).
    Takes the entity (resource) type and an ordered data mapping.

    etype - type IRI for th enew entity
    unique - list of key/value tuples of data to use in generating its unique ID, or None in which case one is just randomly generated
    defaultvocabbase - for convenience, provided, use to resolve relative etype & data keys
    '''
    params = {}
    #XXX: Use proper URI normalization? Have a philosophical discussion with Mark about this :)
    if vocabbase: etype = vocabbase + etype

    unique_computed = []
    for k, v in unique:
        if vocabbase:
            #XXX OK absolutize used here. Go figure
            k = k if iri.is_absolute(k) else iri.absolutize(k, vocabbase)
        unique_computed.append((k, v))

    if unique_computed:
        # XXX Is OrderedJsonEncoder neded now that we're using list of tuples rather than ordered dict?
        plaintext = json.dumps([etype, unique_computed], cls=OrderedJsonEncoder)
        eid = idgen.send(plaintext)
    else:
        #We only have a type; no other distinguishing data. Generate a random hash
        eid = next(idgen)
    return eid
Пример #5
0
    def _link(ctx):
        (origin, _, t, a) = ctx.current_link
        if derive_origin:
            #Have enough info to derive the origin from context. Ignore origin in current link
            origin = derive_origin(ctx)

        #If need be call the Versa action function to determine the relationship to the materialized resource
        rels = rel(ctx) if callable(rel) else rel
        if not isinstance(rels, list): rels = [rels]

        _value = value(ctx) if callable(value) else (
            t if value is None else value)
        #Just work with the first provided statement, for now
        if res and not (ignore_refs and not iri.is_absolute(_value)):
            try:
                _value = I(_value)
            except ValueError:
                ctx.extras['logger'].warn(
                    'Requirement to convert link target to IRI failed for invalid input, causing the corresponding output link to be omitted entirely: {0}'
                    .format(
                        repr(
                            (I(origin), I(iri.absolutize(rel,
                                                         ctx.base)), _value))))
                #XXX How do we really want to handle this error?
                return []
        for r in rels:
            ctx.output_model.add(I(origin), I(iri.absolutize(r, ctx.base)),
                                 _value, {})
        return
Пример #6
0
def materialize_entity(etype, ctx_params=None, model_to_update=None, data=None, addtype=True, loop=None, logger=logging):
    '''
    Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping
    according to the resource type. Implements the Libhub Resource Hash Convention
    As a convenience, if a vocabulary base is provided, concatenate it to etype and the data keys

    data - list of key/value pairs used to compute the hash. If empty the hash will be a default for the entity type
            WARNING: THIS FUNCTION MANGLES THE data ARG
    '''
    ctx_params = ctx_params or {}
    vocabbase = ctx_params.get('vocabbase', BL)
    entbase = ctx_params.get('entbase')
    existing_ids = ctx_params.get('existing_ids', set())
    plugins = ctx_params.get('plugins')
    logger = ctx_params.get('logger', logging)
    output_model = ctx_params.get('output_model')
    ids = ctx_params.get('ids', default_idgen(entbase))
    if vocabbase and not iri.is_absolute(etype):
        etype = vocabbase + etype
    params = {'logger': logger}

    data = data or []
    if addtype: data.insert(0, [TYPE_REL, etype])
    data_full =  [ ((vocabbase + k if not iri.is_absolute(k) else k), v) for (k, v) in data ]
    plaintext = json.dumps(data_full, separators=(',', ':'), cls=OrderedJsonEncoder)

    eid = ids.send(plaintext)

    if model_to_update:
        model_to_update.add(I(eid), TYPE_REL, I(etype))

    params['materialized_id'] = eid
    params['first_seen'] = eid in existing_ids
    params['plaintext'] = plaintext
    for plugin in plugins or ():
        #Not using yield from
        if BF_MATRES_TASK in plugin:
            for p in plugin[BF_MATRES_TASK](loop, output_model, params): pass
        #logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop))
    return eid
Пример #7
0
def materialize_entity(etype, ctx_params=None, model_to_update=None, data=None, addtype=True, logger=logging):
    '''
    Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping
    according to the resource type. Implements the Libhub Resource Hash Convention
    As a convenience, if a vocabulary base is provided, concatenate it to etype and the data keys

    data - list of key/value pairs used to compute the hash. If empty the hash will be a default for the entity type
            WARNING: THIS FUNCTION MANGLES THE data ARG
    '''
    ctx_params = ctx_params or {}
    vocabbase = ctx_params.get('vocabbase', BL)
    entbase = ctx_params.get('entbase')
    existing_ids = ctx_params.get('existing_ids', set())
    plugins = ctx_params.get('plugins')
    logger = ctx_params.get('logger', logging)
    output_model = ctx_params.get('output_model')
    ids = ctx_params.get('ids', default_idgen(entbase))
    if vocabbase and not iri.is_absolute(etype):
        etype = vocabbase + etype
    params = {'logger': logger}

    data = data or []
    if addtype: data.insert(0, [VTYPE_REL, etype])
    data_full =  [ ((vocabbase + k if not iri.is_absolute(k) else k), v) for (k, v) in data ]
    plaintext = json.dumps(data_full, separators=(',', ':'), cls=OrderedJsonEncoder)

    eid = ids.send(plaintext)

    if model_to_update:
        model_to_update.add(I(eid), VTYPE_REL, I(etype))

    params['materialized_id'] = eid
    params['first_seen'] = eid in existing_ids
    params['plaintext'] = plaintext
    for plugin in plugins or ():
        #Not using yield from
        if BF_MATRES_TASK in plugin:
            for p in plugin[BF_MATRES_TASK](output_model, params): pass
    return eid
Пример #8
0
def resource_id(etype, fprint=None, idgen=default_idgen(None), vocabbase=None):
    '''
    Lowest level routine for generating a, ID value using the Versa comvention
    
    The Versa convention originated as the hash algorithm outlined by
    the Libhub initiative for for BIBFRAME Lite, and now codified in the document [Computing Versa Resource Hashes
](https://github.com/uogbuji/versa/wiki/Computing-Versa-Resource-Hashes).

    etype - type IRI for the new entity (if the entity has multiple types, this is the primary and additional types
    can be provided in the fingerprint set)
    fprint - fingerprint set. List of key/value tuples of data to use in generating its unique ID, or None in which
    case one is just randomly generated
    defaultvocabbase - for convenience, provided, use to resolve relative etype & fingerprint keys

    >>> from versa.pipeline import resource_id
    >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Jonathan Bruce Postel"), ("http://schema.org/birthDate", "1943-08-06")])
    '-7hP9d_Xo8M'
    >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Augusta Ada King")])
    'xjgOrUFiw_o'
    '''
    params = {}
    if vocabbase and not iri.is_absolute(etype):
        etype = vocabbase(etype)

    fprint_processed = []
    for k, v in fprint or []:
        if vocabbase and not iri.is_absolute(k):
            k = vocabbase(k)
        fprint_processed.append((k, v))

    if fprint_processed:
        fprint_processed.append((VTYPE_REL, etype))
        fprint_processed.sort()
        plaintext = json.dumps(fprint_processed, separators=(',', ':'), cls=OrderedJsonEncoder)
        eid = idgen.send(plaintext)
    else:
        #We only have a type; no other distinguishing data. Generate a random hash
        eid = next(idgen)
    return I(eid)
Пример #9
0
def materialize_entity(etype, ctx_params=None, loop=None, model_to_update=None, data=None, addtype=True):
    '''
    Routine for creating a BIBFRAME resource. Takes the entity (resource) type and a data mapping
    according to the resource type. Implements the Libhub Resource Hash Convention
    As a convenience, if a vocabulary base is provided, concatenate it to etype and the data keys
    '''
    ctx_params = ctx_params or {}
    vocabbase = ctx_params.get('vocabbase', BL)
    existing_ids = ctx_params.get('existing_ids')
    plugins = ctx_params.get('plugins')
    logger = ctx_params.get('logger', logging)
    output_model = ctx_params.get('output_model')
    ids = ctx_params.get('ids')
    if vocabbase and not iri.is_absolute(etype):
        etype = vocabbase + etype
    params = {'logger': logger}

    data = data or []
    if addtype: data.insert(0, [TYPE_REL, etype])
    data_full =  [ ((vocabbase + k if not iri.is_absolute(k) else k), v) for (k, v) in data ]
    plaintext = json.dumps(data_full, separators=(',', ':'), cls=OrderedJsonEncoder)

    eid = ids.send(plaintext)

    if model_to_update:
        model_to_update.add(I(eid), TYPE_REL, I(etype))

    params['materialized_id'] = eid
    params['first_seen'] = eid in existing_ids
    params['plaintext'] = plaintext
    for plugin in plugins or ():
        #Not using yield from
        if BF_MATRES_TASK in plugin:
            for p in plugin[BF_MATRES_TASK](loop, output_model, params): pass
        #logger.debug("Pending tasks: %s" % asyncio.Task.all_tasks(loop))
    return eid
Пример #10
0
    def _res(ctx):
        _arg = arg(ctx) if callable(arg) else arg
        _arg = [_arg] if not isinstance(_arg, list) else _arg
        ret = []
        for u in _arg:
            iu = None
            try:
                iu = I(u)
            except ValueError:
                # attempt to recover by percent encoding
                try:
                    iu = I(iri.percent_encode(u))
                except ValueError as e:
                    ctx.logger('Unable to convert "{}" to IRI reference:\n{}'.format(u, e))
                    continue

            if iu and not iri.is_absolute(iu) and base is not None:
                iu = I(iri.absolutize(iu, base))

            ret.append(iu)

        return ret
Пример #11
0
    def _res(ctx):
        _arg = arg(ctx) if callable(arg) else arg
        _arg = [_arg] if not isinstance(_arg, list) else _arg
        ret = []
        for u in _arg:
            iu = u
            if not (ignore_refs and not iri.is_absolute(iu)):
                # coerce into an IRIref, but fallout as untyped text otherwise
                try:
                    iu = I(iu)
                except ValueError as e:
                    # attempt to recover by percent encoding
                    try:
                        iu = I(iri.percent_encode(iu))
                    except ValueError as e:
                        ctx.extras['logger'].warn('Unable to convert "{}" to IRI reference:\n{}'.format(iu, e))

                if base is not None and isinstance(iu, I):
                    iu = I(iri.absolutize(iu, base))

            ret.append(iu)

        return ret
Пример #12
0
def resource_id(etype, unique=None, idgen=default_idgen(None), vocabbase=None):
    '''
    Very low level routine for generating a, ID value using the hash algorithm
    outlined by the Libhub initiative for for BIBFRAME Lite (Libhub Resource Hash Convention).
    https://github.com/zepheira/pybibframe/wiki/From-Records-to-Resources:-the-Library.Link-resource-ID-generation-algorithm
    Takes the entity (resource) type and an ordered data mapping.

    etype - type IRI for th enew entity
    unique - list of key/value tuples of data to use in generating its unique ID, or None in which case one is just randomly generated
    defaultvocabbase - for convenience, provided, use to resolve relative etype & data keys

    >>> from bibframe.util import resource_id
    >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Jonathan Bruce Postel"), ("http://schema.org/birthDate", "1943-08-06")])
    '-7hP9d_Xo8M'
    >>> resource_id("http://schema.org/Person", [("http://schema.org/name", "Augusta Ada King")])
    'xjgOrUFiw_o'
    '''
    params = {}
    #XXX: Use proper URI normalization? Have a philosophical discussion with Mark about this :)
    if vocabbase: etype = vocabbase + etype

    unique_computed = []
    for k, v in unique:
        if vocabbase:
            #XXX OK absolutize used here. Go figure
            k = k if iri.is_absolute(k) else iri.absolutize(k, vocabbase)
        unique_computed.append((k, v))

    if unique_computed:
        unique_computed.insert(0, [VTYPE_REL, etype])
        plaintext = json.dumps(unique_computed, separators=(',', ':'))
        eid = idgen.send(plaintext)
    else:
        #We only have a type; no other distinguishing data. Generate a random hash
        eid = next(idgen)
    return eid
Пример #13
0
    :return: target of the context's current link
    '''
    #Action function generator to multiplex a relationship at processing time
    def _target(ctx):
        '''
        Versa action function Utility to return the target of the context's current link

        :param ctx: Versa context used in processing (e.g. includes the prototype link
        :return: Target of the context's current link
        '''
        return ctx.current_link[TARGET]
    return _target


NS_PATCH = lambda ns, k, v: (ns+k, v) if not iri.is_absolute(k) else (k, v)
def all_subfields(ctx):
    '''
    Utility to return a hash key from all subfields mentioned in the MARC prototype link

    :param ctx: Versa context used in processing (e.g. includes the prototype link
    :return: Tuple of key/value tuples from the attributes; suitable for hashing
    '''
    #result = [ valitem for keys, values in ctx.linkset[0][ATTRIBUTES].items() for valitem in values ]
    #print(result)
    #for valitem in ctx.linkset[0][ATTRIBUTES].items():
    #    result.extend(valitem)
        #sorted(functools.reduce(lambda a, b: a.extend(b), ))
    #ctx.logger('GRIPPO' + repr(sorted(functools.reduce(lambda a, b: a.extend(b), ctx.linkset[0][ATTRIBUTES].items()))))

    attrs = ctx.current_link[ATTRIBUTES]
Пример #14
0
    def __init__(self,
                 obj,
                 siri=None,
                 encoding=None,
                 streamopenmode='rb',
                 sourcetype=inputsourcetype.unknown):
        '''
        obj - byte string, proper string (only if you really know what you're doing),
            file-like object (stream), file path or URI.
        uri - optional override URI.  Base URI for the input source will be set to
            this value

        >>> from amara3 import inputsource
        >>> inp = inputsource('abc')
        >>> inp.stream
        <_io.StringIO object at 0x1056fbf78>
        >>> inp.iri
        >>> print(inp.iri)
        None
        >>> inp = inputsource(['abc', 'def']) #Now multiple streams in one source
        >>> inp.stream
        <_io.StringIO object at 0x1011aff78>
        >>> print(inp.iri)
        None
        >>> inp = next(inp)
        >>> inp.stream
        <_io.StringIO object at 0x1011af5e8>
        >>> print(inp.iri)
        None
        >>>
        '''
        # from amara3 import inputsource; inp = inputsource('foo.zip')
        # from amara3 import inputsource; inp = inputsource('test/resource/std-examples.zip')
        # s = inp.stream.read(100)
        # s
        # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<!-- edited with XML Spy v4.3 U (http://www.xmlspy.com) by M'
        # s
        # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<collection xmlns="http://www.loc.gov/MARC21/slim">\r\n  <reco'

        self.stream = None
        self.iri = siri
        self.sourcetype = sourcetype

        if obj in ('', b''):
            raise ValueError("Cannot parse an empty string as XML")

        if hasattr(obj, 'read'):
            #Create dummy Uri to use as base
            #uri = uri or uuid4().urn
            self.stream = obj
        #elif sourcetype == inputsourcetype.xmlstring:
        #See this article about XML detection heuristics
        #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html
        #uri = uri or uuid4().urn
        elif self.sourcetype == inputsourcetype.iri or (
                siri and iri.matches_uri_syntax(obj)):
            self.iri = siri or obj
            self.stream = urlopen(iri)
        elif self.sourcetype == inputsourcetype.filename or (
                siri and iri.is_absolute(obj) and not os.path.isfile(obj)):
            #FIXME: convert path to URI
            self.iri = siri or iri.os_path_to_uri(obj)
            self.stream = open(obj, streamopenmode)
        elif self.sourcetype == inputsourcetype.string or isinstance(
                obj, str) or isinstance(obj, bytes):
            self.stream = StringIO(obj)
            #If obj is beyond a certain length, don't even try it as a URI
            #if len(obj) < MAX_URI_LENGTH_FOR_HEURISTIC:
            #    self.iri = iri.os_path_to_uri(obj)
            #    self.stream = urlopen(siri)
        else:
            raise ValueError("Unable to recognize as an inputsource")
        return
Пример #15
0
    def __init__(self, obj, siri=None, encoding=None, streamopenmode='rb',
                    sourcetype=inputsourcetype.unknown):
        '''
        obj - byte string, proper string (only if you really know what you're doing),
            file-like object (stream), file path or URI.
        uri - optional override URI.  Base URI for the input source will be set to
            this value

        >>> from amara3 import inputsource
        >>> inp = inputsource('abc')
        >>> inp.stream
        <_io.StringIO object at 0x1056fbf78>
        >>> inp.iri
        >>> print(inp.iri)
        None
        >>> inp = inputsource(['abc', 'def']) #Now multiple streams in one source
        >>> inp.stream
        <_io.StringIO object at 0x1011aff78>
        >>> print(inp.iri)
        None
        >>> inp = next(inp)
        >>> inp.stream
        <_io.StringIO object at 0x1011af5e8>
        >>> print(inp.iri)
        None
        >>>
        '''
        # from amara3 import inputsource; inp = inputsource('foo.zip')
        # from amara3 import inputsource; inp = inputsource('test/resource/std-examples.zip')
        # s = inp.stream.read(100)
        # s
        # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<!-- edited with XML Spy v4.3 U (http://www.xmlspy.com) by M'
        # s
        # b'<?xml version="1.0" encoding="UTF-8"?>\r\n<collection xmlns="http://www.loc.gov/MARC21/slim">\r\n  <reco'

        self.stream = None
        self.iri = siri
        self.sourcetype = sourcetype

        if obj in ('', b''):
            raise ValueError("Cannot parse an empty string as XML")

        if hasattr(obj, 'read'):
            #Create dummy Uri to use as base
            #uri = uri or uuid4().urn
            self.stream = obj
        #elif sourcetype == inputsourcetype.xmlstring:
            #See this article about XML detection heuristics
            #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html
            #uri = uri or uuid4().urn
        elif self.sourcetype == inputsourcetype.iri or (siri and iri.matches_uri_syntax(obj)):
            self.iri = siri or obj
            self.stream = urlopen(iri)
        elif self.sourcetype == inputsourcetype.filename or (siri and iri.is_absolute(obj) and not os.path.isfile(obj)):
            #FIXME: convert path to URI
            self.iri = siri or iri.os_path_to_uri(obj)
            self.stream = open(obj, streamopenmode)
        elif self.sourcetype == inputsourcetype.string or isinstance(obj, str) or isinstance(obj, bytes):
            self.stream = StringIO(obj)
            #If obj is beyond a certain length, don't even try it as a URI
            #if len(obj) < MAX_URI_LENGTH_FOR_HEURISTIC:
            #    self.iri = iri.os_path_to_uri(obj)
            #    self.stream = urlopen(siri)
        else:
            raise ValueError("Unable to recognize as an inputsource")
        return
Пример #16
0
    '''

    #Action function generator to multiplex a relationship at processing time
    def _origin(ctx):
        '''
        Versa action function Utility to return the origin of the context's current link

        :param ctx: Versa context used in processing (e.g. includes the prototype link
        :return: Origin of the context's current link
        '''
        return ctx.current_link[ORIGIN]

    return _origin


NS_PATCH = lambda ns, k, v: (ns + k, v) if not iri.is_absolute(k) else (k, v)


def all_subfields(ctx):
    '''
    Utility to return a hash key from all subfields mentioned in the MARC prototype link

    :param ctx: Versa context used in processing (e.g. includes the prototype link
    :return: Tuple of key/value tuples from the attributes; suitable for hashing
    '''
    #result = [ valitem for keys, values in ctx.linkset[0][ATTRIBUTES].items() for valitem in values ]
    #print(result)
    #for valitem in ctx.linkset[0][ATTRIBUTES].items():
    #    result.extend(valitem)
    #sorted(functools.reduce(lambda a, b: a.extend(b), ))
    #ctx.logger('GRIPPO' + repr(sorted(functools.reduce(lambda a, b: a.extend(b), ctx.linkset[0][ATTRIBUTES].items()))))