示例#1
0
def bfconvert(inputs, base=None, out=None, limit=None, rdfttl=None, config=None, verbose=False, logger=logging):
    '''
    inputs - List of MARC/XML files to be parsed and converted to BIBFRAME RDF (Note: want to allow singular input strings)
    out - file where raw Versa JSON dump output should be written (default: write to stdout)
    rdfttl - stream to where RDF Turtle output should be written
    config - configuration information
    limit - Limit the number of records processed to this number. If omitted, all records will be processed.
    base - Base IRI to be used for creating resources.
    verbose - If true show additional messages and information (default: False)
    logger - logging object for messages
    '''
    #if stats:
    #    register_service(statsgen.statshandler)

    if hasattr(inputs, 'read') and hasattr(inputs, 'close'):
        #It's a file type?
        inputs = [inputs]
    if limit is not None:
        try:
            limit = int(limit)
        except ValueError:
            logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit))

    ids = marc.idgen(base)
    m = memory.connection()
    g = rdflib.Graph()
    g.bind('bf', BFNS)
    g.bind('bfc', BFCNS)
    g.bind('bfd', BFDNS)
    g.bind('v', VNS)

    def postprocess(rec):
        #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle
        if rdfttl is not None: rdf.process(m, g, logger=logger)
        m.create_space()

    #Initialize auxiliary services (i.e. plugins)
    plugins = []
    for pc in config.get(u'plugins', []):
        try:
            plugins.append(g_services[pc[u'id']](
                config=pc,
                logger=logger,
            )
            )
        except KeyError:
            raise Exception(u'Unknown plugin {0}'.format(pc[u'id']))

    limiting = [0, limit]
    for inf in inputs:
        sink = marc.record_handler(m, idbase=base, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger)
        parse_marcxml(inf, sink)

    if rdfttl is not None: rdfttl.write(g.serialize(format="turtle"))
    for plugin in plugins:
        plugin.close()
    return
示例#2
0
def bfconvert(inputs=None, base=None, out=None, limit=None, rdfttl=None, config=None, verbose=False, mods=None):
    '''
    inputs - One or more MARC/XML files to be parsed and converted to BIBFRAME RDF
    out - file where raw Versa JSON dump output should be written (default: write to stdout)
    rdfttl - file where RDF Turtle output should be written
    config - file containing config in JSON format
    stats - file where statistics output should be written in JSON format
    limit - Limit the number of records processed to this number. If omitted, all records will be processed.
    base - Base IRI to be used for creating resources.
    mod - Python module to be imported in order to register plugins.
    verbose - If true show additional messages and information (default: False)
    '''
    if config is None:
        config = {}
    else:
        config = json.load(config)
    logger = logging.getLogger('marc2bfrdf')
    if verbose:
        logger.setLevel(logging.DEBUG)

    for mod in mods or []:
        __import__(mod, globals(), locals(), [])
    from bibframe import g_services

    #if stats:
    #    register_service(statsgen.statshandler)

    if limit is not None:
        try:
            limit = int(limit)
        except ValueError:
            logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit))

    ids = marc.idgen(base)
    m = memory.connection()
    g = rdflib.Graph()
    g.bind('bf', BFNS)
    g.bind('bfc', BFCNS)
    g.bind('bfd', BFDNS)
    g.bind('v', VNS)

    def postprocess(rec):
        #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle
        if rdfttl is not None: rdf.process(m, g, logger=logger)
        m.create_space()

    #Initialize auxiliary services (i.e. plugins)
    plugins = []
    for pc in config.get(u'plugins', []):
        try:
            plugins.append(g_services[pc[u'id']](
                config=pc,
                logger=logger,
            )
            )
        except KeyError:
            raise Exception(u'Unknown plugin {0}'.format(pc[u'id']))

    limiting = [0, limit]
    for inf in inputs:
        sink = marc.record_handler(m, idbase=base, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger)
        parse_marcxml(inf, sink)

    if rdfttl is not None: rdfttl.write(g.serialize(format="turtle"))
    for plugin in plugins:
        plugin.close()
    return
示例#3
0
def bfconvert(inputs, handle_marc_source=handle_marcxml_source, entbase=None, model=None,
                out=None, limit=None, rdfttl=None, rdfxml=None, xml=None, config=None,
                verbose=False, logger=logging, loop=None, canonical=False,
                lax=False, defaultsourcetype=inputsourcetype.unknown):
    '''
    inputs - One or more open file-like object, string with MARC content, or filename or IRI. If filename or
                IRI it's a good idea to indicate this via the defaultsourcetype parameter
    handle_marc_source - Function to turn a source of MARC data (e.g. XML or JSON) into the internal format for processing
    entbase - Base IRI to be used for creating resources.
    model - model instance for internal use
    out - file where raw Versa JSON dump output should be written (default: write to stdout)
    limit - Limit the number of records processed to this number. If omitted, all records will be processed.
    rdfttl - stream to where RDF Turtle output should be written
    rdfxml - stream to where RDF/XML output should be written
    config - configuration information
    verbose - If true show additional messages and information (default: False)
    logger - logging object for messages
    loop - optional asyncio event loop to use
    canonical - output Versa's canonical form?
    lax - If True signal to the handle_marc_source function that relaxed syntax rules should be applied
            (e.g. accept XML with namespace problems)
    defaultsourcetype - Signal indicating how best to interpret inputs to create an inputsource
    '''
    #if stats:
    #    register_service(statsgen.statshandler)

    config = config or {}
    if limit is not None:
        try:
            limit = int(limit)
        except ValueError:
            logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit))

    def resolve_class(fullname):
        '''
        Given a full name for a Python class, return the class object
        '''
        import importlib
        modpath, name = fullname.rsplit('.', 1)
        module = importlib.import_module(modpath)
        cls = getattr(module, name)
        return cls

    attr_cls = resolve_class(config.get('versa-attr-cls', 'builtins.dict'))
    attr_list_cls = resolve_class(config.get('versa-attr-list-cls', 'builtins.list'))
    #attr_ordered_cls = resolve_class(config.get('versa-attr-cls', 'collections.OrderedDict'))

    model_factory = functools.partial(memory.connection, attr_cls=attr_cls) #,logger=logger)
    model_factory.attr_list_cls = attr_list_cls
    #model_odict_factory = functools.partial(memory.connection, attr_cls=attr_ordered_cls) #,logger=logger)
    #model_odict_factory.attr_list_cls = attr_list_cls

    if 'marc_record_handler' in config:
        handle_marc_source = AVAILABLE_MARC_HANDLERS[config['marc_record_handler']]

    readmode = handle_marc_source.readmode
    #inputs = ( inputsource(open(i, readmode)) for i in inputs )
    #if not isinstance(inputs[0], inputsource):
    #    inputs = ( inputsource(i, streamopenmode=readmode) for i in inputs )

    if handle_marc_source.makeinputsource:
        inputs = factory(inputs, defaultsourcetype=defaultsourcetype, streamopenmode=readmode)
    #inputs = ( inputsource(i, streamopenmode=readmode) for i in inputs )

    ids = marc.idgen(entbase)
    if model is None: model = model_factory()
    g = rdflib.Graph()
    #Intentionally not using either factory
    if canonical: global_model = memory.connection() #logger=logger)

    if xml is not None:
        xmlw = writer.raw(xml, indent='  ')
        xmlw.start_element('bibframe')

    extant_resources = None
    #extant_resources = set()
    def postprocess():
        #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle
        if any((rdfttl, rdfxml)): rdf.process(model, g, to_ignore=extant_resources, logger=logger)
        if canonical: global_model.add_many([(o,r,t,a) for (rid,(o,r,t,a)) in model])

        if xml is not None:
            microxml.process(model, xmlw, to_ignore=extant_resources, logger=logger)

        model.create_space()

    #Set up event loop if not provided
    if not loop:
        loop = asyncio.get_event_loop()

    #Allow configuration of a separate base URI for vocab items (classes & properties)
    #XXX: Is this the best way to do this, or rather via a post-processing plug-in
    vb = config.get('vocab-base-uri', BL)

    transform_iris = config.get('transforms', {})
    if transform_iris:
        transforms = {}
        for tiri in transform_iris:
            try:
                transforms.update(AVAILABLE_TRANSFORMS[tiri])
            except KeyError:
                raise Exception('Unknown transforms set {0}'.format(tiri))
    else:
        transforms = TRANSFORMS

    marcextras_vocab = config.get('marcextras-vocab')

    #Initialize auxiliary services (i.e. plugins)
    plugins = []
    for pc in config.get('plugins', []):
        try:
            pinfo = g_services[pc['id']]
            plugins.append(pinfo)
            pinfo[BF_INIT_TASK](pinfo, config=pc)
        except KeyError:
            raise Exception('Unknown plugin {0}'.format(pc['id']))

    limiting = [0, limit]
    #logger=logger,

    #raise(Exception(repr(inputs)))
    for source in inputs:
        @asyncio.coroutine
        #Wrap the parse operation to make it a task in the event loop
        def wrap_task(): #source=source
            sink = marc.record_handler( loop,
                                        model,
                                        entbase=entbase,
                                        vocabbase=vb,
                                        limiting=limiting,
                                        plugins=plugins,
                                        ids=ids,
                                        postprocess=postprocess,
                                        out=out,
                                        logger=logger,
                                        transforms=transforms,
                                        extra_transforms=extra_transforms(marcextras_vocab),
                                        canonical=canonical,
                                        model_factory=model_factory)

            args = dict(lax=lax)
            handle_marc_source(source, sink, args, logger, model_factory)
            sink.close()
            yield
        task = asyncio.async(wrap_task(), loop=loop)

        try:
            loop.run_until_complete(task)
        except Exception as ex:
            raise ex
        finally:
            loop.close()

    if canonical:
        out.write(repr(global_model))

    if vb == BFZ:
        g.bind('bf', BFNS)
        g.bind('bfc', BFCNS)
        g.bind('bfd', BFDNS)
    else:
        g.bind('vb', rdflib.Namespace(vb))
    if entbase:
        g.bind('ent', entbase)

    if rdfttl is not None:
        logger.debug('Converting to RDF (Turtle).')
        rdfttl.write(g.serialize(format="turtle"))

    if rdfxml is not None:
        logger.debug('Converting to RDF (XML).')
        rdfxml.write(g.serialize(format="pretty-xml"))

    if xml is not None:
        logger.debug('Converting to XML.')
        xmlw.end_element('bibframe')
    return
示例#4
0
def bfconvert(inputs, entbase=None, model=None, out=None, limit=None, rdfttl=None, rdfxml=None, config=None, verbose=False, logger=logging, loop=None, canonical=False, lax=False, zipcheck=False):
    '''
    inputs - List of MARC/XML files to be parsed and converted to BIBFRAME RDF (Note: want to allow singular input strings)
    entbase - Base IRI to be used for creating resources.
    model - model instance for internal use
    out - file where raw Versa JSON dump output should be written (default: write to stdout)
    limit - Limit the number of records processed to this number. If omitted, all records will be processed.
    rdfttl - stream to where RDF Turtle output should be written
    rdfxml - stream to where RDF/XML output should be written
    config - configuration information
    verbose - If true show additional messages and information (default: False)
    logger - logging object for messages
    loop - optional asyncio event loop to use
    canonical - output Versa's canonical form?
    zipcheck - whether to check for zip files among the inputs
    '''
    #if stats:
    #    register_service(statsgen.statshandler)

    config = config or {}
    if hasattr(inputs, 'read') and hasattr(inputs, 'close'):
        #It's a file type?
        inputs = [inputs]
    if limit is not None:
        try:
            limit = int(limit)
        except ValueError:
            logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit))

    ids = marc.idgen(entbase)
    if model is None: model = memory.connection(logger=logger)
    g = rdflib.Graph()
    if canonical: global_model = memory.connection(attr_cls=OrderedDict)

    extant_resources = None
    #extant_resources = set()
    def postprocess():
        #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle
        if any((rdfttl, rdfxml)): rdf.process(model, g, to_ignore=extant_resources, logger=logger)
        if canonical: global_model.add_many([(o,r,t,a) for (rid,(o,r,t,a)) in model])

        model.create_space()

    #Set up event loop if not provided
    if not loop:
        loop = asyncio.get_event_loop()

    #Allow configuration of a separate base URI for vocab items (classes & properties)
    #XXX: Is this the best way to do this, or rather via a post-processing plug-in
    vb = config.get('vocab-base-uri', BL)

    transform_iris = config.get('transforms', {})
    if transform_iris:
        transforms = {}
        for tiri in transform_iris:
            try:
                transforms.update(AVAILABLE_TRANSFORMS[tiri])
            except KeyError:
                raise Exception('Unknown transforms set {0}'.format(tiri))
    else:
        transforms = TRANSFORMS

    marcextras_vocab = config.get('marcextras-vocab')

    #Initialize auxiliary services (i.e. plugins)
    plugins = []
    for pc in config.get('plugins', []):
        try:
            pinfo = g_services[pc['id']]
            plugins.append(pinfo)
            pinfo[BF_INIT_TASK](pinfo, config=pc)
        except KeyError:
            raise Exception('Unknown plugin {0}'.format(pc['id']))

    limiting = [0, limit]
    #logger=logger,
    
    if zipcheck:
        warnings.warn("The zipcheck option is not working yet.", RuntimeWarning)
    
    for source_file in inputs:
        #Note: 
        def input_fileset(sf):
            if zipcheck and zipfile.is_zipfile(sf):
                zf = zipfile.ZipFile(sf, 'r')
                for info in list(zf.infolist()):
                    #From the doc: Note If the ZipFile was created by passing in a file-like object as the first argument to the constructor, then the object returned by open() shares the ZipFile’s file pointer. Under these circumstances, the object returned by open() should not be used after any additional operations are performed on the ZipFile object.
                    sf.seek(0, 0)
                    zf = zipfile.ZipFile(sf, 'r')
                    yield zf.open(info, mode='r')
            else:
                if zipcheck:
                    #Because zipfile.is_zipfile fast forwards to EOF
                    sf.seek(0, 0)
                yield sf

        for inf in input_fileset(source_file):
            @asyncio.coroutine
            #Wrap the parse operation to make it a task in the event loop
            def wrap_task(inf=inf):
                #Cannot reuse a pyexpat parser, so must create a new one for each input file
                sink = marc.record_handler( loop,
                                            model,
                                            entbase=entbase,
                                            vocabbase=vb,
                                            limiting=limiting,
                                            plugins=plugins,
                                            ids=ids,
                                            postprocess=postprocess,
                                            out=out,
                                            logger=logger,
                                            transforms=transforms,
                                            extra_transforms=extra_transforms(marcextras_vocab),
                                            canonical=canonical)

                if lax:
                    parser = xml.parsers.expat.ParserCreate()
                else:
                    parser = xml.parsers.expat.ParserCreate(namespace_separator=NSSEP)
                handler = expat_callbacks(sink, parser, lax)

                parser.StartElementHandler = handler.start_element
                parser.EndElementHandler = handler.end_element
                parser.CharacterDataHandler = handler.char_data
                parser.buffer_text = True

                parser.ParseFile(inf)
                if handler.no_records:
                    warnings.warn("No records found in this file. Possibly an XML namespace problem (try using the 'lax' flag).", RuntimeWarning)
                sink.close()
                yield
            task = asyncio.async(wrap_task(), loop=loop)

    try:
        loop.run_until_complete(task)
    except Exception as ex:
        raise ex
    finally:
        loop.close()

    if canonical:
        out.write(repr(global_model))

    if vb == BFZ:
        g.bind('bf', BFNS)
        g.bind('bfc', BFCNS)
        g.bind('bfd', BFDNS)
    else:
        g.bind('vb', rdflib.Namespace(vb))
    if entbase:
        g.bind('ent', entbase)

    if rdfttl is not None:
        logger.debug('Converting to RDF.')
        rdfttl.write(g.serialize(format="turtle"))

    if rdfxml is not None:
        logger.debug('Converting to RDF.')
        rdfxml.write(g.serialize(format="pretty-xml"))
    return
示例#5
0
def bfconvert(inputs, base=None, out=None, limit=None, rdfttl=None, rdfxml=None, config=None, verbose=False, logger=logging):
    '''
    inputs - List of MARC/XML files to be parsed and converted to BIBFRAME RDF (Note: want to allow singular input strings)
    out - file where raw Versa JSON dump output should be written (default: write to stdout)
    rdfttl - stream to where RDF Turtle output should be written
    config - configuration information
    limit - Limit the number of records processed to this number. If omitted, all records will be processed.
    base - Base IRI to be used for creating resources.
    verbose - If true show additional messages and information (default: False)
    logger - logging object for messages
    '''
    #if stats:
    #    register_service(statsgen.statshandler)

    config = config or {}
    if hasattr(inputs, 'read') and hasattr(inputs, 'close'):
        #It's a file type?
        inputs = [inputs]
    if limit is not None:
        try:
            limit = int(limit)
        except ValueError:
            logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit))

    ids = marc.idgen(base)
    m = memory.connection()
    g = rdflib.Graph()
    g.bind('bf', BFNS)
    g.bind('bfc', BFCNS)
    g.bind('bfd', BFDNS)
    g.bind('v', VNS)
    if base:
        g.bind('ent', base)

    extant_resources = None
    #extant_resources = set()
    def postprocess(rec):
        #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle
        if any((rdfttl, rdfxml)): rdf.process(m, g, to_ignore=extant_resources, logger=logger)
        m.create_space()

    #Set up event loop
    loop = asyncio.get_event_loop()

    #Allow configuration of a separate base URI for vocab items (classes & properties)
    #XXX: Is this the best way to do this, or rather via a post-processing plug-in
    vb = config.get(u'vocab-base-uri', BFZ)

    #Initialize auxiliary services (i.e. plugins)
    plugins = []
    for pc in config.get(u'plugins', []):
        try:
            pinfo = g_services[pc[u'id']]
            plugins.append(pinfo)
            pinfo[BF_INIT_TASK](pinfo, config=pc)
        except KeyError:
            raise Exception(u'Unknown plugin {0}'.format(pc[u'id']))

    limiting = [0, limit]
    #logger=logger,
    
    for inf in inputs:
        sink = marc.record_handler(loop, m, entbase=base, vocabbase=vb, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger)
        parser = sax.make_parser()
        #parser.setContentHandler(marcxmlhandler(receive_recs()))
        parser.setContentHandler(marcxmlhandler(sink))
        parser.setFeature(sax.handler.feature_namespaces, 1)
        @asyncio.coroutine
        #Wrap the parse operation to make it a task in the event loop
        def wrap_task():
            parser.parse(inf)
            yield
        task = asyncio.Task(wrap_task())
        #parse_marcxml(inf, sink)
        try:
            loop.run_until_complete(task)
        except Exception as ex:
            raise ex
        finally:
            loop.close()

    if rdfttl is not None:
        logger.debug('Converting to RDF.')
        rdfttl.write(g.serialize(format="turtle"))

    if rdfxml is not None:
        logger.debug('Converting to RDF.')
        rdfxml.write(g.serialize(format="pretty-xml"))
    return