Exemplo n.º 1
0
            def wrap_task(inf=inf):
                #Cannot reuse a pyexpat parser, so must create a new one for each input file
                sink = marc.record_handler( loop,
                                            model,
                                            entbase=entbase,
                                            vocabbase=vb,
                                            limiting=limiting,
                                            plugins=plugins,
                                            ids=ids,
                                            postprocess=postprocess,
                                            out=out,
                                            logger=logger,
                                            transforms=transforms,
                                            extra_transforms=extra_transforms(marcextras_vocab),
                                            canonical=canonical)

                if lax:
                    parser = xml.parsers.expat.ParserCreate()
                else:
                    parser = xml.parsers.expat.ParserCreate(namespace_separator=NSSEP)
                handler = expat_callbacks(sink, parser, lax)

                parser.StartElementHandler = handler.start_element
                parser.EndElementHandler = handler.end_element
                parser.CharacterDataHandler = handler.char_data
                parser.buffer_text = True

                parser.ParseFile(inf)
                if handler.no_records:
                    warnings.warn("No records found in this file. Possibly an XML namespace problem (try using the 'lax' flag).", RuntimeWarning)
                sink.close()
                yield
Exemplo n.º 2
0
def bfconvert(inputs, base=None, out=None, limit=None, rdfttl=None, config=None, verbose=False, logger=logging):
    '''
    inputs - List of MARC/XML files to be parsed and converted to BIBFRAME RDF (Note: want to allow singular input strings)
    out - file where raw Versa JSON dump output should be written (default: write to stdout)
    rdfttl - stream to where RDF Turtle output should be written
    config - configuration information
    limit - Limit the number of records processed to this number. If omitted, all records will be processed.
    base - Base IRI to be used for creating resources.
    verbose - If true show additional messages and information (default: False)
    logger - logging object for messages
    '''
    #if stats:
    #    register_service(statsgen.statshandler)

    if hasattr(inputs, 'read') and hasattr(inputs, 'close'):
        #It's a file type?
        inputs = [inputs]
    if limit is not None:
        try:
            limit = int(limit)
        except ValueError:
            logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit))

    ids = marc.idgen(base)
    m = memory.connection()
    g = rdflib.Graph()
    g.bind('bf', BFNS)
    g.bind('bfc', BFCNS)
    g.bind('bfd', BFDNS)
    g.bind('v', VNS)

    def postprocess(rec):
        #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle
        if rdfttl is not None: rdf.process(m, g, logger=logger)
        m.create_space()

    #Initialize auxiliary services (i.e. plugins)
    plugins = []
    for pc in config.get(u'plugins', []):
        try:
            plugins.append(g_services[pc[u'id']](
                config=pc,
                logger=logger,
            )
            )
        except KeyError:
            raise Exception(u'Unknown plugin {0}'.format(pc[u'id']))

    limiting = [0, limit]
    for inf in inputs:
        sink = marc.record_handler(m, idbase=base, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger)
        parse_marcxml(inf, sink)

    if rdfttl is not None: rdfttl.write(g.serialize(format="turtle"))
    for plugin in plugins:
        plugin.close()
    return
Exemplo n.º 3
0
        def wrap_task(): #source=source
            sink = marc.record_handler( loop,
                                        model,
                                        entbase=entbase,
                                        vocabbase=vb,
                                        limiting=limiting,
                                        plugins=plugins,
                                        ids=ids,
                                        postprocess=postprocess,
                                        out=out,
                                        logger=logger,
                                        transforms=transforms,
                                        extra_transforms=extra_transforms(marcextras_vocab),
                                        canonical=canonical,
                                        model_factory=model_factory)

            args = dict(lax=lax)
            handle_marc_source(source, sink, args, logger, model_factory)
            sink.close()
            yield
Exemplo n.º 4
0
def bfconvert(inputs=None, base=None, out=None, limit=None, rdfttl=None, config=None, verbose=False, mods=None):
    '''
    inputs - One or more MARC/XML files to be parsed and converted to BIBFRAME RDF
    out - file where raw Versa JSON dump output should be written (default: write to stdout)
    rdfttl - file where RDF Turtle output should be written
    config - file containing config in JSON format
    stats - file where statistics output should be written in JSON format
    limit - Limit the number of records processed to this number. If omitted, all records will be processed.
    base - Base IRI to be used for creating resources.
    mod - Python module to be imported in order to register plugins.
    verbose - If true show additional messages and information (default: False)
    '''
    if config is None:
        config = {}
    else:
        config = json.load(config)
    logger = logging.getLogger('marc2bfrdf')
    if verbose:
        logger.setLevel(logging.DEBUG)

    for mod in mods or []:
        __import__(mod, globals(), locals(), [])
    from bibframe import g_services

    #if stats:
    #    register_service(statsgen.statshandler)

    if limit is not None:
        try:
            limit = int(limit)
        except ValueError:
            logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit))

    ids = marc.idgen(base)
    m = memory.connection()
    g = rdflib.Graph()
    g.bind('bf', BFNS)
    g.bind('bfc', BFCNS)
    g.bind('bfd', BFDNS)
    g.bind('v', VNS)

    def postprocess(rec):
        #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle
        if rdfttl is not None: rdf.process(m, g, logger=logger)
        m.create_space()

    #Initialize auxiliary services (i.e. plugins)
    plugins = []
    for pc in config.get(u'plugins', []):
        try:
            plugins.append(g_services[pc[u'id']](
                config=pc,
                logger=logger,
            )
            )
        except KeyError:
            raise Exception(u'Unknown plugin {0}'.format(pc[u'id']))

    limiting = [0, limit]
    for inf in inputs:
        sink = marc.record_handler(m, idbase=base, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger)
        parse_marcxml(inf, sink)

    if rdfttl is not None: rdfttl.write(g.serialize(format="turtle"))
    for plugin in plugins:
        plugin.close()
    return
Exemplo n.º 5
0
def bfconvert(inputs, base=None, out=None, limit=None, rdfttl=None, rdfxml=None, config=None, verbose=False, logger=logging):
    '''
    inputs - List of MARC/XML files to be parsed and converted to BIBFRAME RDF (Note: want to allow singular input strings)
    out - file where raw Versa JSON dump output should be written (default: write to stdout)
    rdfttl - stream to where RDF Turtle output should be written
    config - configuration information
    limit - Limit the number of records processed to this number. If omitted, all records will be processed.
    base - Base IRI to be used for creating resources.
    verbose - If true show additional messages and information (default: False)
    logger - logging object for messages
    '''
    #if stats:
    #    register_service(statsgen.statshandler)

    config = config or {}
    if hasattr(inputs, 'read') and hasattr(inputs, 'close'):
        #It's a file type?
        inputs = [inputs]
    if limit is not None:
        try:
            limit = int(limit)
        except ValueError:
            logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit))

    ids = marc.idgen(base)
    m = memory.connection()
    g = rdflib.Graph()
    g.bind('bf', BFNS)
    g.bind('bfc', BFCNS)
    g.bind('bfd', BFDNS)
    g.bind('v', VNS)
    if base:
        g.bind('ent', base)

    extant_resources = None
    #extant_resources = set()
    def postprocess(rec):
        #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle
        if any((rdfttl, rdfxml)): rdf.process(m, g, to_ignore=extant_resources, logger=logger)
        m.create_space()

    #Set up event loop
    loop = asyncio.get_event_loop()

    #Allow configuration of a separate base URI for vocab items (classes & properties)
    #XXX: Is this the best way to do this, or rather via a post-processing plug-in
    vb = config.get(u'vocab-base-uri', BFZ)

    #Initialize auxiliary services (i.e. plugins)
    plugins = []
    for pc in config.get(u'plugins', []):
        try:
            pinfo = g_services[pc[u'id']]
            plugins.append(pinfo)
            pinfo[BF_INIT_TASK](pinfo, config=pc)
        except KeyError:
            raise Exception(u'Unknown plugin {0}'.format(pc[u'id']))

    limiting = [0, limit]
    #logger=logger,
    
    for inf in inputs:
        sink = marc.record_handler(loop, m, entbase=base, vocabbase=vb, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger)
        parser = sax.make_parser()
        #parser.setContentHandler(marcxmlhandler(receive_recs()))
        parser.setContentHandler(marcxmlhandler(sink))
        parser.setFeature(sax.handler.feature_namespaces, 1)
        @asyncio.coroutine
        #Wrap the parse operation to make it a task in the event loop
        def wrap_task():
            parser.parse(inf)
            yield
        task = asyncio.Task(wrap_task())
        #parse_marcxml(inf, sink)
        try:
            loop.run_until_complete(task)
        except Exception as ex:
            raise ex
        finally:
            loop.close()

    if rdfttl is not None:
        logger.debug('Converting to RDF.')
        rdfttl.write(g.serialize(format="turtle"))

    if rdfxml is not None:
        logger.debug('Converting to RDF.')
        rdfxml.write(g.serialize(format="pretty-xml"))
    return