def wrap_task(inf=inf): #Cannot reuse a pyexpat parser, so must create a new one for each input file sink = marc.record_handler( loop, model, entbase=entbase, vocabbase=vb, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger, transforms=transforms, extra_transforms=extra_transforms(marcextras_vocab), canonical=canonical) if lax: parser = xml.parsers.expat.ParserCreate() else: parser = xml.parsers.expat.ParserCreate(namespace_separator=NSSEP) handler = expat_callbacks(sink, parser, lax) parser.StartElementHandler = handler.start_element parser.EndElementHandler = handler.end_element parser.CharacterDataHandler = handler.char_data parser.buffer_text = True parser.ParseFile(inf) if handler.no_records: warnings.warn("No records found in this file. Possibly an XML namespace problem (try using the 'lax' flag).", RuntimeWarning) sink.close() yield
def bfconvert(inputs, base=None, out=None, limit=None, rdfttl=None, config=None, verbose=False, logger=logging): ''' inputs - List of MARC/XML files to be parsed and converted to BIBFRAME RDF (Note: want to allow singular input strings) out - file where raw Versa JSON dump output should be written (default: write to stdout) rdfttl - stream to where RDF Turtle output should be written config - configuration information limit - Limit the number of records processed to this number. If omitted, all records will be processed. base - Base IRI to be used for creating resources. verbose - If true show additional messages and information (default: False) logger - logging object for messages ''' #if stats: # register_service(statsgen.statshandler) if hasattr(inputs, 'read') and hasattr(inputs, 'close'): #It's a file type? inputs = [inputs] if limit is not None: try: limit = int(limit) except ValueError: logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit)) ids = marc.idgen(base) m = memory.connection() g = rdflib.Graph() g.bind('bf', BFNS) g.bind('bfc', BFCNS) g.bind('bfd', BFDNS) g.bind('v', VNS) def postprocess(rec): #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle if rdfttl is not None: rdf.process(m, g, logger=logger) m.create_space() #Initialize auxiliary services (i.e. plugins) plugins = [] for pc in config.get(u'plugins', []): try: plugins.append(g_services[pc[u'id']]( config=pc, logger=logger, ) ) except KeyError: raise Exception(u'Unknown plugin {0}'.format(pc[u'id'])) limiting = [0, limit] for inf in inputs: sink = marc.record_handler(m, idbase=base, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger) parse_marcxml(inf, sink) if rdfttl is not None: rdfttl.write(g.serialize(format="turtle")) for plugin in plugins: plugin.close() return
def wrap_task(): #source=source sink = marc.record_handler( loop, model, entbase=entbase, vocabbase=vb, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger, transforms=transforms, extra_transforms=extra_transforms(marcextras_vocab), canonical=canonical, model_factory=model_factory) args = dict(lax=lax) handle_marc_source(source, sink, args, logger, model_factory) sink.close() yield
def bfconvert(inputs=None, base=None, out=None, limit=None, rdfttl=None, config=None, verbose=False, mods=None): ''' inputs - One or more MARC/XML files to be parsed and converted to BIBFRAME RDF out - file where raw Versa JSON dump output should be written (default: write to stdout) rdfttl - file where RDF Turtle output should be written config - file containing config in JSON format stats - file where statistics output should be written in JSON format limit - Limit the number of records processed to this number. If omitted, all records will be processed. base - Base IRI to be used for creating resources. mod - Python module to be imported in order to register plugins. verbose - If true show additional messages and information (default: False) ''' if config is None: config = {} else: config = json.load(config) logger = logging.getLogger('marc2bfrdf') if verbose: logger.setLevel(logging.DEBUG) for mod in mods or []: __import__(mod, globals(), locals(), []) from bibframe import g_services #if stats: # register_service(statsgen.statshandler) if limit is not None: try: limit = int(limit) except ValueError: logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit)) ids = marc.idgen(base) m = memory.connection() g = rdflib.Graph() g.bind('bf', BFNS) g.bind('bfc', BFCNS) g.bind('bfd', BFDNS) g.bind('v', VNS) def postprocess(rec): #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle if rdfttl is not None: rdf.process(m, g, logger=logger) m.create_space() #Initialize auxiliary services (i.e. plugins) plugins = [] for pc in config.get(u'plugins', []): try: plugins.append(g_services[pc[u'id']]( config=pc, logger=logger, ) ) except KeyError: raise Exception(u'Unknown plugin {0}'.format(pc[u'id'])) limiting = [0, limit] for inf in inputs: sink = marc.record_handler(m, idbase=base, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger) parse_marcxml(inf, sink) if rdfttl is not None: rdfttl.write(g.serialize(format="turtle")) for plugin in plugins: plugin.close() return
def bfconvert(inputs, base=None, out=None, limit=None, rdfttl=None, rdfxml=None, config=None, verbose=False, logger=logging): ''' inputs - List of MARC/XML files to be parsed and converted to BIBFRAME RDF (Note: want to allow singular input strings) out - file where raw Versa JSON dump output should be written (default: write to stdout) rdfttl - stream to where RDF Turtle output should be written config - configuration information limit - Limit the number of records processed to this number. If omitted, all records will be processed. base - Base IRI to be used for creating resources. verbose - If true show additional messages and information (default: False) logger - logging object for messages ''' #if stats: # register_service(statsgen.statshandler) config = config or {} if hasattr(inputs, 'read') and hasattr(inputs, 'close'): #It's a file type? inputs = [inputs] if limit is not None: try: limit = int(limit) except ValueError: logger.debug('Limit must be a number, not "{0}". Ignoring.'.format(limit)) ids = marc.idgen(base) m = memory.connection() g = rdflib.Graph() g.bind('bf', BFNS) g.bind('bfc', BFCNS) g.bind('bfd', BFDNS) g.bind('v', VNS) if base: g.bind('ent', base) extant_resources = None #extant_resources = set() def postprocess(rec): #No need to bother with Versa -> RDF translation if we were not asked to generate Turtle if any((rdfttl, rdfxml)): rdf.process(m, g, to_ignore=extant_resources, logger=logger) m.create_space() #Set up event loop loop = asyncio.get_event_loop() #Allow configuration of a separate base URI for vocab items (classes & properties) #XXX: Is this the best way to do this, or rather via a post-processing plug-in vb = config.get(u'vocab-base-uri', BFZ) #Initialize auxiliary services (i.e. plugins) plugins = [] for pc in config.get(u'plugins', []): try: pinfo = g_services[pc[u'id']] plugins.append(pinfo) pinfo[BF_INIT_TASK](pinfo, config=pc) except KeyError: raise Exception(u'Unknown plugin {0}'.format(pc[u'id'])) limiting = [0, limit] #logger=logger, for inf in inputs: sink = marc.record_handler(loop, m, entbase=base, vocabbase=vb, limiting=limiting, plugins=plugins, ids=ids, postprocess=postprocess, out=out, logger=logger) parser = sax.make_parser() #parser.setContentHandler(marcxmlhandler(receive_recs())) parser.setContentHandler(marcxmlhandler(sink)) parser.setFeature(sax.handler.feature_namespaces, 1) @asyncio.coroutine #Wrap the parse operation to make it a task in the event loop def wrap_task(): parser.parse(inf) yield task = asyncio.Task(wrap_task()) #parse_marcxml(inf, sink) try: loop.run_until_complete(task) except Exception as ex: raise ex finally: loop.close() if rdfttl is not None: logger.debug('Converting to RDF.') rdfttl.write(g.serialize(format="turtle")) if rdfxml is not None: logger.debug('Converting to RDF.') rdfxml.write(g.serialize(format="pretty-xml")) return