def get_record(target, identifier): if target is not None: client = Client(target['url'], registry) record = client.getRecord(identifier=identifier, metadataPrefix=target['metadata_prefix']) return convert_record(record, target['metadata_prefix'], target['title'])
class Repository(object): """ Repository handles interaction with the various interfaces provided by the dspace repository. """ def __init__(self, url=None, **kwargs): self.base_url = kwargs.pop('base_url', None) self.oai_path = kwargs.pop('oai_path', None) self.oai_enabled = bool(kwargs.pop('oai_enabled', True)) self.sword_enabled = bool(kwargs.pop('sword_enabled', False)) if url is not None: warn( 'The url paramater will not be supported in version 3, ' 'use base_url and oai_path instead', DeprecationWarning) if (self.base_url and url.startswith(self.base_url) and self.oai_path is None): self.oai_path = url.replace(self.base_url, '', 1).lstrip('/') elif not self.base_url: if self.oai_path is None: self.oai_path = 'dspace-oai/request' if url.endswith(self.oai_path): self.base_url = url[:-(len(self.oai_path) + 1)] if self.base_url is None: raise ValueError('base_url argument must be specified') if not 'metadata_registry' in kwargs: kwargs['metadata_registry'] = MetadataRegistry() kwargs['metadata_registry'].registerReader('mets', dspace_mets_reader) if self.sword_enabled: skwargs = {'base_url': self.base_url} for key in kwargs.keys(): if key.startswith('sword_'): skwargs[key[6:]] = kwargs.pop(key) self.sword = SwordService(**skwargs) if self.oai_enabled: self.oai = Client('/'.join(( self.base_url, self.oai_path, )), **kwargs) self.identifier_base = self._extractIdentifierBase(self.base_url) def _extractIdentifierBase(self, url): """ From a given URL, extract the OAI identifier base (hostname) """ return urlparse(url).hostname def _extractSet(self, handle): """ Determine the OAI set from a collection handle """ if not isinstance(handle, basestring): raise ValueError('Collection handles must be strings') return 'hdl_' + handle.replace('/', '_').replace(':', '_') def getName(self): """ Get the configured name of the repository """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' return self.oai.identify().repositoryName() def getCollections(self): """ Get a list of the collections in the repository """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' return map(lambda c: c[0:2], self.oai.listSets()) def getItemHandles(self, collection=None, **kw): """ Get item handles from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' for item in self.getItemIdentifiers(collection=collection, **kw): yield item.identifier().split(':', 2)[2] def getItemIdentifiers(self, collection=None, **kw): """ Get item identifiers from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kw.setdefault('metadataPrefix', 'mets') if collection: kw['set'] = self._extractSet(collection) return self.oai.listIdentifiers(**kw) def getItems(self, collection=None, **kw): """ Get full items from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kw.setdefault('metadataPrefix', 'mets') if collection: kw['set'] = self._extractSet(collection) return self.oai.listRecords(**kw) def getItem(self, handle=None, identifier=None, **kwargs): """ Get a single item from the OAI-PMH interface either by handle or identifier """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kwargs.setdefault('metadataPrefix', 'mets') if handle is None and identifier is None: raise ValueError('Either handle or identifier must be provided') if handle is not None: if identifier is not None: raise ValueError('Either a handle or identifier must be ' 'provided, not both') identifier = 'oai:%s:%s' % ( self.identifier_base, handle, ) return self.oai.getRecord(identifier=identifier, **kwargs) def getOAIItemIdentifier(self, handle): return 'oai:%s:%s' % (self._extractIdentifierBase( self.base_url), handle) def getSwordCollections(self): pass def getSwordCollection(self, args): pass
def get_record(target, identifier): if target is not None: client = Client(target['url'], registry) record = client.getRecord(identifier=identifier, metadataPrefix=target['metadata_prefix']) return convert_record(record, target['metadata_prefix'], target['title'])
class ZoraAPI: METADATA_PREFIX = 'oai_dc' # In the constructor, we register to the ZORA API and initialize the necessary class variables def __init__(self, url): registry = MetadataRegistry() registry.registerReader(ZoraAPI.METADATA_PREFIX, oai_dc_reader) self.client = Client(url, registry) self.institutes = {} self.resource_types = [] self.load_institutes_and_types() # Returns the hierarchical dictionary of institutes def get_institutes(self): return self.institutes # Returns the list of resource types def get_resource_types(self): return self.resource_types # Loads all institutes and resource types. The institutes also get parsed into a hierarchical dictionary. def load_institutes_and_types(self): institutes_list = [] resource_type_list = [] for item in self.client.listSets(): split = item[1].split(' = ') if len(split) != 2: continue set_type, set_value = split if set_type == 'Subjects': institutes_list.append(set_value) elif set_type == 'Type': resource_type_list.append(set_value) institutes_dict = self.parse_institutes(institutes_list) self.institutes = institutes_dict self.resource_types = resource_type_list # Parses a list of institutes into a hierarchical dictionary @staticmethod def parse_institutes(institute_list_raw): institutes_dict = {} for institute_raw in institute_list_raw: institutes = institute_raw.split(': ') parent = institutes_dict for institute in institutes: if parent.get(institute) is None: parent[institute] = {} parent = parent[institute] return institutes_dict # Get all metadata dictionaries from ZORA def get_metadata_dicts(self, from_): record_list = self.get_records(from_) metadata_dict_list = self.parse_records(record_list) return metadata_dict_list # Gets one specific paper from the ZORA repository and returns the record of it def get_record(self, uid): record = self.client.getRecord(identifier=uid, metadataPrefix=ZoraAPI.METADATA_PREFIX) return record # Gets the papers from the ZORA repository and returns their records in form of a list def get_records(self, from_): args = {'metadataPrefix': ZoraAPI.METADATA_PREFIX} # Add the from_ argument if it is defined (this is used to get only the most recent papers/changes) if from_: args['from_'] = from_ # Get the relevant papers from ZORA and parse them record_list = [] try: print('Loading records from ZORA API...') record_iterator = self.client.listRecords(**args) record_list = [] count = 0 for record in record_iterator: record_list.append(record) count += 1 if is_debug() and count % 1000 == 0: print(str(count)) print(count) print('Done') except NoRecordsMatchError: print('No records were found') except RemoteDisconnected as error: print(error) except Exception as error: print(error) finally: return record_list # This method parses a list of records from ZORA in a easier to use metadata dictionary. def parse_records(self, record_list): metadata_dict_list = [] print('Parsing records...') for record in record_list: metadata_dict = self.parse_record(record) if metadata_dict: metadata_dict_list.append(metadata_dict) print('Done') return metadata_dict_list # This function parses a record into a dictionary with a similar structure of the Paper database object. # To do so, it turns some unnecessary lists into single values and parses the 'subject' field into 'ddcs' (dewey # decimal classifications), 'keywords' and 'institutes'. # # NOTE: It is not possible to parse the 'subject' field properly since we lack the ability to distinguish between # keywords and institutes (some institutes contain commas --> they will get recognized as lists of keywords). @staticmethod def parse_record(record): metadata_dict = {} metadata_dict['uid'] = record[0].identifier() # If there is no metadata, we assume that the paper has been deleted and store that information in the dict if not record[1]: metadata_dict['deleted'] = True return metadata_dict # If there is metadata available, we parse it into a convenient form metadata_dict = {**metadata_dict, **dict(record[1].getMap())} metadata_dict['title'] = metadata_dict['title'][ 0] if 'title' in metadata_dict and len( metadata_dict['title']) > 0 else None metadata_dict['creators'] = metadata_dict.pop( 'creator') if 'creator' in metadata_dict else [] # If the field 'subject' starts with three digits, it is a ddc (dewey decimal classification). If it contains a # comma-separated list, it is a list of keywords. Otherwise it is an institute. # # NOTE: There are some dewey decimal classifications that contain commas, therefore we check for the three # digits before we look for comma separated lists. Some institutes contain commas as well. This # leads to some institutes getting recognized as a list of keywords. With the information available this problem # unfortunately cannot be solved properly. institute_list = [] ddc_list = [] keyword_list = [] if 'subject' in metadata_dict: for item in metadata_dict['subject']: # If subject starts with three digits and a space, we assume its a dewey decimal classification regex = re.compile('^\d\d\d\s+\w') if regex.match(item): ddc_list.append(item) # If the subject has the same name as an institute, we assume it is an institute elif db.session.query(Institute).filter( Institute.name == item).first(): institute_list.append(item) # If it is none of the above, we assume that it is a comma-separated list of keywords else: for keyword in item.split(','): keyword_list.append(keyword) metadata_dict['institutes'] = institute_list metadata_dict['ddcs'] = ddc_list metadata_dict['keywords'] = keyword_list metadata_dict['description'] = metadata_dict['description'][ 0] if 'description' in metadata_dict and len( metadata_dict['description']) > 0 else None metadata_dict['publisher'] = metadata_dict['publisher'][ 0] if 'publisher' in metadata_dict and len( metadata_dict['publisher']) > 0 else None metadata_dict['date'] = metadata_dict['date'][ 0] if 'date' in metadata_dict and len( metadata_dict['date']) > 0 else None # We filter the 'type' field and only store the paper type type_list = metadata_dict.pop( 'type') if 'type' in metadata_dict else [] resource_type_list = [] for resource_type in type_list: if db.session.query(ResourceType).filter( ResourceType.name == resource_type).first(): resource_type_list.append(resource_type) metadata_dict['resource_types'] = resource_type_list metadata_dict['language'] = metadata_dict['language'][ 0] if 'language' in metadata_dict and len( metadata_dict['language']) > 0 else None metadata_dict['relation'] = metadata_dict['relation'][ 0] if 'relation' in metadata_dict and len( metadata_dict['relation']) > 0 else None return metadata_dict
class OaiPaperSource(PaperSource): # TODO: this should not inherit from PaperSource """ A paper source that fetches records from the OAI-PMH proxy (typically: proaixy). It uses the ListRecord verb to fetch records from the OAI-PMH source. Each record is then converted to a :class:`BarePaper` by an :class:`OaiTranslator` that handles the format the metadata is served in. """ def __init__(self, endpoint, day_granularity=False, *args, **kwargs): """ This sets up the paper source. :param endpoint: the address of the OAI-PMH endpoint to fetch from. :param day_granularity: should we use day-granular timestamps to fetch from the proxy or full timestamps (default: False, full timestamps) See the protocol reference for more information on timestamp granularity: https://www.openarchives.org/OAI/openarchivesprotocol.html """ super(OaiPaperSource, self).__init__(*args, **kwargs) self.registry = MetadataRegistry() self.registry.registerReader('oai_dc', oai_dc_reader) self.registry.registerReader('base_dc', base_dc_reader) self.registry.registerReader('citeproc', citeproc_reader) self.client = Client(endpoint, self.registry) self.client._day_granularity = day_granularity if settings.PROAIXY_API_KEY: self.client.extra_parameters = { 'key': settings.PROAIXY_API_KEY} self.translators = {} # Translator management def add_translator(self, translator): """ Adds the given translator to the paper source, so that we know how to translate papers in the given format. The paper source cannot hold more than one translator per OAI format (it decides what translator to use solely based on the format) so if there is already a translator for that format, it will be overriden. """ self.translators[translator.format()] = translator # Record ingestion def ingest(self, from_date=None, metadataPrefix='any', resumptionToken=None): """ Main method to fill Dissemin with papers! :param from_date: only fetch papers modified after that date in the proxy (useful for incremental fetching) :param metadataPrefix: restrict the ingest for this metadata format """ args = {'metadataPrefix':metadataPrefix} if from_date: args['from_'] = from_date if resumptionToken: args['resumptionToken'] = resumptionToken records = self.client.listRecords(**args) self.process_records(records) def create_paper_by_identifier(self, identifier, metadataPrefix): """ Queries the OAI-PMH proxy for a single paper. :param identifier: the OAI identifier to fetch :param metadataPrefix: the format to use (a translator has to be registered for that format, otherwise we return None with a warning message) :returns: a Paper or None """ record = self.client.getRecord( metadataPrefix=metadataPrefix, identifier=identifier) return self.process_record(record[0], record[1]._map) # Record search utilities def listRecords_or_empty(self, source, *args, **kwargs): """ pyoai raises :class:`NoRecordsMatchError` when no records match, we would rather like to get an empty list in that case. """ try: return source.listRecords(*args, **kwargs) except NoRecordsMatchError: return [] def process_record(self, header, metadata): """ Saves the record given by the header and metadata (as returned by pyoai) into a Paper, or None if anything failed. """ translator = self.translators.get(header.format()) if translator is None: print("Warning: unknown metadata format %s, skipping" % header.format()) return paper = translator.translate(header, metadata) if paper is not None: try: with transaction.atomic(): saved = Paper.from_bare(paper) return saved except ValueError as e: print "Ignoring invalid paper:" print header.identifier() print e def process_records(self, listRecords): """ Save as :class:`Paper` all the records contained in this list """ # check that we have at least one translator, otherwise # it's not really worth trying… if not self.translators: raise ValueError("No OAI translators have been set up: " + "We cannot save any record.") last_report = datetime.now() processed_since_report = 0 for record in listRecords: header = record[0] metadata = record[1]._map self.process_record(header, metadata) # rate reporting processed_since_report += 1 if processed_since_report >= 1000: td = datetime.now() - last_report rate = 'infty' if td.seconds: rate = unicode(processed_since_report / td.seconds) print("current rate: %s records/s" % rate) processed_since_report = 0 last_report = datetime.now()
# 'marc21' sets = oai.listSets() for s in sets: print s # 'MZK03' recids = oai.listIdentifiers(metadataPrefix='marc21', set='MZK03') # from_='2003-01-01T00:00:00Z', until='' # for example: 'MZK03-907223' is in the list of maps # or 356050 *not a map # 238208 problematic r = oai.getRecord(identifier='MZK03-1479', metadataPrefix='marc21') # from lxml import etree # print etree.tostring(r[1],pretty_print=True) # xpath_evaluator = etree.XPathEvaluator(r[1][0], namespaces={'marc21':'http://www.loc.gov/MARC21/slim'}) # e = xpath_evaluator.evaluate #s = etree.tostring(r[1][0],pretty_print=True) rec = r[1] # this returns parsed MARC record # Processing of the MARC record: # link is in rec['856']
class OaiPaperSource(PaperSource): # TODO: this should not inherit from PaperSource """ A paper source that fetches records from the OAI-PMH proxy (typically: proaixy). It uses the ListRecord verb to fetch records from the OAI-PMH source. Each record is then converted to a :class:`BarePaper` by an :class:`OaiTranslator` that handles the format the metadata is served in. """ def __init__(self, oaisource, day_granularity=False, *args, **kwargs): """ This sets up the paper source. :param oaisource: the OAISource to fetch from. :param day_granularity: should we use day-granular timestamps to fetch from the proxy or full timestamps (default: False, full timestamps) See the protocol reference for more information on timestamp granularity: https://www.openarchives.org/OAI/openarchivesprotocol.html """ super(OaiPaperSource, self).__init__(*args, **kwargs) if not oaisource.endpoint: raise ValueError('No OAI endpoint was configured for this OAI source.') self.registry = MetadataRegistry() self.registry.registerReader('oai_dc', oai_dc_reader) self.registry.registerReader('base_dc', base_dc_reader) self.client = Client(oaisource.endpoint, self.registry) self.client._day_granularity = day_granularity self.translators = { 'oai_dc': OAIDCTranslator(oaisource), 'base_dc': BASEDCTranslator(oaisource), } # Translator management def add_translator(self, translator): """ Adds the given translator to the paper source, so that we know how to translate papers in the given format. The paper source cannot hold more than one translator per OAI format (it decides what translator to use solely based on the format) so if there is already a translator for that format, it will be overriden. """ self.translators[translator.format()] = translator # Record ingestion def ingest(self, from_date=None, metadataPrefix='oai_dc', resumptionToken=None): """ Main method to fill Dissemin with papers! :param from_date: only fetch papers modified after that date in the proxy (useful for incremental fetching) :param metadataPrefix: restrict the ingest for this metadata format """ args = {'metadataPrefix':metadataPrefix} if from_date: args['from_'] = from_date if resumptionToken: args['resumptionToken'] = resumptionToken records = self.client.listRecords(**args) self.process_records(records, metadataPrefix) def create_paper_by_identifier(self, identifier, metadataPrefix): """ Queries the OAI-PMH proxy for a single paper. :param identifier: the OAI identifier to fetch :param metadataPrefix: the format to use (a translator has to be registered for that format, otherwise we return None with a warning message) :returns: a Paper or None """ record = self.client.getRecord( metadataPrefix=metadataPrefix, identifier=identifier) return self.process_record(record[0], record[1]._map, metadataPrefix) # Record search utilities def listRecords_or_empty(self, source, *args, **kwargs): """ pyoai raises :class:`NoRecordsMatchError` when no records match, we would rather like to get an empty list in that case. """ try: return source.listRecords(*args, **kwargs) except NoRecordsMatchError: return [] def process_record(self, header, metadata, format): """ Saves the record given by the header and metadata (as returned by pyoai) into a Paper, or None if anything failed. """ translator = self.translators.get(format) if translator is None: logger.warning("Unknown metadata format %s, skipping" % header.format()) return paper = translator.translate(header, metadata) if paper is not None: try: with transaction.atomic(): saved = Paper.from_bare(paper) return saved except ValueError: logger.exception("Ignoring invalid paper with header %s" % header.identifier()) def process_records(self, listRecords, format): """ Save as :class:`Paper` all the records contained in this list """ # check that we have at least one translator, otherwise # it's not really worth trying… if not self.translators: raise ValueError("No OAI translators have been set up: " + "We cannot save any record.") last_report = datetime.now() processed_since_report = 0 with ParallelGenerator(listRecords, max_lookahead=1000) as g: for record in g: header = record[0] metadata = record[1]._map self.process_record(header, metadata, format) # rate reporting processed_since_report += 1 if processed_since_report >= 1000: td = datetime.now() - last_report rate = 'infty' if td.seconds: rate = str(processed_since_report / td.seconds) logger.info("current rate: %s records/s" % rate) processed_since_report = 0 last_report = datetime.now()
registry = metadata.MetadataRegistry() registry.registerReader('marc21', XMLReader()) #### OAI-PMH Client processing from oaipmh.client import Client from lxml import etree oai = Client('http://snape.mzk.cz/OAI-script', registry) #recs = oai.listRecords(metadataPrefix='marc21', set='MZK03') #rec = recs.next() #for rec in recs: rec = oai.getRecord(identifier='MZK03-907223', metadataPrefix='marc21') if rec: print rec[0].identifier() r = rec[1] # Get XML tree for record print etree.tostring(r, pretty_print=True) if r: xpath_evaluator = etree.XPathEvaluator( r, namespaces={'marc': 'http://www.loc.gov/MARC21/slim'}) e = xpath_evaluator.evaluate print e("//marc:datafield[@tag='856']") print e("//marc:datafield[@tag='034']")
from oaipmh import metadata registry = metadata.MetadataRegistry() registry.registerReader('marc21', XMLReader() ) #### OAI-PMH Client processing from oaipmh.client import Client from lxml import etree oai = Client('http://snape.mzk.cz/OAI-script', registry) #recs = oai.listRecords(metadataPrefix='marc21', set='MZK03') #rec = recs.next() #for rec in recs: rec = oai.getRecord(identifier='MZK03-907223', metadataPrefix='marc21') if rec: print rec[0].identifier() r = rec[1] # Get XML tree for record print etree.tostring(r,pretty_print=True) if r: xpath_evaluator = etree.XPathEvaluator(r, namespaces={'marc':'http://www.loc.gov/MARC21/slim'}) e = xpath_evaluator.evaluate print e("//marc:datafield[@tag='856']") print e("//marc:datafield[@tag='034']")