Пример #1
0
def get_record(target, identifier):
    if target is not None:
        client = Client(target['url'], registry)
        record = client.getRecord(identifier=identifier,
                                  metadataPrefix=target['metadata_prefix'])
        return convert_record(record, target['metadata_prefix'],
                              target['title'])
Пример #2
0
class Repository(object):
    """ Repository handles interaction with the various interfaces provided by 
    the dspace repository. """
    def __init__(self, url=None, **kwargs):
        self.base_url = kwargs.pop('base_url', None)
        self.oai_path = kwargs.pop('oai_path', None)

        self.oai_enabled = bool(kwargs.pop('oai_enabled', True))
        self.sword_enabled = bool(kwargs.pop('sword_enabled', False))

        if url is not None:
            warn(
                'The url paramater will not be supported in version 3, '
                'use base_url and oai_path instead', DeprecationWarning)

            if (self.base_url and url.startswith(self.base_url)
                    and self.oai_path is None):
                self.oai_path = url.replace(self.base_url, '', 1).lstrip('/')
            elif not self.base_url:
                if self.oai_path is None:
                    self.oai_path = 'dspace-oai/request'
                if url.endswith(self.oai_path):
                    self.base_url = url[:-(len(self.oai_path) + 1)]

        if self.base_url is None:
            raise ValueError('base_url argument must be specified')

        if not 'metadata_registry' in kwargs:
            kwargs['metadata_registry'] = MetadataRegistry()
            kwargs['metadata_registry'].registerReader('mets',
                                                       dspace_mets_reader)

        if self.sword_enabled:
            skwargs = {'base_url': self.base_url}

            for key in kwargs.keys():
                if key.startswith('sword_'):
                    skwargs[key[6:]] = kwargs.pop(key)

            self.sword = SwordService(**skwargs)

        if self.oai_enabled:
            self.oai = Client('/'.join((
                self.base_url,
                self.oai_path,
            )), **kwargs)

        self.identifier_base = self._extractIdentifierBase(self.base_url)

    def _extractIdentifierBase(self, url):
        """ From a given URL, extract the OAI identifier base (hostname) """
        return urlparse(url).hostname

    def _extractSet(self, handle):
        """ Determine the OAI set from a collection handle """
        if not isinstance(handle, basestring):
            raise ValueError('Collection handles must be strings')
        return 'hdl_' + handle.replace('/', '_').replace(':', '_')

    def getName(self):
        """ Get the configured name of the repository """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        return self.oai.identify().repositoryName()

    def getCollections(self):
        """ Get a list of the collections in the repository """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        return map(lambda c: c[0:2], self.oai.listSets())

    def getItemHandles(self, collection=None, **kw):
        """ Get item handles from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        for item in self.getItemIdentifiers(collection=collection, **kw):
            yield item.identifier().split(':', 2)[2]

    def getItemIdentifiers(self, collection=None, **kw):
        """ Get item identifiers from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kw.setdefault('metadataPrefix', 'mets')

        if collection:
            kw['set'] = self._extractSet(collection)

        return self.oai.listIdentifiers(**kw)

    def getItems(self, collection=None, **kw):
        """ Get full items from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kw.setdefault('metadataPrefix', 'mets')

        if collection:
            kw['set'] = self._extractSet(collection)

        return self.oai.listRecords(**kw)

    def getItem(self, handle=None, identifier=None, **kwargs):
        """ Get a single item from the OAI-PMH interface either by handle or 
        identifier """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kwargs.setdefault('metadataPrefix', 'mets')

        if handle is None and identifier is None:
            raise ValueError('Either handle or identifier must be provided')

        if handle is not None:
            if identifier is not None:
                raise ValueError('Either a handle or identifier must be '
                                 'provided, not both')

            identifier = 'oai:%s:%s' % (
                self.identifier_base,
                handle,
            )

        return self.oai.getRecord(identifier=identifier, **kwargs)

    def getOAIItemIdentifier(self, handle):
        return 'oai:%s:%s' % (self._extractIdentifierBase(
            self.base_url), handle)

    def getSwordCollections(self):
        pass

    def getSwordCollection(self, args):
        pass
Пример #3
0
def get_record(target, identifier):
    if target is not None:
        client = Client(target['url'], registry)
        record = client.getRecord(identifier=identifier, metadataPrefix=target['metadata_prefix'])
        return convert_record(record, target['metadata_prefix'], target['title'])
Пример #4
0
class ZoraAPI:
    METADATA_PREFIX = 'oai_dc'

    # In the constructor, we register to the ZORA API and initialize the necessary class variables
    def __init__(self, url):
        registry = MetadataRegistry()
        registry.registerReader(ZoraAPI.METADATA_PREFIX, oai_dc_reader)
        self.client = Client(url, registry)
        self.institutes = {}
        self.resource_types = []
        self.load_institutes_and_types()

    # Returns the hierarchical dictionary of institutes
    def get_institutes(self):
        return self.institutes

    # Returns the list of resource types
    def get_resource_types(self):
        return self.resource_types

    # Loads all institutes and resource types. The institutes also get parsed into a hierarchical dictionary.
    def load_institutes_and_types(self):
        institutes_list = []
        resource_type_list = []
        for item in self.client.listSets():
            split = item[1].split(' = ')
            if len(split) != 2:
                continue
            set_type, set_value = split
            if set_type == 'Subjects':
                institutes_list.append(set_value)
            elif set_type == 'Type':
                resource_type_list.append(set_value)
        institutes_dict = self.parse_institutes(institutes_list)
        self.institutes = institutes_dict
        self.resource_types = resource_type_list

    # Parses a list of institutes into a hierarchical dictionary
    @staticmethod
    def parse_institutes(institute_list_raw):
        institutes_dict = {}
        for institute_raw in institute_list_raw:
            institutes = institute_raw.split(': ')
            parent = institutes_dict
            for institute in institutes:
                if parent.get(institute) is None:
                    parent[institute] = {}
                parent = parent[institute]
        return institutes_dict

    # Get all metadata dictionaries from ZORA
    def get_metadata_dicts(self, from_):
        record_list = self.get_records(from_)
        metadata_dict_list = self.parse_records(record_list)
        return metadata_dict_list

    # Gets one specific paper from the ZORA repository and returns the record of it
    def get_record(self, uid):
        record = self.client.getRecord(identifier=uid,
                                       metadataPrefix=ZoraAPI.METADATA_PREFIX)
        return record

    # Gets the papers from the ZORA repository and returns their records in form of a list
    def get_records(self, from_):
        args = {'metadataPrefix': ZoraAPI.METADATA_PREFIX}

        # Add the from_ argument if it is defined (this is used to get only the most recent papers/changes)
        if from_:
            args['from_'] = from_

        # Get the relevant papers from ZORA and parse them
        record_list = []
        try:
            print('Loading records from ZORA API...')
            record_iterator = self.client.listRecords(**args)
            record_list = []
            count = 0
            for record in record_iterator:
                record_list.append(record)
                count += 1
                if is_debug() and count % 1000 == 0:
                    print(str(count))
            print(count)
            print('Done')
        except NoRecordsMatchError:
            print('No records were found')
        except RemoteDisconnected as error:
            print(error)
        except Exception as error:
            print(error)
        finally:
            return record_list

    # This method parses a list of records from ZORA in a easier to use metadata dictionary.
    def parse_records(self, record_list):
        metadata_dict_list = []
        print('Parsing records...')
        for record in record_list:
            metadata_dict = self.parse_record(record)
            if metadata_dict:
                metadata_dict_list.append(metadata_dict)
        print('Done')
        return metadata_dict_list

    # This function parses a record into a dictionary with a similar structure of the Paper database object.
    # To do so, it turns some unnecessary lists into single values and parses the 'subject' field into 'ddcs' (dewey
    # decimal classifications), 'keywords' and 'institutes'.
    #
    # NOTE: It is not possible to parse the 'subject' field properly since we lack the ability to distinguish between
    # keywords and institutes (some institutes contain commas --> they will get recognized as lists of keywords).
    @staticmethod
    def parse_record(record):
        metadata_dict = {}
        metadata_dict['uid'] = record[0].identifier()

        # If there is no metadata, we assume that the paper has been deleted and store that information in the dict
        if not record[1]:
            metadata_dict['deleted'] = True
            return metadata_dict

        # If there is metadata available, we parse it into a convenient form
        metadata_dict = {**metadata_dict, **dict(record[1].getMap())}

        metadata_dict['title'] = metadata_dict['title'][
            0] if 'title' in metadata_dict and len(
                metadata_dict['title']) > 0 else None
        metadata_dict['creators'] = metadata_dict.pop(
            'creator') if 'creator' in metadata_dict else []

        # If the field 'subject' starts with three digits, it is a ddc (dewey decimal classification). If it contains a
        # comma-separated list, it is a list of keywords. Otherwise it is an institute.
        #
        # NOTE: There are some dewey decimal classifications that contain commas, therefore we check for the three
        # digits before we look for comma separated lists. Some institutes contain commas as well. This
        # leads to some institutes getting recognized as a list of keywords. With the information available this problem
        # unfortunately cannot be solved properly.
        institute_list = []
        ddc_list = []
        keyword_list = []
        if 'subject' in metadata_dict:
            for item in metadata_dict['subject']:

                # If subject starts with three digits and a space, we assume its a dewey decimal classification
                regex = re.compile('^\d\d\d\s+\w')
                if regex.match(item):
                    ddc_list.append(item)

                # If the subject has the same name as an institute, we assume it is an institute
                elif db.session.query(Institute).filter(
                        Institute.name == item).first():
                    institute_list.append(item)

                # If it is none of the above, we assume that it is a comma-separated list of keywords
                else:
                    for keyword in item.split(','):
                        keyword_list.append(keyword)

        metadata_dict['institutes'] = institute_list
        metadata_dict['ddcs'] = ddc_list
        metadata_dict['keywords'] = keyword_list
        metadata_dict['description'] = metadata_dict['description'][
            0] if 'description' in metadata_dict and len(
                metadata_dict['description']) > 0 else None
        metadata_dict['publisher'] = metadata_dict['publisher'][
            0] if 'publisher' in metadata_dict and len(
                metadata_dict['publisher']) > 0 else None
        metadata_dict['date'] = metadata_dict['date'][
            0] if 'date' in metadata_dict and len(
                metadata_dict['date']) > 0 else None

        # We filter the 'type' field and only store the paper type
        type_list = metadata_dict.pop(
            'type') if 'type' in metadata_dict else []
        resource_type_list = []
        for resource_type in type_list:
            if db.session.query(ResourceType).filter(
                    ResourceType.name == resource_type).first():
                resource_type_list.append(resource_type)
        metadata_dict['resource_types'] = resource_type_list
        metadata_dict['language'] = metadata_dict['language'][
            0] if 'language' in metadata_dict and len(
                metadata_dict['language']) > 0 else None
        metadata_dict['relation'] = metadata_dict['relation'][
            0] if 'relation' in metadata_dict and len(
                metadata_dict['relation']) > 0 else None

        return metadata_dict
Пример #5
0
class OaiPaperSource(PaperSource):  # TODO: this should not inherit from PaperSource
    """
    A paper source that fetches records from the OAI-PMH proxy
    (typically: proaixy).

    It uses the ListRecord verb to fetch records from the OAI-PMH
    source. Each record is then converted to a :class:`BarePaper`
    by an :class:`OaiTranslator` that handles the format
    the metadata is served in.
    """

    def __init__(self, endpoint, day_granularity=False, *args, **kwargs):
        """
        This sets up the paper source.

        :param endpoint: the address of the OAI-PMH endpoint
            to fetch from.
        :param day_granularity: should we use day-granular timestamps
            to fetch from the proxy or full timestamps (default: False,
            full timestamps)

        See the protocol reference for more information on timestamp
        granularity:
        https://www.openarchives.org/OAI/openarchivesprotocol.html
        """
        super(OaiPaperSource, self).__init__(*args, **kwargs)
        self.registry = MetadataRegistry()
        self.registry.registerReader('oai_dc', oai_dc_reader)
        self.registry.registerReader('base_dc', base_dc_reader)
        self.registry.registerReader('citeproc', citeproc_reader)
        self.client = Client(endpoint, self.registry)
        self.client._day_granularity = day_granularity
        if settings.PROAIXY_API_KEY:
            self.client.extra_parameters = {
                'key': settings.PROAIXY_API_KEY}
        self.translators = {}

    # Translator management

    def add_translator(self, translator):
        """
        Adds the given translator to the paper source,
        so that we know how to translate papers in the given format.

        The paper source cannot hold more than one translator
        per OAI format (it decides what translator to use
        solely based on the format) so if there is already a translator
        for that format, it will be overriden.
        """
        self.translators[translator.format()] = translator

    # Record ingestion

    def ingest(self, from_date=None, metadataPrefix='any',
               resumptionToken=None):
        """
        Main method to fill Dissemin with papers!

        :param from_date: only fetch papers modified after that date in
                          the proxy (useful for incremental fetching)
        :param metadataPrefix: restrict the ingest for this metadata
                          format
        """
	args = {'metadataPrefix':metadataPrefix}
	if from_date:
	    args['from_'] = from_date
	if resumptionToken:
	    args['resumptionToken'] = resumptionToken
        records = self.client.listRecords(**args)
        self.process_records(records)

    def create_paper_by_identifier(self, identifier, metadataPrefix):
        """
        Queries the OAI-PMH proxy for a single paper.

        :param identifier: the OAI identifier to fetch
        :param metadataPrefix: the format to use (a translator
                    has to be registered for that format, otherwise
                    we return None with a warning message)
        :returns: a Paper or None
        """
        record = self.client.getRecord(
                    metadataPrefix=metadataPrefix,
                    identifier=identifier)
        return self.process_record(record[0], record[1]._map)

    # Record search utilities

    def listRecords_or_empty(self, source, *args, **kwargs):
        """
        pyoai raises :class:`NoRecordsMatchError` when no records match,
        we would rather like to get an empty list in that case.
        """
        try:
            return source.listRecords(*args, **kwargs)
        except NoRecordsMatchError:
            return []

    def process_record(self, header, metadata):
        """
        Saves the record given by the header and metadata (as returned by
        pyoai) into a Paper, or None if anything failed.
        """
        translator = self.translators.get(header.format())
        if translator is None:
            print("Warning: unknown metadata format %s, skipping" %
                  header.format())
            return

        paper = translator.translate(header, metadata)
        if paper is not None:
            try:
                with transaction.atomic():
                    saved = Paper.from_bare(paper)
                return saved
            except ValueError as e:
                print "Ignoring invalid paper:"
                print header.identifier()
                print e

    def process_records(self, listRecords):
        """
        Save as :class:`Paper` all the records contained in this list
        """
        # check that we have at least one translator, otherwise
        # it's not really worth trying…
        if not self.translators:
            raise ValueError("No OAI translators have been set up: " +
                             "We cannot save any record.")

        last_report = datetime.now()
        processed_since_report = 0

        for record in listRecords:
            header = record[0]
            metadata = record[1]._map

            self.process_record(header, metadata)

            # rate reporting
            processed_since_report += 1
            if processed_since_report >= 1000:
                td = datetime.now() - last_report
                rate = 'infty'
                if td.seconds:
                    rate = unicode(processed_since_report / td.seconds)
                print("current rate: %s records/s" % rate)
                processed_since_report = 0
                last_report = datetime.now()
Пример #6
0
# 'marc21'

sets = oai.listSets()
for s in sets:
	print s

# 'MZK03'

recids = oai.listIdentifiers(metadataPrefix='marc21', set='MZK03') # from_='2003-01-01T00:00:00Z', until=''

# for example: 'MZK03-907223' is in the list of maps
# or 356050 *not a map

# 238208 problematic
r = oai.getRecord(identifier='MZK03-1479', metadataPrefix='marc21')

# from lxml import etree
# print etree.tostring(r[1],pretty_print=True)

# xpath_evaluator = etree.XPathEvaluator(r[1][0], namespaces={'marc21':'http://www.loc.gov/MARC21/slim'})
# e = xpath_evaluator.evaluate

#s = etree.tostring(r[1][0],pretty_print=True)

rec = r[1] # this returns parsed MARC record

# Processing of the MARC record:

# link is in
rec['856']
Пример #7
0
class OaiPaperSource(PaperSource):  # TODO: this should not inherit from PaperSource
    """
    A paper source that fetches records from the OAI-PMH proxy
    (typically: proaixy).

    It uses the ListRecord verb to fetch records from the OAI-PMH
    source. Each record is then converted to a :class:`BarePaper`
    by an :class:`OaiTranslator` that handles the format
    the metadata is served in.
    """

    def __init__(self, oaisource, day_granularity=False, *args, **kwargs):
        """
        This sets up the paper source.

        :param oaisource: the OAISource to fetch from.
        :param day_granularity: should we use day-granular timestamps
            to fetch from the proxy or full timestamps (default: False,
            full timestamps)

        See the protocol reference for more information on timestamp
        granularity:
        https://www.openarchives.org/OAI/openarchivesprotocol.html
        """
        super(OaiPaperSource, self).__init__(*args, **kwargs)
        if not oaisource.endpoint:
            raise ValueError('No OAI endpoint was configured for this OAI source.')

        self.registry = MetadataRegistry()
        self.registry.registerReader('oai_dc', oai_dc_reader)
        self.registry.registerReader('base_dc', base_dc_reader)
        self.client = Client(oaisource.endpoint, self.registry)
        self.client._day_granularity = day_granularity
        self.translators = {
            'oai_dc': OAIDCTranslator(oaisource),
            'base_dc': BASEDCTranslator(oaisource),
        }

    # Translator management

    def add_translator(self, translator):
        """
        Adds the given translator to the paper source,
        so that we know how to translate papers in the given format.

        The paper source cannot hold more than one translator
        per OAI format (it decides what translator to use
        solely based on the format) so if there is already a translator
        for that format, it will be overriden.
        """
        self.translators[translator.format()] = translator

    # Record ingestion

    def ingest(self, from_date=None, metadataPrefix='oai_dc',
               resumptionToken=None):
        """
        Main method to fill Dissemin with papers!

        :param from_date: only fetch papers modified after that date in
                          the proxy (useful for incremental fetching)
        :param metadataPrefix: restrict the ingest for this metadata
                          format
        """
        args = {'metadataPrefix':metadataPrefix}
        if from_date:
            args['from_'] = from_date
        if resumptionToken:
            args['resumptionToken'] = resumptionToken
        records = self.client.listRecords(**args)
        self.process_records(records, metadataPrefix)

    def create_paper_by_identifier(self, identifier, metadataPrefix):
        """
        Queries the OAI-PMH proxy for a single paper.

        :param identifier: the OAI identifier to fetch
        :param metadataPrefix: the format to use (a translator
                    has to be registered for that format, otherwise
                    we return None with a warning message)
        :returns: a Paper or None
        """
        record = self.client.getRecord(
                    metadataPrefix=metadataPrefix,
                    identifier=identifier)
        return self.process_record(record[0], record[1]._map, metadataPrefix)

    # Record search utilities

    def listRecords_or_empty(self, source, *args, **kwargs):
        """
        pyoai raises :class:`NoRecordsMatchError` when no records match,
        we would rather like to get an empty list in that case.
        """
        try:
            return source.listRecords(*args, **kwargs)
        except NoRecordsMatchError:
            return []

    def process_record(self, header, metadata, format):
        """
        Saves the record given by the header and metadata (as returned by
        pyoai) into a Paper, or None if anything failed.
        """
        translator = self.translators.get(format)
        if translator is None:
            logger.warning("Unknown metadata format %s, skipping" % header.format())
            return

        paper = translator.translate(header, metadata)
        if paper is not None:
            try:
                with transaction.atomic():
                    saved = Paper.from_bare(paper)
                return saved
            except ValueError:
                logger.exception("Ignoring invalid paper with header %s" % header.identifier())

    def process_records(self, listRecords, format):
        """
        Save as :class:`Paper` all the records contained in this list
        """
        # check that we have at least one translator, otherwise
        # it's not really worth trying…
        if not self.translators:
            raise ValueError("No OAI translators have been set up: " +
                             "We cannot save any record.")

        last_report = datetime.now()
        processed_since_report = 0

        with ParallelGenerator(listRecords, max_lookahead=1000) as g:
            for record in g:
                header = record[0]
                metadata = record[1]._map

                self.process_record(header, metadata, format)

                # rate reporting
                processed_since_report += 1
                if processed_since_report >= 1000:
                    td = datetime.now() - last_report
                    rate = 'infty'
                    if td.seconds:
                        rate = str(processed_since_report / td.seconds)
                    logger.info("current rate: %s records/s" % rate)
                    processed_since_report = 0
                    last_report = datetime.now()
Пример #8
0
registry = metadata.MetadataRegistry()
registry.registerReader('marc21', XMLReader())

#### OAI-PMH Client processing

from oaipmh.client import Client
from lxml import etree

oai = Client('http://snape.mzk.cz/OAI-script', registry)

#recs = oai.listRecords(metadataPrefix='marc21', set='MZK03')

#rec = recs.next()
#for rec in recs:

rec = oai.getRecord(identifier='MZK03-907223', metadataPrefix='marc21')

if rec:
    print rec[0].identifier()
    r = rec[1]  # Get XML tree for record
    print etree.tostring(r, pretty_print=True)

    if r:
        xpath_evaluator = etree.XPathEvaluator(
            r, namespaces={'marc': 'http://www.loc.gov/MARC21/slim'})
        e = xpath_evaluator.evaluate

        print e("//marc:datafield[@tag='856']")
        print e("//marc:datafield[@tag='034']")
Пример #9
0
from oaipmh import metadata

registry = metadata.MetadataRegistry()
registry.registerReader('marc21', XMLReader() )

#### OAI-PMH Client processing 

from oaipmh.client import Client
from lxml import etree

oai = Client('http://snape.mzk.cz/OAI-script', registry)

#recs = oai.listRecords(metadataPrefix='marc21', set='MZK03')

#rec = recs.next()
#for rec in recs:

rec = oai.getRecord(identifier='MZK03-907223', metadataPrefix='marc21')

if rec:
	print rec[0].identifier()
	r = rec[1] # Get XML tree for record 
	print etree.tostring(r,pretty_print=True)

	if r:
		xpath_evaluator = etree.XPathEvaluator(r, namespaces={'marc':'http://www.loc.gov/MARC21/slim'})
		e = xpath_evaluator.evaluate

		print e("//marc:datafield[@tag='856']")
		print e("//marc:datafield[@tag='034']")