예제 #1
0
    def scrape(self):
        raise Exception("not finished")
        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        url = self.setting('pmh-endpoint')
        client = Client(url, registry)

        print "  OAI Repository", url
        print "  Available sets:"
        for s in client.listSets():
            print "   ", s

        oai_set = self.setting('set')
        oai_from = self.setting('from')
        oai_until = self.setting('until')

        kwargs = {}

        if oai_set:
            kwargs['set'] = oai_set

        if oai_from is not None:
            date_args = [int(arg) for arg in oai_from.split("-")]
            kwargs['from_'] = datetime.datetime(*date_args)

        if oai_until is not None:
            date_args = [int(arg) for arg in oai_until.split("-")]
            kwargs['until'] = datetime.datetime(*date_args)

        records = [r for r in client.listRecords(metadataPrefix='oai_dc', **kwargs)]

        data_filepath = os.path.join(self.work_dir(), self.setting('data-file'))
        with open(data_filepath, 'wb') as f:
            print "  picking", len(records), "records"
            pickle.dump(records, f)
예제 #2
0
def list_sets(target):
    if target is not None:
        client = Client(target['url'], registry)
        setspecs = client.listSets()
        results = []
        if setspecs is not None:
            for setspec in setspecs:
                results.append(convert_setspec(setspec))
        return results
예제 #3
0
def list_sets(target):
    if target is not None:
        client = Client(target['url'], registry)
        setspecs = client.listSets()
        results = []
        if setspecs is not None:
            for setspec in setspecs:
                results.append(convert_setspec(setspec))
        return results
예제 #4
0
    def list_oai_community_sets(self, repository):

        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(repository.base_url, registry)
            sets = client.listSets()
        except:
            return

        """ Filter records to build list of community sets """
        self.communities = []
        for i in sets:
            set_id = i[0]
            set_name = i[1]
            """ Build collection tuples (id, human readable name) """
            if set_id[:3] == 'com':
                set_data = []
                set_data.append(set_id)
                set_data.append(set_name)
                self.communities.append(set_data)

        self.communities = sorted(
            self.communities, key=lambda community: community[1])
예제 #5
0
def harvest(metadata_set, dest_folder, log_file, content_type,
            from_date, until_date):

    #############################
    # ### FILESYSTEM CHECKS ### #
    #############################
    try:
        if not os.path.isdir(dest_folder):
            os.makedirs(dest_folder)
        # Verify write permission inside the folder:
    except BaseException as e:
        log.error(str(e))
        log.exit("Unable to create destination folder: %s" % dest_folder)

    try:
        test_path = os.path.join(dest_folder, '__test_permissions__')
        os.makedirs(test_path)
        os.rmdir(test_path)
    except BaseException as e:
        log.error(str(e))
        log.exit("Unable to use destination folder: %s" % dest_folder)

    try:
        log_handle = open(log_file, 'a+')
        log_handle.close()
    except BaseException as e:
        log.error(str(e))
        log.exit("Unable to create log_file: %s" % log_file)

    #################################
    # ### OAI-PMH CONFIGURATION ### #
    #################################
    URL = 'https://node0-d-efg.d4science.org/efg/mvc/oai/oai.do'
    metadata_prefix = 'efg'

    ###################################
    # ### OPEN OAI-PMH CONNECTION ### #
    ###################################
    registry = MetadataRegistry()
    registry.registerReader(metadata_prefix, oai_dc_reader)

    #print ("URL=" + str(URL))

    client = Client(URL, registry)

    ####################################
    # ### CHECK IF THIS SET EXISTS ### #
    ####################################
    set_found = False
    for s in client.listSets():
        if metadata_set == s[0]:
            set_found = True

    if not set_found:
        log.exit("Unable to find this set: %s" % metadata_set)

    #############################
    # ### RETRIEVE METADATA ### #
    #############################

    if from_date is not None:
        from_date = parse_date(from_date)
        if from_date is None:
            log.exit("Unable to convert from date")

    if until_date is not None:
        until_date = parse_date(until_date)
        if until_date is None:
            log.exit("Unable to convert until date")

    report_data = {
        'downloaded': 0,
        'filtered': 0,
        'saved': 0,
        'saved_files': [],
        'missing_sourceid': [],
        'wrong_content_type': []
    }
    timestamp = int(1000 * time.time())
    log.info("Retrieving records for %s..." % metadata_set)
    try:
        records = client.listRecords(
            metadataPrefix=metadata_prefix,
            set=metadata_set,
            from_=from_date,
            until=until_date)
    except NoRecordsMatchError as e:
        log.exit(e)

    log.info("Records retrieved, extracting...")
    try:

        for record in records:
            element = record[1].element()
            # Obtained eTree is based on namespaced XML
            # Read: 19.7.1.6. Parsing XML with Namespaces
            # https://docs.python.org/2/library/xml.etree.elementtree.html

            # find(match)
            # Finds the first subelement matching match.
            #   match may be a tag name or path.
            #   Returns an element instance or None.

            # findall(match)
            # Finds all matching subelements, by tag name or path.
            #   Returns a list containing all matching elements
            #   in document order.

            report_data['downloaded'] += 1

            if report_data['downloaded'] % 100 == 0:
                print('.', end='', flush=True)

                if report_data['downloaded'] % 5000 == 0:
                    print(
                        ' %s downloaded - %s saved' % (
                            report_data['downloaded'],
                            report_data['saved']
                        ), flush=True)

            efgEntity = element.find(tag("efgEntity"))
            if efgEntity is None:
                # log.warning("efgEntity not found, skipping record")
                continue
            avcreation = efgEntity.find(tag("avcreation"))
            nonavcreation = efgEntity.find(tag("nonavcreation"))

            if avcreation is not None:
                manifestation = avcreation.find(tag("avManifestation"))
                recordSource = avcreation.find(tag("recordSource"))
                keywords = avcreation.findall(tag("keywords"))
                title_el = avcreation.find(tag("identifyingTitle"))
                title = (title_el.text
                         if title_el is not None
                         else "Unknown title")
            elif nonavcreation is not None:
                manifestation = nonavcreation.find(tag("nonAVManifestation"))
                recordSource = nonavcreation.find(tag("recordSource"))
                keywords = nonavcreation.findall(tag("keywords"))
                title_el = nonavcreation.find(tag("title"))
                title = (title_el.find(tag("text")).text
                         if title_el is not None
                         else "Unknown title")
            else:
                title = "Unknown title"
                # log.warning("(non)avcreation not found, skipping record")
                continue

            filter_keyword = "IMediaCities"
            is_good = False
            for keyword in keywords:
                term = keyword.find(tag("term"))
                if term.text == filter_keyword:
                    is_good = True
                    break

            if not is_good:
                continue

            report_data['filtered'] += 1

            if manifestation is None:
                report_data['missing_sourceid'].append(title)
                # log.warning("avManifestation not found, skipping record")
                continue

            if content_type is not None:
                content_type = content_type.lower()

                item = manifestation.find(tag("item"))
                if item is None:
                    # missing <item> => type cannot be found
                    report_data['wrong_content_type'].append(title)
                    continue

                item_type = item.find(tag("type"))
                if item_type is None:
                    # missing <type>
                    report_data['wrong_content_type'].append(title)
                    continue

                if item_type.text.lower() != content_type:
                    # wrong type
                    report_data['wrong_content_type'].append(title)
                    continue



            # ATTENZIONE: il sourceID va preso dal recordSource che sta
            #              sotto avcreation/nonavcreation e NON sotto
            #               avManifestation/nonAVManifestation

            #recordSource = manifestation.find(tag("recordSource"))
            if recordSource is None:
                report_data['missing_sourceid'].append(title)
                # log.warning("recordSource not found, skipping record")
                continue

            sourceID = recordSource.find(tag("sourceID"))
            if sourceID is None:
                report_data['missing_sourceid'].append(title)
                # log.warning("sourceID not found, skipping record")
                continue

            content = etree.tostring(efgEntity, pretty_print=False)

            # id_text = urllib.parse.quote_plus(sourceID.text.strip())
            # replace non alpha-numeric characters with a dash
            id_text = re.sub(r'[\W_]+', '-', sourceID.text.strip())
            # fine cinzia

            filename = "%s_%s_%s.xml" % (
                metadata_set,
                id_text,
                timestamp
            )
            filepath = os.path.join(dest_folder, filename)
            # with open(filepath, 'wb') as f:
            with codecs.open(filepath, 'wb', "utf-8") as f:
                f.write(content.decode('utf-8'))
            # OLD
            #with codecs.open(filepath, 'wb', "utf-8") as f:
            #    f.write(html.unescape(content.decode('utf-8')))

            report_data['saved'] += 1
            report_data['saved_files'].append(filename)

    except NoRecordsMatchError as e:
        log.warning("No more records after filtering?")
        log.warning(e)

        # ###################
        # Write report file
        # ###################

        # the procedure writes a report file containing the results
        #     of the harvesting:
        # the list of records that do not contain the record ID
        #     (by writing the content of the element title)

    with open(log_file, 'w+') as f:
        json.dump(report_data, f)

    f.close()

    # Just to close previous dot line
    print("")

    log.info("""

%s records from set [%s] downloaded
open log file [%s] for details
""" % (report_data['saved'], metadata_set, log_file)
    )
예제 #6
0
class Repository(object):
    """ Repository handles interaction with the various interfaces provided by 
    the dspace repository. """
    def __init__(self, url=None, **kwargs):
        self.base_url = kwargs.pop('base_url', None)
        self.oai_path = kwargs.pop('oai_path', None)

        self.oai_enabled = bool(kwargs.pop('oai_enabled', True))
        self.sword_enabled = bool(kwargs.pop('sword_enabled', False))

        if url is not None:
            warn(
                'The url paramater will not be supported in version 3, '
                'use base_url and oai_path instead', DeprecationWarning)

            if (self.base_url and url.startswith(self.base_url)
                    and self.oai_path is None):
                self.oai_path = url.replace(self.base_url, '', 1).lstrip('/')
            elif not self.base_url:
                if self.oai_path is None:
                    self.oai_path = 'dspace-oai/request'
                if url.endswith(self.oai_path):
                    self.base_url = url[:-(len(self.oai_path) + 1)]

        if self.base_url is None:
            raise ValueError('base_url argument must be specified')

        if not 'metadata_registry' in kwargs:
            kwargs['metadata_registry'] = MetadataRegistry()
            kwargs['metadata_registry'].registerReader('mets',
                                                       dspace_mets_reader)

        if self.sword_enabled:
            skwargs = {'base_url': self.base_url}

            for key in kwargs.keys():
                if key.startswith('sword_'):
                    skwargs[key[6:]] = kwargs.pop(key)

            self.sword = SwordService(**skwargs)

        if self.oai_enabled:
            self.oai = Client('/'.join((
                self.base_url,
                self.oai_path,
            )), **kwargs)

        self.identifier_base = self._extractIdentifierBase(self.base_url)

    def _extractIdentifierBase(self, url):
        """ From a given URL, extract the OAI identifier base (hostname) """
        return urlparse(url).hostname

    def _extractSet(self, handle):
        """ Determine the OAI set from a collection handle """
        if not isinstance(handle, basestring):
            raise ValueError('Collection handles must be strings')
        return 'hdl_' + handle.replace('/', '_').replace(':', '_')

    def getName(self):
        """ Get the configured name of the repository """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        return self.oai.identify().repositoryName()

    def getCollections(self):
        """ Get a list of the collections in the repository """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        return map(lambda c: c[0:2], self.oai.listSets())

    def getItemHandles(self, collection=None, **kw):
        """ Get item handles from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        for item in self.getItemIdentifiers(collection=collection, **kw):
            yield item.identifier().split(':', 2)[2]

    def getItemIdentifiers(self, collection=None, **kw):
        """ Get item identifiers from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kw.setdefault('metadataPrefix', 'mets')

        if collection:
            kw['set'] = self._extractSet(collection)

        return self.oai.listIdentifiers(**kw)

    def getItems(self, collection=None, **kw):
        """ Get full items from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kw.setdefault('metadataPrefix', 'mets')

        if collection:
            kw['set'] = self._extractSet(collection)

        return self.oai.listRecords(**kw)

    def getItem(self, handle=None, identifier=None, **kwargs):
        """ Get a single item from the OAI-PMH interface either by handle or 
        identifier """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kwargs.setdefault('metadataPrefix', 'mets')

        if handle is None and identifier is None:
            raise ValueError('Either handle or identifier must be provided')

        if handle is not None:
            if identifier is not None:
                raise ValueError('Either a handle or identifier must be '
                                 'provided, not both')

            identifier = 'oai:%s:%s' % (
                self.identifier_base,
                handle,
            )

        return self.oai.getRecord(identifier=identifier, **kwargs)

    def getOAIItemIdentifier(self, handle):
        return 'oai:%s:%s' % (self._extractIdentifierBase(
            self.base_url), handle)

    def getSwordCollections(self):
        pass

    def getSwordCollection(self, args):
        pass
예제 #7
0
        provider_name = journals['titulo'][i]  #
        url_provider = journals['url'][i]  #armazena a url do provedor
        provider_issn = journals['issn'][i]  #
        print(provider_name, url_provider)

        try:

            print("Acessando os dados de provedor ", provider_name)

            #Conecta com o provedor OAI-PMH
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(url_provider, registry)

            print("Conexão estabelecida")
            sets = client.listSets()  #lista os conjuntos
            print("Conjuntos encontrados")

            for setSpec, setName, setDescription in sets:  #percorre cada conjunto do provedor

                try:

                    records = client.listRecords(
                        metadataPrefix='oai_dc',
                        set=setSpec)  #lista os registros

                    print("Coletando dados do conjunto {}, do provedor {} \n".
                          format(setName, provider_name))

                    count = 1
예제 #8
0
class ZoraAPI:
    METADATA_PREFIX = 'oai_dc'

    # In the constructor, we register to the ZORA API and initialize the necessary class variables
    def __init__(self, url):
        registry = MetadataRegistry()
        registry.registerReader(ZoraAPI.METADATA_PREFIX, oai_dc_reader)
        self.client = Client(url, registry)
        self.institutes = {}
        self.resource_types = []
        self.load_institutes_and_types()

    # Returns the hierarchical dictionary of institutes
    def get_institutes(self):
        return self.institutes

    # Returns the list of resource types
    def get_resource_types(self):
        return self.resource_types

    # Loads all institutes and resource types. The institutes also get parsed into a hierarchical dictionary.
    def load_institutes_and_types(self):
        institutes_list = []
        resource_type_list = []
        for item in self.client.listSets():
            split = item[1].split(' = ')
            if len(split) != 2:
                continue
            set_type, set_value = split
            if set_type == 'Subjects':
                institutes_list.append(set_value)
            elif set_type == 'Type':
                resource_type_list.append(set_value)
        institutes_dict = self.parse_institutes(institutes_list)
        self.institutes = institutes_dict
        self.resource_types = resource_type_list

    # Parses a list of institutes into a hierarchical dictionary
    @staticmethod
    def parse_institutes(institute_list_raw):
        institutes_dict = {}
        for institute_raw in institute_list_raw:
            institutes = institute_raw.split(': ')
            parent = institutes_dict
            for institute in institutes:
                if parent.get(institute) is None:
                    parent[institute] = {}
                parent = parent[institute]
        return institutes_dict

    # Get all metadata dictionaries from ZORA
    def get_metadata_dicts(self, from_):
        record_list = self.get_records(from_)
        metadata_dict_list = self.parse_records(record_list)
        return metadata_dict_list

    # Gets one specific paper from the ZORA repository and returns the record of it
    def get_record(self, uid):
        record = self.client.getRecord(identifier=uid,
                                       metadataPrefix=ZoraAPI.METADATA_PREFIX)
        return record

    # Gets the papers from the ZORA repository and returns their records in form of a list
    def get_records(self, from_):
        args = {'metadataPrefix': ZoraAPI.METADATA_PREFIX}

        # Add the from_ argument if it is defined (this is used to get only the most recent papers/changes)
        if from_:
            args['from_'] = from_

        # Get the relevant papers from ZORA and parse them
        record_list = []
        try:
            print('Loading records from ZORA API...')
            record_iterator = self.client.listRecords(**args)
            record_list = []
            count = 0
            for record in record_iterator:
                record_list.append(record)
                count += 1
                if is_debug() and count % 1000 == 0:
                    print(str(count))
            print(count)
            print('Done')
        except NoRecordsMatchError:
            print('No records were found')
        except RemoteDisconnected as error:
            print(error)
        except Exception as error:
            print(error)
        finally:
            return record_list

    # This method parses a list of records from ZORA in a easier to use metadata dictionary.
    def parse_records(self, record_list):
        metadata_dict_list = []
        print('Parsing records...')
        for record in record_list:
            metadata_dict = self.parse_record(record)
            if metadata_dict:
                metadata_dict_list.append(metadata_dict)
        print('Done')
        return metadata_dict_list

    # This function parses a record into a dictionary with a similar structure of the Paper database object.
    # To do so, it turns some unnecessary lists into single values and parses the 'subject' field into 'ddcs' (dewey
    # decimal classifications), 'keywords' and 'institutes'.
    #
    # NOTE: It is not possible to parse the 'subject' field properly since we lack the ability to distinguish between
    # keywords and institutes (some institutes contain commas --> they will get recognized as lists of keywords).
    @staticmethod
    def parse_record(record):
        metadata_dict = {}
        metadata_dict['uid'] = record[0].identifier()

        # If there is no metadata, we assume that the paper has been deleted and store that information in the dict
        if not record[1]:
            metadata_dict['deleted'] = True
            return metadata_dict

        # If there is metadata available, we parse it into a convenient form
        metadata_dict = {**metadata_dict, **dict(record[1].getMap())}

        metadata_dict['title'] = metadata_dict['title'][
            0] if 'title' in metadata_dict and len(
                metadata_dict['title']) > 0 else None
        metadata_dict['creators'] = metadata_dict.pop(
            'creator') if 'creator' in metadata_dict else []

        # If the field 'subject' starts with three digits, it is a ddc (dewey decimal classification). If it contains a
        # comma-separated list, it is a list of keywords. Otherwise it is an institute.
        #
        # NOTE: There are some dewey decimal classifications that contain commas, therefore we check for the three
        # digits before we look for comma separated lists. Some institutes contain commas as well. This
        # leads to some institutes getting recognized as a list of keywords. With the information available this problem
        # unfortunately cannot be solved properly.
        institute_list = []
        ddc_list = []
        keyword_list = []
        if 'subject' in metadata_dict:
            for item in metadata_dict['subject']:

                # If subject starts with three digits and a space, we assume its a dewey decimal classification
                regex = re.compile('^\d\d\d\s+\w')
                if regex.match(item):
                    ddc_list.append(item)

                # If the subject has the same name as an institute, we assume it is an institute
                elif db.session.query(Institute).filter(
                        Institute.name == item).first():
                    institute_list.append(item)

                # If it is none of the above, we assume that it is a comma-separated list of keywords
                else:
                    for keyword in item.split(','):
                        keyword_list.append(keyword)

        metadata_dict['institutes'] = institute_list
        metadata_dict['ddcs'] = ddc_list
        metadata_dict['keywords'] = keyword_list
        metadata_dict['description'] = metadata_dict['description'][
            0] if 'description' in metadata_dict and len(
                metadata_dict['description']) > 0 else None
        metadata_dict['publisher'] = metadata_dict['publisher'][
            0] if 'publisher' in metadata_dict and len(
                metadata_dict['publisher']) > 0 else None
        metadata_dict['date'] = metadata_dict['date'][
            0] if 'date' in metadata_dict and len(
                metadata_dict['date']) > 0 else None

        # We filter the 'type' field and only store the paper type
        type_list = metadata_dict.pop(
            'type') if 'type' in metadata_dict else []
        resource_type_list = []
        for resource_type in type_list:
            if db.session.query(ResourceType).filter(
                    ResourceType.name == resource_type).first():
                resource_type_list.append(resource_type)
        metadata_dict['resource_types'] = resource_type_list
        metadata_dict['language'] = metadata_dict['language'][
            0] if 'language' in metadata_dict and len(
                metadata_dict['language']) > 0 else None
        metadata_dict['relation'] = metadata_dict['relation'][
            0] if 'relation' in metadata_dict and len(
                metadata_dict['relation']) > 0 else None

        return metadata_dict
예제 #9
0
#### OAI-PMH Client processing 

oai = Client('http://snape.mzk.cz/OAI-script', registry)

id = oai.identify()
print id.repositoryName()
print id.adminEmails()
print id.baseURL()

formats = oai.listMetadataFormats()
pprint formats

# 'marc21'

sets = oai.listSets()
for s in sets:
	print s

# 'MZK03'

recids = oai.listIdentifiers(metadataPrefix='marc21', set='MZK03') # from_='2003-01-01T00:00:00Z', until=''

# for example: 'MZK03-907223' is in the list of maps
# or 356050 *not a map

# 238208 problematic
r = oai.getRecord(identifier='MZK03-1479', metadataPrefix='marc21')

# from lxml import etree
# print etree.tostring(r[1],pretty_print=True)
예제 #10
0
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
from PdfParser.PdfParser import Preprocessing

import json
import bz2
from numpy import record
import pandas as pd
baseurl = 'http://export.arxiv.org/oai2?'
corpuspath = '/Users/goksukara/Desktop/Projects/EclipseWorkspace/Specilization/PhytonCode/Data/corpus.csv'
if __name__ == "__main__":

    url = baseurl
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(url, registry)
    record = client.listSets()
    for word in record:
        print(word)
    #Write to file
    #with bz2.BZ2File('out.json', 'wb') as outfile:

for record in client.listRecords(metadataPrefix='oai_dc', set='cs'):
    header, metadata, _ = record
    doc = {}
    #Extract identifier
    #doc["id"] = header.identifier()
    #Extract title and other metadata
    doc["title"] = "\n".join(metadata["title"])
    doc["abstract"] = "\n".join(metadata["description"])
    #doc["authors"] = metadata["creator"]