def scrape(self): raise Exception("not finished") registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) url = self.setting('pmh-endpoint') client = Client(url, registry) print " OAI Repository", url print " Available sets:" for s in client.listSets(): print " ", s oai_set = self.setting('set') oai_from = self.setting('from') oai_until = self.setting('until') kwargs = {} if oai_set: kwargs['set'] = oai_set if oai_from is not None: date_args = [int(arg) for arg in oai_from.split("-")] kwargs['from_'] = datetime.datetime(*date_args) if oai_until is not None: date_args = [int(arg) for arg in oai_until.split("-")] kwargs['until'] = datetime.datetime(*date_args) records = [r for r in client.listRecords(metadataPrefix='oai_dc', **kwargs)] data_filepath = os.path.join(self.work_dir(), self.setting('data-file')) with open(data_filepath, 'wb') as f: print " picking", len(records), "records" pickle.dump(records, f)
def list_sets(target): if target is not None: client = Client(target['url'], registry) setspecs = client.listSets() results = [] if setspecs is not None: for setspec in setspecs: results.append(convert_setspec(setspec)) return results
def list_oai_community_sets(self, repository): try: registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(repository.base_url, registry) sets = client.listSets() except: return """ Filter records to build list of community sets """ self.communities = [] for i in sets: set_id = i[0] set_name = i[1] """ Build collection tuples (id, human readable name) """ if set_id[:3] == 'com': set_data = [] set_data.append(set_id) set_data.append(set_name) self.communities.append(set_data) self.communities = sorted( self.communities, key=lambda community: community[1])
def harvest(metadata_set, dest_folder, log_file, content_type, from_date, until_date): ############################# # ### FILESYSTEM CHECKS ### # ############################# try: if not os.path.isdir(dest_folder): os.makedirs(dest_folder) # Verify write permission inside the folder: except BaseException as e: log.error(str(e)) log.exit("Unable to create destination folder: %s" % dest_folder) try: test_path = os.path.join(dest_folder, '__test_permissions__') os.makedirs(test_path) os.rmdir(test_path) except BaseException as e: log.error(str(e)) log.exit("Unable to use destination folder: %s" % dest_folder) try: log_handle = open(log_file, 'a+') log_handle.close() except BaseException as e: log.error(str(e)) log.exit("Unable to create log_file: %s" % log_file) ################################# # ### OAI-PMH CONFIGURATION ### # ################################# URL = 'https://node0-d-efg.d4science.org/efg/mvc/oai/oai.do' metadata_prefix = 'efg' ################################### # ### OPEN OAI-PMH CONNECTION ### # ################################### registry = MetadataRegistry() registry.registerReader(metadata_prefix, oai_dc_reader) #print ("URL=" + str(URL)) client = Client(URL, registry) #################################### # ### CHECK IF THIS SET EXISTS ### # #################################### set_found = False for s in client.listSets(): if metadata_set == s[0]: set_found = True if not set_found: log.exit("Unable to find this set: %s" % metadata_set) ############################# # ### RETRIEVE METADATA ### # ############################# if from_date is not None: from_date = parse_date(from_date) if from_date is None: log.exit("Unable to convert from date") if until_date is not None: until_date = parse_date(until_date) if until_date is None: log.exit("Unable to convert until date") report_data = { 'downloaded': 0, 'filtered': 0, 'saved': 0, 'saved_files': [], 'missing_sourceid': [], 'wrong_content_type': [] } timestamp = int(1000 * time.time()) log.info("Retrieving records for %s..." % metadata_set) try: records = client.listRecords( metadataPrefix=metadata_prefix, set=metadata_set, from_=from_date, until=until_date) except NoRecordsMatchError as e: log.exit(e) log.info("Records retrieved, extracting...") try: for record in records: element = record[1].element() # Obtained eTree is based on namespaced XML # Read: 19.7.1.6. Parsing XML with Namespaces # https://docs.python.org/2/library/xml.etree.elementtree.html # find(match) # Finds the first subelement matching match. # match may be a tag name or path. # Returns an element instance or None. # findall(match) # Finds all matching subelements, by tag name or path. # Returns a list containing all matching elements # in document order. report_data['downloaded'] += 1 if report_data['downloaded'] % 100 == 0: print('.', end='', flush=True) if report_data['downloaded'] % 5000 == 0: print( ' %s downloaded - %s saved' % ( report_data['downloaded'], report_data['saved'] ), flush=True) efgEntity = element.find(tag("efgEntity")) if efgEntity is None: # log.warning("efgEntity not found, skipping record") continue avcreation = efgEntity.find(tag("avcreation")) nonavcreation = efgEntity.find(tag("nonavcreation")) if avcreation is not None: manifestation = avcreation.find(tag("avManifestation")) recordSource = avcreation.find(tag("recordSource")) keywords = avcreation.findall(tag("keywords")) title_el = avcreation.find(tag("identifyingTitle")) title = (title_el.text if title_el is not None else "Unknown title") elif nonavcreation is not None: manifestation = nonavcreation.find(tag("nonAVManifestation")) recordSource = nonavcreation.find(tag("recordSource")) keywords = nonavcreation.findall(tag("keywords")) title_el = nonavcreation.find(tag("title")) title = (title_el.find(tag("text")).text if title_el is not None else "Unknown title") else: title = "Unknown title" # log.warning("(non)avcreation not found, skipping record") continue filter_keyword = "IMediaCities" is_good = False for keyword in keywords: term = keyword.find(tag("term")) if term.text == filter_keyword: is_good = True break if not is_good: continue report_data['filtered'] += 1 if manifestation is None: report_data['missing_sourceid'].append(title) # log.warning("avManifestation not found, skipping record") continue if content_type is not None: content_type = content_type.lower() item = manifestation.find(tag("item")) if item is None: # missing <item> => type cannot be found report_data['wrong_content_type'].append(title) continue item_type = item.find(tag("type")) if item_type is None: # missing <type> report_data['wrong_content_type'].append(title) continue if item_type.text.lower() != content_type: # wrong type report_data['wrong_content_type'].append(title) continue # ATTENZIONE: il sourceID va preso dal recordSource che sta # sotto avcreation/nonavcreation e NON sotto # avManifestation/nonAVManifestation #recordSource = manifestation.find(tag("recordSource")) if recordSource is None: report_data['missing_sourceid'].append(title) # log.warning("recordSource not found, skipping record") continue sourceID = recordSource.find(tag("sourceID")) if sourceID is None: report_data['missing_sourceid'].append(title) # log.warning("sourceID not found, skipping record") continue content = etree.tostring(efgEntity, pretty_print=False) # id_text = urllib.parse.quote_plus(sourceID.text.strip()) # replace non alpha-numeric characters with a dash id_text = re.sub(r'[\W_]+', '-', sourceID.text.strip()) # fine cinzia filename = "%s_%s_%s.xml" % ( metadata_set, id_text, timestamp ) filepath = os.path.join(dest_folder, filename) # with open(filepath, 'wb') as f: with codecs.open(filepath, 'wb', "utf-8") as f: f.write(content.decode('utf-8')) # OLD #with codecs.open(filepath, 'wb', "utf-8") as f: # f.write(html.unescape(content.decode('utf-8'))) report_data['saved'] += 1 report_data['saved_files'].append(filename) except NoRecordsMatchError as e: log.warning("No more records after filtering?") log.warning(e) # ################### # Write report file # ################### # the procedure writes a report file containing the results # of the harvesting: # the list of records that do not contain the record ID # (by writing the content of the element title) with open(log_file, 'w+') as f: json.dump(report_data, f) f.close() # Just to close previous dot line print("") log.info(""" %s records from set [%s] downloaded open log file [%s] for details """ % (report_data['saved'], metadata_set, log_file) )
class Repository(object): """ Repository handles interaction with the various interfaces provided by the dspace repository. """ def __init__(self, url=None, **kwargs): self.base_url = kwargs.pop('base_url', None) self.oai_path = kwargs.pop('oai_path', None) self.oai_enabled = bool(kwargs.pop('oai_enabled', True)) self.sword_enabled = bool(kwargs.pop('sword_enabled', False)) if url is not None: warn( 'The url paramater will not be supported in version 3, ' 'use base_url and oai_path instead', DeprecationWarning) if (self.base_url and url.startswith(self.base_url) and self.oai_path is None): self.oai_path = url.replace(self.base_url, '', 1).lstrip('/') elif not self.base_url: if self.oai_path is None: self.oai_path = 'dspace-oai/request' if url.endswith(self.oai_path): self.base_url = url[:-(len(self.oai_path) + 1)] if self.base_url is None: raise ValueError('base_url argument must be specified') if not 'metadata_registry' in kwargs: kwargs['metadata_registry'] = MetadataRegistry() kwargs['metadata_registry'].registerReader('mets', dspace_mets_reader) if self.sword_enabled: skwargs = {'base_url': self.base_url} for key in kwargs.keys(): if key.startswith('sword_'): skwargs[key[6:]] = kwargs.pop(key) self.sword = SwordService(**skwargs) if self.oai_enabled: self.oai = Client('/'.join(( self.base_url, self.oai_path, )), **kwargs) self.identifier_base = self._extractIdentifierBase(self.base_url) def _extractIdentifierBase(self, url): """ From a given URL, extract the OAI identifier base (hostname) """ return urlparse(url).hostname def _extractSet(self, handle): """ Determine the OAI set from a collection handle """ if not isinstance(handle, basestring): raise ValueError('Collection handles must be strings') return 'hdl_' + handle.replace('/', '_').replace(':', '_') def getName(self): """ Get the configured name of the repository """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' return self.oai.identify().repositoryName() def getCollections(self): """ Get a list of the collections in the repository """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' return map(lambda c: c[0:2], self.oai.listSets()) def getItemHandles(self, collection=None, **kw): """ Get item handles from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' for item in self.getItemIdentifiers(collection=collection, **kw): yield item.identifier().split(':', 2)[2] def getItemIdentifiers(self, collection=None, **kw): """ Get item identifiers from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kw.setdefault('metadataPrefix', 'mets') if collection: kw['set'] = self._extractSet(collection) return self.oai.listIdentifiers(**kw) def getItems(self, collection=None, **kw): """ Get full items from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kw.setdefault('metadataPrefix', 'mets') if collection: kw['set'] = self._extractSet(collection) return self.oai.listRecords(**kw) def getItem(self, handle=None, identifier=None, **kwargs): """ Get a single item from the OAI-PMH interface either by handle or identifier """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kwargs.setdefault('metadataPrefix', 'mets') if handle is None and identifier is None: raise ValueError('Either handle or identifier must be provided') if handle is not None: if identifier is not None: raise ValueError('Either a handle or identifier must be ' 'provided, not both') identifier = 'oai:%s:%s' % ( self.identifier_base, handle, ) return self.oai.getRecord(identifier=identifier, **kwargs) def getOAIItemIdentifier(self, handle): return 'oai:%s:%s' % (self._extractIdentifierBase( self.base_url), handle) def getSwordCollections(self): pass def getSwordCollection(self, args): pass
provider_name = journals['titulo'][i] # url_provider = journals['url'][i] #armazena a url do provedor provider_issn = journals['issn'][i] # print(provider_name, url_provider) try: print("Acessando os dados de provedor ", provider_name) #Conecta com o provedor OAI-PMH registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(url_provider, registry) print("Conexão estabelecida") sets = client.listSets() #lista os conjuntos print("Conjuntos encontrados") for setSpec, setName, setDescription in sets: #percorre cada conjunto do provedor try: records = client.listRecords( metadataPrefix='oai_dc', set=setSpec) #lista os registros print("Coletando dados do conjunto {}, do provedor {} \n". format(setName, provider_name)) count = 1
class ZoraAPI: METADATA_PREFIX = 'oai_dc' # In the constructor, we register to the ZORA API and initialize the necessary class variables def __init__(self, url): registry = MetadataRegistry() registry.registerReader(ZoraAPI.METADATA_PREFIX, oai_dc_reader) self.client = Client(url, registry) self.institutes = {} self.resource_types = [] self.load_institutes_and_types() # Returns the hierarchical dictionary of institutes def get_institutes(self): return self.institutes # Returns the list of resource types def get_resource_types(self): return self.resource_types # Loads all institutes and resource types. The institutes also get parsed into a hierarchical dictionary. def load_institutes_and_types(self): institutes_list = [] resource_type_list = [] for item in self.client.listSets(): split = item[1].split(' = ') if len(split) != 2: continue set_type, set_value = split if set_type == 'Subjects': institutes_list.append(set_value) elif set_type == 'Type': resource_type_list.append(set_value) institutes_dict = self.parse_institutes(institutes_list) self.institutes = institutes_dict self.resource_types = resource_type_list # Parses a list of institutes into a hierarchical dictionary @staticmethod def parse_institutes(institute_list_raw): institutes_dict = {} for institute_raw in institute_list_raw: institutes = institute_raw.split(': ') parent = institutes_dict for institute in institutes: if parent.get(institute) is None: parent[institute] = {} parent = parent[institute] return institutes_dict # Get all metadata dictionaries from ZORA def get_metadata_dicts(self, from_): record_list = self.get_records(from_) metadata_dict_list = self.parse_records(record_list) return metadata_dict_list # Gets one specific paper from the ZORA repository and returns the record of it def get_record(self, uid): record = self.client.getRecord(identifier=uid, metadataPrefix=ZoraAPI.METADATA_PREFIX) return record # Gets the papers from the ZORA repository and returns their records in form of a list def get_records(self, from_): args = {'metadataPrefix': ZoraAPI.METADATA_PREFIX} # Add the from_ argument if it is defined (this is used to get only the most recent papers/changes) if from_: args['from_'] = from_ # Get the relevant papers from ZORA and parse them record_list = [] try: print('Loading records from ZORA API...') record_iterator = self.client.listRecords(**args) record_list = [] count = 0 for record in record_iterator: record_list.append(record) count += 1 if is_debug() and count % 1000 == 0: print(str(count)) print(count) print('Done') except NoRecordsMatchError: print('No records were found') except RemoteDisconnected as error: print(error) except Exception as error: print(error) finally: return record_list # This method parses a list of records from ZORA in a easier to use metadata dictionary. def parse_records(self, record_list): metadata_dict_list = [] print('Parsing records...') for record in record_list: metadata_dict = self.parse_record(record) if metadata_dict: metadata_dict_list.append(metadata_dict) print('Done') return metadata_dict_list # This function parses a record into a dictionary with a similar structure of the Paper database object. # To do so, it turns some unnecessary lists into single values and parses the 'subject' field into 'ddcs' (dewey # decimal classifications), 'keywords' and 'institutes'. # # NOTE: It is not possible to parse the 'subject' field properly since we lack the ability to distinguish between # keywords and institutes (some institutes contain commas --> they will get recognized as lists of keywords). @staticmethod def parse_record(record): metadata_dict = {} metadata_dict['uid'] = record[0].identifier() # If there is no metadata, we assume that the paper has been deleted and store that information in the dict if not record[1]: metadata_dict['deleted'] = True return metadata_dict # If there is metadata available, we parse it into a convenient form metadata_dict = {**metadata_dict, **dict(record[1].getMap())} metadata_dict['title'] = metadata_dict['title'][ 0] if 'title' in metadata_dict and len( metadata_dict['title']) > 0 else None metadata_dict['creators'] = metadata_dict.pop( 'creator') if 'creator' in metadata_dict else [] # If the field 'subject' starts with three digits, it is a ddc (dewey decimal classification). If it contains a # comma-separated list, it is a list of keywords. Otherwise it is an institute. # # NOTE: There are some dewey decimal classifications that contain commas, therefore we check for the three # digits before we look for comma separated lists. Some institutes contain commas as well. This # leads to some institutes getting recognized as a list of keywords. With the information available this problem # unfortunately cannot be solved properly. institute_list = [] ddc_list = [] keyword_list = [] if 'subject' in metadata_dict: for item in metadata_dict['subject']: # If subject starts with three digits and a space, we assume its a dewey decimal classification regex = re.compile('^\d\d\d\s+\w') if regex.match(item): ddc_list.append(item) # If the subject has the same name as an institute, we assume it is an institute elif db.session.query(Institute).filter( Institute.name == item).first(): institute_list.append(item) # If it is none of the above, we assume that it is a comma-separated list of keywords else: for keyword in item.split(','): keyword_list.append(keyword) metadata_dict['institutes'] = institute_list metadata_dict['ddcs'] = ddc_list metadata_dict['keywords'] = keyword_list metadata_dict['description'] = metadata_dict['description'][ 0] if 'description' in metadata_dict and len( metadata_dict['description']) > 0 else None metadata_dict['publisher'] = metadata_dict['publisher'][ 0] if 'publisher' in metadata_dict and len( metadata_dict['publisher']) > 0 else None metadata_dict['date'] = metadata_dict['date'][ 0] if 'date' in metadata_dict and len( metadata_dict['date']) > 0 else None # We filter the 'type' field and only store the paper type type_list = metadata_dict.pop( 'type') if 'type' in metadata_dict else [] resource_type_list = [] for resource_type in type_list: if db.session.query(ResourceType).filter( ResourceType.name == resource_type).first(): resource_type_list.append(resource_type) metadata_dict['resource_types'] = resource_type_list metadata_dict['language'] = metadata_dict['language'][ 0] if 'language' in metadata_dict and len( metadata_dict['language']) > 0 else None metadata_dict['relation'] = metadata_dict['relation'][ 0] if 'relation' in metadata_dict and len( metadata_dict['relation']) > 0 else None return metadata_dict
#### OAI-PMH Client processing oai = Client('http://snape.mzk.cz/OAI-script', registry) id = oai.identify() print id.repositoryName() print id.adminEmails() print id.baseURL() formats = oai.listMetadataFormats() pprint formats # 'marc21' sets = oai.listSets() for s in sets: print s # 'MZK03' recids = oai.listIdentifiers(metadataPrefix='marc21', set='MZK03') # from_='2003-01-01T00:00:00Z', until='' # for example: 'MZK03-907223' is in the list of maps # or 356050 *not a map # 238208 problematic r = oai.getRecord(identifier='MZK03-1479', metadataPrefix='marc21') # from lxml import etree # print etree.tostring(r[1],pretty_print=True)
from oaipmh.metadata import MetadataRegistry, oai_dc_reader from PdfParser.PdfParser import Preprocessing import json import bz2 from numpy import record import pandas as pd baseurl = 'http://export.arxiv.org/oai2?' corpuspath = '/Users/goksukara/Desktop/Projects/EclipseWorkspace/Specilization/PhytonCode/Data/corpus.csv' if __name__ == "__main__": url = baseurl registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(url, registry) record = client.listSets() for word in record: print(word) #Write to file #with bz2.BZ2File('out.json', 'wb') as outfile: for record in client.listRecords(metadataPrefix='oai_dc', set='cs'): header, metadata, _ = record doc = {} #Extract identifier #doc["id"] = header.identifier() #Extract title and other metadata doc["title"] = "\n".join(metadata["title"]) doc["abstract"] = "\n".join(metadata["description"]) #doc["authors"] = metadata["creator"]