def list_oai_collections(self, community): """ Retrieve the header data for each record in the current community repo """ try: registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(community.repository.base_url, registry) records = client.listIdentifiers( metadataPrefix='oai_dc', set=community.identifier) except: community_collections = set() return """ Filter records to build list of collections in the community set """ community_collections = set() for i in records: for j in i.setSpec(): if j[:3] == 'col': community_collections.add(j) print len(community_collections) """ Build collection tuples (identifier, name) """ for i in community_collections: # print i # print community_collections set_data = [] set_data.append(i) # Store identifier set_data.append('Collection: %s'%i) # Store human readable name # print set_data self.collections.append(set_data)
def insertAll(time, time2): registry = MetadataRegistry() registry.registerReader('arXivRaw', arXivRaw_reader) client = Client(URL, registry) client.updateGranularity() list = client.listRecords(metadataPrefix='arXivRaw', from_=time, until=time2) errors = 0 for a in list: #a = list.next() try: title = '\n'.join(a[1]['title']) sr2 = str(' '.join(a[1]['categories']).replace('-','_')).split(' ') abstract = '\n'.join(a[1]['abstract']) url = 'http://arxiv.org/abs/' + a[1]['id'][0] date = datetime.strptime(a[1]['created'][0], '%a, %d %b %Y %H:%M:%S %Z') authors = a[1]['authors'][0]# '; '.join(a[1]['keynames']) abstract = abstract + '\nBy: ' + authors + '\nIn: ' + ', '.join(sr2) print title print sr2 print abstract print url print date print authors insert(title + ' (' + authors + ')', str("fullarxiv"), url, abstract, date=date, cross_srs=sr2) except: print 'ERROR' print a errors = errors+1 print 'Completed with %s errors' % errors
def arxiv_oai_scraper(subject, start, end, sleep_time=0): base_url = "http://export.arxiv.org/oai2" output = list() registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(base_url, registry) client.updateGranularity() records = client.listRecords(metadataPrefix='oai_dc', set="{}".format(subject), from_=start, until=end) for _, md, _ in records: # print md.getField("title") # checks for the case in 2010 when there is no title for something if md is not None: txt_dict = {"title": md["title"], "abstract": md["description"], "date": md["date"], "subject": md["subject"], "url": md["identifier"], "authors": md['creator']} output.append(txt_dict) time.sleep(sleep_time) return output
def __init__( self, base_url, metadata_registry=None, applicationContext=None, dayGranularity=False, credentials=None ): Client.__init__(self, base_url, metadata_registry, credentials) SwissbibPreImportProcessor.__init__(self, applicationContext) self._day_granularity = dayGranularity # self.writeContext = writeContext # was ist hier anders als bei Aleph! if not self.context.getConfiguration().getIteratorOAIStructure() is None: self.pIterSingleRecord = re.compile( self.context.getConfiguration().getIteratorOAIStructure(), re.UNICODE | re.DOTALL | re.IGNORECASE ) else: self.pIterSingleRecord = re.compile("<record>.*?</record>", re.UNICODE | re.DOTALL | re.IGNORECASE) # GH: 16.10.2015 this works for Nebis because we are looking for the outer 'shell' of all <record>...</record> not qualified with additional namespaces. # we can use this for deleted as well as for full records. Compare example in exampleContentStructures/alma/deletedAndUpdatedRecords.xml # with Aleph this isn't as easy.. . # self.pIterSingleRecordNebis = re.compile('<record>.*?</record>',re.UNICODE | re.DOTALL | re.IGNORECASE) self.pResumptionToken = re.compile( "<resumptionToken.*?>(.{1,}?)</resumptionToken>", re.UNICODE | re.DOTALL | re.IGNORECASE ) self.harvestingErrorPattern = re.compile( "(<error.*?>.*?</error>|<html>.*?HTTP.*?Status.*?4\d\d)", re.UNICODE | re.DOTALL | re.IGNORECASE )
def scrape(self): raise Exception("not finished") registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) url = self.setting('pmh-endpoint') client = Client(url, registry) print " OAI Repository", url print " Available sets:" for s in client.listSets(): print " ", s oai_set = self.setting('set') oai_from = self.setting('from') oai_until = self.setting('until') kwargs = {} if oai_set: kwargs['set'] = oai_set if oai_from is not None: date_args = [int(arg) for arg in oai_from.split("-")] kwargs['from_'] = datetime.datetime(*date_args) if oai_until is not None: date_args = [int(arg) for arg in oai_until.split("-")] kwargs['until'] = datetime.datetime(*date_args) records = [r for r in client.listRecords(metadataPrefix='oai_dc', **kwargs)] data_filepath = os.path.join(self.work_dir(), self.setting('data-file')) with open(data_filepath, 'wb') as f: print " picking", len(records), "records" pickle.dump(records, f)
def getClient(self): if self.format == 'citeproc': return FakeOaiClientForCrossref() registry = MetadataRegistry() client = Client(self.url, registry) client.get_method = self.get_method client._day_granularity = self.day_granularity return client
def list_sets(target): if target is not None: client = Client(target['url'], registry) setspecs = client.listSets() results = [] if setspecs is not None: for setspec in setspecs: results.append(convert_setspec(setspec)) return results
def list_metadata_formats(target, identifier): if target is not None: client = Client(target['url'], registry) metadata_formats = client.listMetadataFormats(identifier=identifier) results = [] if metadata_formats is not None: for metadata_format in metadata_formats: results.append(convert_metadata_formats(metadata_format)) return results
def list_identifiers(target, date_from, date_until, setspec): if target is not None: client = Client(target['url'], registry) headers = client.listIdentifiers(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec) results = [] if headers is not None: for header in headers: results.append(convert_header(header)) return results
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, metadata_registry) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): yield record
def list_records(target, date_from, date_until, setspec): logging.debug("list_records") if target is not None: client = Client(target['url'], registry) # todo : clean this, find simplified cases if date_from is not None and date_until is not None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec) elif date_from is not None and date_until is not None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until) elif date_from is not None and date_until is None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec) elif date_from is None and date_until is not None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec) elif date_from is not None and date_until is None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from) elif date_from is None and date_until is not None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until) elif date_from is None and date_until is None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec) elif date_from is None and date_until is None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix']) if records is not None: for record in records: yield convert_record(record, target['metadata_prefix'], target['title'])
def list_records(target, date_from, date_until, setspec): if target is not None: client = Client(target['url'], registry) # todo : clean this, find simplified cases if date_from is not None and date_until is not None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec) elif date_from is not None and date_until is not None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until) elif date_from is not None and date_until is None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec) elif date_from is None and date_until is not None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec) elif date_from is not None and date_until is None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from) elif date_from is None and date_until is not None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until) elif date_from is None and date_until is None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec) elif date_from is None and date_until is None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix']) results = [] if records is not None: results = [] for record in records: results.append(convert_record(record, target['metadata_prefix'], target['title'])) return results
def pull_data(source): list_of_records = [] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) # Get list of public experiments at sources registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(source + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc", registry) try: exps_date = [] exps_metadata = [] for (header, meta, extra) in client.listRecords(metadataPrefix='oai_dc'): exps_date.append(str(header._datestamp)) exps_metadata.append(meta) logger.debug('Date=%s' % header._datestamp) except AttributeError as e: msg = "Error reading experiment %s" % e logger.error(msg) raise OAIPMHError(msg) except error.NoRecordsMatchError as e: msg = "no public records found on source %s" % e logger.warn(msg) return exp_counter = 0 for exp_metadata in exps_metadata: user_id = exp_metadata.getField('creator')[0] user_profile = json.loads(_get_user(source, user_id)) data_tobe_indexed = dict(user_profile) data_tobe_indexed['user_id'] = user_id exp_id = exp_metadata.getField('identifier')[0] description = exp_metadata.getField('description')[0] title = exp_metadata.getField('title')[0] if settings.EXPERIMENT_PATH[0] == '/': settings.EXPERIMENT_PATH = settings.EXPERIMENT_PATH[1:] experiment_url = os.path.join(source, settings.EXPERIMENT_PATH % exp_id) data_tobe_indexed['experiment_id'] = exp_id data_tobe_indexed['experiment_title'] = title data_tobe_indexed['experiment_description'] = description data_tobe_indexed['experiment_url'] = experiment_url data_tobe_indexed['id'] = experiment_url data_tobe_indexed['experiment_date'] = exps_date[exp_counter] exp_counter += 1 for k, v in data_tobe_indexed.items(): logger.debug('%s = %s' % (k, v)) logger.debug('') list_of_records.append(json.dumps(data_tobe_indexed)) return list_of_records
def test(request): URL = 'http://www.kulturarv.dk/ffrepox/OAIHandler' registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry) identifyResponse = client.identify() print dir(identifyResponse) #for record in client.listRecords(metadataPrefix='oai_dc'): # result += record return HttpResponse(identifyResponse.repositoryName())
def harvest_oai_collection_records(self, collection): records = [] try: registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(collection.community.repository.base_url, registry) records = client.listRecords( metadataPrefix='oai_dc', set=collection.identifier) except: return return records
def clean(self): cleaned_data = super(CreateRepositoryForm, self).clean() try: registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(cleaned_data.get('base_url'), registry) server = client.identify() # set the repository name apply to model instance when saved. cleaned_data['name'] = server.repositoryName() except: raise ValidationError('Repository base url is invalid.') return cleaned_data
def get_client(url, transforms): transforms = fix_transforms(transforms) registry = MetadataRegistry() c = Client(url, registry) metadata = c.listMetadataFormats() metadata[0] = [ 'fbb', 'http://www.kulturarv.dk/fbb/fbb.xsd', 'http://www.kulturarv.dk/fbb'] namespaces = dict((x[0], x[2]) for x in metadata) fields = dict((transform['field'], ('textList', transform['path'])) for transform in transforms) namespace = metadata[0][0] print namespaces,fields registry.registerReader(namespace, MetadataReader(fields=fields, namespaces=namespaces)) return c, namespace
def index_documents(main_url, database_name, url, reader, prefix, format): registry = MetadataRegistry() registry.registerReader(prefix, reader) client = Client(url, registry) return_stuff = [] for record in client.listRecords(metadataPrefix=prefix): r = record[1] value = format(r,record[0].identifier()) if value != None: return_stuff.append(value) if len(return_stuff) >= 10000: sync_files(main_url, database_name, return_stuff) return_stuff = [] sync_files(main_url, database_name, return_stuff)
def read_base_records(self): registry = MetadataRegistry() registry.registerReader('base_dc', base_dc_reader) client = Client('http://doai.io/oai', registry) for header, record, _ in client.listRecords(metadataPrefix='base_dc'): # only process records for which base was unsure if '2' not in record['oa']: continue # extract splash_url for link in record['identifier']: metadata = {'base_oa':''.join(record['oa']), 'splash_url':link, 'from_identifier':header.identifier()} yield self.filter_url(link,metadata, looking_for='any')
def __init__(self, oaisource, day_granularity=False, *args, **kwargs): """ This sets up the paper source. :param oaisource: the OAISource to fetch from. :param day_granularity: should we use day-granular timestamps to fetch from the proxy or full timestamps (default: False, full timestamps) See the protocol reference for more information on timestamp granularity: https://www.openarchives.org/OAI/openarchivesprotocol.html """ super(OaiPaperSource, self).__init__(*args, **kwargs) if not oaisource.endpoint: raise ValueError('No OAI endpoint was configured for this OAI source.') self.registry = MetadataRegistry() self.registry.registerReader('oai_dc', oai_dc_reader) self.registry.registerReader('base_dc', base_dc_reader) self.client = Client(oaisource.endpoint, self.registry) self.client._day_granularity = day_granularity self.translators = { 'oai_dc': OAIDCTranslator(oaisource), 'base_dc': BASEDCTranslator(oaisource), }
def __init__(self, url, prefix=nsdl.LR_NSDL_PREFIX, reader=None, fields=None, namespaces=None, fieldMap=None): ''' Constructor ''' if fields == None: self._fields = nsdl.LR_NSDL_DC_FIELDS else: self._fields = fields if fieldMap == None: self._fieldMap = nsdl.NSDL_TO_LR_MAP else: self._fieldMap = fieldMap if namespaces == None: self._namespaces = nsdl.LR_NSDL_DC_NAMESPACES else: self._namespaces = namespaces if reader == None: reader = MetadataReader(fields = self._fields, namespaces = self._namespaces) self._url = url self._registry = MetadataRegistry() self._prefix = prefix self._registry.registerReader(prefix, reader) self._client = Client(url, self._registry)
def _init_clients(self): try: self._c = OaipmhClient(self.store.state['base_oai_url']) #, metadata_registry = dumbMetadataRegistry) self.identify() except OSError: logger.error("Cannot make OAIPMH client") raise Exception("Cannot make OAIPMH client")
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'): """ Create an OAI-PMH client, gather metadata and output it. """ total = num = 0 msg = "Fetching records between " + str(start) + " and " + str(end) sys.stderr.write(msg + "\n") # # Set up metadata readers # registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('qdc', qdc_reader) # registry.registerReader('rdf', rdf_reader) # no reader yet # registry.registerReader('ore', ore_reader) # no reader yet # registry.registerReader('mets', mets_reader) # no reader yet client = Client(URL, registry) records = client.listRecords(metadataPrefix='qdc', from_=start, until=end, set=set) for (h, m, a) in records: print h, m, a if not m: sys.stderr.write("o") continue total = total + 1 handle = m.getField('identifier') if not handle: sys.stderr.write("Record without a handle.\n") continue r = dict({ 'handle' : handle[0] }) for key in qdc_reader._fields.keys(): r[key] = m.getField(key) RECORDS.append(r) sys.stderr.write('.') sys.stderr.flush() num = num + 1 msg = "\nCollected " + str(num) + " records, out of " + str(total) sys.stderr.write('\n' + msg + '\n'); if options.store: pickle.dump(RECORDS, open(options.store, "wb"))
def update(self, from_date=None): self._log.info('Harvesting oai server: %s' % self._url) registry = MetadataRegistry() registry.registerReader(self._prefix, lambda el: el) client = Client(self._url, registry) try: for header, element, about in client.listRecords( metadataPrefix = self._prefix, from_ = from_date): added = self._process_record(header, element) if added: yield self._get_id(header) except NoRecordsMatchError: pass super(OAIBasedContentProvider, self).update()
def processItems(): oai_oi_reader = MetadataReader( fields={ 'title': ('textList', 'oai_oi:oi/oi:title/text()'), 'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'), 'creator': ('textList', 'oai_oi:oi/oi:creator/text()'), 'subject': ('textList', 'oai_oi:oi/oi:subject/text()'), 'description': ('textList', 'oai_oi:oi/oi:description/text()'), 'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'), 'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'), 'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'), 'date': ('textList', 'oai_oi:oi/oi:date/text()'), 'type': ('textList', 'oai_oi:oi/oi:type/text()'), 'extent': ('textList', 'oai_oi:oi/oi:extend/text()'), 'medium': ('textList', 'oai_oi:oi/oi:medium/text()'), 'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'), 'source': ('textList', 'oai_oi:oi/oi:source/text()'), 'language': ('textList', 'oai_oi:oi/oi:language/text()'), 'references': ('textList', 'oai_oi:oi/oi:references/text()'), 'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'), 'attributionName': ('textList', 'oai_oi:oi/oi:attributionName/text()'), 'attributionURL': ('textList', 'oai_oi:oi/oi:attributionURL/text()'), 'license': ('textList', 'oai_oi:oi/oi:license/text()'), #Zitten er niet in #'rights': ('textList', 'oai_oi:oi/oi:rights/text()'), #'relation': ('textList', 'oai_oi:oi/oi:relation/text()'), #'coverage': ('textList', 'oai_oi:oi/oi:coverage/text()'), #'format': ('textList', 'oai_oi:oi/oi:format/text()'), }, namespaces={ 'oi' : 'http://www.openbeelden.nl/oai/', 'oai_oi' : 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc' : 'http://purl.org/dc/elements/1.1/', 'dcterms' : 'http://purl.org/dc/terms', } ) url = u'http://www.openbeelden.nl/feeds/oai/' registry = MetadataRegistry() registry.registerReader('oai_oi', oai_oi_reader) client = Client(url, registry) for record in client.listRecords(metadataPrefix='oai_oi'): processItem(record)
def iter_items(self, partition): """ Partition is an OAI-PMH endpoint """ # source = "oai:%s" % partition registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(partition, registry) for record in client.listRecords(metadataPrefix='oai_dc'): header, metadata, _ = record if header.isDeleted(): continue # _id = header.identifier() # date = header.datestamp() meta = metadata.getMap() # TODO: there are much validation and heuristics to be done here! # format0 = (meta.get("format") or [None])[0] # if not format0: # continue # if format0 not in ("application/pdf", ): # continue url0 = (meta.get("identifier") or [None])[0] if not url0: continue title0 = (meta.get("title") or [""])[0].encode("utf-8") desc0 = (meta.get("description") or [""])[0].encode("utf-8") # TODO: validate that the url0 is not on another domain?! yield url0, {}, "html", 2, """ <html><head><title>%s</title></head><body>%s</body></html> """ % (title0, desc0)
def setupOAIPMHConnection(self): oai_oi_reader = MetadataReader( fields={ 'title': ('textList', 'oai_oi:oi/oi:title/text()'), 'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'), 'creator': ('textList', 'oai_oi:oi/oi:creator/text()'), 'subject': ('textList', 'oai_oi:oi/oi:subject/text()'), 'description': ('textList', 'oai_oi:oi/oi:description/text()'), 'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'), 'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'), 'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'), 'date': ('textList', 'oai_oi:oi/oi:date/text()'), 'type': ('textList', 'oai_oi:oi/oi:type/text()'), 'extent': ('textList', 'oai_oi:oi/oi:extent/text()'), 'medium': ('textList', 'oai_oi:oi/oi:medium/text()'), 'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'), 'source': ('textList', 'oai_oi:oi/oi:source/text()'), 'language': ('textList', 'oai_oi:oi/oi:language/text()'), 'references': ('textList', 'oai_oi:oi/oi:references/text()'), 'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'), 'attributionName': ('textList', 'oai_oi:oi/oi:attributionName/text()'), 'attributionURL': ('textList', 'oai_oi:oi/oi:attributionURL/text()'), 'license': ('textList', 'oai_oi:oi/oi:license/text()') }, namespaces={ 'oai_oi': 'http://www.openbeelden.nl/feeds/oai/', #'http://www.openarchives.org/OAI/2.0/oai_oi/', 'oi': 'http://www.openbeelden.nl/oai/' } ) URL = 'http://www.openbeelden.nl/feeds/oai/' #Initialize the OAI client self.registry = MetadataRegistry() self.registry.registerReader('oai_oi', oai_oi_reader) self.client = Client(URL, self.registry) #Test if the connection to the OAI-PMH provider works x = self.client.updateGranularity() x = self.client.identify() print 'identity %s' % x.repositoryName() print 'identity %s' % x.protocolVersion() print 'identity %s' % x.baseURL() """ for s in client.listSets(): print s """ #initialize the OpenSKOSHandler self.openSKOSHandler = OpenSKOSHandler()
def acquire_and_publish_documents(oai_url, publish_url, reader, prefix, pwd): registry = MetadataRegistry() registry.registerReader(prefix, reader) client = Client(oai_url, registry) documents = [] count = 0 for record in client.listRecords(metadataPrefix=prefix): header = record[0] metadata = record[1] rawMetadata = urllib2.urlopen("{0}?verb=GetRecord&metadataPrefix={1}&identifier={2}".format(oai_url,prefix,header.identifier())).read() # re-format Jorum id identifier = header.identifier() identifier = identifier.replace("oai:dspace.jorum.ac.uk:","") uri = "http://dspace.jorum.ac.uk/xmlui/handle/" + identifier print(uri) # create keys from dc.subject terms fo = StringIO.StringIO(rawMetadata) tree = parse(fo) # can only parse files or file objects keys = [] for elem in tree.getiterator(): # print("tag " + str(elem.tag)) # print("text " + str(elem.text)) if elem.tag == "{http://purl.org/dc/elements/1.1/}subject": keys.append(elem.text) fo.close() print(keys) print("\n") value = convert_to_envelope(metadata, rawMetadata, uri, keys) # print (value) # print(dir(header)) if value != None: documents.append(value) count += 1 if (count % 10 == 0) or (count == 3): publish_documents(publish_url, documents, pwd) documents = [] publish_documents(publish_url, documents, pwd)
def list_oai_community_sets(self, repository): try: registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(repository.base_url, registry) sets = client.listSets() except: return """ Filter records to build list of community sets """ self.communities = [] for i in sets: set_id = i[0] set_name = i[1] """ Build collection tuples (id, human readable name) """ if set_id[:3] == 'com': set_data = [] set_data.append(set_id) set_data.append(set_name) self.communities.append(set_data) self.communities = sorted( self.communities, key=lambda community: community[1])
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl) ) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): yield record
def getClient(self): registry = MetadataRegistry() client = Client(self.url, registry) client.get_method = self.get_method client._day_granularity = self.day_granularity return client
def _client(base_URL, registry=None): """ instantiate client """ client = Client(base_URL, registry) return client
return element # Defining of metadata Readers in the Registry from oaipmh import metadata registry = metadata.MetadataRegistry() registry.registerReader('marc21', XMLReader()) #### OAI-PMH Client processing from oaipmh.client import Client from lxml import etree oai = Client('http://snape.mzk.cz/OAI-script', registry) #recs = oai.listRecords(metadataPrefix='marc21', set='MZK03') #rec = recs.next() #for rec in recs: rec = oai.getRecord(identifier='MZK03-907223', metadataPrefix='marc21') if rec: print rec[0].identifier() r = rec[1] # Get XML tree for record print etree.tostring(r, pretty_print=True) if r: xpath_evaluator = etree.XPathEvaluator(
def transfer_experiment(source): """ Pull public experiments from source into current mytardis. """ #TODO: Cleanup error messages #TODO: does not transfer liences as not part of METS format. #NOTE: As this is a pull we trust the data from the other tardis # Check identity of the feed from oaipmh.client import Client from oaipmh import error from oaipmh.metadata import MetadataRegistry, oai_dc_reader from django.core.cache import cache from django.utils.hashcompat import md5_constructor as md5 # The cache key consists of the task name and the MD5 digest # of the feed URL. cache_key = md5("token").hexdigest() lock_id = "%s-lock-%s" % ("consume_experiment", cache_key) LOCK_EXPIRE = 60 * 5 # cache.add fails if if the key already exists acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE) # memcache delete is very slow, but we have to use it to take # advantage of using add() for atomic locking release_lock = lambda: cache.delete(lock_id) registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) source_url = "%s/apps/oaipmh/?verb=Identify" % source client = Client(source_url, registry) try: identify = client.identify() except AttributeError as e: msg = "Error reading repos identity: %s:%s" % (source, e) logger.error(msg) raise ReposReadError(msg) except error.ErrorBase as e: msg = "OAIPMH error: %s" % e logger.error(msg) raise OAIPMHError(msg) except URLError as e: logger.error(e) raise repos = identify.baseURL() import urlparse repos_url = urlparse.urlparse(repos) dest_name = "%s://%s" % (repos_url.scheme, repos_url.netloc) if dest_name != source: msg = "Source directory reports incorrect name: %s" % dest_name logger.error(msg) raise BadAccessError(msg) # Get list of public experiments at sources registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client( source + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc", registry) try: exps_metadata = [ meta for (header, meta, extra) in client.listRecords(metadataPrefix='oai_dc') ] except AttributeError as e: msg = "Error reading experiment %s" % e logger.error(msg) raise OAIPMHError(msg) except error.NoRecordsMatchError as e: msg = "no public records found on source %s" % e logger.warn(msg) return local_ids = [] for exp_metadata in exps_metadata: exp_id = exp_metadata.getField('identifier')[0] user = exp_metadata.getField('creator')[0] found_user = _get_or_create_user(source, user) #make sure experiment is publicish try: xmldata = getURL("%s/apps/reposproducer/expstate/%s/" % (source, exp_id)) except HTTPError as e: msg = "cannot get public state of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) try: exp_state = json.loads(xmldata) except ValueError as e: msg = "cannot parse public state of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not exp_state in [ Experiment.PUBLIC_ACCESS_FULL, Experiment.PUBLIC_ACCESS_METADATA ]: msg = 'cannot ingest private experiments.' % exp_id logger.error(msg) raise BadAccessError(msg) # Get the usernames of isOwner django_user ACLs for the experiment try: xmldata = getURL("%s/apps/reposproducer/acls/%s/" % (source, exp_id)) except HTTPError as e: msg = "Cannot get acl list of experiment %s" % exp_id logger.error(msg) raise ReposReadError(msg) try: acls = json.loads(xmldata) except ValueError as e: msg = "cannot parse acl list of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) owners = [] for acl in acls: if acl['pluginId'] == 'django_user' and acl['isOwner']: user = _get_or_create_user(source, acl['entityId']) owners.append(user.username) else: # FIXME: skips all other types of acl for now pass # Get the METS for the experiment metsxml = "" try: metsxml = getURL("%s/experiment/metsexport/%s/?force_http_urls" % (source, exp_id)) #metsxml = getURL("%s/experiment/metsexport/%s/" #% (source, exp_id)) except HTTPError as e: msg = "cannot get METS for experiment %s" % exp_id logger.error(msg) raise ReposReadError(msg) # load schema and parametername for experiment keys try: key_schema = Schema.objects.get(namespace=settings.KEY_NAMESPACE) except Schema.DoesNotExist as e: msg = "No ExperimentKeyService Schema found" logger.error(msg) raise BadAccessError(msg) try: key_name = ParameterName.objects.get(name=settings.KEY_NAME) except ParameterName.DoesNotExist as e: msg = "No ExperimentKeyService ParameterName found" logger.error(msg) raise BadAccessError(msg) try: xmldata = getURL("%s/apps/reposproducer/key/%s/" % (source, exp_id)) except HTTPError as e: msg = "cannot get key of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not xmldata: logger.warn( "Unable to retrieve experiment %s key. Will try again later" % exp_id) return try: key_value = json.loads(xmldata) except ValueError as e: msg = "cannot parse key list of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not key_value: logger.warn( "Unable to retrieve experiment %s key value. Will try again later" % exp_id) return logger.debug("retrieved key %s from experiment %s" % (key_value, exp_id)) exps = Experiment.objects.all() got_lock = True if not acquire_lock(): logger.warning("another worker has access to consume experiment") return duplicate_exp = 0 for exp in exps: #logger.warn("exp = %s" % exp.id) params = ExperimentParameter.objects.filter( name=key_name, parameterset__schema=key_schema, parameterset__experiment=exp) #logger.warn("params.count() = %s" % params.count()) if params.count() >= 1: key = params[0].string_value if key == key_value: duplicate_exp = exp.id #logger.warn("found duplicate for %s" % duplicate_exp) break if duplicate_exp: logger.warn( "Found duplicate experiment form %s exp %s to exp %s" % (source, exp_id, duplicate_exp)) if got_lock: release_lock() return # TODO: Need someway of updating and existing experiment. Problem is # that copy will have different id from original, so need unique identifier # to allow matching # We have not pulled everything we need from producer and are ready to create # experiment. # Make placeholder experiment and ready metadata e = Experiment( title='Placeholder Title', approved=True, created_by=found_user, public_access=exp_state, locked=False # so experiment can then be altered. ) e.save() # store the key #eps, was_created = ExperimentParameterSet.objects.\ # get_or_create(experiment=e, schema=key_schema) #if was_created: # logger.warn("was created") #ep, was_created = ExperimentParameter.objects.get_or_create(parameterset=eps, # name=key_name, # string_value=key_value) #if was_created: # logger.warn("was created again") #ep.save() if got_lock: release_lock() local_id = e.id filename = path.join(e.get_or_create_directory(), 'mets_upload.xml') f = open(filename, 'wb+') f.write(metsxml) f.close() # Ingest this experiment META data and isOwner ACLS eid = None try: eid, sync_path = _registerExperimentDocument(filename=filename, created_by=found_user, expid=local_id, owners=owners) logger.info('=== processing experiment %s: DONE' % local_id) except: # FIXME: what errors can mets return? msg = '=== processing experiment %s: FAILED!' \ % local_id logger.error(msg) raise MetsParseError(msg) # FIXME: if METS parse fails then we should go back and delete the placeholder experiment exp = Experiment.objects.get(id=eid) # so that tardis does not copy the data for datafile in exp.get_datafiles(): datafile.stay_remote = True datafile.save() #import nose.tools #nose.tools.set_trace() # FIXME: reverse lookup of URLs seem quite slow. # TODO: put this information into specific metadata schema attached to experiment exp.description += get_audit_message(source, exp_id) exp.save() local_ids.append(local_id) return local_ids
def run(event, context): #pull data from OAI endpoint registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) url_base = 'http://contentdm.marinlibrary.org' URL = url_base + '/oai/oai.php' client = Client(URL, registry, force_http_get=True) harvested_data = [] for record in client.listRecords(metadataPrefix='oai_dc'): if not record[0].isDeleted(): fields = record[1].getMap() fields['subjects'] = fields['subject'][0].split(';') del fields['subject'] fields['set'] = record[0].setSpec() identifier = record[0].identifier().split(':')[2] fields[ 'image_url_base'] = url_base + '/digital/iiif/' + identifier + '/' harvested_data.append(fields) es.indices.delete(index='digital_collection_recs', ignore=[400, 404]) mapping = { "mappings": { "_doc": { "properties": { "title": { "type": "text" }, "creator": { "type": "text" }, "subjects": { "type": "text" }, "description": { "type": "text" }, "publisher": { "type": "text" }, "contributor": { "type": "text" }, "date": { "type": "text" }, "type": { "type": "text", "fielddata": "true" }, "format": { "type": "text", "fielddata": "true" }, "identifier": { "type": "text" }, "source": { "type": "text" }, "language": { "type": "text", "fielddata": "true" }, "relation": { "type": "text" }, "coverage": { "type": "text" }, "rights": { "type": "text" }, "set": { "type": "text", "fielddata": "true" }, "image_url_base": { "type": "text" } } } } } es.indices.create(index='digital_collection_recs', body=mapping) helpers.bulk(es, harvested_data, index='digital_collection_recs', doc_type='_doc') return "success"
class NSDLDCImport(object): ''' Class exports the required fields from the UCAR OAI-PMH data repository using NSDL_DC. ''' def __init__(self, url, prefix=nsdl.LR_NSDL_PREFIX, reader=None, fields=None, namespaces=None, fieldMap=None): ''' Constructor ''' if fields == None: self._fields = nsdl.LR_NSDL_DC_FIELDS else: self._fields = fields if fieldMap == None: self._fieldMap = nsdl.NSDL_TO_LR_MAP else: self._fieldMap = fieldMap if namespaces == None: self._namespaces = nsdl.LR_NSDL_DC_NAMESPACES else: self._namespaces = namespaces if reader == None: reader = MetadataReader(fields=self._fields, namespaces=self._namespaces) self._url = url self._registry = MetadataRegistry() self._prefix = prefix self._registry.registerReader(prefix, reader) self._client = Client(url, self._registry) def _format(self, doc): value = {} # merge all the fields for (fieldname, fieldconfig) in self._fieldMap.items(): if fieldconfig["type"] == "const" and "const" in fieldconfig: value[fieldname] = fieldconfig["const"] elif fieldconfig["type"] == "[string]" and len( fieldconfig["fields"]) > 0: value[fieldname] = [] for field in fieldconfig["fields"]: value[fieldname].extend(doc.getField(field)) elif fieldconfig["type"] == "string" and len( fieldconfig["fields"]) > 0: value[fieldname] = "" for field in fieldconfig["fields"]: value[fieldname] += ", ".join(doc.getField(field)) elif fieldconfig["type"] == "boolean" and len( fieldconfig["fields"]) > 0: value[fieldname] = True for field in fieldconfig["fields"]: value[fieldname] &= doc.getField(field) return value def fetch_documents(self, range=10000): return_stuff = [] for record in self._client.listRecords(metadataPrefix=self._prefix): r = record[1] value = self._format(r) if value != None: return_stuff.append(value) if len(return_stuff) >= range: yield return_stuff return_stuff = []
def run(self): # Check that ElasticSearch is alive self.check_index() # If the user specified the --REBUILD flag, recreate the index if self.options['rebuild']: self.rebuild_index() # Connect to the repository registry = MetadataRegistry() registry.registerReader(self.settings["metadata_format"], self.settings["metadata_reader"]) client = Client(self.settings["uri"], registry) identity = client.identify() print "Connected to repository: %s" % identity.repositoryName() # got to update granularity or we barf with: # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z client.updateGranularity() # Initialise some variables batcher = Batch.Batch() total_records = 0 start = time.time() # Now do the synchonisation # If the user specified an identifier, then synchronise this record if (self.options['identifier'] is not None): total_records += self.synchronise_record( client, batcher, self.options['identifier']) else: # Else, synchronise using the date-range provided by the user, or failing that, # the date-range based on the last sync # Get the synchronisation config record synchronisation_config = self.get_synchronisation_config() if self.options["from_date"] is not None: # If the user specified a from-date argument, use it from_date = self.options[ "from_date"] # already a date (not a datetime) elif synchronisation_config is not None and "to_date" in synchronisation_config: # Else read the last synchronised to_date from the config, and add on a day from_date = dateutil.parser.parse( synchronisation_config["to_date"]).date() + timedelta( days=1) else: # Else use the default_from_date in the config from_date = dateutil.parser.parse( self.settings['default_from_date']).date() if self.options["to_date"] is not None: to_date = self.options[ "to_date"] # already a date (not a datetime) else: to_date = (date.today() - timedelta(days=1)) # Force the from_date to use time 00:00:00 from_date = datetime.combine( from_date, _time(hour=0, minute=0, second=0, microsecond=0)) # Force the to_date to use time 23:59:59 to_date = datetime.combine( to_date, _time(hour=23, minute=59, second=59, microsecond=0)) print "Synchronising from %s - %s" % (from_date, to_date) while from_date < to_date: next_date = datetime.combine( from_date.date() + timedelta(days=(self.settings['delta_days'] - 1)), _time(hour=23, minute=59, second=59, microsecond=0)) number_of_records = self.synchronise_period( client, batcher, from_date, next_date) batcher.clear() #Store the records in elasticsearch self.put_synchronisation_config(from_date, next_date, number_of_records) from_date += timedelta(days=(self.settings['delta_days'])) total_records += number_of_records # Pause so as not to get banned. to = 20 print "Sleeping for %i seconds so as not to get banned." % to time.sleep(to) # Store the records in the index batcher.clear() # Print out some statistics time_spent = time.time() - start print 'Total time spent: %d seconds' % (time_spent) if time_spent > 0.001: # careful as its not an integer print 'Total records synchronised: %i records (%d records/second)' % ( total_records, (total_records / time_spent)) else: print 'Total records synchronised: %i records' % (total_records) return total_records sys.exit()
#aka oaijson import sys from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader import simplejson as json import couchdb server = couchdb.Server() db = server['dcat'] URL = 'http://cardinalscholar.bsu.edu/cgi/oai2' registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry) records = client.listRecords(metadataPrefix='oai_dc') i = 0 for hdr, metadata, _ in records: i = i + 1 print hdr.identifier() print hdr.datestamp() map = metadata.getMap() map.update({'cdmcollection': 'cardinalscholar'}) db.save(map) print 'saved ' + str(i)
return def get_bool(prompt): while True: try: return {"true": True, "false": False}[input(prompt).lower()] except KeyError: print("Invalid input please enter True or False!") # init registry = MetadataRegistry() registry.registerReader('marc21', MarcXML) client = Client(URL, registry) start = valid_date(start_date) stop = valid_date(stop_date) # main while start < stop: from_date = start start = start + timedelta(days=1) # increase days one by one until_date = start try: records = client.listRecords(metadataPrefix='marc21', set='SKC', from_=from_date, until=until_date)
from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader from PdfParser.PdfParser import Preprocessing import json import bz2 from numpy import record import pandas as pd baseurl = 'http://export.arxiv.org/oai2?' corpuspath = '/Users/goksukara/Desktop/Projects/EclipseWorkspace/Specilization/PhytonCode/Data/corpus.csv' if __name__ == "__main__": url = baseurl registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(url, registry) record = client.listSets() for word in record: print(word) #Write to file #with bz2.BZ2File('out.json', 'wb') as outfile: for record in client.listRecords(metadataPrefix='oai_dc', set='cs'): header, metadata, _ = record doc = {} #Extract identifier #doc["id"] = header.identifier() #Extract title and other metadata doc["title"] = "\n".join(metadata["title"]) doc["abstract"] = "\n".join(metadata["description"]) #doc["authors"] = metadata["creator"]
class OaiPaperSource(PaperSource): # TODO: this should not inherit from PaperSource """ A paper source that fetches records from the OAI-PMH proxy (typically: proaixy). It uses the ListRecord verb to fetch records from the OAI-PMH source. Each record is then converted to a :class:`BarePaper` by an :class:`OaiTranslator` that handles the format the metadata is served in. """ def __init__(self, endpoint, day_granularity=False, *args, **kwargs): """ This sets up the paper source. :param endpoint: the address of the OAI-PMH endpoint to fetch from. :param day_granularity: should we use day-granular timestamps to fetch from the proxy or full timestamps (default: False, full timestamps) See the protocol reference for more information on timestamp granularity: https://www.openarchives.org/OAI/openarchivesprotocol.html """ super(OaiPaperSource, self).__init__(*args, **kwargs) self.registry = MetadataRegistry() self.registry.registerReader('oai_dc', oai_dc_reader) self.registry.registerReader('base_dc', base_dc_reader) self.registry.registerReader('citeproc', citeproc_reader) self.client = Client(endpoint, self.registry) self.client._day_granularity = day_granularity if settings.PROAIXY_API_KEY: self.client.extra_parameters = { 'key': settings.PROAIXY_API_KEY} self.translators = {} # Translator management def add_translator(self, translator): """ Adds the given translator to the paper source, so that we know how to translate papers in the given format. The paper source cannot hold more than one translator per OAI format (it decides what translator to use solely based on the format) so if there is already a translator for that format, it will be overriden. """ self.translators[translator.format()] = translator # Record ingestion def ingest(self, from_date=None, metadataPrefix='any', resumptionToken=None): """ Main method to fill Dissemin with papers! :param from_date: only fetch papers modified after that date in the proxy (useful for incremental fetching) :param metadataPrefix: restrict the ingest for this metadata format """ args = {'metadataPrefix':metadataPrefix} if from_date: args['from_'] = from_date if resumptionToken: args['resumptionToken'] = resumptionToken records = self.client.listRecords(**args) self.process_records(records) def create_paper_by_identifier(self, identifier, metadataPrefix): """ Queries the OAI-PMH proxy for a single paper. :param identifier: the OAI identifier to fetch :param metadataPrefix: the format to use (a translator has to be registered for that format, otherwise we return None with a warning message) :returns: a Paper or None """ record = self.client.getRecord( metadataPrefix=metadataPrefix, identifier=identifier) return self.process_record(record[0], record[1]._map) # Record search utilities def listRecords_or_empty(self, source, *args, **kwargs): """ pyoai raises :class:`NoRecordsMatchError` when no records match, we would rather like to get an empty list in that case. """ try: return source.listRecords(*args, **kwargs) except NoRecordsMatchError: return [] def process_record(self, header, metadata): """ Saves the record given by the header and metadata (as returned by pyoai) into a Paper, or None if anything failed. """ translator = self.translators.get(header.format()) if translator is None: print("Warning: unknown metadata format %s, skipping" % header.format()) return paper = translator.translate(header, metadata) if paper is not None: try: with transaction.atomic(): saved = Paper.from_bare(paper) return saved except ValueError as e: print "Ignoring invalid paper:" print header.identifier() print e def process_records(self, listRecords): """ Save as :class:`Paper` all the records contained in this list """ # check that we have at least one translator, otherwise # it's not really worth trying… if not self.translators: raise ValueError("No OAI translators have been set up: " + "We cannot save any record.") last_report = datetime.now() processed_since_report = 0 for record in listRecords: header = record[0] metadata = record[1]._map self.process_record(header, metadata) # rate reporting processed_since_report += 1 if processed_since_report >= 1000: td = datetime.now() - last_report rate = 'infty' if td.seconds: rate = unicode(processed_since_report / td.seconds) print("current rate: %s records/s" % rate) processed_since_report = 0 last_report = datetime.now()
from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader URL = sys.argv[1] METADATA_PREFIX = sys.argv[2] if len(sys.argv) == 4: SETSPEC = sys.argv[3] else: SETSPEC = None registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader(METADATA_PREFIX, oai_dc_reader) client = Client(URL, registry) record_count = 0 deleted_count = 0 if SETSPEC: records = client.listRecords(metadataPrefix=METADATA_PREFIX, set=SETSPEC) else: records = client.listRecords(metadataPrefix=METADATA_PREFIX) for num, record in enumerate(records): record_count += 1 delinfo = '' if record[0].isDeleted(): deleted_count += 1 delinfo = '(deleted)'
def get_names (dataname): record_prefix = "rdf:RDF/edm:ProvidedCHO" edm_reader = MetadataReader( fields={ 'objectId': ('textList', record_prefix + '/@rdf:about'), 'spatial': ('textList', record_prefix + '/dcterms:spatial/text()'), }, namespaces={ 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc':'http://purl.org/dc/elements/1.1/', 'dcterms':'http://purl.org/dc/terms/', 'dct': 'http://purl.org/dc/terms/', 'edm' : 'http://www.europeana.eu/schemas/edm/', 'foaf': 'http://xmlns.com/foaf/0.1/', 'owl' : 'http://www.w3.org/2002/07/owl#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'skos': 'http://www.w3.org/2004/02/skos/core#', 'xsi' : 'http://www.w3.org/2001/XMLSchema-instance', 'ore': 'http://www.openarchives.org/ore/terms/' } ) dictnames={} identifier=[] if __name__ == "__main__": URL = 'https://data.jhn.ngo/oai' registry = MetadataRegistry() registry.registerReader('edm', edm_reader) client = Client(URL, registry ) k = 0 for record in client.listRecords(metadataPrefix='edm' , set= dataname ): output = record[1].getMap() k = k + 1 print(k) if output['spatial'] !=[]: if len(output['spatial']) ==1: if len(output['spatial'][0])>3: if [output['spatial'][0],output['objectId'][0]] not in identifier: identifier.append([output['spatial'][0],output['objectId'][0]]) if output['spatial'][0] not in dictnames.keys(): key = output['spatial'][0] dictnames.setdefault(key,[]) dictnames[key].append(output['objectId'][0]) else: key = output['spatial'][0] dictnames[key].append(output['objectId'][0]) else: for j in range (0,len(output['spatial'])): if len(output['spatial'][j])>3: if [output['spatial'][j],output['objectId'][0]] not in identifier: identifier.append([output['spatial'][j],output['objectId'][0]]) if output['spatial'][j] not in dictnames.keys(): key = output['spatial'][j] dictnames.setdefault(key,[]) dictnames[key].append(output['objectId'][0]) else: key = output['spatial'][j] dictnames[key].append(output['objectId'][0]) #print (identifier) return dictnames
def get_names(dataname): record_prefix = "rdf:RDF/edm:ProvidedCHO" # Modidy/add Xpath mappings to get other fields and other objects (agent, place etc) edm_reader = MetadataReader( fields={ 'title': ('textList', record_prefix + '/dc:title/text()'), 'creator': ('textList', record_prefix + '/dc:creator/text()'), 'subject': ('textList', record_prefix + '/dc:subject/text()'), 'description': ('textList', record_prefix + '/dc:description/text()'), 'publisher': ('textList', record_prefix + '/dc:publisher/text()'), 'contributor': ('textList', record_prefix + '/dc:contributor/text()'), 'date': ('textList', record_prefix + '/dc:date/text()'), 'type': ('textList', record_prefix + '/dc:type/text()'), 'format': ('textList', record_prefix + '/dc:format/text()'), 'identifier': ('textList', record_prefix + '/dc:identifier/text()'), 'source': ('textList', record_prefix + '/dc:source/text()'), 'language': ('textList', record_prefix + '/dc:language/text()'), 'relation': ('textList', record_prefix + '/dc:relation/text()'), 'coverage': ('textList', record_prefix + '/dc:coverage/text()'), 'rights': ('textList', record_prefix + '/dc:rights/text()'), 'spatial': ('textList', record_prefix + '/dc:spatial/text()'), 'objectId': ('textList', record_prefix + '/@rdf:about'), }, namespaces={ 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcterms': 'http://purl.org/dc/terms/', 'dct': 'http://purl.org/dc/terms/', 'edm': 'http://www.europeana.eu/schemas/edm/', 'foaf': 'http://xmlns.com/foaf/0.1/', 'owl': 'http://www.w3.org/2002/07/owl#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'skos': 'http://www.w3.org/2004/02/skos/core#', 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', 'ore': 'http://www.openarchives.org/ore/terms/' }) names = [] identifier = [] if __name__ == "__main__": URL = 'https://data.jhn.ngo/oai' registry = MetadataRegistry() registry.registerReader('edm', edm_reader) client = Client(URL, registry) # To harvest specific dataset, use "set" parameter: set='AIUJE1_MARC21' k = 0 for record in client.listRecords(metadataPrefix='edm', set=dataname): output = record[1].getMap() k = k + 1 print(k) if output['creator'] != []: if len(output['creator']) == 1: names.append([output['creator'][0]]) identifier.append( [output['creator'][0], output['objectId'][0]]) else: for j in range(0, len(output['creator'])): names.append([output['creator'][j]]) identifier.append( [output['creator'][j], output['objectId'][0]]) if output['contributor'] != []: if len(output['contributor']) == 1: names.append([output['contributor'][0]]) identifier.append( [output['contributor'][0], output['objectId'][0]]) else: for s in range(0, len(output['contributor'])): names.append([output['contributor'][s]]) identifier.append( [output['contributor'][s], output['objectId'][0]]) print(names) return identifier
def add_provider(cxn, args): """Add a new provider to the registry database. Process ``args`` to add a new provider to the registry database. Return 0 for success, 1 for failure (error message should be logged). ``cxn`` => instance of ``sqlite3.Connection`` ``args`` => instance of ``argparse.Namespace`` """ global logger, MAX_NAME_LENGTH addlogger = logger.getChild('add') # Validate name if len(args.name) > MAX_NAME_LENGTH: addlogger.critical('Short name for new provider must be no more than ' '{0} characters long'.format(MAX_NAME_LENGTH)) return 1 elif args.name.startswith(('http://', 'https://')) or args.name == 'all': addlogger.critical('Short name for new provider may not be "all" nor ' 'may it begin "http://" or "https://"') return 1 # Try to create row now to avoid unnecessary validation if duplicate try: cxn.execute( "INSERT INTO providers(name, lastHarvest) values " "(?, ?)", (args.name, datetime.fromtimestamp(0))) except sqlite3.IntegrityError: addlogger.critical('Unable to add provider "{0}"; ' 'provider with this name already exists' ''.format(args.name)) return 1 else: addlogger.info('Adding provider "{0}"'.format(args.name)) # Get any missing information # Base URL if args.url is None: args.url = raw_input('Base URL: '.ljust(20)) if not args.url: addlogger.critical('Base URL for new provider not supplied') return 1 # Set up an OAI-PMH client for validating providers md_registry = MetadataRegistry() md_registry.registerReader('oai_dc', oai_dc_reader) client = Client(args.url, md_registry) # Validate Base URL by fetching Identify try: client.identify() except (XMLSyntaxError, HTTPError): addlogger.critical('Base URL for new provider does not return a valid ' 'response to an `Identify` request') return 1 # Destination if args.dest is None: args.dest = raw_input('Destination directory: '.ljust(20)) if args.dest: # Expand user dir args.dest = os.path.expanduser(args.dest) else: addlogger.info('Destination for data for new provider not supplied' ' using default `pwd`: {0}'.format(os.getcwd())) args.dest = os.getcwd() # metadataPrefix # Check that selected metadataPrefix is available from provider # Fetch list of available formats mdps = dict( (mdpinfo[0], mdpinfo[1:]) for mdpinfo in client.listMetadataFormats()) while args.metadataPrefix not in mdps: print "Available metadataPrefix values:" # List available formats for mdp in mdps: print mdp, '-', mdps[mdp][1] args.metadataPrefix = raw_input('metadataPrefix [oai_dc]:'.ljust(20)) if not args.metadataPrefix: addlogger.info('metadataPrefix for new provider not supplied. ' 'using default: oai_dc') args.metadataPrefix = 'oai_dc' cxn.execute( "UPDATE providers SET " "url=?, " "destination=?, " "metadataPrefix=? " "WHERE name=?", (args.url, args.dest, args.metadataPrefix, args.name)) addlogger.info('URL for next harvest: {0}?verb=ListRecords' '&metadataPrefix={1}' '&from={2:%Y-%m-%dT%H:%M:%SZ%z}' ''.format(args.url, args.metadataPrefix, datetime.fromtimestamp(0))) # All done, commit database cxn.commit() return 0
def identifiy(target): if target is not None: client = Client(target['url'], registry) identify = client.identify() return convert_identifiy(identify)
return handler.records[0] marcxml_reader = MARCXMLReader() registry = metadata.MetadataRegistry() registry.registerReader('marc21', marcxml_reader) g = Graph() g.namespace_manager.bind('skos', SKOS) g.namespace_manager.bind('cn', CN) g.namespace_manager.bind('dc', DC) g.namespace_manager.bind('dct', DCT) g.namespace_manager.bind('rdaa', RDAA) g.namespace_manager.bind('rdac', RDAC) oai = Client('https://fennica.linneanet.fi/cgi-bin/oai-pmh-fennica-asteri-aut.cgi', registry) #recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames', from_=datetime(2016,2,10)) recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames') lang_cache = {} if os.path.exists(LANG_CACHE_FILE): lcf = codecs.open(LANG_CACHE_FILE, 'r', 'utf-8') for line in lcf: lang, text = line.rstrip("\r\n").split("\t") if lang == '': lang = None lang_cache[text] = lang lcf.close() label_to_uri = {}
return handler.records[0] marcxml_reader = MARCXMLReader() # Defining of metadata Readers in the Registry from oaipmh import metadata registry = metadata.MetadataRegistry() registry.registerReader('oai_dc', metadata.oai_dc_reader) registry.registerReader('marc21', marcxml_reader) #### OAI-PMH Client processing oai = Client('http://snape.mzk.cz/OAI-script', registry) id = oai.identify() print id.repositoryName() print id.adminEmails() print id.baseURL() formats = oai.listMetadataFormats() pprint formats # 'marc21' sets = oai.listSets() for s in sets: print s
'abstract': ('textList', 'oai_dc:dc/dc:description/text()'), 'date': ('textList', 'oai_dc:dc/dc:date/text()'), }, namespaces={ 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc': 'http://purl.org/dc/elements/1.1/' }) # And create a registry for parsing the oai info, linked to the reader registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) # arXiv OAI url we will query URL = "http://export.arxiv.org/oai2" # Create OAI client; now we're all set for listing some records client = Client(URL, registry) # Open files for writing titlef = open(title_file, 'w') #abstractf = open(abstr_file, 'w') # Keep track of run-time and number of papers start_time = time.time() count = 0 # Harvest for record in client.listRecords(metadataPrefix='oai_dc', set=section): try: # Extract the title title = record[1].getField('title')[0] # Extract the abstract
def oai_metadata(oai_endpoint): registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(oai_endpoint, registry) return make_graphs(client.listRecords(metadataPrefix='oai_dc'))
def oaiSpider(subject="hep-ex", section="physics", start=None, end=None, sleep_time = 0): ''' Pull articles using the Open Archives Initiative protocol subject - String defining the subset of the main section section - String defining the main section (typically physics or nothing) start - A datetime.datetime object restricting the starting date of returned articles end - A datetime.datetime object restricting the ending date of the returned articles sleep_time - A number specifying how many ms to wait between the record queries Examples oaiSpider("hep-ex", "physics") ==> returns all HEP experiment articles oaiSpider("cs", "", datetime(2011,06,24)) ==> returns all computer science articles submitted after June 24th, 2011 oaiSpider("hep-ph", "physics", None, datetime(2011,06, 24)) ==> returns all HEP phenomenology articles submitted before June 24th, 2011 Returns a list of dictionaries containing the article metadata ''' from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader base_url = "http://export.arxiv.org/oai2" output = [] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(base_url, registry) client.updateGranularity() if section == None: section = "" if len(section) > 0 and section[-1] != ":": section += ":" # sets = client.listSets() # for entry in sets: # print entry ### OAIPMH module sucks donkey balls # Causes some error when I use the from_ or until keys records = client.listRecords(metadataPrefix='oai_dc' , set='%s%s' % (section, subject) , from_=start #, from_=datestamp , until=end ) counter = 0 for (header, metadata, aux) in records: print counter # for key in metadata._map.keys(): # print key, metadata[key] output.append({"title" : cleanText(metadata["title"][0]), "abstract" : cleanText(metadata["description"][0]), "date" : convertDate(max(metadata["date"])), "subject" : subject, "url" : metadata["identifier"][0], "authors" : "; ".join( metadata['creator']), }) print output[-1] counter += 1 # break # if counter > 15: # break time.sleep(sleep_time) return output
"""Retrieve arXiv abstracts with the OAI-PMH protocol The parser is messed up: it returns empty lists for the metadata tags; Do not use it""" from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader URL = 'http://export.arxiv.org/oai2' registry = MetadataRegistry() registry.registerReader('arXiv', oai_dc_reader) client = Client(URL, registry) counter = 0 for record in client.listRecords(metadataPrefix='arXiv', set='physics:hep-ex'): if counter == 150: break # print record[0].datestamp() # print record[0].identifier() # print record[1].getField(name='title') print(vars(record[1])) counter += 1 # for a_set in client.listSets(): # # if counter == 50: # # break # print a_set # counter += 1