def __init__(self, endpoint, day_granularity=False, *args, **kwargs): """ This sets up the paper source. :param endpoint: the address of the OAI-PMH endpoint to fetch from. :param day_granularity: should we use day-granular timestamps to fetch from the proxy or full timestamps (default: False, full timestamps) See the protocol reference for more information on timestamp granularity: https://www.openarchives.org/OAI/openarchivesprotocol.html """ super(OaiPaperSource, self).__init__(*args, **kwargs) self.registry = MetadataRegistry() self.registry.registerReader('oai_dc', oai_dc_reader) self.registry.registerReader('base_dc', base_dc_reader) self.registry.registerReader('citeproc', citeproc_reader) self.client = Client(endpoint, self.registry) self.client._day_granularity = day_granularity if settings.PROAIXY_API_KEY: self.client.extra_parameters = { 'key': settings.PROAIXY_API_KEY} self.translators = {}
def __init__(self, oaisource, day_granularity=False, *args, **kwargs): """ This sets up the paper source. :param oaisource: the OAISource to fetch from. :param day_granularity: should we use day-granular timestamps to fetch from the proxy or full timestamps (default: False, full timestamps) See the protocol reference for more information on timestamp granularity: https://www.openarchives.org/OAI/openarchivesprotocol.html """ super(OaiPaperSource, self).__init__(*args, **kwargs) if not oaisource.endpoint: raise ValueError( 'No OAI endpoint was configured for this OAI source.') self.registry = MetadataRegistry() self.registry.registerReader('oai_dc', oai_dc_reader) self.registry.registerReader('base_dc', base_dc_reader) self.client = Client(oaisource.endpoint, self.registry) self.client._day_granularity = day_granularity self.translators = { 'oai_dc': OAIDCTranslator(oaisource), 'base_dc': BASEDCTranslator(oaisource), }
def __init__(self, url, prefix=nsdl.LR_NSDL_PREFIX, reader=None, fields=None, namespaces=None, fieldMap=None): ''' Constructor ''' if fields == None: self._fields = nsdl.LR_NSDL_DC_FIELDS else: self._fields = fields if fieldMap == None: self._fieldMap = nsdl.NSDL_TO_LR_MAP else: self._fieldMap = fieldMap if namespaces == None: self._namespaces = nsdl.LR_NSDL_DC_NAMESPACES else: self._namespaces = namespaces if reader == None: reader = MetadataReader(fields=self._fields, namespaces=self._namespaces) self._url = url self._registry = MetadataRegistry() self._prefix = prefix self._registry.registerReader(prefix, reader) self._client = Client(url, self._registry)
def setUp(self): self.registry = MetadataRegistry() self.registry.registerReader('mets', dspace_mets_reader) self.element = etree.parse( os.path.join(os.path.dirname(__file__), 'dspace_mets.xml')).getroot() self.item = self.registry.readMetadata('mets', self.element)
def harvest(url): registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(url, registry) client.ignoreBadCharacters(true_or_false=True) identifiers = [] for header in client.listIdentifiers(metadataPrefix='oai_dc'): # if (not(header.isDeleted())): print(f"Found identifier {header.identifier()}") identifiers.append(header.identifier()) # else: # print(f"Skipping (DELETED) identifier {header.identifier()}") print(f"Total number of identifiers: {len(identifiers)}") # Only get the identifier string at the end of the url identifiers = [x.split('/')[-1] for x in identifiers] dirname = os.path.dirname(__file__) filename = os.path.join(dirname, 'philarchive-2.txt') with open(filename, 'w') as f: print(f"Writing to {filename}") f.writelines('\n'.join(identifiers))
def __init__(self, url): registry = MetadataRegistry() registry.registerReader(ZoraAPI.METADATA_PREFIX, oai_dc_reader) self.client = Client(url, registry) self.institutes = {} self.resource_types = [] self.load_institutes_and_types()
def init(user): fullURL = URL+user registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(fullURL, registry) logging.info('The community %s harvested', user) return(client)
def __init__(self, configuration_file): """Constructor.""" self.oai_config = ConfigParser.SafeConfigParser() self.oai_config.read(configuration_file) self.current_config = 'ToulouseBis' registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) self.client = Client(self._get_config_value('url'), registry)
def _create_metadata_registry(self): registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('oai_ddi', oai_ddi_reader) # TODO: Change back? registry.registerReader('dif', dif_reader2) # HDR registry.registerReader('datacite', datacite_reader) return registry
def test(request): URL = 'http://www.kulturarv.dk/ffrepox/OAIHandler' registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry) identifyResponse = client.identify() print dir(identifyResponse) #for record in client.listRecords(metadataPrefix='oai_dc'): # result += record return HttpResponse(identifyResponse.repositoryName())
def _registerReader(metadata_format): """ """ #TODO, check namespaces if metadata_format in ("metashare", "cmdi", "olac"): metadata_registry = MetadataRegistry() metadata_registry.registerReader(metadata_format, Reader()) return metadata_registry else: raise NotImplementedError("The %s metadata format is " \ "currently not supported." % metadata_format)
def clean(self): cleaned_data = super(CreateRepositoryForm, self).clean() try: registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(cleaned_data.get('base_url'), registry) server = client.identify() # set the repository name apply to model instance when saved. cleaned_data['name'] = server.repositoryName() except: raise ValidationError('Repository base url is invalid.') return cleaned_data
def index_documents(main_url, database_name, url, reader, prefix, format): registry = MetadataRegistry() registry.registerReader(prefix, reader) client = Client(url, registry) return_stuff = [] for record in client.listRecords(metadataPrefix=prefix): r = record[1] value = format(r, record[0].identifier()) if value != None: return_stuff.append(value) if len(return_stuff) >= 10000: sync_files(main_url, database_name, return_stuff) return_stuff = [] sync_files(main_url, database_name, return_stuff)
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'): """ Create an OAI-PMH client, gather metadata and output it. """ total = num = 0 msg = "Fetching records between " + str(start) + " and " + str(end) sys.stderr.write(msg + "\n") # # Set up metadata readers # registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('qdc', qdc_reader) # registry.registerReader('rdf', rdf_reader) # no reader yet # registry.registerReader('ore', ore_reader) # no reader yet # registry.registerReader('mets', mets_reader) # no reader yet client = Client(URL, registry) records = client.listRecords(metadataPrefix='qdc', from_=start, until=end, set=set) for (h, m, a) in records: print h, m, a if not m: sys.stderr.write("o") continue total = total + 1 handle = m.getField('identifier') if not handle: sys.stderr.write("Record without a handle.\n") continue r = dict({'handle': handle[0]}) for key in qdc_reader._fields.keys(): r[key] = m.getField(key) RECORDS.append(r) sys.stderr.write('.') sys.stderr.flush() num = num + 1 msg = "\nCollected " + str(num) + " records, out of " + str(total) sys.stderr.write('\n' + msg + '\n') if options.store: pickle.dump(RECORDS, open(options.store, "wb"))
def _create_metadata_registry(self): registry = MetadataRegistry() if self.md_format == 'iso19139' and self.md_application == 'EPOS': registry.registerReader(self.md_format, iso19139_reader) log.debug('Format -> iso19139') elif self.md_format == 'datacite' and self.md_application == 'ILAB': registry.registerReader(self.md_format, datacite_ilab) log.debug('->datacite ILAB reader') else: registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('oai_ddi', oai_ddi_reader) registry.registerReader('dif', dif_reader2) return registry
def setUp(self): super(BookMetadataTest, self).setUp() xml = path.join(path.dirname(__file__), 'files/lubie-kiedy-kobieta.xml') self.book = models.Book.from_xml_file(xml) xml = path.join(path.dirname(__file__), 'files/antygona.xml') self.book2 = models.Book.from_xml_file(xml) mr = MetadataRegistry() self.catalogue = Catalogue(mr) mr.registerWriter('oai_dc', oai_dc_writer) nsmap = {'oai_dc': NS_OAIDC, 'dc': NS_DC, 'xsi': NS_XSI} self.xml = XMLTreeServer(self.catalogue, mr, nsmap)
def update(self, from_date=None): self._log.info('Harvesting oai server: %s' % self._url) registry = MetadataRegistry() registry.registerReader(self._prefix, lambda el: el) client = Client(self._url, registry) try: for header, element, about in client.listRecords( metadataPrefix=self._prefix, from_=from_date): added = self._process_record(header, element) if added: yield self._get_id(header) except NoRecordsMatchError: pass super(OAIBasedContentProvider, self).update()
def get_client(url, transforms): transforms = fix_transforms(transforms) registry = MetadataRegistry() c = Client(url, registry) metadata = c.listMetadataFormats() metadata[0] = [ 'fbb', 'http://www.kulturarv.dk/fbb/fbb.xsd', 'http://www.kulturarv.dk/fbb' ] namespaces = dict((x[0], x[2]) for x in metadata) fields = dict((transform['field'], ('textList', transform['path'])) for transform in transforms) namespace = metadata[0][0] print namespaces, fields registry.registerReader( namespace, MetadataReader(fields=fields, namespaces=namespaces)) return c, namespace
def processItems(): oai_oi_reader = MetadataReader( fields={ 'title': ('textList', 'oai_oi:oi/oi:title/text()'), 'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'), 'creator': ('textList', 'oai_oi:oi/oi:creator/text()'), 'subject': ('textList', 'oai_oi:oi/oi:subject/text()'), 'description': ('textList', 'oai_oi:oi/oi:description/text()'), 'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'), 'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'), 'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'), 'date': ('textList', 'oai_oi:oi/oi:date/text()'), 'type': ('textList', 'oai_oi:oi/oi:type/text()'), 'extent': ('textList', 'oai_oi:oi/oi:extend/text()'), 'medium': ('textList', 'oai_oi:oi/oi:medium/text()'), 'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'), 'source': ('textList', 'oai_oi:oi/oi:source/text()'), 'language': ('textList', 'oai_oi:oi/oi:language/text()'), 'references': ('textList', 'oai_oi:oi/oi:references/text()'), 'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'), 'attributionName': ('textList', 'oai_oi:oi/oi:attributionName/text()'), 'attributionURL': ('textList', 'oai_oi:oi/oi:attributionURL/text()'), 'license': ('textList', 'oai_oi:oi/oi:license/text()'), #Zitten er niet in #'rights': ('textList', 'oai_oi:oi/oi:rights/text()'), #'relation': ('textList', 'oai_oi:oi/oi:relation/text()'), #'coverage': ('textList', 'oai_oi:oi/oi:coverage/text()'), #'format': ('textList', 'oai_oi:oi/oi:format/text()'), }, namespaces={ 'oi': 'http://www.openbeelden.nl/oai/', 'oai_oi': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcterms': 'http://purl.org/dc/terms', }) url = u'http://www.openbeelden.nl/feeds/oai/' registry = MetadataRegistry() registry.registerReader('oai_oi', oai_oi_reader) client = Client(url, registry) for record in client.listRecords(metadataPrefix='oai_oi'): processItem(record)
def __init__(self, url=None, **kwargs): self.base_url = kwargs.pop('base_url', None) self.oai_path = kwargs.pop('oai_path', None) self.oai_enabled = bool(kwargs.pop('oai_enabled', True)) self.sword_enabled = bool(kwargs.pop('sword_enabled', False)) if url is not None: warn( 'The url paramater will not be supported in version 3, ' 'use base_url and oai_path instead', DeprecationWarning) if (self.base_url and url.startswith(self.base_url) and self.oai_path is None): self.oai_path = url.replace(self.base_url, '', 1).lstrip('/') elif not self.base_url: if self.oai_path is None: self.oai_path = 'dspace-oai/request' if url.endswith(self.oai_path): self.base_url = url[:-(len(self.oai_path) + 1)] if self.base_url is None: raise ValueError('base_url argument must be specified') if not 'metadata_registry' in kwargs: kwargs['metadata_registry'] = MetadataRegistry() kwargs['metadata_registry'].registerReader('mets', dspace_mets_reader) if self.sword_enabled: skwargs = {'base_url': self.base_url} for key in kwargs.keys(): if key.startswith('sword_'): skwargs[key[6:]] = kwargs.pop(key) self.sword = SwordService(**skwargs) if self.oai_enabled: self.oai = Client('/'.join(( self.base_url, self.oai_path, )), **kwargs) self.identifier_base = self._extractIdentifierBase(self.base_url)
def iter_items(self, partition): """ Partition is an OAI-PMH endpoint """ # source = "oai:%s" % partition registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(partition, registry) for record in client.listRecords(metadataPrefix='oai_dc'): header, metadata, _ = record if header.isDeleted(): continue # _id = header.identifier() # date = header.datestamp() meta = metadata.getMap() # TODO: there are much validation and heuristics to be done here! # format0 = (meta.get("format") or [None])[0] # if not format0: # continue # if format0 not in ("application/pdf", ): # continue url0 = (meta.get("identifier") or [None])[0] if not url0: continue title0 = (meta.get("title") or [""])[0].encode("utf-8") desc0 = (meta.get("description") or [""])[0].encode("utf-8") # TODO: validate that the url0 is not on another domain?! yield url0, {}, "html", 2, """ <html><head><title>%s</title></head><body>%s</body></html> """ % (title0, desc0)
def _initialise_client(self, url): registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('ore', oai_ore_reader) logging.info('Initialising OAI client with URL [%s]', url) return Client(url, registry)
def indexCollection(URL, url_base, metadata_prefix, collection, action): #pull data from OAI endpoint registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry, force_http_get=True) harvested_data = [] for record in client.listRecords(metadataPrefix=metadata_prefix, set=collection): if not record[0].isDeleted(): fields = record[1].getMap() if fields['subject']: fields['subjects'] = fields['subject'][0].split(';') del fields['subject'] fields['set'] = record[0].setSpec() identifier = record[0].identifier().split(':')[2] fields[ 'image_url_base'] = url_base + '/digital/iiif/' + identifier + '/' harvested_data.append(fields) if action is 'reindex': es.indices.delete(index='digital_collection_recs', ignore=[400, 404]) mapping = { "mappings": { "_doc": { "properties": { "title": { "type": "text" }, "creator": { "type": "text" }, "subjects": { "type": "text" }, "description": { "type": "text" }, "publisher": { "type": "text" }, "contributor": { "type": "text" }, "date": { "type": "text" }, "type": { "type": "text", "fielddata": "true" }, "format": { "type": "text", "fielddata": "true" }, "identifier": { "type": "text" }, "source": { "type": "text" }, "language": { "type": "text", "fielddata": "true" }, "relation": { "type": "text" }, "coverage": { "type": "text" }, "rights": { "type": "text" }, "set": { "type": "text", "fielddata": "true" }, "image_url_base": { "type": "text" } } } } } es.indices.create(index='digital_collection_recs', body=mapping) helpers.bulk(es, harvested_data, index='digital_collection_recs', doc_type='_doc') return "success"
'type', 'format', 'identifier', 'source', 'language', 'relation', 'coverage', 'rights', ]: for value in map.get(name, []): e = SubElement(element, nsdc(name), nsmap=nsmap) e.text = value for name in ['hasPart', 'isPartOf']: for value in map.get(name, []): e = SubElement(element, nsdcterms(name), nsmap=nsmap) e.text = value metadata_registry = MetadataRegistry() metadata_registry.registerWriter('oai_dc', fbc_oai_dc_writer) metadata_registry.registerWriter('qdc', qdc_writer) server = ServerBase(Catalogue(metadata_registry), metadata_registry, {'topxsi': NS_XSI}) def oaipmh(request): resp = server.handleRequest(request.GET) return HttpResponse(resp, content_type='application/xml')
def harvest(metadata_set, dest_folder, log_file, content_type, from_date, until_date): ############################# # ### FILESYSTEM CHECKS ### # ############################# try: if not os.path.isdir(dest_folder): os.makedirs(dest_folder) # Verify write permission inside the folder: except BaseException as e: log.error(str(e)) log.exit("Unable to create destination folder: %s" % dest_folder) try: test_path = os.path.join(dest_folder, '__test_permissions__') os.makedirs(test_path) os.rmdir(test_path) except BaseException as e: log.error(str(e)) log.exit("Unable to use destination folder: %s" % dest_folder) try: log_handle = open(log_file, 'a+') log_handle.close() except BaseException as e: log.error(str(e)) log.exit("Unable to create log_file: %s" % log_file) ################################# # ### OAI-PMH CONFIGURATION ### # ################################# URL = 'https://node0-d-efg.d4science.org/efg/mvc/oai/oai.do' metadata_prefix = 'efg' ################################### # ### OPEN OAI-PMH CONNECTION ### # ################################### registry = MetadataRegistry() registry.registerReader(metadata_prefix, oai_dc_reader) #print ("URL=" + str(URL)) client = Client(URL, registry) #################################### # ### CHECK IF THIS SET EXISTS ### # #################################### set_found = False for s in client.listSets(): if metadata_set == s[0]: set_found = True if not set_found: log.exit("Unable to find this set: %s" % metadata_set) ############################# # ### RETRIEVE METADATA ### # ############################# if from_date is not None: from_date = parse_date(from_date) if from_date is None: log.exit("Unable to convert from date") if until_date is not None: until_date = parse_date(until_date) if until_date is None: log.exit("Unable to convert until date") report_data = { 'downloaded': 0, 'filtered': 0, 'saved': 0, 'saved_files': [], 'missing_sourceid': [], 'wrong_content_type': [] } timestamp = int(1000 * time.time()) log.info("Retrieving records for %s..." % metadata_set) try: records = client.listRecords( metadataPrefix=metadata_prefix, set=metadata_set, from_=from_date, until=until_date) except NoRecordsMatchError as e: log.exit(e) log.info("Records retrieved, extracting...") try: for record in records: element = record[1].element() # Obtained eTree is based on namespaced XML # Read: 19.7.1.6. Parsing XML with Namespaces # https://docs.python.org/2/library/xml.etree.elementtree.html # find(match) # Finds the first subelement matching match. # match may be a tag name or path. # Returns an element instance or None. # findall(match) # Finds all matching subelements, by tag name or path. # Returns a list containing all matching elements # in document order. report_data['downloaded'] += 1 if report_data['downloaded'] % 100 == 0: print('.', end='', flush=True) if report_data['downloaded'] % 5000 == 0: print( ' %s downloaded - %s saved' % ( report_data['downloaded'], report_data['saved'] ), flush=True) efgEntity = element.find(tag("efgEntity")) if efgEntity is None: # log.warning("efgEntity not found, skipping record") continue avcreation = efgEntity.find(tag("avcreation")) nonavcreation = efgEntity.find(tag("nonavcreation")) if avcreation is not None: manifestation = avcreation.find(tag("avManifestation")) recordSource = avcreation.find(tag("recordSource")) keywords = avcreation.findall(tag("keywords")) title_el = avcreation.find(tag("identifyingTitle")) title = (title_el.text if title_el is not None else "Unknown title") elif nonavcreation is not None: manifestation = nonavcreation.find(tag("nonAVManifestation")) recordSource = nonavcreation.find(tag("recordSource")) keywords = nonavcreation.findall(tag("keywords")) title_el = nonavcreation.find(tag("title")) title = (title_el.find(tag("text")).text if title_el is not None else "Unknown title") else: title = "Unknown title" # log.warning("(non)avcreation not found, skipping record") continue filter_keyword = "IMediaCities" is_good = False for keyword in keywords: term = keyword.find(tag("term")) if term.text == filter_keyword: is_good = True break if not is_good: continue report_data['filtered'] += 1 if manifestation is None: report_data['missing_sourceid'].append(title) # log.warning("avManifestation not found, skipping record") continue if content_type is not None: content_type = content_type.lower() item = manifestation.find(tag("item")) if item is None: # missing <item> => type cannot be found report_data['wrong_content_type'].append(title) continue item_type = item.find(tag("type")) if item_type is None: # missing <type> report_data['wrong_content_type'].append(title) continue if item_type.text.lower() != content_type: # wrong type report_data['wrong_content_type'].append(title) continue # ATTENZIONE: il sourceID va preso dal recordSource che sta # sotto avcreation/nonavcreation e NON sotto # avManifestation/nonAVManifestation #recordSource = manifestation.find(tag("recordSource")) if recordSource is None: report_data['missing_sourceid'].append(title) # log.warning("recordSource not found, skipping record") continue sourceID = recordSource.find(tag("sourceID")) if sourceID is None: report_data['missing_sourceid'].append(title) # log.warning("sourceID not found, skipping record") continue content = etree.tostring(efgEntity, pretty_print=False) # id_text = urllib.parse.quote_plus(sourceID.text.strip()) # replace non alpha-numeric characters with a dash id_text = re.sub(r'[\W_]+', '-', sourceID.text.strip()) # fine cinzia filename = "%s_%s_%s.xml" % ( metadata_set, id_text, timestamp ) filepath = os.path.join(dest_folder, filename) # with open(filepath, 'wb') as f: with codecs.open(filepath, 'wb', "utf-8") as f: f.write(content.decode('utf-8')) # OLD #with codecs.open(filepath, 'wb', "utf-8") as f: # f.write(html.unescape(content.decode('utf-8'))) report_data['saved'] += 1 report_data['saved_files'].append(filename) except NoRecordsMatchError as e: log.warning("No more records after filtering?") log.warning(e) # ################### # Write report file # ################### # the procedure writes a report file containing the results # of the harvesting: # the list of records that do not contain the record ID # (by writing the content of the element title) with open(log_file, 'w+') as f: json.dump(report_data, f) f.close() # Just to close previous dot line print("") log.info(""" %s records from set [%s] downloaded open log file [%s] for details """ % (report_data['saved'], metadata_set, log_file) )
def _create_metadata_registry(self): registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('oai_ddi', oai_ddi_reader) return registry
def get_names(dataname): record_prefix = "rdf:RDF/edm:ProvidedCHO" # Modidy/add Xpath mappings to get other fields and other objects (agent, place etc) edm_reader = MetadataReader( fields={ 'title': ('textList', record_prefix + '/dc:title/text()'), 'creator': ('textList', record_prefix + '/dc:creator/text()'), 'subject': ('textList', record_prefix + '/dc:subject/text()'), 'description': ('textList', record_prefix + '/dc:description/text()'), 'publisher': ('textList', record_prefix + '/dc:publisher/text()'), 'contributor': ('textList', record_prefix + '/dc:contributor/text()'), 'date': ('textList', record_prefix + '/dc:date/text()'), 'type': ('textList', record_prefix + '/dc:type/text()'), 'format': ('textList', record_prefix + '/dc:format/text()'), 'identifier': ('textList', record_prefix + '/dc:identifier/text()'), 'source': ('textList', record_prefix + '/dc:source/text()'), 'language': ('textList', record_prefix + '/dc:language/text()'), 'relation': ('textList', record_prefix + '/dc:relation/text()'), 'coverage': ('textList', record_prefix + '/dc:coverage/text()'), 'rights': ('textList', record_prefix + '/dc:rights/text()'), 'spatial': ('textList', record_prefix + '/dc:spatial/text()'), 'objectId': ('textList', record_prefix + '/@rdf:about'), }, namespaces={ 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcterms': 'http://purl.org/dc/terms/', 'dct': 'http://purl.org/dc/terms/', 'edm': 'http://www.europeana.eu/schemas/edm/', 'foaf': 'http://xmlns.com/foaf/0.1/', 'owl': 'http://www.w3.org/2002/07/owl#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'skos': 'http://www.w3.org/2004/02/skos/core#', 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', 'ore': 'http://www.openarchives.org/ore/terms/' }) names = [] identifier = [] if __name__ == "__main__": URL = 'https://data.jhn.ngo/oai' registry = MetadataRegistry() registry.registerReader('edm', edm_reader) client = Client(URL, registry) # To harvest specific dataset, use "set" parameter: set='AIUJE1_MARC21' for record in client.listRecords(metadataPrefix='edm', set=dataname): output = record[1].getMap() if output['creator'] != []: names.append([output['creator'][0]]) identifier.append( [output['creator'][0], output['objectId'][0]]) if output['contributor'] != []: names.append([output['contributor'][0]]) identifier.append( [output['contributor'][0], output['objectId'][0]]) print(names) return identifier
def _client(self, dataset): registry = MetadataRegistry() registry.registerReader(dataset.metadata_prefix, oai_dc_reader) return Client(dataset.pmh_url, registry)
def add_provider(cxn, args): """Add a new provider to the registry database. Process ``args`` to add a new provider to the registry database. Return 0 for success, 1 for failure (error message should be logged). ``cxn`` => instance of ``sqlite3.Connection`` ``args`` => instance of ``argparse.Namespace`` """ global logger, MAX_NAME_LENGTH addlogger = logger.getChild('add') # Validate name if len(args.name) > MAX_NAME_LENGTH: addlogger.critical('Short name for new provider must be no more than ' '{0} characters long'.format(MAX_NAME_LENGTH)) return 1 elif args.name.startswith(('http://', 'https://')) or args.name == 'all': addlogger.critical('Short name for new provider may not be "all" nor ' 'may it begin "http://" or "https://"') return 1 # Try to create row now to avoid unnecessary validation if duplicate try: cxn.execute( "INSERT INTO providers(name, lastHarvest) values " "(?, ?)", (args.name, datetime.fromtimestamp(0))) except sqlite3.IntegrityError: addlogger.critical('Unable to add provider "{0}"; ' 'provider with this name already exists' ''.format(args.name)) return 1 else: addlogger.info('Adding provider "{0}"'.format(args.name)) # Get any missing information # Base URL if args.url is None: args.url = raw_input('Base URL:'.ljust(20)) if not args.url: addlogger.critical('Base URL for new provider not supplied') return 1 # Set up an OAI-PMH client for validating providers md_registry = MetadataRegistry() md_registry.registerReader('oai_dc', oai_dc_reader) client = Client(args.url, md_registry) # Validate Base URL by fetching Identify try: client.identify() except (XMLSyntaxError, HTTPError): addlogger.critical('Base URL for new provider does not return a valid ' 'response to an `Identify` request') return 1 # Destination if args.dest is None: args.dest = raw_input('Destination directory: '.ljust(20)) if args.dest: # Expand user dir args.dest = os.path.expanduser(args.dest) else: addlogger.info('Destination for data for new provider not supplied' ' using default `pwd`: {0}'.format(os.getcwd())) args.dest = os.getcwd() # metadataPrefix # Check that selected metadataPrefix is available from provider # Fetch list of available formats mdps = dict( (mdpinfo[0], mdpinfo[1:]) for mdpinfo in client.listMetadataFormats()) while args.metadataPrefix not in mdps: print "Available metadataPrefix values:" # List available formats for mdp in mdps: print mdp, '-', mdps[mdp][1] args.metadataPrefix = raw_input('metadataPrefix [oai_dc]:'.ljust(20)) if not args.metadataPrefix: addlogger.info('metadataPrefix for new provider not supplied. ' 'using default: oai_dc') args.metadataPrefix = 'oai_dc' cxn.execute( "UPDATE providers SET " "url=?, " "destination=?, " "metadataPrefix=? " "WHERE name=?", (args.url, args.dest, args.metadataPrefix, args.name)) addlogger.info('URL for next harvest: {0}?verb=ListRecords' '&metadataPrefix={1}' '&from={2:%Y-%m-%dT%H:%M:%SZ%z}' ''.format(args.url, args.metadataPrefix, datetime.fromtimestamp(0))) # All done, commit database cxn.commit() return 0
def oai_metadata(oai_endpoint): registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(oai_endpoint, registry) return make_graphs(client.listRecords(metadataPrefix='oai_dc'))