def run(self): offset = self.defaultOffset count = self.defaultCount url = self.getNextURL() while url: self.logger.info("Pulling REST records from " + str(url)) try: response = Utils.getResponse(url) except Exception as e: self.logger.warning("Could not get content from " + str(url) + " ERROR: " + str(e)) continue try: data = Utils.getJSONFromResponse(response) except Exception as e: self.logger.warning("Could not get content from " + str(url) + " ERROR: " + str(e)) continue if len(data) == 0: url = None if self.recordType == 'list': records = [] for uri in data: records.append(uri) else: records = self.getRecords(url) self.addBatch(records) offset = offset + count url = self.getNextURL(offset, count)
def addResource(self, uri, sourceNamespace, setNamespace, batchTag=None): self.logger.info("Adding resource with uri: " + str(uri)) record = None message = None try: Utils.checkURI(uri) except URIException as e: raise Exception("Resource uri did not validate. uri: " + str(uri)) params = { 'sourceNamespace': sourceNamespace, 'setNamespace': setNamespace, 'uri': uri } if batchTag: params['batchTag'] = batchTag try: response = Utils.postRSData(self.resourceURI, params) except Exception as e: raise BadResourceURL( "Could not add resource. resourceURI: " + str(self.resourceURI), e) record = Utils.getJSONFromResponse(response) message = self.getMessage(record) if message: self.logger.warning(message) return record, message
def addCapability(self, capURL, sourceNamespace, setNamespace, capType): self.logger.info("Adding capability with url:" + str(capURL)) record = None message = None try: Utils.checkURI(capURL) except Exception as e: self.logger.warning("Capability URL did not validate. url: " + str(capURL) + " ERROR: " + str(e)) raise Exception("Capability URL did not validate. url: " + str(capURL) + " ERROR: " + str(e)) params = { 'sourceNamespace': sourceNamespace, 'setNamespace': setNamespace, 'uri': capURL, 'capabilityType': capType } try: response = Utils.postRSData(self.capabilityURI, params) except Exception as e: self.logger.critical("Could not add capability. capabiltyURI: " + str(self.capabilityURI) + " ERROR: " + str(e)) raise BadResourceURL(str(e)) record = Utils.getJSONFromResponse(response) message = self.getMessage(record) if message: self.logger.warning(message) return record, message
def getMessage(self, record): message = Utils.getRecordAttr(record, 'message') msg = Utils.getRecordAttr(record, 'msg') if message: return message if msg: return msg return None
def getManifest(self, batchTag, sourceNamespace, setNamespace): url = self.endpointURI + "/static/" + str(sourceNamespace) + "/" + str( setNamespace) + "/" + str(batchTag) + "/manifest" urlCheck = Utils.checkURI(url) if not urlCheck: return False contents = Utils.getContent(url) return contents
def getResourceContent(self, resID): url = self.targetURI + '/content/' + resID try: response = Utils.getResponse(url) record = Utils.getJSONFromResponse(response) except Exception as e: raise BadResourceURL("Could not get data from url", e, self.logger) return record
def postLog(self, level, msg): fullNamespace = self.sourceNamespace + "/" + self.setNamespace log = { "MSG": msg, "LEVEL": level, "NAMESPACE": fullNamespace, "BATCHTAG": self.batchTag } Utils.postToLog(log)
def getResources(self, offset=0, count=20): url = self.endpointURI + str("resource") url = str(url) + "?offset=" + str(offset) + "&count=" + str(count) urlCheck = Utils.checkURI(url) if not urlCheck: return False f = urllib.request.urlopen(url) contents = Utils.getContent(url) return contents
def run(self): try: response = Utils.getResponse(self.ListSource) except Exception as e: raise RSLoaderError( "Could not load URLs from listSource. Error: " + str(e), None, self.logger) records = Utils.getJSONFromResponse(response) self.addBatch(records)
def __init__(self, sourceNamespace, setNamespace, opts, mode='latest'): for key, value in opts.items(): setattr(self, key, value) Utils.validateRequired(opts, ['targetURL']) self.batchTag = Utils.getCurrentBatchTimestamp() self.logger = gl.gleanomaticLogger(sourceNamespace, setNamespace, self.batchTag) self.logger.info("Initializing RESTPublisher") self.reader = RSReader(sourceNamespace, setNamespace, {"batchTag": self.batchTag}, mode)
def loadCapabilityList(self, sourceNamespace, setNamespace): url = self.endpointURI + "/RS/" + str(sourceNamespace) + "/" + str( setNamespace) + "/capabilitylist.json" urlCheck = Utils.checkURI(url) if not urlCheck: return False response = Utils.getResponse(url) data = Utils.getJSONFromResponse(response) if 'urlset' in data: if 'url' in data['urlset']: return data['urlset']['url'] return []
def __init__(self, sourceNamespace, setNamespace, opts): super().__init__(sourceNamespace, setNamespace, opts) self.logger.info("initializing OAILoader") try: Utils.validateRequired(opts, ['OAISource', 'OAIMetaDataPrefix']) except ValueError as e: raise RSLoaderError("Missing required parameter.", e, self.logger) try: Utils.checkURI(str(self.OAISource) + "?verb=Identify") except Exception as e: raise RSLoaderError("OAISource url did not validate. ", e, self.logger) return None
def __init__(self, sourceNamespace, setNamespace, opts): super().__init__(sourceNamespace, setNamespace, opts) self.logger.info("initializing ListLoader") try: Utils.validateRequired(opts, ['ListSource']) except ValueError as e: raise RSLoaderError("Missing required parameter.", e, self.logger) try: Utils.checkURI(str(self.ListSource)) except Exception as e: raise RSLoaderError("ListSource url did not validate. ", e, self.logger) return None
def addFromBatch(datum,attempts=1): parts = datum.split("||") params = {'uri': parts[0], 'sourceNamespace' : parts[1], 'setNamespace' : parts[2], 'batchTag': parts[3]} resourceURI = str(appConfig.targetURI) + "/resource" namespace = str(parts[1]) + "/" + str(parts[2]) response = Utils.postRSData(resourceURI,params) respJ = Utils.getJSONFromResponse(response) if not respJ: print("Failed to post " + str(parts[0])) Utils.postToLog({"LEVEL":"WARNING", "MSG": "Failed to post " + str(parts[0]), "NAMESPACE": namespace, "BATCHTAG": str(parts[3]), "RESP": respJ}) return True if 'ID' in respJ: print("Posted " + str(parts[0])) Utils.postToLog({"LEVEL":"INFO", "MSG": "Posted " + str(parts[0]) + " to resource/" + str(respJ['ID']), "NAMESPACE": namespace, "BATCHTAG": str(parts[3])}) else: if attempts > 3: print("Failed to post " + str(parts[0])) Utils.postToLog({"LEVEL":"WARNING", "MSG": "Failed to post " + str(parts[0]), "NAMESPACE": namespace, "BATCHTAG": str(parts[3]), "RESP": respJ}) else: attempts = attempts + 1 time.sleep(3) addFromBatch(datum,attempts) return True
def __init__(self, sourceNamespace, setNamespace, opts): try: super().__init__(sourceNamespace, setNamespace, opts) except Exception as e: raise Exception("Could not start RSLoader. " + str(e)) self.logger.info("initializing OAILoader") Utils.validateRequired(opts, ['OAISource', 'OAIMetaDataPrefix']) try: Utils.checkURI(str(self.OAISource) + "?verb=Identify") except Exception as e: self.logger.critical( self.msg("OAISource url did not validate. " + str(e))) raise ValueError("OAISource url did not validate. " + str(e)) return None
def initRecord(self, resID): resURL = str(self.config.targetURI) + "/resource/" + str(resID) resourceJSON = Utils.getContent(resURL) self.resource = Utils.jsonToDict(resourceJSON) self.logger = gl.gleanomaticLogger(self.resource['sourceNamespace'], self.resource['setNamespace'], 'OAIMap') self.logger.info("Initializing record in OAIMap.") mapConfig = maps[self.resource["sourceNamespace"]][ self.resource["setNamespace"]] if 'prefix' in mapConfig: self.prefix = mapConfig['prefix'] self.mapper = self.getMapper(mapConfig["mapper"]) url = str(self.config.targetURI) + "/content/" + str(resID) content = Utils.getContent(url) try: data = Utils.getDictFromXML(content) except Exception as e: self.logger.critical("Could not get dict from xml. ERROR: " + str(e)) raise Exception(str(e)) try: record = data["OAI-PMH"]["GetRecord"]["record"] self.header = record["header"] self.metadata = record["metadata"] except KeyError as e: try: self.header = record["header"] except KeyError as e: self.logger.critical( "Could not find metadata or header in record.") raise BadOAIRecord(str(e)) try: status = self.header["status"] except KeyError as e: try: status = self.header["@status"] except KeyError as e: self.logger.critical("No status in header.") raise BadOAIRecord(str(e)) if status == 'deleted': self.deleted = True else: self.logger.critical("Unknown status type: " + str(status)) raise BadOAIRecord("No metadata. Unknown status: " + str(status)) if not self.deleted: mbr = MOHUBBaseRecord() self.resultRecord = mbr.getBaseRecord() return record
def addDump(self, batchTag, sourceNamespace, setNamespace): response = None params = { 'sourceNamespace': sourceNamespace, 'setNamespace': setNamespace, 'batchTag': batchTag } try: response = Utils.postRSData(self.capabilityURI, params) except Exception as e: raise AddDumpException("Could not post dump.", e) d = Utils.getJSONFromResponse(response) d = self.convertToRSDomain(d) return d
def __init__(self, sourceNamespace, setNamespace, opts, mode='latest'): self.transformURI = appConfig.transformURI for key, value in opts.items(): setattr(self, key, value) Utils.validateRequired(opts, ['transformName', 'targetSet']) #parse target namespaces out of targetSet parts = self.targetSet.split('/') self.targetSourceNS = parts[0] self.targetSetNS = parts[1] self.loader = RSLoader(self.targetSourceNS, self.targetSetNS) self.reader = RSReader(sourceNamespace, setNamespace, {"batchTag": self.loader.batchTag}, mode) self.logger = self.loader.logger self.logger.info("initializing Transformer")
def pullDynamicOAIByURL(self, url): while url: self.logger.info("Pulling dynamic OAI from " + str(url)) try: data = Utils.getContent(url) except Exception as e: self.logger.warning("Could not get content from " + str(url) + " ERROR: " + str(e)) continue OAIerror = self.getError(data) if OAIerror: raise RSLoaderError( "Could not pull OAI records. OAIError: " + str(OAIerror), None, self.logger) rawIDs = data.split('<identifier>') #first item is the header del rawIDs[0] records = [] result = None for rawID in rawIDs: parts = rawID.split('</identifier>') resourceURL = str( self.OAISource) + "?verb=GetRecord&metadataPrefix=" + str( self.OAIMetaDataPrefix) + "&identifier=" + str( parts[0]) records.append(resourceURL) self.addBatch(records) rToken = self.getResumptionToken(data) if rToken: url = str( self.OAISource ) + "?verb=ListIdentifiers&resumptionToken=" + str(rToken) else: url = None
def makeDump(self): if self.createDump: try: contents = self.targetEndpoint.addDump(self.batchTag, self.sourceNamespace, self.setNamespace) except Exception as e: logger.critical(self.msg("Could not add dump.")) raise AddDumpException("Could not add dump.", e) zipURI = contents while True: retries = 0 try: uriResponse = Utils.checkURI(zipURI) except Exception as e: #allow up to 1 hour for zip creation - sleep 60 seconds and try 60 times time.sleep(60) retries = retries + 1 if retries > 60: logger.critical( self.msg("Too many retries waiting for " + str(zipURI))) raise AddDumpException( "Too many retries waiting for " + str(zipURI)) continue if uriResponse: logger.info("Found zipURI.") break result = self.addCapability(zipURI, 'dump') return result return False
def __init__(self, endpointURI): logger.info("Initializing RSRestClient") #ensure that there is a trailing slash on the endpoint if endpointURI[-1] != "/": endpointURI = str(endpointURI) + "/" self.endpointURI = endpointURI self.resourceURI = str(self.endpointURI) + "resource" logger.info("Checking resourceURI: " + str(self.resourceURI)) try: Utils.checkURI(self.resourceURI) except Exception as e: logger.critical("ResourceURI did not validate: " + str(self.resourceURI) + " ERROR:" + str(e)) raise TargetURIException( "ResourceURI did not validate: " + str(self.resourceURI), e) self.capabilityURI = str(self.endpointURI) + "capability"
def initRecord(self): resURL = str(self.config.targetURI) + "/resource/" + str(self.resID) resourceJSON = Utils.getContent(resURL) self.resource = Utils.jsonToDict(resourceJSON) self.logger = gl.gleanomaticLogger(self.resource['sourceNamespace'],self.resource['setNamespace'],'MimsyMap') self.logger.info("Initializing record in MimsyMap.") url = str(self.config.targetURI) + "/content/" + str(self.resID) try: response = Utils.getResponse(url) record = Utils.getJSONFromResponse(response) except Exception as e: raise gError("Could not get data from url",e,self.logger) if not self.deleted: mbr = MOHUBBaseRecord() self.resultRecord = mbr.getBaseRecord() return record
def loadManifestIDs(self, sourceNamespace, setNamespace, batchTag): url = self.endpointURI + "/static/" + str(sourceNamespace) + "/" + str( setNamespace) + "/" + str(batchTag) + "/manifest" urlCheck = Utils.checkURI(url) if not urlCheck: return False ids = [] contents = Utils.getContent(url) lines = contents.split("\n") for line in lines: parts = line.split('><') resourceID = parts[-1] resourceID = resourceID.replace('/resource/', '') resourceID = resourceID.replace('>', '') ids.append(resourceID) return ids
def __init__(self, sourceNamespace, setNamespace, opts): super().__init__(sourceNamespace, setNamespace, opts) self.logger.info("initializing ESLoader") try: Utils.validateRequired( opts, ['ESHost', 'ESPort', 'ESIndex', 'ESType', 'body']) except ValueError as e: raise RSLoaderError("Missing required parameter.", e, self.logger) self.es = Elasticsearch([{ 'host': self.ESHost, 'port': self.ESPort }], timeout=self.timeout) if not self.es.indices.exists(index=self.ESIndex): raise RSLoaderError("ES Index " + self.ESIndex + " not exist.") self.baseRecordURL = 'http://' + str(self.ESHost) + ':' + str( self.ESPort) + '/' + str(self.ESIndex) + '/' + str( self.ESType) + '/' return None
def loadResourceListIndex(self, sourceNamespace, setNamespace): url = self.endpointURI + "/RS/" + str(sourceNamespace) + "/" + str( setNamespace) + "/resourcelistindex.json" urlCheck = Utils.checkURI(url) if not urlCheck: return False response = Utils.getResponse(url) data = Utils.getJSONFromResponse(response) urls = [] if 'sitemapindex' in data: if 'sitemap' in data['sitemapindex']: sitemap = data['sitemapindex']['sitemap'] for record in sitemap: if 'rs:ln' in record: if '@type' in record['rs:ln']: if str(record['rs:ln'] ['@type']).lower() == 'application/json': urls.append(record['rs:ln']['@href']) return urls
def __init__(self,sourceNamespace,setNamespace,opts={}): self.batchTag = Utils.getCurrentBatchTimestamp() self.logger = gl.gleanomaticLogger(sourceNamespace,setNamespace,self.batchTag) self.logger.info("Initializing RSLoader") self.targetURI = appConfig.targetURI self.targetEndpoint = rc.RSRestClient(self.targetURI,self.logger) self.sourceNamespace = sourceNamespace self.setNamespace = setNamespace self.createDump = appConfig.createDump for key, value in opts.items(): setattr(self, key, value)
def loadResourceListIDs(self, url): url = self.convertToRSDomain(url) urlCheck = Utils.checkURI(url) if not urlCheck: return False response = Utils.getResponse(url) data = Utils.getJSONFromResponse(response) ids = [] if 'urlset' in data: if 'url' in data['urlset']: urls = data['urlset']['url'] for record in urls: if 'rs:ln' in record: if 'rel' in record['rs:ln']: if str(record['rs:ln'] ['rel']).lower() == 'describedby': resourceID = record['rs:ln']['href'] resourceID = resourceID.replace( '/resource/', '') ids.append(resourceID) return ids
def loadIDs(self, limit=None): if self.mode == 'all': self.index = self.targetEndpoint.loadResourceListIndex( self.sourceNamespace, self.setNamespace) while True: ids = self.getNextIDs() if not ids: break else: for resourceID in ids: self.resourceIDs.append(resourceID) if limit: if len(self.resourceIDs) > limit: break elif self.mode == 'latest': latestTag = None batchTags = [] capURLs = self.targetEndpoint.loadCapabilityList( self.sourceNamespace, self.setNamespace) if limit: capURLs = capURLs[:limit] if capURLs: for record in capURLs: thisTag = None if 'rs:md' in record: if '@until' in record['rs:md']: thisTag = Utils.getBatchTimestamp( record['rs:md']['@until']) else: #is it a zip file named by batchtag? if '.zip' in record['loc']: parts = record['loc'].split('/') filename = parts[-1] thisTag = filename.replace('.zip', '') if thisTag: numTag = int(thisTag) batchTags.append(numTag) batchTags.sort() latestTag = batchTags[-1] ids = self.targetEndpoint.loadManifestIDs(self.sourceNamespace, self.setNamespace, latestTag) for resourceID in ids: self.resourceIDs.append(resourceID) else: print("Unknown mode: " + str(mode)) return True
def pullDynamicOAI(self): url = str( self.OAISource) + "?verb=ListIdentifiers&metadataPrefix=" + str( self.OAIMetaDataPrefix) if self.OAIset: url = url + "&set=" + str(self.OAIset) while url: logger.info("Pulling dynamic OAI from " + str(url)) data = Utils.getContent(url) OAIerror = self.getError(data) if OAIerror: logger.critical( self.msg("Could not pull OAI records. Error: " + str(OAIerror))) raise ValueError("Could not pull OAI records. ERROR: " + str(OAIerror)) rawIDs = data.split('<identifier>') #first item is the header del rawIDs[0] records = [] result = None for rawID in rawIDs: parts = rawID.split('</identifier>') resourceURL = str( self.OAISource) + "?verb=GetRecord&metadataPrefix=" + str( self.OAIMetaDataPrefix) + "&identifier=" + str( parts[0]) records.append(resourceURL) self.addBatch(records) rToken = self.getResumptionToken(data) if rToken: url = str( self.OAISource ) + "?verb=ListIdentifiers&resumptionToken=" + str(rToken) else: url = None
def deleteResource(self, uri): response = Utils.deleteContent(uri) if not response: raise Exception("Could not delete resource at " + str(uri)) d = response.read() return d