def __init__(self, opts=Opts()): self.opts = opts self.namespaces = opts.settings["namespaces"] self.config = opts.settings["config"] self.identity = opts.settings["identity"] self.signtool = opts.signtool self.fetcher = oaipmh.Fetcher(namespaces=opts.settings["namespaces"], conf=opts.settings["config"]) self.docList = {} if self.opts.settings["config"]["metadataPrefix"] == "nsdl_dc": col_names = self.fetcher.fetchCollections() self.transformer = NSDL(identity=self.identity, config=self.config, namespaces=self.namespaces, col_map=col_names) elif self.opts.settings["config"]["metadataPrefix"] == "comm_para": self.transformer = CommPara(identity=self.identity, config=self.config, namespaces=self.namespaces) elif self.opts.settings["config"]["metadataPrefix"] == "oai_dc": self.transformer = OAIDC(identity=self.identity, config=self.config, namespaces=self.namespaces) else: self.transformer = None try: self.prev_success = self.opts.prev_success except: self.prev_success = False try: self.from_date = self.opts.from_date except: self.from_date = self.fetcher.fetchEarliestDatestamp() self.prev_success = False try: self.until_date = self.opts.until_date except: now = datetime.utcnow() self.until_date = now.strftime("%Y-%m-%dT%H:%M:%SZ") self.prev_success = False self.couch = CouchDB(self.opts, self.from_date, self.until_date) self.completed_set = False
class Run(): def __init__(self, opts=Opts()): self.opts = opts self.namespaces = opts.settings["namespaces"] self.config = opts.settings["config"] self.identity = opts.settings["identity"] self.signtool = opts.signtool self.fetcher = oaipmh.Fetcher(namespaces=opts.settings["namespaces"], conf=opts.settings["config"]) self.docList = {} if self.opts.settings["config"]["metadataPrefix"] == "nsdl_dc": col_names = self.fetcher.fetchCollections() self.transformer = NSDL(identity=self.identity, config=self.config, namespaces=self.namespaces, col_map=col_names) elif self.opts.settings["config"]["metadataPrefix"] == "comm_para": self.transformer = CommPara(identity=self.identity, config=self.config, namespaces=self.namespaces) elif self.opts.settings["config"]["metadataPrefix"] == "oai_dc": self.transformer = OAIDC(identity=self.identity, config=self.config, namespaces=self.namespaces) else: self.transformer = None try: self.prev_success = self.opts.prev_success except: self.prev_success = False try: self.from_date = self.opts.from_date except: self.from_date = self.fetcher.fetchEarliestDatestamp() self.prev_success = False try: self.until_date = self.opts.until_date except: now = datetime.utcnow() self.until_date = now.strftime("%Y-%m-%dT%H:%M:%SZ") self.prev_success = False self.couch = CouchDB(self.opts, self.from_date, self.until_date) self.completed_set = False def getPublishEndpoint(self): if self.opts.LEARNING_REGISTRY_URL == "-": self.publishEndpoint = None self.chunk = 0 else: hdrs = {"Content-Type":"application/json; charset=utf-8"} try: if self.config["publish_user"] is not None and self.config["publish_passwd"] is not None: creds = "{u}:{p}".format(u=self.config["publish_user"].strip(), p=self.config["publish_passwd"].strip()) hdrs['Authorization'] = 'Basic ' + base64.encodestring(creds)[:-1] except: pass self.publishEndpoint = urllib2.Request("{server}/publish".format(server=self.opts.LEARNING_REGISTRY_URL), headers=hdrs) return self.publishEndpoint def sign(self, doc): if doc != None and self.signtool != None: signed = self.signtool.sign(doc) try: if len(signed["digital_signature"]["signature"]) == 0: log.error("Problem signing document") except: log.exception("There's a problem with the digital_signature") return signed else: return doc @LogStartStop() def connect(self): try: for recset in self.fetcher.fetchRecords(): for rec in recset: if self.transformer is not None: (repo_id, doc) = self.transformer.format(rec) seen = self.couch.have_i_seen(repo_id) if not seen or seen["published"] == False: doc = self.sign(doc) if (doc != None and repo_id != None): self.docList[repo_id] = doc self.publishToNode() self.publishToNode() self.publishToNode(force=True) self.completed_set = True except: log.exception("Stopping") finally: self.storeHistory() def storeHistory(self): state = { "harvest_completed": self.completed_set, "harvest_from": self.from_date, "harvest_until": self.until_date } if self.completed_set: self.couch.forget_everything() if self.opts.CONFIG_FILE != None and os.path.exists(self.opts.CONFIG_FILE): extConf = json.load(codecs.open(self.opts.CONFIG_FILE, "r", encoding="utf-8")) extConf["config"].update(state) with codecs.open(self.opts.CONFIG_FILE, "w", encoding="utf-8") as out: out.write(json.dumps(extConf, indent=4, sort_keys=True)) def publishToNode(self, force=False): ''' Save to Learning Registry ''' numDocs = len(self.docList.keys()) if self.opts.CHUNKSIZE <= numDocs or (numDocs > 0 and force): try: repo_ids = [] docList = [] map(lambda x: (repo_ids.append(x[0]), docList.append(x[1])), self.docList.items()) body = { "documents":docList } endpoint = self.getPublishEndpoint() if endpoint is not None: response = urllib2.urlopen(endpoint, data=json.dumps(body)) publishStatus = json.load(response) if not publishStatus["OK"]: log.error(publishStatus["error"]) nonpubcount = 0 for idx, result in enumerate(publishStatus["document_results"]): repo_id = repo_ids[idx] if not result["OK"]: nonpubcount += 1 if "doc_ID" not in result: result["doc_ID"] = "Unknown ID" if "error" not in result: result["error"] = "Unknown publishing error." published = False log.error("REPOID:{repoid} DOCID:{docid} ERROR: {msg}".format(repoid=repo_id, docid=result["doc_ID"], msg=result["error"])) else: published = True log.info("Published doc id : %s", result["doc_ID"]) self.couch.saw(repo_id, published) pubcount = numDocs - nonpubcount try: size = sys.getsizeof(self.docList, -1) / 1024 except: size = 0 log.info("Published {pub} documents ({kbytes}KB), {nonpub} documents were not published.".format(pub=pubcount, nonpub=nonpubcount, kbytes=size)) # assert True == False else: self.chunk += 1 print "/********* CHUNK {chunkNumber} *********/".format(chunkNumber=self.chunk) print json.dumps(body, indent=4) self.docList.clear() except HTTPError as e: log.error("HTTP Error encoutered:{0} message:{1}".format(e.errno, e.strerror)) raise except Exception: log.exception("Unexpected error while trying to publish to node.") raise else: log.debug("Nothing is being updated.")