예제 #1
0
파일: run.py 프로젝트: navnorth/LRDataPump
    def __init__(self, opts=Opts()):
        self.opts = opts
        
        self.namespaces = opts.settings["namespaces"]
        self.config = opts.settings["config"]
        self.identity = opts.settings["identity"]
        self.signtool = opts.signtool
        
        
        self.fetcher = oaipmh.Fetcher(namespaces=opts.settings["namespaces"], conf=opts.settings["config"])
        self.docList = {}
       
        if self.opts.settings["config"]["metadataPrefix"] == "nsdl_dc":
            col_names = self.fetcher.fetchCollections()
            self.transformer = NSDL(identity=self.identity, config=self.config, namespaces=self.namespaces, col_map=col_names)
        elif self.opts.settings["config"]["metadataPrefix"] == "comm_para":
            self.transformer = CommPara(identity=self.identity, config=self.config, namespaces=self.namespaces)
        elif self.opts.settings["config"]["metadataPrefix"] == "oai_dc":
            self.transformer = OAIDC(identity=self.identity, config=self.config, namespaces=self.namespaces)
        else:
            self.transformer = None
        
        try:
            self.prev_success = self.opts.prev_success
        except:
            self.prev_success = False
        
        try:
            self.from_date = self.opts.from_date
        except:
            self.from_date = self.fetcher.fetchEarliestDatestamp()
            self.prev_success = False
        
        try:
            self.until_date = self.opts.until_date
        except:
            now = datetime.utcnow()
            self.until_date = now.strftime("%Y-%m-%dT%H:%M:%SZ")
            self.prev_success = False
            

        self.couch = CouchDB(self.opts, self.from_date, self.until_date)
        self.completed_set = False
예제 #2
0
파일: run.py 프로젝트: navnorth/LRDataPump
class Run():
    def __init__(self, opts=Opts()):
        self.opts = opts
        
        self.namespaces = opts.settings["namespaces"]
        self.config = opts.settings["config"]
        self.identity = opts.settings["identity"]
        self.signtool = opts.signtool
        
        
        self.fetcher = oaipmh.Fetcher(namespaces=opts.settings["namespaces"], conf=opts.settings["config"])
        self.docList = {}
       
        if self.opts.settings["config"]["metadataPrefix"] == "nsdl_dc":
            col_names = self.fetcher.fetchCollections()
            self.transformer = NSDL(identity=self.identity, config=self.config, namespaces=self.namespaces, col_map=col_names)
        elif self.opts.settings["config"]["metadataPrefix"] == "comm_para":
            self.transformer = CommPara(identity=self.identity, config=self.config, namespaces=self.namespaces)
        elif self.opts.settings["config"]["metadataPrefix"] == "oai_dc":
            self.transformer = OAIDC(identity=self.identity, config=self.config, namespaces=self.namespaces)
        else:
            self.transformer = None
        
        try:
            self.prev_success = self.opts.prev_success
        except:
            self.prev_success = False
        
        try:
            self.from_date = self.opts.from_date
        except:
            self.from_date = self.fetcher.fetchEarliestDatestamp()
            self.prev_success = False
        
        try:
            self.until_date = self.opts.until_date
        except:
            now = datetime.utcnow()
            self.until_date = now.strftime("%Y-%m-%dT%H:%M:%SZ")
            self.prev_success = False
            

        self.couch = CouchDB(self.opts, self.from_date, self.until_date)
        self.completed_set = False
    
    def getPublishEndpoint(self):
        if self.opts.LEARNING_REGISTRY_URL == "-":        
            self.publishEndpoint = None
            self.chunk = 0
        else:
            hdrs = {"Content-Type":"application/json; charset=utf-8"}
            
            try:
                if self.config["publish_user"] is not None and self.config["publish_passwd"] is not None:
                    creds = "{u}:{p}".format(u=self.config["publish_user"].strip(), p=self.config["publish_passwd"].strip())
                    hdrs['Authorization'] = 'Basic ' + base64.encodestring(creds)[:-1]
            except:
                pass
            
            self.publishEndpoint = urllib2.Request("{server}/publish".format(server=self.opts.LEARNING_REGISTRY_URL), headers=hdrs)
        
        return self.publishEndpoint 
 
    def sign(self, doc):
        if doc != None and self.signtool != None:
            signed = self.signtool.sign(doc)
            try:
                if len(signed["digital_signature"]["signature"]) == 0:
                    log.error("Problem signing document")
            except:
                log.exception("There's a problem with the digital_signature")
            
            return signed
        else:
            return doc
        
    @LogStartStop()
    def connect(self):
        try:
            for recset in self.fetcher.fetchRecords():
                for rec in recset:
                    if self.transformer is not None:
                        (repo_id, doc) = self.transformer.format(rec)
                        seen = self.couch.have_i_seen(repo_id)
                        if not seen or seen["published"] == False:
                            doc = self.sign(doc)
                            if (doc != None and repo_id != None):
                                self.docList[repo_id] = doc
    
                    self.publishToNode()
                self.publishToNode()
            self.publishToNode(force=True)
            self.completed_set = True
        except:
            log.exception("Stopping")
        finally:
            self.storeHistory()
            
    def storeHistory(self):
        
        state = { 
                 "harvest_completed": self.completed_set,
                 "harvest_from": self.from_date,
                 "harvest_until": self.until_date
        }
        
        if self.completed_set:
            self.couch.forget_everything()
        
        if self.opts.CONFIG_FILE != None and os.path.exists(self.opts.CONFIG_FILE):
            
            extConf = json.load(codecs.open(self.opts.CONFIG_FILE, "r", encoding="utf-8"))
            extConf["config"].update(state)
            
            
            with codecs.open(self.opts.CONFIG_FILE, "w", encoding="utf-8") as out:
                out.write(json.dumps(extConf, indent=4, sort_keys=True))
            
            
    def publishToNode(self, force=False):
        '''
        Save to Learning Registry
        '''
        numDocs = len(self.docList.keys())
        if self.opts.CHUNKSIZE <= numDocs or (numDocs > 0 and force):
            try:
                repo_ids = []
                docList = []
                map(lambda x: (repo_ids.append(x[0]), docList.append(x[1])), self.docList.items())
                body = { "documents":docList }
                endpoint = self.getPublishEndpoint()
                if endpoint is not None:
                    response = urllib2.urlopen(endpoint, data=json.dumps(body))
                    
                    publishStatus = json.load(response)
                    if not publishStatus["OK"]:
                        log.error(publishStatus["error"])
                    
                    nonpubcount = 0 
                    for idx, result in enumerate(publishStatus["document_results"]):
                        repo_id = repo_ids[idx]
                        if not result["OK"]:
                            nonpubcount += 1
                            if "doc_ID" not in result:
                                result["doc_ID"] = "Unknown ID"
                            if "error" not in result:
                                result["error"] = "Unknown publishing error."
                            published = False
                            log.error("REPOID:{repoid} DOCID:{docid} ERROR: {msg}".format(repoid=repo_id, docid=result["doc_ID"], msg=result["error"]))
                        else:
                            published = True
                            log.info("Published doc id : %s", result["doc_ID"])
                        self.couch.saw(repo_id, published)
                     
                    pubcount = numDocs - nonpubcount
                    try:
                        size = sys.getsizeof(self.docList, -1) / 1024
                    except:
                        size = 0
                    log.info("Published {pub} documents ({kbytes}KB), {nonpub} documents were not published.".format(pub=pubcount, nonpub=nonpubcount, kbytes=size))
#                    assert True == False
                else:
                    self.chunk += 1
                    print "/********* CHUNK {chunkNumber} *********/".format(chunkNumber=self.chunk)   
                    print json.dumps(body, indent=4)   
                            
                self.docList.clear()
                

            except HTTPError as e:
                log.error("HTTP Error encoutered:{0}  message:{1}".format(e.errno, e.strerror))
                raise
            except Exception:
                log.exception("Unexpected error while trying to publish to node.")
                raise
        else:
            log.debug("Nothing is being updated.")