def load(cache, auId, doUrls, expire, trials, sleep, timeout): act = Action.GETAUSUMMARY if (doUrls): act = act + "," + Action.GETURLLIST log.info('Start %s: %s expire=%s ...' % (act, auId, str(expire))) success = False try: for _ in range(trials): try: if (doUrls): UrlReport.loadTry(cache.ui, auId, expire) else: LockssCacheAuSummary.__loadTry(cache.ui, auId, expire) success = True break except urllib2.HTTPError as inst: cache.reconnect(sleep,timeout) except ExpatError: log.error("XML Parser error; could not %s %s" % (auId, act)) success = False; # try again if (not success): log.error("exhausted trials for %s; could not load %s" % (auId, act)) except LockssError as inst: log.warn("LockssException: %s" % inst) # output is scanned for the ERROR string log.warn("could not digest %s for %s" % (act, auId.auId)) finally: log.debug2('Stop %s: %s Success = %s ...' % (act, auId, success)) return success
def collectAuIdInstances(self, cache): ids = []; if (self.options.all): ids = cache.locksscacheauid_set.all() else: ids = LockssCacheAuId.get(self.options.auids, self.options.auidprefixlist, cache) log.info("#Matching AUIDS: %d" % len(ids)) for au in ids: log.debug2("Matching AUIDS: %s" % au.auId) return ids
def mkdir(self, action, server): dirname = "%s/%s" % (self.options.dir, server) if not os.path.exists(dirname): try: os.makedirs(dirname,0777) log.info("created output directory %s" % dirname) except: log.error("Could not create %s" % dirname) return None else: log.debug2("using output directory %s" % dirname) return dirname
def loadTry(ui, auId, datafresh): ''' create/update urlReport and associated related urls (deleting existing urls) ''' urlReport = None reportDate = datetime.utcnow().replace(tzinfo=pytz.UTC) try: # check for existing au summary urlReport = UrlReport.objects.get(auId = auId) if ((reportDate - urlReport.reportDate) < datafresh): log.debug2("uptodate UrlReport available - not querying cache") return urlReport except UrlReport.DoesNotExist: pass; log.info("get UrlReport %s" % auId) (reportDate, urllist) = LockssCacheAuSummary.loadTryIt(ui, auId, True); if (not urllist): raise LockssError("%s on cache %s reports 0 urls" % (str(auId), auId.cache.name)); if (not urlReport): urlReport = UrlReport.objects.create(reportDate=reportDate, auId=auId); else: Url.objects.filter(urlReport=urlReport).delete() urlReport.reportDate = reportDate; urlReport.save(); print "urlReport ", urlReport; try: for url in urllist: # work only on urls with real content if (not url.has_key('NodeContentSize') or url['NodeContentSize'] == '-'): continue u = { 'urlReport' : urlReport } u['name'] = url[u'NodeName'] u['childCount'] = url[u'NodeChildCount'] if url.has_key(u'NodeChildCount') else 0 u['treeSize'] = url[u'NodeTreeSize'] if url.has_key(u'NodeTreeSize') else 0 u['size'] = url[u'NodeContentSize'] if url.has_key(u'NodeContentSize') else 0 u['version'] = url[u'NodeVersion'] if url.has_key(u'NodeVersion') else 0 if url.has_key(u'NodeVersion'): if (url[u'NodeVersion'] == '1' ): u['version'] = 1 else: u['version'] = url[u'NodeVersion']['value'] lurl = Url.objects.create(**u) lurl.save() log.debug2("Url %s " % lurl.name) except Exception as e: urlReport.delete(); # deletes dependent urls raise LockssError("Failed to read Url Info for %s %s\nException %s" % (auId.cache.name, str(auId), str(e)))
def printcsv(f, auids, sort, hdrs, sep): if (not auids): log.info('NOOP %s: No auids for %s' % (Action.PRTAUSUMMARY, f.name) ) return cache = auids[0].cache log.info('Start %s: Cache %s File %s ...' % (Action.PRTAUSUMMARY, cache, f.name)) f.write(sep.join(hdrs) + "\n") # build query qu = Q() for auid in auids: qu = qu | Q(auId = auid.id) sums = LockssCacheAuSummary.objects.filter(qu).order_by(sort) for sm in sums: f.write(sm.csv(hdrs, sep) + "\n") f.close() log.debug2('Stop %s: Cache %s File %s ...' % (Action.PRTAUSUMMARY, cache, f.name))
def __loadTry(ui, auId, datafresh): ''' deleting existing LockssCacheAuSummary create new summary by reading status info from cache ''' lockssSummary = None try: # check for existing au summary lockssSummary = LockssCacheAuSummary.objects.get(auId = auId) if ((datetime.utcnow().replace(tzinfo=pytz.UTC) - lockssSummary.reportDate) < datafresh): log.debug2("uptodate LockssCacheAuSummary available - not querying cache") return lockssSummary except: pass; log.info("get LockssCacheAuSummary %s" % auId); LockssCacheAuSummary.loadTryIt(ui, auId, False);
def load(cache, auId, trials, sleep,timeout): success = False log.debug2('Start %s: %s ...' % (Action.GETCRAWLSTATUS, auId)) try: log.info('get %s: %s ...' % (Action.GETCRAWLSTATUS, auId)) for i in range(trials): try: LockssCrawlStatus.__loadTry(cache.ui, auId) success = True break except urllib2.HTTPError as inst: cache.reconnect(sleep,timeout) log.error("exhausted trials for %s, could not load crawlstatus" % (auId)) except LockssError as inst: log.warn("LockssException: %s" % inst) # output is scanned for the ERROR string log.warn("could not digest %s for %s" % (Action.GETCRAWLSTATUS, auId.auId)) finally: log.debug2('Stop %s: %s Success = %s ...' % (Action.GETCRAWLSTATUS, auId, success)) return success
def __loadTry(ui, auId): ''' delete existing LockssAuCrawlStatus and create new by reading status info from cache ''' log.debug("try %s" % (auId)) st = { 'auId' : auId } status = ui.getCrawlStatus(auId.masterAuId.getLockssAu()) reportDate = datetime.utcnow().replace(tzinfo=pytz.utc) if (not status): log.debug2("No CrawlStatus Info for %s %s" % (auId.cache, auId.auId) ) else: for s in status: # work on status info if (not s): raise LockssError("CrawlStatusTable returned empty info"); try: st['reportDate'] = reportDate st['type'] = s['crawl_type'] st['startTime'] = datetime.strptime(s['start'], utils.UI_STRFTIME) st['nBytesFetched'] = s['content_bytes_fetched'].replace(",", "") st['status'] = s['crawl_status']['value'] st['nMimeTypes'] = str(s['num_of_mime_types']['value'].replace(",", "")) st['duration'] = str(s['dur']) for f in LockssCrawlStatus.INTFIELDMAP: val = s[LockssCrawlStatus.INTFIELDMAP[f]] if (val.__class__ == unicode): st[f] = int(val.replace(",", "")) else: st[f] = str(val['value'].replace(",", "")) except KeyError: raise LockssError("CrawlStatusTable returned faulty info for %s: %s" % (auId.auId, s)); try: # update existing crawl status crawl = LockssCrawlStatus.objects.get( auId = auId, startTime = st['startTime']) crawl.__dict__.update(st) log.debug("LockssCrawlStatus UPD %s %s" % (crawl.startTime, str(crawl.auId))) except ObjectDoesNotExist: # create new crawlstatus crawl = LockssCrawlStatus.objects.create(**st) log.debug("LockssCrawlStatus NEW %s %s" % (crawl.startTime, str(crawl.auId))) crawl.save()
def printcsv(f, auids, sort, limit, hdrs, sep): if (not auids): log.info('NOOP %s: No auids for %s' % (Action.PRTCRAWLSTATUS, f.name) ) return cache = auids[0].cache log.info('Start %s: Cache %s File %s ...' % (Action.PRTCRAWLSTATUS, cache, f.name)) f.write(sep.join(hdrs) + "\n") if (limit > 0): crawls = [] for auid in auids: crawls = crawls + LockssCrawlStatus.recents(auid, limit) crawls = sorted(crawls, key=lambda crawl:crawl.__dict__.get(sort)) else: qu = Q() for auid in auids: qu = qu | Q(auId = auid.id) crawls = LockssCrawlStatus.objects.filter(qu).order_by(sort) for st in crawls: f.write(st.csv(hdrs,sep) + "\n") log.debug2('Stop %s: File %s ...' % (Action.PRTCRAWLSTATUS, f.name))
def printcsv(folder, auids, orderby, hdrs, sep, minrev = 1): ''' print url reports for all given auids including urls that have a version at least as great as minrev, which defaults to 1 ''' if (not auids): log.info('NOOP %s: No auids to print to %s' % (Action.PRTURLLIST, folder) ) return for auid in auids: urls = [] try: if (orderby == 'minversion' or orderby == 'replication'): urls = auid.urlreport.url_set.filter(version__gte=minrev).all() else: urls = auid.urlreport.url_set.filter(version__gte=minrev).order_by(orderby).all() ext = ".tsv" if (sep == ","): ext = ".csv" f = open(folder + "/" + auid.auId + ext, 'w') if (urls.count() == 0): log.info("NOOP %s: file %s No Urls with version >= %s" % (Action.PRTURLLIST, f.name, minrev)) log.info('Start %s: file %s version %s' % (Action.PRTURLLIST, f.name, minrev)) try: reportDate = auid.urlreport.reportDate f.write("ReportDate\t%s\nIncluding Urls with version >= %s\n\n" % (str(reportDate), minrev)) f.write(sep.join(hdrs) + "\n") for url in urls: f.write(url.csv(hdrs, sep) + "\n") log.debug2('Stop %s: file %s version %s' % (Action.PRTURLLIST, f.name, minrev)) f.close() except IndexError: log.info("NOOP %s: file %s No Urls at all" % (Action.PRTURLLIST, f.name)) except ObjectDoesNotExist: log.warn('Start %s: No UrlReport for %s at %s' % (Action.PRTURLLIST, auid, auid.cache.name))
def process_server(self, server): ''' if dryrun collect matching auids and log.info them otherwise perform all requested actions ''' log.info("------ Start Processing %s" % server) options = self.options try: self.cache = self.get_cache(server.domain, server.port, options.need_credentials, options.username, options.password) if (not options.dryrun): if (Action.GETAUIDLIST in options.action): success = LockssCacheAuId.load(self.cache) if (not success): log.error('Exiting: could not load auids from cache %s' % (self.cache)) raise RuntimeError, 'could not load auids from cache' auids = self.collectAuIdInstances(self.cache) if (options.dryrun): return if (Action.PRTAUIDLIST in options.action): f = self.open_file(self.cache.name, Action.PRTAUIDLIST) # TODO get all auids for server if (f): LockssCacheAuId.printcsv(f, auids, "\t") f.close() if (Action.GETREPOSSPACE in options.action): self.getreposspace() if (Action.PRTREPOSSPACE in options.action): f = self.open_file(self.cache.name, Action.PRTREPOSSPACE) if (f): RepositorySpace.printcsv(f, [ self.cache ], "\t") f.close() # actions below needs auids to operate on if (not auids): log.info("no matching auids"); return; doUrls = Action.GETURLLIST in options.action success = None if (Action.GETAUSUMMARY in options.action): self.getausummaries(auids, options.dir, doUrls, options.expire, options.noquit) if (Action.PRTAUSUMMARY in options.action): f = self.open_file(self.cache.name, Action.PRTAUSUMMARY) if (f): LockssCacheAuSummary.printcsv(f, auids, options.ausummarysort, options.ausummaryheaders, "\t") f.close() if (Action.PRTURLLIST in options.action): dr = self.mkdir(options.action, self.cache.name) if (dr): UrlReport.printcsv("%s/%s" % (self.options.dir, server.name), #dir, auids, options.urlsort, options.urlheaders, '\t', options.urlminversion) if (Action.GETCRAWLSTATUS in options.action): self.getcrawlstatus(auids, options.dir, options.noquit) if (Action.PRTCRAWLSTATUS in options.action): f = self.open_file(self.cache.name, Action.PRTCRAWLSTATUS) if (f): LockssCrawlStatus.printcsv(f, auids, options.crawlsort, options.ncrawllimit, options.crawlheaders, "\t") f.close() if (Action.GETCOMMPEERS in options.action): self.getcommpeers(options.dir, options.noquit) if (Action.PRTCOMMPEERS in options.action): f = self.open_file(self.cache.name, Action.PRTCOMMPEERS) if (f): # TODO LockssCacheCommPeer.printcsv(f, self.cache) f.close() except LockssError as e: log.error("EXCEPTION %s" % str(e)) finally: log.debug2("------ Stop Processing %s" % server)
def process(self): log.debug2("---_Start Processing"); for server in self.options.cachelist: self.process_server(server) log.debug2("--- Stop Processing");