예제 #1
0
 def load(cache, auId, doUrls, expire, trials, sleep, timeout): 
     act = Action.GETAUSUMMARY
     if (doUrls): 
         act = act +  "," + Action.GETURLLIST
     log.info('Start %s: %s expire=%s ...' % (act, auId, str(expire)))
     success = False
     try: 
         for _ in  range(trials): 
             try: 
                 if (doUrls):
                     UrlReport.loadTry(cache.ui, auId, expire)
                 else:
                     LockssCacheAuSummary.__loadTry(cache.ui, auId, expire)
                 success = True
                 break
             except urllib2.HTTPError as inst:
                 cache.reconnect(sleep,timeout)
             except ExpatError: 
                 log.error("XML Parser error; could not %s %s" % (auId, act))
                 success = False; # try again 
         if (not success):                  
             log.error("exhausted trials for %s; could not load %s" % (auId, act))
     except LockssError as inst:
         log.warn("LockssException: %s" % inst)  # output is scanned for the ERROR string 
         log.warn("could not digest %s for %s" % (act, auId.auId)) 
     finally: 
         log.debug2('Stop %s: %s Success = %s ...' % (act, auId, success))
     return success
예제 #2
0
 def collectAuIdInstances(self, cache):
     ids = [];
     if (self.options.all):
         ids =  cache.locksscacheauid_set.all()
     else:
         ids = LockssCacheAuId.get(self.options.auids, self.options.auidprefixlist, cache)
     log.info("#Matching AUIDS: %d" % len(ids))
     for au in ids:
         log.debug2("Matching AUIDS: %s" % au.auId)
     return ids
예제 #3
0
 def mkdir(self, action, server):
     dirname = "%s/%s" % (self.options.dir, server)
     if not os.path.exists(dirname):
         try: 
             os.makedirs(dirname,0777)
             log.info("created output directory %s" % dirname)
         except: 
             log.error("Could not create %s" % dirname)
             return None 
     else: 
             log.debug2("using output directory %s" % dirname)           
     return dirname 
예제 #4
0
    def loadTry(ui, auId, datafresh): 
        '''
        create/update urlReport 
	    and associated related urls (deleting existing urls) 
        '''   
        urlReport = None
        reportDate = datetime.utcnow().replace(tzinfo=pytz.UTC)
        try:
            # check for existing au summary 
            urlReport = UrlReport.objects.get(auId = auId)
            if ((reportDate - urlReport.reportDate) < datafresh): 
                log.debug2("uptodate UrlReport available - not querying cache") 
                return urlReport
        except UrlReport.DoesNotExist:
            pass;
        
        log.info("get UrlReport %s" % auId) 
        (reportDate, urllist) = LockssCacheAuSummary.loadTryIt(ui, auId, True);
        if (not urllist):
            raise LockssError("%s on cache %s reports 0 urls" % (str(auId), auId.cache.name));

        if (not urlReport): 
            urlReport = UrlReport.objects.create(reportDate=reportDate, auId=auId);
        else:
            Url.objects.filter(urlReport=urlReport).delete()
        urlReport.reportDate = reportDate; 
        urlReport.save();
        print "urlReport ",  urlReport;
        try: 
            for url in urllist: 
                # work only on urls with real content 
                if (not url.has_key('NodeContentSize') or url['NodeContentSize'] == '-'): 
                    continue
                u = { 'urlReport' : urlReport }
                u['name'] = url[u'NodeName']
                u['childCount'] = url[u'NodeChildCount'] if url.has_key(u'NodeChildCount') else 0 
                u['treeSize'] =  url[u'NodeTreeSize'] if url.has_key(u'NodeTreeSize') else 0  
                u['size'] = url[u'NodeContentSize'] if url.has_key(u'NodeContentSize') else 0  
                u['version'] = url[u'NodeVersion'] if url.has_key(u'NodeVersion') else 0  
                if url.has_key(u'NodeVersion'):   
                    if (url[u'NodeVersion'] == '1' ): 
                        u['version'] = 1
                    else:
                        u['version'] =  url[u'NodeVersion']['value']
                lurl = Url.objects.create(**u)
                lurl.save()
                log.debug2("Url %s " % lurl.name)         
        except Exception as e: 
            urlReport.delete();  # deletes dependent urls 
            raise LockssError("Failed to read Url Info for %s %s\nException %s" % 
                                      (auId.cache.name, str(auId), str(e))) 
예제 #5
0
 def printcsv(f, auids, sort, hdrs, sep):
     if (not auids): 
         log.info('NOOP %s: No auids for %s' % (Action.PRTAUSUMMARY, f.name) )
         return
     cache  = auids[0].cache
     log.info('Start %s: Cache %s File %s ...' % (Action.PRTAUSUMMARY, cache, f.name))
     
     f.write(sep.join(hdrs) + "\n")
     # build query 
     qu = Q()
     for auid in auids:
         qu = qu | Q(auId = auid.id)
     sums = LockssCacheAuSummary.objects.filter(qu).order_by(sort)
     for sm in sums:
         f.write(sm.csv(hdrs, sep) + "\n")
     f.close()
     log.debug2('Stop %s: Cache %s File %s ...' % (Action.PRTAUSUMMARY, cache, f.name))
예제 #6
0
    def __loadTry(ui, auId, datafresh): 
        '''
        deleting existing LockssCacheAuSummary 
	create new summary by reading status info from cache 
        '''   
        lockssSummary = None
        try:
            # check for existing au summary 
            lockssSummary = LockssCacheAuSummary.objects.get(auId = auId)
            if ((datetime.utcnow().replace(tzinfo=pytz.UTC) - lockssSummary.reportDate) < datafresh): 
                log.debug2("uptodate LockssCacheAuSummary available - not querying cache") 
                return lockssSummary    
        except: 
            pass; 
        
        log.info("get LockssCacheAuSummary %s" % auId);
        LockssCacheAuSummary.loadTryIt(ui, auId, False);
예제 #7
0
 def load(cache, auId, trials, sleep,timeout):
     success = False
     log.debug2('Start %s: %s ...' % (Action.GETCRAWLSTATUS, auId))
     try: 
         log.info('get %s: %s ...' % (Action.GETCRAWLSTATUS, auId))
         for i in  range(trials): 
             try: 
                 LockssCrawlStatus.__loadTry(cache.ui, auId)
                 success = True
                 break
             except urllib2.HTTPError as inst:
                 cache.reconnect(sleep,timeout)
                 log.error("exhausted trials for %s, could not load crawlstatus" % (auId))
     except LockssError as inst:
         log.warn("LockssException: %s" % inst)  # output is scanned for the ERROR string 
         log.warn("could not digest %s for %s" % (Action.GETCRAWLSTATUS, auId.auId)) 
     finally: 
         log.debug2('Stop %s: %s Success = %s ...' % (Action.GETCRAWLSTATUS, auId, success))
     return  success
예제 #8
0
    def __loadTry(ui, auId): 
        '''
        delete existing LockssAuCrawlStatus and create new by reading status info from cache 
        '''
        log.debug("try %s" % (auId))
        st = { 'auId' : auId }
        status = ui.getCrawlStatus(auId.masterAuId.getLockssAu()) 
        reportDate = datetime.utcnow().replace(tzinfo=pytz.utc)

        if (not status): 
            log.debug2("No CrawlStatus Info for %s %s" % (auId.cache, auId.auId) )
        else: 
            for s in status: 
                # work on status info 
                if (not s):
                    raise LockssError("CrawlStatusTable returned empty info"); 
                try: 
                    st['reportDate'] = reportDate 
                    st['type'] = s['crawl_type']
                    st['startTime'] = datetime.strptime(s['start'], utils.UI_STRFTIME)
                    st['nBytesFetched'] = s['content_bytes_fetched'].replace(",", "") 
                    st['status'] = s['crawl_status']['value']
                    st['nMimeTypes'] = str(s['num_of_mime_types']['value'].replace(",", "")) 
                    st['duration'] = str(s['dur'])
                    for f in LockssCrawlStatus.INTFIELDMAP: 
                        val = s[LockssCrawlStatus.INTFIELDMAP[f]]
                        if (val.__class__ == unicode):
                            st[f] = int(val.replace(",", ""))
                        else: 
                            st[f] = str(val['value'].replace(",", "")) 
                except KeyError: 
                    raise LockssError("CrawlStatusTable returned faulty info for %s: %s" % (auId.auId, s)); 
    
                try:
                    # update existing crawl status  
                    crawl = LockssCrawlStatus.objects.get( auId = auId, startTime = st['startTime'])
                    crawl.__dict__.update(st) 
                    log.debug("LockssCrawlStatus UPD %s %s" % (crawl.startTime, str(crawl.auId)))
                except ObjectDoesNotExist:
                    # create new crawlstatus 
                    crawl = LockssCrawlStatus.objects.create(**st)
                    log.debug("LockssCrawlStatus NEW %s %s" % (crawl.startTime, str(crawl.auId)))
                crawl.save()
예제 #9
0
 def printcsv(f, auids, sort, limit, hdrs, sep):
     if (not auids): 
         log.info('NOOP %s: No auids for %s' % (Action.PRTCRAWLSTATUS, f.name) )
         return
     cache  = auids[0].cache
     log.info('Start %s: Cache %s File %s ...' % (Action.PRTCRAWLSTATUS, cache, f.name))
     f.write(sep.join(hdrs) + "\n")
     if (limit > 0): 
         crawls = []
         for auid in auids:
             crawls = crawls + LockssCrawlStatus.recents(auid, limit)
         crawls = sorted(crawls, key=lambda crawl:crawl.__dict__.get(sort))
     else: 
         qu = Q()
         for auid in auids:
             qu = qu | Q(auId = auid.id)
         crawls = LockssCrawlStatus.objects.filter(qu).order_by(sort)
     
     for st in crawls:
         f.write(st.csv(hdrs,sep) + "\n")
     log.debug2('Stop %s: File %s ...' % (Action.PRTCRAWLSTATUS, f.name))
예제 #10
0
 def printcsv(folder, auids, orderby, hdrs, sep, minrev = 1):
     '''
     print url reports for all given auids including urls that have a 
     version at least as great as minrev, which defaults to 1
     '''       
     if (not auids): 
         log.info('NOOP %s: No auids to print to %s' % (Action.PRTURLLIST, folder) )
         return
     
     for auid in auids:
         urls = []
         try: 
             if (orderby == 'minversion' or orderby == 'replication'): 
                 urls = auid.urlreport.url_set.filter(version__gte=minrev).all()
             else: 
                 urls = auid.urlreport.url_set.filter(version__gte=minrev).order_by(orderby).all()
             ext = ".tsv"
             if (sep == ","): 
                 ext = ".csv"
             f = open(folder + "/" + auid.auId + ext, 'w')
             if (urls.count() == 0):
                 log.info("NOOP %s: file %s No Urls with version >= %s" % (Action.PRTURLLIST, f.name, minrev))
             log.info('Start %s: file %s version %s' % (Action.PRTURLLIST, f.name, minrev))
             try:
                 reportDate = auid.urlreport.reportDate 
                 f.write("ReportDate\t%s\nIncluding Urls with version >= %s\n\n" % (str(reportDate), minrev))
                 f.write(sep.join(hdrs) + "\n")
                 for url in urls:
                     f.write(url.csv(hdrs, sep) + "\n")
                 log.debug2('Stop %s: file %s version %s' % (Action.PRTURLLIST, f.name, minrev))
                 f.close()
             except IndexError:
                 log.info("NOOP %s: file %s No Urls at all" % (Action.PRTURLLIST, f.name))
         
         except ObjectDoesNotExist: 
             log.warn('Start %s: No UrlReport for %s at %s' % 
                      (Action.PRTURLLIST, auid, auid.cache.name))
예제 #11
0
 def process_server(self, server):
     '''
     if dryrun collect matching auids and log.info them
     otherwise perform all requested actions 
     '''    
    
     log.info("------ Start Processing %s" % server) 
     options = self.options 
     try: 
         self.cache = self.get_cache(server.domain, server.port, 
                                      options.need_credentials, options.username, 
                                      options.password)
         
         
         if (not options.dryrun):
             if (Action.GETAUIDLIST in options.action): 
                 success = LockssCacheAuId.load(self.cache)
                 if (not success):
                     log.error('Exiting: could not load auids from cache %s' % (self.cache))
                     raise RuntimeError, 'could not load auids from cache'
         auids = self.collectAuIdInstances(self.cache)
         
         if (options.dryrun): 
             return
         
         if (Action.PRTAUIDLIST in options.action): 
             f = self.open_file(self.cache.name, Action.PRTAUIDLIST)  
             # TODO get all auids for server 
             if (f): 
                 LockssCacheAuId.printcsv(f, auids, "\t")
                 f.close()
         
         if (Action.GETREPOSSPACE in options.action):
             self.getreposspace()
     
         if (Action.PRTREPOSSPACE in options.action): 
             f = self.open_file(self.cache.name, Action.PRTREPOSSPACE)  
             if (f): 
                 RepositorySpace.printcsv(f, [ self.cache ], "\t")
                 f.close()
                 
         # actions below needs auids to operate on 
         if (not auids):
             log.info("no matching auids"); 
             return;
         
         doUrls = Action.GETURLLIST in options.action
         success = None
     
         if (Action.GETAUSUMMARY in options.action): 
             self.getausummaries(auids, options.dir, doUrls, options.expire, options.noquit)
             
         if (Action.PRTAUSUMMARY in options.action): 
             f = self.open_file(self.cache.name, Action.PRTAUSUMMARY)
             if (f): 
                 LockssCacheAuSummary.printcsv(f, auids, options.ausummarysort, options.ausummaryheaders, "\t")
                 f.close()
             
         if (Action.PRTURLLIST in options.action):
             dr = self.mkdir(options.action, self.cache.name)
             if (dr): 
                 UrlReport.printcsv("%s/%s" % (self.options.dir, server.name),  #dir, 
                                auids, options.urlsort, options.urlheaders, '\t', options.urlminversion)
         
         if (Action.GETCRAWLSTATUS in options.action): 
             self.getcrawlstatus(auids, options.dir, options.noquit)
                 
         if (Action.PRTCRAWLSTATUS in options.action): 
             f = self.open_file(self.cache.name, Action.PRTCRAWLSTATUS)
             if (f): 
                 LockssCrawlStatus.printcsv(f, auids, options.crawlsort, options.ncrawllimit, options.crawlheaders, "\t")
                 f.close()
         
         if (Action.GETCOMMPEERS in options.action): 
             self.getcommpeers(options.dir, options.noquit)
     
         if (Action.PRTCOMMPEERS in options.action): 
             f = self.open_file(self.cache.name, Action.PRTCOMMPEERS)
             if (f): 
                 # TODO LockssCacheCommPeer.printcsv(f, self.cache)
                 f.close()
     except LockssError as e:
         log.error("EXCEPTION %s" % str(e))
     finally:         
         log.debug2("------ Stop Processing %s" % server) 
예제 #12
0
 def process(self):
     log.debug2("---_Start Processing"); 
     for server in self.options.cachelist: 
         self.process_server(server)
     log.debug2("--- Stop Processing");