def exists(self, pfn): """ Checks if the requested file is known by the referred RSE. :param pfn Physical file name :returns: True if the file exists, False if it doesn't :raise ServiceUnavailable """ dp = DataPoint(str(pfn), self.cfg) fileinfo = arc.FileInfo() status = dp.h.Stat(fileinfo) if not status: if status.GetErrno() == errno.ENOENT: return False raise ServiceUnavailable(str(status)) return True
def checkOutputFiles(self, surldict): ''' Check if SURLs are working. Returns a dict of arcjobid:file status Do bulk arc.DataPoint.Stat() with max 100 files per request. The list of surls passed here all belong to the same SE. ''' if self.arcconf.get(['downtime', 'srmdown']) == 'True': self.log.info("SRM down, will validate later") return dict((k['arcjobid'], self.retry) for k in surldict.values()) result = {} datapointlist = arc.DataPointList() surllist = [] dummylist = [] bulklimit = 100 for surls in surldict.values(): count = 0 for surl in surls: count += 1 if not surl['surl']: self.log.error("Missing surl for %s, cannot validate" % surl['arcjobid']) result[surl['arcjobid']] = self.failed continue dp = aCTUtils.DataPoint(str(surl['surl']), self.uc) if not dp or not dp.h: self.log.warning("URL %s not supported, skipping validation" % str(surl['surl'])) result[surl['arcjobid']] = self.ok continue datapointlist.append(dp.h) dummylist.append(dp) # to not destroy objects surllist.append(surl) if count % bulklimit != 0 and count != len(surls): continue # do bulk call (files, status) = dp.h.Stat(datapointlist) if not status and status.GetErrno() != os.errno.EOPNOTSUPP: # If call fails it is generally a server or connection problem # and in most cases should be retryable if status.Retryable(): self.log.warning("Failed to query files on %s, will retry later: %s" % (dp.h.GetURL().Host(), str(status))) result.update(dict((k['arcjobid'], self.retry) for k in surllist)) else: self.log.error("Failed to query files on %s: %s" % (dp.h.GetURL().Host(), str(status))) result.update(dict((k['arcjobid'], self.failed) for k in surllist)) else: # files is a list of FileInfo objects. If file is not found or has # another error in the listing FileInfo object will be invalid for i in range(len(datapointlist)): if status.GetErrno() == os.errno.EOPNOTSUPP: # Bulk stat was not supported, do non-bulk here f = arc.FileInfo() st = datapointlist[i].Stat(f) if not st or not f: if status.Retryable(): self.log.warning("Failed to query files on %s, will retry later: %s" % (datapointlist[i].GetURL().Host(), str(st))) result[surllist[i]['arcjobid']] = self.retry else: self.log.warning("%s: Failed to find info on %s" % (surllist[i]['arcjobid'], datapointlist[i].GetURL().str())) result[surllist[i]['arcjobid']] = self.failed files.append(None) else: files.append(f) if not files[i]: self.log.warning("%s: Failed to find info on %s" % (surllist[i]['arcjobid'], datapointlist[i].GetURL().str())) result[surllist[i]['arcjobid']] = self.failed else: # compare metadata try: self.log.debug("File %s for %s: expected size %d, checksum %s, actual size %d, checksum %s" % (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'], int(surllist[i]['fsize']), surllist[i]['checksum'], int(files[i].GetSize()), files[i].GetCheckSum())) except: self.log.warning("Unhandled issue %d",i) result[surllist[i]['arcjobid']] = self.failed continue if int(surllist[i]['fsize']) != int(files[i].GetSize()): self.log.warning("File %s for %s: size on storage (%d) differs from expected size (%d)" % (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'], int(files[i].GetSize()), int(surllist[i]['fsize']))) result[surllist[i]['arcjobid']] = self.failed continue if not files[i].CheckCheckSum(): self.log.warning("File %s for %s: no checksum information available" % (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'])) elif surllist[i]['checksum'] != files[i].GetCheckSum(): self.log.warning("File %s for %s: checksum on storage (%s) differs from expected checksum (%s)" % (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'], files[i].GetCheckSum(), surllist[i]['checksum'])) result[surllist[i]['arcjobid']] = self.failed continue self.log.info("File %s validated for %s" % (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'])) # don't overwrite previous failed file for this job if surllist[i]['arcjobid'] not in result: result[surllist[i]['arcjobid']] = self.ok # Clear lists and go to next round datapointlist = arc.DataPointList() surllist = [] dummylist = [] return result
def fetchJobs(self, arcstate, nextarcstate): # Get list of jobs in the right state jobstofetch = self.db.getArcJobs("arcstate='" + arcstate + "' and cluster='" + self.cluster + "'" + " limit 100") if not jobstofetch: return self.log.info("Fetching %i jobs" % sum(len(v) for v in jobstofetch.values())) fetched = [] notfetched = [] notfetchedretry = [] for proxyid, jobs in jobstofetch.items(): self.uc.CredentialString(self.db.getProxy(proxyid)) # Clean the download dir just in case something was left from previous attempt for job in jobs: shutil.rmtree( self.conf.get(['tmp', 'dir']) + job[2].JobID[job[2].JobID.rfind('/'):], True) # Get list of downloadable files for these jobs filestodl = self.db.getArcJobsInfo( "arcstate='" + arcstate + "' and cluster='" + self.cluster + "' and proxyid='" + str(proxyid) + "'", ['id', 'downloadfiles']) # id: downloadfiles downloadfiles = dict( (row['id'], row['downloadfiles']) for row in filestodl) # jobs to download all files jobs_downloadall = dict( (j[0], j[2]) for j in jobs if j[0] in downloadfiles and not downloadfiles[j[0]]) # jobs to download specific files jobs_downloadsome = dict( (j[0], j[2]) for j in jobs if j[0] in downloadfiles and downloadfiles[j[0]]) # We don't know if a failure from JobSupervisor is retryable or not # so always retry (f, r) = self.fetchAll(jobs_downloadall) fetched.extend(f) notfetchedretry.extend(r) (f, n, r) = self.fetchSome(jobs_downloadsome, downloadfiles) fetched.extend(f) notfetched.extend(n) notfetchedretry.extend(r) # Check for massive failure, and back off before trying again # TODO: downtime awareness if len(notfetched) > 10 and len(notfetched) == len(jobstofetch) or \ len(notfetchedretry) > 10 and len(notfetchedretry) == len(jobstofetch): self.log.error( "Failed to get any jobs from %s, sleeping for 5 mins" % self.cluster) time.sleep(300) return for proxyid, jobs in jobstofetch.items(): for (id, appjobid, job, created) in jobs: if job.JobID in notfetchedretry: self.log.warning("%s: Could not get output from job %s" % (appjobid, job.JobID)) # Remove download directory to allow retry shutil.rmtree( self.conf.get(['tmp', 'dir']) + job.JobID[job.JobID.rfind('/'):], True) # Check if job still exists fileinfo = arc.FileInfo() self.uc.CredentialString(self.db.getProxy(proxyid)) dp = aCTUtils.DataPoint(job.JobID, self.uc) status = dp.h.Stat(fileinfo) # TODO Check other permanent errors if not status and status.GetErrno() == errno.ENOENT: self.log.warning("%s: Job %s no longer exists" % (appjobid, job.JobID)) self.db.updateArcJob( id, { "arcstate": "donefailed", "tarcstate": self.db.getTimeStamp() }) # Otherwise try again next time elif job.JobID in notfetched: self.log.error("%s: Failed to download job %s" % (appjobid, job.JobID)) self.db.updateArcJob( id, { "arcstate": "donefailed", "tarcstate": self.db.getTimeStamp() }) else: self.log.info("%s: Downloaded job %s" % (appjobid, job.JobID)) self.db.updateArcJob( id, { "arcstate": nextarcstate, "tarcstate": self.db.getTimeStamp() })
import arc import sys if len(sys.argv) != 2: sys.stdout.write("Usage: python partial_copy.py filename\n") sys.exit(1) desired_size = 512 usercfg = arc.UserConfig() url = arc.URL(sys.argv[1]) handle = arc.DataHandle(url, usercfg) point = handle.__ref__() point.SetSecure( False) # GridFTP servers generally do not have encrypted data channel info = arc.FileInfo("") point.Stat(info) sys.stdout.write("Name: %s\n" % str(info.GetName())) fsize = info.GetSize() if fsize > desired_size: point.Range(fsize - desired_size, fsize - 1) databuffer = arc.DataBuffer() point.StartReading(databuffer) while True: n = 0 length = 0 offset = 0 (r, n, length, offset, buf) = databuffer.for_write(True) if not r: break sys.stdout.write("BUFFER: %d : %d : %s\n" % (offset, length, str(buf))) databuffer.is_written(n) point.StopReading()