Пример #1
0
    def submitFTS3(self, pinTime=False):
        """ submit fts job using FTS3 rest API """

        if self.FTSGUID:
            return S_ERROR("FTSJob already has been submitted")

        transfers = []

        for ftsFile in self:
            trans = fts3.new_transfer(ftsFile.SourceSURL,
                                      ftsFile.TargetSURL,
                                      checksum=ftsFile.Checksum,
                                      filesize=ftsFile.Size)
            transfers.append(trans)

        source_spacetoken = self.SourceToken if self.SourceToken else None
        dest_spacetoken = self.TargetToken if self.TargetToken else None
        copy_pin_lifetime = pinTime if pinTime else None
        bring_online = 86400 if pinTime else None

        job = fts3.new_job(transfers=transfers,
                           overwrite=True,
                           source_spacetoken=source_spacetoken,
                           spacetoken=dest_spacetoken,
                           bring_online=bring_online,
                           copy_pin_lifetime=copy_pin_lifetime,
                           retry=3)

        try:
            context = fts3.Context(self.FTSServer)
            self.FTSGUID = fts3.submit(context, job)

        except Exception, e:
            return S_ERROR("Error at submission: %s" % e)
Пример #2
0
    def generateContext(ftsServer, ucert, lifetime=25200):
        """This method generates an fts3 context

        :param ftsServer: address of the fts3 server
        :param ucert: the path to the certificate to be used
        :param lifetime: duration (in sec) of the delegation to the FTS3 server
                        (default is 7h, like FTS3 default)

        :returns: an fts3 context
        """
        try:
            context = fts3.Context(endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False)

            # Explicitely delegate to be sure we have the lifetime we want
            # Note: the delegation will re-happen only when the FTS server
            # decides that there is not enough timeleft.
            # At the moment, this is 1 hour, which effectively means that if you do
            # not submit a job for more than 1h, you have no valid proxy in FTS servers
            # anymore. In future release of FTS3, the delegation will be triggered when
            # one third of the lifetime will be left.
            # Also, the proxy given as parameter might have less than "lifetime" left
            # since it is cached, but it does not matter, because in the FTS3Agent
            # we make sure that we renew it often enough
            # Finally, FTS3 has an issue with handling the lifetime of the proxy,
            # because it does not check all the chain. This is under discussion
            # https://its.cern.ch/jira/browse/FTS-1575
            fts3.delegate(context, lifetime=datetime.timedelta(seconds=lifetime))

            return S_OK(context)
        except FTS3ClientException as e:
            gLogger.exception("Error generating context", repr(e))
            return S_ERROR(repr(e))
Пример #3
0
    def _banStorageElement(self, storageElement):

        endpoints = getFTS3Servers()['Value']

        blacklist = {}
        for endpoint in endpoints:
            # endpoint = 'https://fts3-pilot.cern.ch:8446'

            # TODO: maybe proxyPath is not needed since it is picked from the environment by the REST API
            proxyPath = getProxyInfo()
            if not proxyPath['OK']:
                return proxyPath

            try:
                proxyPath = proxyPath['Value']['path']
            except Exception as e:
                return S_ERROR(repr(e).replace(',)', ')'))

            context = fts3.Context(endpoint, proxyPath)
            timeout = 3600  # or...?
            status = 'wait'  # or...?
            allow_submit = False  # or...?

            # TODO: ban_se returns the list of jobIDs interrupted by the banning
            pausedJobIDs = fts3.ban_se(context, storageElement, status,
                                       timeout, allow_submit)
            self.log.info("fts3.ban_se: %s" % pausedJobIDs)

            blacklist[endpoint] = json.loads(context.get("ban/se"))

        return S_OK(blacklist)
Пример #4
0
def recoverFINISHEDDIRTY(s, ftsFileName):
    (status, ftsJID, fStat, fServer) = getNewStatus(s, ftsFileName)
    # This set of files has transferred successfully. Move it to DONE directory
    # print "Finished dirty for ", ftsFileName, " with ftsID", ftsJID, " status", status, "."
    context = fts3.Context(fServer)
    jobStat = fts3.get_job_status(context, ftsJID, list_files=True)
    failedFiles = []
    missFiles = []
    for fileInfo in jobStat['files']:
        if fileInfo["file_state"] == "FINISHED": continue
        reason = fileInfo["reason"]
        if "Probably stalled" in reason:
            failedFiles.append(
                (fileInfo["source_surl"], fileInfo["dest_surl"]))
        elif "globus_ftp_control_local_pasv failed" in reason:
            failedFiles.append(
                (fileInfo["source_surl"], fileInfo["dest_surl"]))
        elif "500 No such file or directory" in reason:
            print fServer, fileInfo["source_surl"], reason
            print fServer[:-1] + "9/fts3/ftsmon/#/job/" + ftsJID
            missFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"]))
        else:
            print ftsFileName, fileInfo["source_surl"], fileInfo["reason"][:50]
            failedFiles.append(
                (fileInfo["source_surl"], fileInfo["dest_surl"]))
    # print failedFiles
    cleanUpTransfer(failedFiles, ftsFileName)
    writeTransfer(failedFiles, "TODO/", "D", ftsFileName)
    writeTransfer(missFiles, "DONE/Bad/", "M", ftsFileName)
Пример #5
0
    def _banStorageElement(self, storageElement):

        endpoints = getFTS3Servers()['Value']

        blacklist = {}
        for endpoint in endpoints:
            #endpoint = 'https://fts3-pilot.cern.ch:8446'

            #TODO: maybe proxyPath is not needed since it is picked from the environment by the REST API
            proxyPath = getProxyInfo()
            if not proxyPath.get('OK'):
                return S_ERROR("Proxy not found!")

            try:
                proxyPath = proxyPath.get('Value').get('path')
            except Exception as e:
                return S_ERROR(e.message)

            context = fts3.Context(endpoint, proxyPath)
            timeout = 3600  #or...?
            status = 'wait'  #or...?
            allow_submit = False  #or...?

            #TODO: ban_se returns the list of jobIDs interrupted by the banning
            pausedJobIDs = fts3.ban_se(context, storageElement, status,
                                       timeout, allow_submit)

            blacklist[endpoint] = json.loads(context.get("ban/se"))

        return S_OK(blacklist)


################################################################################
#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
Пример #6
0
    def _unbanStorageElement(self, storageElement):

        endpoints = getFTS3Servers()['Value']

        blacklist = {}
        for endpoint in endpoints:
            #endpoint = 'https://fts3-pilot.cern.ch:8446'

            #TODO: maybe proxyPath is not needed since it is picked from the environment by the REST API
            proxyPath = getProxyInfo()
            if not proxyPath.get('OK'):
                return S_ERROR("Proxy not found!")

            try:
                proxyPath = proxyPath.get('Value').get('path')
            except Exception as e:
                return S_ERROR(e.message)

            context = fts3.Context(endpoint, proxyPath)

            fts3.unban_se(context, storageElement)

            blacklist[endpoint] = json.loads(context.get("ban/se"))

        return S_OK(blacklist)
Пример #7
0
def submit(proxy, toTrans, source, destination):

    # prepare rest job with 200 files per job
    transfers = []
    for files in chunks(toTrans, 200):

        c = pycurl.Curl()
        # create destination and source pfns for job
        for lfn in files:
            print(lfn)
            transfers.append(
                fts3.new_transfer(apply_tfc_to_lfn(source, lfn, c),
                                  apply_tfc_to_lfn(destination, lfn, c)))

        c.close()

        # Submit fts job
        context = fts3.Context('https://fts3.cern.ch:8446',
                               proxy,
                               proxy,
                               verify=True)
        print(fts3.delegate(context, lifetime=timedelta(hours=48),
                            force=False))

        job = fts3.new_job(transfers)

        #print("Monitor link: https://fts3.cern.ch:8449/fts3/ftsmon/#/job/"+fts3.submit(context, job))
        jobid = fts3.submit(context, job)

        #for file in (fts3.get_job_status(context, jobid, list_files=True))["files"]:
        for key, value in (fts3.get_job_status(context, jobid,
                                               list_files=True)).iteritems():
            print key
Пример #8
0
def submitTheFTSJob(ftsFile):
  ### First way : Random choice of two servers
  # ftsServ = random.choice([ftsServ1, ftsServ2])
  ### Second way : Weighted choice of two servers
  # rndValue = random.uniform(0.0,1.0)
  # ftsServ = ftsServ1
  # if rndValue < 0.7 : ftsServ = ftsServ2
  ### Third way : Random choice of three servers
  fList = [ftsServ1, ftsServ2, ftsServ3]
  ftsServ = random.choice(fList)
  #
  context = fts3.Context(ftsServ)
  # Open the file and stop the processing.
  listOfPairs[:] = open(ceBase + "DOING/" + ftsFile).read().split("\n")
  # listOfPairs[:] = open(ceBase + "TODO/" + ftsFile).read().split("\n")
  # All the threading bit is here to check in parallel whether the files we are looking are okay in castor
  # Once the function is done, the list "okayFiles" should be filled
  transfers = []
  if checkStatus:
    checkStatusOnCastor()
  else:
    okayFiles[:] = []
    for onePair in listOfPairs:
      if len(onePair)<10: continue
      (sourceSURL, targetSURL) = onePair.split("  ")
      okayFiles.append((sourceSURL, targetSURL))
  if len(okayFiles)>0:
    for oneSet in okayFiles:
      transf = fts3.new_transfer(oneSet[0], oneSet[1])
      transfers.append(transf)
    job = fts3.new_job(transfers=transfers, overwrite=True, verify_checksum=True, reuse=False, retry=5) # requested by Andrea Manzi
    ftsJobID = fts3.submit(context, job)
    return ftsJobID, ftsServ
  else: # None of the files in this lot were good!
    return "-1", "-1"
Пример #9
0
    def _unbanStorageElement(self, storageElement):

        endpoints = getFTS3Servers()
        if not endpoints['OK']:
            return endpoints

        endpoints = endpoints['Value']

        blacklist = {}
        for endpoint in endpoints:
            # endpoint = 'https://fts3-pilot.cern.ch:8446'

            # TODO: maybe proxyPath is not needed since it is picked from the environment by the REST API
            proxyPath = getProxyInfo()
            if not proxyPath['OK']:
                return proxyPath

            try:
                proxyPath = proxyPath['Value']['path']
            except Exception as e:
                return S_ERROR(repr(e).replace(',)', ')'))

            context = fts3.Context(endpoint, proxyPath)

            fts3.unban_se(context, storageElement)

            blacklist[endpoint] = json.loads(context.get("ban/se"))

        return S_OK(blacklist)
Пример #10
0
    def __init__(self, config, quiet, debug, test=False):
        """

        :param config:
        :param quiet:
        :param debug:
        :param test:
        """
        # TODO: use test in input to set self.TEST
        self.config_getter = config.Getter
        self.config = config.Monitor
        self.TEST = False

        createLogdir('Done')

        def setRootLogger(quiet, debug):
            """
            Taken from CRABServer TaskWorker
            Sets the root logger with the desired verbosity level
               The root logger logs to logs/asolog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :return logger: a logger with the appropriate logger level."""

            createLogdir('logs')

            if self.TEST:
                # if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog('logs/monitor.txt',
                                                when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger("master")
            logger.debug("PID %s.", os.getpid())
            logger.debug("Logging level initialized to %s.", loglevel)
            return logger

        self.STOP = False
        self.logger = setRootLogger(quiet, debug)
        self.active_users = list()
        self.q = Queue()
        self.context = fts3.Context(self.config_getter.serverFTS,
                                    self.config_getter.opsProxy,
                                    self.config_getter.opsProxy,
                                    verify=True)
Пример #11
0
def getNewStatus(s, f, fid=""):
    if len(fid) < 3:
        (fid, fstat, fIter, fServer) = getStatusForJob(s, f)
    if fid == "-1":
        print "File ", f, "not submitted to FTS?"
        return "Unknown-notsubmitted", -1, 0, "-1"
    context = fts3.Context(fServer)
    ftsStat = fts3.get_job_status(context, fid)
    return ftsStat["job_state"], fid, ftsStat, fServer
Пример #12
0
    def monitorFTS3(self, full=False):
        if not self.FTSGUID:
            return S_ERROR("FTSGUID not set, FTS job not submitted?")

        jobStatusDict = None
        try:
            context = fts3.Context(endpoint=self.FTSServer)
            jobStatusDict = fts3.get_job_status(context,
                                                self.FTSGUID,
                                                list_files=True)
        except Exception, e:
            return S_ERROR("Error getting the job status %s" % e)
Пример #13
0
def getNewStatus(s, f, fid="", context=0):
  if len(fid) < 3 :
    (fid, fstat, fIter, fServer) = getStatusForJob(s, f)
  if fid == "-1":
    return "Unknown-notsubmitted", -1, 0, "-1"
  if context == 0:
    context = fts3.Context(fServer)
  try:
    ftsStat = fts3.get_job_status(context, fid)
    return ftsStat["job_state"], fid, ftsStat, fServer
  except:
    print "File ", f, "unknown to FTS?"
    return "Unknow", -1, 0, "-1"
Пример #14
0
def fts3_delegate(fts3_endpoint='https://fts3-pilot.cern.ch:8446'):
    if voms_proxy_expired():
        print("INFO: creating new proxy.")
        proxy = voms_proxy_init()
        if proxy:
            print("Proxy info:")
            print("path: {}".format(proxy['path']))
            print("expiration: {}".format(proxy['expiration']))
            print("timestamp: {}".format(proxy['TS']))
        else:
            print("FATAL: proxy creation failed.")
            return
    else:
        proxy = voms_proxy_info()
        print("INFO: proxy valid, avoiding recreation.")

    fts3_context = context = fts3.Context(fts3_endpoint, verify=True)
    whoami = fts3.whoami(fts3_context)

    no_valid_delegation = False
    termination_time = datetime.utcnow()
    elapsed_threshold = timedelta(hours=1)

    try:
        delegation_ID = whoami['delegation_id']
        check_delegation_json = fts3_check_delegation(delegation_ID, proxy,
                                                      fts3_endpoint)

        if check_delegation_json:
            termination_time = datetime.strptime(
                check_delegation_json['termination_time'].replace('T', ' '),
                '%Y-%m-%d %H:%M:%S')
            print('INFO: Delegation valid until {} UTC'.format(
                termination_time.strftime('%H:%M:%S %Y-%m-%d')))
        else:
            no_valid_delegation = True
    except:
        no_valid_delegation = False

    if no_valid_delegation:
        print("INFO: no valid delegation found")

    if (termination_time -
            elapsed_threshold) < datetime.utcnow() or no_valid_delegation:
        print('INFO: Renewing delegation!')
        delegation_ID_2 = fts3.delegate(fts3_context,
                                        lifetime=timedelta(hours=12),
                                        force=True)
        print('INFO: New delegation ID = {}'.format(delegation_ID_2))
    else:
        print('INFO: Nothing to do...')
Пример #15
0
def submitTheFTSJob(ftsFile):
    ### First way : Random choice of two servers
    # ftsServ = random.choice([ftsServ1, ftsServ2])
    ### Second way : Weighted choice of two servers
    # rndValue = random.uniform(0.0,1.0)
    # ftsServ = ftsServ1
    # if rndValue < 0.7 : ftsServ = ftsServ2
    ### Third way : Random choice of three servers
    fList = [ftsServ1, ftsServ2, ftsServ3]
    ftsServ = random.choice(fList)
    #
    context = fts3.Context(ftsServ)
    filecontent = open(ceBase + "DOING/" + ftsFile).read().split("\n")
    transfers = []
    for ftra in filecontent:
        if len(ftra) < 10: continue
        (sourceSURL, targetSURL) = ftra.split("  ")
        comm = "gfal-stat " + sourceSURL
        runComm = subprocess.Popen(comm,
                                   shell=True,
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   close_fds=True)
        theInfo = runComm.communicate()[1].strip()
        if theInfo.startswith(
                "gfal-stat error: 2 (No such file or directory)"):
            bFTS = open(ceBase + "DONE/badFileList.txt", "a")
            bFTS.write(ftra + "\n")
            bFTS.close()
        else:
            transf = fts3.new_transfer(sourceSURL, targetSURL)
            transfers.append(transf)
        # transf = fts3.new_transfer(sourceSURL, targetSURL)
        # transfers.append(transf)
    if len(transfers) > 0:
        # job = fts3.new_job(transfers=transfers, overwrite=True, verify_checksum=True, reuse=True, retry=5)
        # job = fts3.new_job(transfers=transfers, overwrite=True, verify_checksum=True, reuse=False, retry=5) # requested by Andrea Manzi
        job = fts3.new_job(
            transfers=transfers,
            overwrite=True,
            verify_checksum=True,
            reuse=False,
            retry=0)  # To avoid deleted files snarling up the system for hours
        ftsJobID = fts3.submit(context,
                               job,
                               delegation_lifetime=fts3.timedelta(hours=72))
        return ftsJobID, ftsServ
    else:
        return "-1", "-1"
Пример #16
0
    def algorithm(self):
        """
        - delegate and use opsproxy (once every 12h)
        - Look into Monitor user folders and if the user is not in the queue put it there

        :return:
        """
        # TODO: monitor is probably better with multiproc
        workers = list()
        for i in range(self.config.max_threads_num):
            worker = Thread(target=self.worker, args=(i, self.q))
            worker.setDaemon(True)
            worker.start()
            workers.append(worker)

        count = 0
        while not self.STOP:
            if count == 0 and not self.config.TEST:
                self.context = fts3.Context(self.config_getter.serverFTS,
                                            self.config_getter.opsProxy,
                                            self.config_getter.opsProxy,
                                            verify=True)
                self.logger.debug(
                    fts3.delegate(self.context,
                                  lifetime=timedelta(hours=48),
                                  force=False))

            for folder in os.listdir('Monitor'):
                user = folder
                jobs = os.listdir('Monitor/' + user)
                if not len(jobs) == 0 and user not in self.active_users:
                    self.active_users.append(user)
                    self.q.put(user)
                elif len(jobs) == 0 and user in self.active_users:
                    self.active_users.remove(user)

            if count < 6 * 60 * 12:  # delegate every 12h
                count += 1
            else:
                count = 0

            self.logger.info('%s active users' % len(self.active_users))
            self.logger.debug('Active users are: %s' % self.active_users)
            self.logger.debug('Queue lenght: %s' % self.q.qsize())
            time.sleep(10)

        for w in workers:
            w.join()

        self.logger.info('Monitor stopped.')
Пример #17
0
    def submitFTS3(self, pinTime=False):
        """ submit fts job using FTS3 rest API """

        if self.FTSGUID:
            return S_ERROR("FTSJob already has been submitted")

        transfers = []

        for ftsFile in self:
            trans = fts3.new_transfer(ftsFile.SourceSURL,
                                      ftsFile.TargetSURL,
                                      checksum='ADLER32:%s' % ftsFile.Checksum,
                                      filesize=ftsFile.Size)
            transfers.append(trans)

        source_spacetoken = self.SourceToken if self.SourceToken else None
        dest_spacetoken = self.TargetToken if self.TargetToken else None
        copy_pin_lifetime = pinTime if pinTime else None
        bring_online = 86400 if pinTime else None

        job = fts3.new_job(transfers=transfers,
                           overwrite=True,
                           source_spacetoken=source_spacetoken,
                           spacetoken=dest_spacetoken,
                           bring_online=bring_online,
                           copy_pin_lifetime=copy_pin_lifetime,
                           retry=3)

        try:
            if not self._fts3context:
                self._fts3context = fts3.Context(endpoint=self.FTSServer,
                                                 request_class=ftsSSLRequest,
                                                 verify=False)
            context = self._fts3context
            self.FTSGUID = fts3.submit(context, job)

        except Exception as e:
            return S_ERROR("Error at submission: %s" % e)

        self.Status = "Submitted"
        self._log = gLogger.getSubLogger(
            "req_%s/FTSJob-%s" % (self.RequestID, self.FTSGUID), True)
        for ftsFile in self:
            ftsFile.FTSGUID = self.FTSGUID
            ftsFile.Status = "Submitted"
        return S_OK()
Пример #18
0
def recoverFINISHEDDIRTY(s, ftsFileName):
    (status, ftsJID, fStat, fServer) = getNewStatus(s, ftsFileName)
    # This set of files has transferred successfully. Move it to DONE directory
    print "Finished dirty for ", ftsFileName, " with ftsID", ftsJID, " status", status, "."

    context = fts3.Context(fServer)
    jobStat = fts3.get_job_status(context, ftsJID, list_files=True)
    failedFiles = []
    for fileInfo in jobStat['files']:
        if fileInfo["file_state"] == "FINISHED": continue
        failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"]))
    # print failedFiles
    for fF in failedFiles:
        print fF

    cleanUpTransfer(failedFiles, ftsFileName)
    retryFailedTransfer(failedFiles, ftsFileName)
Пример #19
0
    def generateContext(ftsServer, ucert):
        """ This method generates an fts3 context

        :param ftsServer: address of the fts3 server
        :param ucert: the path to the certificate to be used

        :returns: an fts3 context
    """
        try:
            context = fts3.Context(endpoint=ftsServer,
                                   ucert=ucert,
                                   request_class=ftsSSLRequest,
                                   verify=False)
            return S_OK(context)
        except FTS3ClientException as e:
            gLogger.exception("Error generating context", repr(e))
            return S_ERROR(repr(e))
Пример #20
0
    def _do_ftscall(self, binding=None, url=None):
        if self._context is None:
            # request_class = Request -> use "requests"-based https call (instead of default PyCURL,
            # which may not be able to handle proxy certificates depending on the cURL installation)
            # verify = False -> do not verify the server certificate
            context = fts3.Context(self.server_url,
                                   ucert=self.x509proxy,
                                   ukey=self.x509proxy,
                                   request_class=Request,
                                   verify=False)

            if self.keep_context:
                self._context = context
        else:
            context = self._context

        if binding is not None:
            reqstring = binding[0]
        else:
            reqstring = url

        LOG.debug('FTS: %s', reqstring)

        wait_time = 1.
        for attempt in xrange(10):
            try:
                if binding is not None:
                    method, args, kwd = binding
                    return getattr(fts3, method)(context, *args, **kwd)
                else:
                    return json.loads(context.get(url))
            except fts_exceptions.ServerError as exc:
                if str(exc.reason) == '500':
                    # Internal server error - let's try again
                    pass
            except fts_exceptions.TryAgain:
                pass

            time.sleep(wait_time)
            wait_time *= 1.5

        LOG.error('Failed to communicate with FTS server: %s', reqstring)
        raise RuntimeError('Failed to communicate with FTS server: %s' %
                           reqstring)
Пример #21
0
def recoverFINISHEDDIRTY(s, ftsFileName):
  (status, ftsJID, fStat, fServer) = getNewStatus(s, ftsFileName)
  if status == "Unknown":
    # Try again ...
    shutil.move(ceBase + "DONE/Dirty/" + ftsFileName, ceBase + "TODO/" + ftsFileName)
    return
  # This set of files has transferred successfully. Move it to DONE directory
  print "Finished dirty for ", ftsFileName, " with ftsID", ftsJID, " status", status, "."
  if ftsJID == -1:
    print "Probably in old sqlite dB. Could not check - retry"
    shutil.move(ceBase + "DONE/Dirty/" + ftsFileName, ceBase + "TODO/" + ftsFileName)
    return
  context = fts3.Context(fServer)
  jobStat = fts3.get_job_status(context, ftsJID, list_files=True)
  failedFiles = []
  for fileInfo in jobStat['files']:
    if fileInfo["file_state"] == "FINISHED": continue
    failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"]))
  # print failedFiles
  if len(failedFiles) < 1: return
  cleanUpTransfer(failedFiles, ftsFileName)
  retryFailedTransfer(failedFiles, ftsFileName)
Пример #22
0
    def _banStorageElement(self, storageElement):

        endpoints = getFTS3Servers()
        if not endpoints['OK']:
            return endpoints

        endpoints = endpoints['Value']

        blacklist = {}
        for endpoint in endpoints:
            # endpoint = 'https://fts3-pilot.cern.ch:8446'

            # TODO: maybe proxyPath is not needed since it is picked from the environment by the REST API
            proxyPath = getProxyInfo()
            if not proxyPath['OK']:
                return proxyPath

            try:
                proxyPath = proxyPath['Value']['path']
            except Exception as e:
                return S_ERROR(repr(e).replace(',)', ')'))

            context = fts3.Context(endpoint, proxyPath)
            status = 'wait'  # This status leaves the jobs queued. The only alternative is "cancel"

            pausedJobIDs = fts3.ban_se(context,
                                       storageElement,
                                       status,
                                       timeout=3600,
                                       allow_submit=False)
            self.log.info("fts3.ban_se: paused jobs: %s" %
                          ','.join(pausedJobIDs))

            blacklist[endpoint] = json.loads(context.get("ban/se"))

        return S_OK(blacklist)
Пример #23
0
    def monitor(self, context=None, ftsServer=None, ucert=None):
        """ Queries the fts server to monitor the job

        This method assumes that the attribute self.ftsGUID is set

        :param context: fts3 context. If not given, it is created (see ftsServer & ucert param)
        :param ftsServer: the address of the fts server to submit to. Used only if context is
                          not given. if not given either, use the ftsServer object attribute

        :param ucert: path to the user certificate/proxy. Might be infered by the fts cli (see its doc)

        :returns {FileID: { status, error } }
    """

        if not self.ftsGUID:
            return S_ERROR("FTSGUID not set, FTS job not submitted?")

        if not context:
            if not ftsServer:
                ftsServer = self.ftsServer
            context = fts3.Context(endpoint=ftsServer,
                                   ucert=ucert,
                                   request_class=ftsSSLRequest,
                                   verify=False)

        jobStatusDict = None
        try:
            jobStatusDict = fts3.get_job_status(context,
                                                self.ftsGUID,
                                                list_files=True)
        except FTS3ClientException as e:
            return S_ERROR("Error getting the job status %s" % e)

        now = datetime.datetime.utcnow().replace(microsecond=0)
        self.lastMonitor = now

        newStatus = jobStatusDict['job_state'].capitalize()
        if newStatus != self.status:
            self.status = newStatus
            self.lastUpdate = now
            self.error = jobStatusDict['reason']

        if newStatus in self.FINAL_STATES:
            self._fillAccountingDict(jobStatusDict)

        filesInfoList = jobStatusDict['files']
        filesStatus = {}
        statusSummary = {}

        for fileDict in filesInfoList:
            file_state = fileDict['file_state'].capitalize()
            file_id = fileDict['file_metadata']
            file_error = fileDict['reason']
            filesStatus[file_id] = {'status': file_state, 'error': file_error}

            statusSummary[file_state] = statusSummary.get(file_state, 0) + 1

        total = len(filesInfoList)
        completed = sum([
            statusSummary.get(state, 0) for state in FTS3File.FTS_FINAL_STATES
        ])
        self.completeness = 100 * completed / total

        return S_OK(filesStatus)
def main():

    parser = argparse.ArgumentParser(description="Run FTS Datalake Tests")

    parser.add_argument("-i",
                        required=True,
                        dest="conf_file",
                        help="Configuration file")
    parser.add_argument("--cleanup",
                        required=False,
                        action='store_true',
                        default=False,
                        help="Clean up src/dst directories")
    parser.add_argument("--exit",
                        required=False,
                        action='store_true',
                        default=False,
                        help="Exit after cleanup")

    arg = parser.parse_args()
    conf_file = str(arg.conf_file)
    cleanup = arg.cleanup
    exit = arg.exit

    # open configuration file to get test details
    with open(conf_file) as json_file:
        data = json.load(json_file)

        # assign json variables
        protocol_map = data['protocols']
        num_of_files_list = data['num_of_files']
        filesize_list = data['filesizes']
        num_of_jobs = data['num_of_jobs']
        testing_folder = data['testing_folder']
        checksum = data["checksum"]
        overwrite = data["overwrite"]
        metadata = data['metadata']

        # figure out the unique endpoints from the configuration
        endpoints = []
        endpoint_tlist = []
        for protocol in protocol_map:
            protocol_endpoints = protocol_map[protocol]
            for endpoint in protocol_endpoints:
                # example: endpoint = door05.pic.es:8452//rucio/pic_dcache
                endpoint_t = endpoint.split(":", 1)[0]
                # example: endpoint_t = door05.pic.es
                endpoint_e = re.split('[0-9]*', endpoint.split(":", 1)[1], 1)[1]
                # example: endpoint_e = //rucio/pic_dcache
                endpoint_ft = endpoint_t + endpoint_e
                if endpoint_ft not in endpoint_tlist:
                    endpoint_tlist.append(endpoint_ft)
                    endpoints.append("{}://{}".format(protocol, endpoint))
        del endpoint_tlist

        # setup folders at the testing endpoints if needed
        _flush_logging_msg("Setting up folders at endpoints")
        prob_endpoints = _gfal_setup_folders(endpoints, testing_folder, cleanup)

        # we have some problematic endpoints
        if prob_endpoints:
            _flush_logging_msg(
                "Problematic endpoints (will not be tested): {})".format(
                    prob_endpoints))

        # the script is used as a setup script so do not perform testing
        if exit:
            sys.exit(1)

        # ----------------------------------------------------------------------

        # authenticate @ FTS endpoint
        # https://gitlab.cern.ch/fts/fts-rest/-/blob/develop/src/fts3/rest/client/context.py#L148
        _flush_logging_msg('Authenticating at {}'.format(FTS_ENDPOINT))
        context = fts3.Context(FTS_ENDPOINT, verify=True)

        # list that holds a dictionary per each job
        # this is later used to poll for the jobs until they finish
        job_map_list = []

        # for every job
        for _ in xrange(num_of_jobs):
            # for every protocol to be checked
            for protocol in protocol_map:
                # get endpoints
                protocol_endpoints = protocol_map[protocol]
                # create unique pairs of 2s (source destionation)
                endpnt_pairs = itertools.permutations(protocol_endpoints, 2)
                # for every pair
                for endpnt_pair in endpnt_pairs:
                    # ad-hoc temp solution for lapp-webdav - remove checksum
                    if endpnt_pair[0] == "lapp-esc02.in2p3.fr:8001/webdav":
                        checksum = "none"
                    if endpnt_pair[1] == "lapp-esc02.in2p3.fr:8001/webdav":
                        checksum = "none"
                    # --
                    abort_source = False
                    source_url = "{}://{}".format(protocol, endpnt_pair[0])
                    dest_url = "{}://{}".format(protocol, endpnt_pair[1])
                    # if the source endpoint is faulty, abort this run
                    if endpnt_pair[0] in prob_endpoints:
                        _flush_logging_msg("Aborting run for source: {}".format(
                            endpnt_pair[0]))
                        continue
                    _flush_logging_msg(
                        ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
                    _flush_logging_msg("Source: {}".format(source_url))
                    _flush_logging_msg("Destination: {}".format(dest_url))
                    # for every filesize combination
                    for filesize in filesize_list:
                        if abort_source:
                            _flush_logging_msg(
                                "Aborting run for source: {}".format(
                                    source_url))
                            break

                        # for every files per job combination
                        for numfile in num_of_files_list:

                            # configure destination filenames
                            local_file_paths = []
                            dest_filenames = []
                            for nfile in xrange(numfile):
                                random_suffix = str(uuid.uuid1())
                                random_filename = "{}.{}".format(
                                    FILE_PREFIX, random_suffix)
                                dest_filenames.append(random_filename)
                                file_path = os.path.join(
                                    LOCALPATH_TEMP_DIR, random_filename)
                                local_file_paths.append(str(file_path))

                            source_dir = os.path.join(source_url,
                                                      testing_folder, "src")

                            # check if source has adequate number of files of
                            # the desired filesize
                            _flush_logging_msg(
                                "Checking source for {} existing {}MB files".
                                format(numfile, filesize))
                            src_filenames = _gfal_check_files(
                                source_dir, filesize, numfile)

                            if src_filenames == -1:
                                abort_source = True
                                _flush_logging_msg(
                                    "Aborting run for source: {}".format(
                                        source_url))
                                break

                            remove_local_files = False
                            if not src_filenames:
                                remove_local_files = True
                                for filename in dest_filenames:
                                    src_filename = "{}_{}mb".format(
                                        filename, filesize)
                                    src_filenames.append(src_filename)

                                # generate random files localy
                                _flush_logging_msg(
                                    "Locally generating {} random files of size:{}MB"
                                    .format(numfile, filesize))
                                for file_path in local_file_paths:
                                    with open(file_path, 'wb') as fout:
                                        fout.write(os.urandom(filesize * MB))

                                # upload files to the source for this job
                                _flush_logging_msg("Uploading files to source")
                                rcode = _gfal_upload_files(
                                    local_file_paths, source_dir, src_filenames)
                                if rcode == -1:
                                    abort_source = True
                                    _flush_logging_msg(
                                        "Aborting run for source: {}".format(
                                            source_url))
                                    break

                            # submit fts transfer
                            _flush_logging_msg('Submitting FTS job')
                            job_id = _fts_submit_job(source_url, dest_url,
                                                     src_filenames,
                                                     dest_filenames, checksum,
                                                     overwrite, testing_folder,
                                                     context, metadata)
                            if job_id == -1:
                                _flush_logging_msg('Job aborted')
                                continue
                            _flush_logging_msg('FTS job id:{}'.format(job_id))

                            job_map = {}
                            job_map['job_id'] = job_id
                            job_map['directory'] = os.path.join(
                                dest_url, testing_folder, "dest")
                            job_map['files_to_purge'] = dest_filenames
                            job_map_list.append(job_map)

                            if remove_local_files:
                                # remove files locally
                                _flush_logging_msg(
                                    "Removing files from LOCALPATH: {}".format(
                                        LOCALPATH_TEMP_DIR))
                                for file in local_file_paths:
                                    os.remove(file)

        _flush_logging_msg(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
        _fts_wait_jobs(context, job_map_list)
        _flush_logging_msg("Testing DONE, program is going to exit now!")
Пример #25
0
    def submit(self,
               context=None,
               ftsServer=None,
               ucert=None,
               pinTime=36000,
               protocols=None):
        """ submit the job to the FTS server

        Some attributes are expected to be defined for the submission to work:
          * type (set by FTS3Operation)
          * sourceSE (only for Transfer jobs)
          * targetSE
          * activity (optional)
          * priority (optional)
          * username
          * userGroup
          * filesToSubmit
          * operationID (optional, used as metadata for the job)

        We also expect the FTSFiles have an ID defined, as it is given as transfer metadata

        :param pinTime: Time the file should be pinned on disk (used for transfers and staging)
                        Used only if he source SE is a tape storage
        :param context: fts3 context. If not given, it is created (see ftsServer & ucert param)
        :param ftsServer: the address of the fts server to submit to. Used only if context is
                          not given. if not given either, use the ftsServer object attribute

        :param ucert: path to the user certificate/proxy. Might be inferred by the fts cli (see its doc)
        :param protocols: list of protocols from which we should choose the protocol to use

        :returns S_OK([FTSFiles ids of files submitted])
    """

        log = gLogger.getSubLogger(
            "submit/%s/%s_%s" %
            (self.operationID, self.sourceSE, self.targetSE), True)

        if not context:
            if not ftsServer:
                ftsServer = self.ftsServer
            context = fts3.Context(endpoint=ftsServer,
                                   ucert=ucert,
                                   request_class=ftsSSLRequest,
                                   verify=False)

        # Construct the target SURL
        res = self.__fetchSpaceToken(self.targetSE)
        if not res['OK']:
            return res
        target_spacetoken = res['Value']

        allLFNs = [ftsFile.lfn for ftsFile in self.filesToSubmit]

        if self.type == 'Transfer':
            res = self._constructTransferJob(pinTime,
                                             allLFNs,
                                             target_spacetoken,
                                             protocols=protocols)
        elif self.type == 'Staging':
            res = self._constructStagingJob(pinTime, allLFNs,
                                            target_spacetoken)
        # elif self.type == 'Removal':
        #   res = self._constructRemovalJob(context, allLFNs, failedLFNs, target_spacetoken)

        if not res['OK']:
            return res

        job, fileIDsInTheJob = res['Value']
        setFileIdsInTheJob = set(fileIDsInTheJob)

        try:
            self.ftsGUID = fts3.submit(context, job)
            log.info("Got GUID %s" % self.ftsGUID)

            # Only increase the amount of attempt
            # if we succeeded in submitting -> no ! Why did I do that ??
            for ftsFile in self.filesToSubmit:
                ftsFile.attempt += 1
                if ftsFile.fileID in setFileIdsInTheJob:
                    ftsFile.status = 'Submitted'

            now = datetime.datetime.utcnow().replace(microsecond=0)
            self.submitTime = now
            self.lastUpdate = now
            self.lastMonitor = now

        except FTS3ClientException as e:
            log.exception("Error at submission", repr(e))
            return S_ERROR("Error at submission: %s" % e)

        return S_OK(fileIDsInTheJob)
Пример #26
0
    def monitor(self, context=None, ftsServer=None, ucert=None):
        """ Queries the fts server to monitor the job.
        The internal state of the object is updated depending on the
        monitoring result.

        In case the job is not found on the server, the status is set to 'Failed'

        Within a job, only the transfers having a `fileID` metadata are considered.
        This is to allow for multihop jobs doing a staging

        This method assumes that the attribute self.ftsGUID is set

        :param context: fts3 context. If not given, it is created (see ftsServer & ucert param)
        :param ftsServer: the address of the fts server to submit to. Used only if context is
                          not given. if not given either, use the ftsServer object attribute

        :param ucert: path to the user certificate/proxy. Might be infered by the fts cli (see its doc)

        :returns: {FileID: { status, error } }

                  Possible error numbers

                  * errno.ESRCH: If the job does not exist on the server
                  * errno.EDEADLK: In case the job and file status are inconsistent (see comments inside the code)


    """

        if not self.ftsGUID:
            return S_ERROR("FTSGUID not set, FTS job not submitted?")

        if not context:
            if not ftsServer:
                ftsServer = self.ftsServer
            context = fts3.Context(endpoint=ftsServer,
                                   ucert=ucert,
                                   request_class=ftsSSLRequest,
                                   verify=False)

        jobStatusDict = None
        try:
            jobStatusDict = fts3.get_job_status(context,
                                                self.ftsGUID,
                                                list_files=True)
        # The job is not found
        # Set its status to Failed and return
        except NotFound:
            self.status = 'Failed'
            return S_ERROR(
                errno.ESRCH,
                "FTSGUID %s not found on %s" % (self.ftsGUID, self.ftsServer))
        except FTS3ClientException as e:
            return S_ERROR("Error getting the job status %s" % e)

        now = datetime.datetime.utcnow().replace(microsecond=0)
        self.lastMonitor = now

        newStatus = jobStatusDict['job_state'].capitalize()
        if newStatus != self.status:
            self.status = newStatus
            self.lastUpdate = now
            self.error = jobStatusDict['reason']

        if newStatus in self.FINAL_STATES:
            self._fillAccountingDict(jobStatusDict)

        filesInfoList = jobStatusDict['files']
        filesStatus = {}
        statusSummary = {}

        # Make a copy, since we are potentially
        # deleting objects
        for fileDict in list(filesInfoList):
            file_state = fileDict['file_state'].capitalize()
            file_metadata = fileDict['file_metadata']

            # previous version of the code did not have dictionary as
            # file_metadata
            if isinstance(file_metadata, dict):
                file_id = file_metadata.get('fileID')
            else:
                file_id = file_metadata

            # The transfer does not have a fileID attached to it
            # so it does not correspond to a file in our DB: skip it
            # (typical of jobs with different staging protocol == CTA)
            # We also remove it from the fileInfoList, such that it is
            # not considered for accounting
            if not file_id:
                filesInfoList.remove(fileDict)
                continue

            file_error = fileDict['reason']
            filesStatus[file_id] = {'status': file_state, 'error': file_error}

            # If the state of the file is final for FTS, set ftsGUID of the file to None,
            # such that it is "released" from this job and not updated anymore in future
            # monitoring calls
            if file_state in FTS3File.FTS_FINAL_STATES:
                filesStatus[file_id]['ftsGUID'] = None

            # If the file is not in a final state, but the job is, we return an error
            # FTS can have inconsistencies where the FTS Job is in a final state
            # but not all the files.
            # The inconsistencies are cleaned every hour on the FTS side.
            # https://its.cern.ch/jira/browse/FTS-1482
            elif self.status in self.FINAL_STATES:
                return S_ERROR(
                    errno.EDEADLK,
                    "Job %s in a final state (%s) while File %s is not (%s)" %
                    (self.ftsGUID, self.status, file_id, file_state))

            statusSummary[file_state] = statusSummary.get(file_state, 0) + 1

        # We've removed all the intermediate transfers that we are not interested in
        # so we put this back into the monitoring data such that the accounting is done properly
        jobStatusDict['files'] = filesInfoList
        if newStatus in self.FINAL_STATES:
            self._fillAccountingDict(jobStatusDict)

        total = len(filesInfoList)
        completed = sum([
            statusSummary.get(state, 0) for state in FTS3File.FTS_FINAL_STATES
        ])
        self.completeness = int(100 * completed / total)

        return S_OK(filesStatus)
Пример #27
0
    def monitorFTS3(self, full=False):
        if not self.FTSGUID:
            return S_ERROR("FTSGUID not set, FTS job not submitted?")

        jobStatusDict = None
        try:
            if not self._fts3context:
                self._fts3context = fts3.Context(endpoint=self.FTSServer,
                                                 request_class=ftsSSLRequest,
                                                 verify=False)
            context = self._fts3context
            jobStatusDict = fts3.get_job_status(context,
                                                self.FTSGUID,
                                                list_files=True)
        except Exception as e:
            return S_ERROR("Error getting the job status %s" % e)

        self.Status = jobStatusDict['job_state'].capitalize()

        filesInfoList = jobStatusDict['files']
        statusSummary = {}
        for fileDict in filesInfoList:
            file_state = fileDict['file_state'].capitalize()
            statusSummary[file_state] = statusSummary.get(file_state, 0) + 1

        total = len(filesInfoList)
        completed = sum(
            [statusSummary.get(state, 0) for state in FTSFile.FINAL_STATES])
        self.Completeness = 100 * completed / total

        if not full:
            return S_OK(statusSummary)

        ftsFilesPrinted = False
        for fileDict in filesInfoList:
            sourceURL = fileDict['source_surl']
            targetURL = fileDict['dest_surl']
            fileStatus = fileDict['file_state'].capitalize()
            reason = fileDict['reason']
            duration = fileDict['tx_duration']
            candidateFile = None
            for ftsFile in self:
                if ftsFile.SourceSURL == sourceURL and ftsFile.TargetSURL == targetURL:
                    candidateFile = ftsFile
                    break
            if candidateFile is None:
                self._log.warn(
                    'FTSFile not found',
                    'Source: %s, Target: %s' % (sourceURL, targetURL))
                if not ftsFilesPrinted:
                    ftsFilesPrinted = True
                    if not len(self):
                        self._log.warn('Monitored FTS job is empty!')
                    else:
                        self._log.warn(
                            'All FTS files are:', '\n' + '\n'.join([
                                'Source: %s, Target: %s' %
                                (ftsFile.SourceSURL, ftsFile.TargetSURL)
                                for ftsFile in self
                            ]))
            else:
                candidateFile.Status = fileStatus
                candidateFile.Error = reason
                candidateFile._duration = duration

                if candidateFile.Status == "Failed":
                    for missingSource in self.missingSourceErrors:
                        if missingSource.match(reason):
                            candidateFile.Error = "MissingSource"

        # # register successful files
        if self.Status in FTSJob.FINALSTATES:
            return self.finalize()
        return S_OK()
Пример #28
0
        return "-1", "-1"


sess = doTheSQLiteAndGetItsPointer()
# submittedFiles = glob.glob(ceBase + "DOING/*.txt")
submittedFiles = glob.glob(ceBase + "DOING/RALSpecific/*.txt")
# print getStatusForJob(sess, "R-uFTSList-02062018-230631.txt")
kount = 0
for ff in submittedFiles:
    tFN = ff.split("/")[-1]
    # tFN = "R-uFTSList-02062018-230631.txt"
    (fID, fStat, fIter, fServ) = getStatusForJob(sess, tFN)
    if fServ != "https://lcgfts3.gridpp.rl.ac.uk:8446":
        # continue
        sys.exit()
    context = fts3.Context(fServ)
    # We have jobs submitted  to the RAL FTS server only
    # First cancel the job
    # print "Cancelling job : ", fID, " file :", tFN
    # stat = fts3.cancel(context, fID)
    shutil.move(ceBase + "DOING/RALSpecific/" + tFN, ceBase + "DOING/" + tFN)
    ftsJobID, ftsServ = submitTheFTSJob(context, tFN)
    print "Submitted file : ", tFN, " with fts ID : ", ftsJobID, " to server ", ftsServ, " URL:", ftsServ[:
                                                                                                          -1] + "9/fts3/ftsmon/#/job/" + ftsJobID
    # shutil.move(ceBase + "DOING/" + tFN, ceBase + "DOING/RALSpecific/" + tFN)
    if ftsJobID == "-1": continue
    #Now I have a pair - write them to the SQLite DB.
    m = sess.query(ftsjob).filter(ftsjob.ftsFile == tFN).all()
    if m:
        m = m[0]
        m.ftsID = ftsJobID
Пример #29
0
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import json
import logging
import fts3.rest.client.easy as fts3
from optparse import OptionParser

opts = OptionParser()
opts.add_option('-s',
                '--endpoint',
                dest='endpoint',
                default='https://fts3-pilot.cern.ch:8446')

(options, args) = opts.parse_args()

logging.getLogger('fts3.rest.client').setLevel(logging.DEBUG)

context = fts3.Context(options.endpoint)
snapshot = fts3.get_snapshot(context)
print json.dumps(snapshot, indent=2)
jobFiles = glob.glob(ceBase + "DOING/*.txt")

for jFile in jobFiles:
    # print jFile
    jobID = sess.query(ftsjob).filter(
        ftsjob.ftsFile == jFile.split("/")[-1]).all()
    if len(jobID) > 0:
        jobID = jobID[0]
    else:
        print jobID
        continue
    ftsJID = jobID.ftsID.strip()
    ftsServer = jobID.ftsServer.strip()
    # if not ("fts3.gridpp" in ftsServer): continue
    try:
        context = fts3.Context(ftsServer)
    except Exception, e:
        print "Exception creating FTS context ", e
        continue
    # print ftsServer
    print "Cancelling job : ", ftsJID, " file :", jFile.split(
        "/")[-1], " status ", jobID.ftsStatus.strip()
    try:
        stat = fts3.cancel(context, ftsJID)
    except Exception, e:
        print "Exception in cancelling : ", e
        # sys.exit()
        continue

# for jobID in rows:
#   if jobID.ftsStatus.strip() in finalStatuses: continue