Exemplo n.º 1
0
    def __init__(self, resthost, jsonDoc, logger=None):
        if not logger:
            self.logger = logging.getLogger(__name__)
            handler = logging.StreamHandler(sys.stdout)
            formatter = logging.Formatter(
                "%(asctime)s:%(levelname)s:%(module)s %(message)s")
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
            self.logger.setLevel(logging.DEBUG)
        else:
            self.logger = logger

        self.jsonDoc = jsonDoc
        self.resthost = resthost
        self.pool = ''
        self.schedds = []
        self.resthost = "cmsweb.cern.ch:8443"
        self.crabserver = CRABRest(hostname=resthost,
                                   localcert='/data/certs/servicecert.pem',
                                   localkey='/data/certs/servicekey.pem',
                                   retry=10,
                                   userAgent='CRABTaskWorker')
        self.crabserver.setDbInstance(dbInstance='prod')
        # use child collector on port 9620 to get schedd attributes
        collName = "cmsgwms-collector-global.cern.ch:9620,cmsgwms-collector-global.fnal.gov:9620"
        self.coll = htcondor.Collector(collName)
Exemplo n.º 2
0
def handleKill(resthost, dbInstance, config, task, procnum, *args, **kwargs):
    """Asks to kill jobs

    :arg str resthost: the hostname where the rest interface is running
    :arg str dbInstance: the rest base url to contact
    :arg WMCore.Configuration config: input configuration
    :arg TaskWorker.DataObjects.Task task: the task to work on
    :arg int procnum: the process number taking care of the work
    :*args and *kwargs: extra parameters currently not defined
    :return: the result of the handler operation."""
    crabserver = CRABRest(resthost,
                          config.TaskWorker.cmscert,
                          config.TaskWorker.cmskey,
                          retry=20,
                          logger=logging.getLogger(str(procnum)),
                          userAgent='CRABTaskWorker',
                          version=__version__)
    crabserver.setDbInstance(dbInstance)
    handler = TaskHandler(task, procnum, crabserver, config, 'handleKill')
    handler.addWork(
        MyProxyLogon(config=config,
                     crabserver=crabserver,
                     procnum=procnum,
                     myproxylen=60 * 5))
    handler.addWork(
        DagmanKiller(config=config, crabserver=crabserver, procnum=procnum))

    return handler.actionWork(args, kwargs)
Exemplo n.º 3
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--r',
                        help='REST host name (e.g. cmsweb-testbed)',
                        required=True)
    parser.add_argument('--i',
                        help='DB instance (prod/preprod/dev)',
                        default='preprod')
    parser.add_argument('--n', help='# of ids to update (1-10)', default=1)

    args = parser.parse_args()
    n = int(args.n)
    hostname = args.r
    instance = args.i
    ids = all_ids[0:n]

    print("test: %s on db %s" % (hostname, instance))
    print("test with %d ids" % n)
    crabserver = CRABRest(hostname=hostname,
                          localcert=proxy,
                          localkey=proxy,
                          userAgent='CRABtestSB')
    crabserver.setDbInstance(dbInstance=instance)
    elapsed = mark_transferred(ids, crabserver)
    print("elapsed time: %d sec" % elapsed)
    return
Exemplo n.º 4
0
def getData(subresource):
    """This function will fetch data from Oracle table"""

    crabserver = CRABRest(hostname=CMSWEB,
                          localcert=CERTIFICATE,
                          localkey=KEY,
                          retry=3,
                          userAgent='CRABTaskWorker')
    crabserver.setDbInstance(dbInstance=DBINSTANCE)
    result = crabserver.get(api='filetransfers',
                            data=encodeRequest({
                                'subresource': subresource,
                                'grouping': 0
                            }))

    return oracleOutputMapping(result)
    def get_backendurls(self):
        # need to deal with the fact that TaskWorkerConfig may only specify a name instance
        # following code snipped is copy/pasted from MasterWorker.py
        try:
            instance = self.config.TaskWorker.instance
        except:
            msg = "No instance provided: need to specify config.TaskWorker.instance in the configuration"
            raise ConfigException(msg)
        if instance in SERVICE_INSTANCES:
            self.logger.info('Will connect to CRAB service: %s', instance)
            self.restHost = SERVICE_INSTANCES[instance]['restHost']
            self.dbInstance = SERVICE_INSTANCES[instance]['dbInstance']
        else:
            msg = "Invalid instance value '%s'" % instance
            raise ConfigException(msg)
        if instance == 'other':
            self.logger.info(
                'Will use restHost and dbInstance from config file')
            try:
                self.restHost = self.config.TaskWorker.restHost
                self.dbInstance = self.config.TaskWorker.dbInstance
            except:
                msg = "Need to specify config.TaskWorker.restHost and dbInstance in the configuration"
                raise ConfigException(msg)

        self.logger.info(
            "Querying server %s for HTCondor schedds and pool names.",
            self.restHost)
        crabserver = CRABRest(self.restHost,
                              self.config.TaskWorker.cmscert,
                              self.config.TaskWorker.cmskey,
                              retry=2,
                              userAgent='CRABTaskWorker')
        crabserver.setDbInstance(self.dbInstance)
        result = crabserver.get(api='info',
                                data={'subresource':
                                      'backendurls'})[0]['result'][0]
        self.pool = str(result['htcondorPool'])
        self.schedds = [str(i) for i in result['htcondorSchedds']]
        self.logger.info("Resulting pool %s; schedds %s", self.pool,
                         ",".join(self.schedds))
Exemplo n.º 6
0
def monitor(user, taskname, log):
    """ function monitoring the Rucio replica locks of a rule 
    and updating db statuses accordingly 

    :param user: user HN name
    :type user: str
    :param taskname: CRAB taskname
    :type taskname: str
    :param log: log object
    :type log: logging
    """
    os.environ["X509_CERT_DIR"] = os.getcwd()

    proxy = None
    if os.path.exists('task_process/RestInfoForFileTransfers.json'):
        with open('task_process/RestInfoForFileTransfers.json') as fp:
            restInfo = json.load(fp)
            proxy = os.getcwd() + "/" + restInfo['proxyfile']
            log.info("Proxy: %s", proxy)
            os.environ["X509_USER_PROXY"] = proxy

    if not proxy:
        log.info('No proxy available yet - waiting for first post-job')
        return None

    # Prepare user and task info for monitoring
    scope = "user."+user
    name = taskname
    log.info("Initializing Monitor Rucio client for %s", taskname)
    crabInj = CRABDataInjector("", "", scope=scope, account=user, auth_type='x509_proxy')

    id_map = {}
    lfn_map = {}
    source_rse = {}

    # create maps for lfn --> oracle id, source rse 
    if os.path.exists('task_process/transfers.txt'):
        with open('task_process/transfers.txt', 'r') as _list:
            for _data in _list.readlines():
                try:
                    doc = json.loads(_data)
                    id_map.update({doc['destination_lfn']: doc['id']})
                    lfn_map.update({doc['id']: doc['destination_lfn']})
                    source_rse.update({doc['destination_lfn']: doc['source']+"_Temp"})
                except Exception:
                    continue
    if os.path.exists('task_process/transfers_direct.txt'):
        with open('task_process/transfers_direct.txt', 'r') as _list:
            for _data in _list.readlines():
                try:
                    doc = json.loads(_data)
                    id_map.update({doc['destination_lfn']: doc['id']})
                    lfn_map.update({doc['id']: doc['destination_lfn']})
                except Exception:
                    continue

    # get the rule for this rucio dataset
    try:
        rules_ = crabInj.cli.list_did_rules(scope, name)
        # {u'name': u'/store/user/dciangot/DStarToD0Pi_D0KPi_DStarFilter_TuneCP5_13TeV-pythia8-evtgen/crab_DStar_rucio_rucio_198_7/190129_085050/0000/DS2b_17_1.root', u'rse': u'T2_IT_Pisa', u'state': u'OK', u'scope': u'user.dciangot', u'rse_id': u'200b6830ca424d87a2e0ae855341b084', u'rule_id': u'4bc56a77ac6743e791dfedaa11db1e1c'}
        list_good = []
        list_failed = []
        list_failed_tmp = []
        list_stuck = []
        list_update = []

        rules = next(rules_)
        log.debug("RULES %s", rules)

    except Exception:
        log.exception("Failed to retrieve rule information")
        return

    locks_generator = None

    # get replica locks and monitor status
    try:
        locks_generator = crabInj.cli.list_replica_locks(rules['id'])
    except Exception:
        if rules['state'] == 'STUCK':
            transfers = crabInj.cli.examine_replication_rule(rules['id'])['transfers']
            for lfn in transfers:
                list_stuck.append((lfn['name'], 'Rule STUCK.'))
        else:
            log.exception('Unable to get replica locks')
            return

    # analyze replica locks info for each file
    sitename = None
    # TODO: should we split in threads ?
    for file_ in locks_generator:
        log.debug("LOCK %s", file_)
        filename = file_['name']
        if filename not in id_map:
            # This is needed because in Rucio we allow user to publish 2 different tasks
            # within the same Rucio dataset
            log.info("Skipping file from previous tasks: %s", filename)
            continue
        status = file_['state']
        log.info("state %s", status)
        sitename = file_['rse']

        if status == "OK":
            list_good.append(filename)
        if status == "STUCK":
            list_failed_tmp.append((filename, "Transfer Stuck", sitename))
        if status == "REPLICATING":
            try:
                ftsJobID = crabInj.cli.list_request_by_did(filename, sitename, scope)["external_id"]
                if ftsJobID:
                    list_update.append((filename, ftsJobID))
            except Exception:
                log.exception("Replica lock not found")

    # Expose FTS job ID in case of failure (if available)   
    for name_ in [x[0] for x in list_failed_tmp]:
        try:
            ftsJobID = crabInj.cli.list_request_by_did(name_, sitename, scope)["external_id"]
            if ftsJobID:
                list_failed.append((name_, "FTS job ID: %s" % ftsJobID))
            else:
                list_failed.append((name_, "No FTS job ID available for stuck transfers. Rucio could have failed to submit FTS job."))
        except Exception:
            log.error("No FTS job ID available for stuck transfer %s. Rucio could have failed to submit FTS job." % name_)
            list_failed.append((name_, "No FTS job ID available for stuck transfers. Rucio could have failed to submit FTS job."))

    # Filter out files already staged directly from the wn
    direct_files = []
    if os.path.exists('task_process/transfers/registered_direct_files.txt'):
        with open("task_process/transfers/registered_direct_files.txt", "r") as list_file:
            direct_files = [x.split('\n')[0] for x in list_file.readlines()]
            log.debug("Checking if some failed files were directly staged from wn: {0}".format(str(direct_files)))
            list_failed = [x for x in list_failed if x[0] not in direct_files]
            log.debug("{0} files to be marked as failed.".format(str(len(list_failed))))

    try:
        crabserver = CRABRest(restInfo['host'], localcert=proxy, localkey=proxy,
                              userAgent='CRABSchedd')
        crabserver.setDbInstance(restInfo['dbInstamce'])
    except Exception:
        log.exception("Failed to set connection to crabserver")
        return

    # Mark FAILED files on the DB and remove them from dataset and rucio replicas
    try:
        if len(list_failed) > 0:
            list_failed_name = [{'scope': scope, 'name': x[0]} for x in list_failed]
            log.debug("Detaching %s" % list_failed_name)
            crabInj.cli.detach_dids(scope, name, list_failed_name)
            sources = list(set([source_rse[x['name']] for x in list_failed_name]))
            for source in sources:
                to_delete = [x for x in list_failed_name if source_rse[x['name']] == source]
                log.debug("Deleting %s from %s" % (to_delete, source))
                crabInj.delete_replicas(source, to_delete)
            mark_failed([id_map[x[0]] for x in list_failed], [x[1] for x in list_failed], crabserver)
    except ReplicaNotFound:
        try:
            mark_failed([id_map[x[0]] for x in list_failed], [x[1] for x in list_failed], crabserver)
        except Exception:
            log.exception("Failed to update status for failed files")
    except Exception:
        log.exception("Failed to update status for failed files")

    # Mark files of STUCK rules on the DB and remove them from dataset and rucio replicas
    try:
        if len(list_stuck) > 0:
            list_stuck_name = [{'scope': scope, 'name': x[0]} for x in list_stuck]
            log.debug("Detaching %s" % list_stuck_name)
            crabInj.cli.detach_dids(scope, name, list_stuck_name)
            sources = list(set([source_rse[x['name']] for x in list_stuck_name]))
            for source in sources:
                to_delete = [x for x in list_stuck_name if source_rse[x['name']] == source]
                log.debug("Deleting %s from %s" % (to_delete, source))
                crabInj.delete_replicas(source, to_delete)
            mark_failed([id_map[x[0]] for x in list_stuck], [x[1] for x in list_stuck], crabserver)
    except ReplicaNotFound:
        try:
            mark_failed([id_map[x[0]] for x in list_failed], [x[1] for x in list_failed], crabserver)
        except Exception:
            log.exception("Failed to update status for failed files")
    except Exception:
        log.exception("Failed to update status for stuck rule")

    # Mark successful transfers as done on oracle DB
    try:
        mark_transferred([id_map[x] for x in list_good], crabserver)
    except Exception:
        log.exception("Failed to update status for transferred files")

    try:
        already_list = []
        list_update_filt = []

        # Keep track of what has been already marked. Avoiding double updates at next iteration
        if os.path.exists("task_process/transfers/submitted_files.txt"):
            with open("task_process/transfers/submitted_files.txt", "r") as list_file:
                for _data in list_file.readlines():
                    already_list.append(_data.split("\n")[0])

        list_update_filt = [x for x in list_update if x not in already_list and x[0] not in direct_files]

        # Insert FTS job ID in oracle DB for all the available tranfers
        if len(list_update_filt) > 0:
            list_update = list_update_filt
            fileDoc = dict()
            fileDoc['asoworker'] = 'rucio'
            fileDoc['subresource'] = 'updateTransfers'
            fileDoc['list_of_ids'] = [id_map[x[0]] for x in list_update]
            fileDoc['list_of_transfer_state'] = ["SUBMITTED" for _ in list_update]
            fileDoc['list_of_fts_instance'] = ['https://fts3-cms.cern.ch:8446/' for _ in list_update]
            fileDoc['list_of_fts_id'] = [x[1] for x in list_update]
            crabserver.post(api='filetransfers',
                            data=encodeRequest(fileDoc))
            log.debug("Marked submitted %s" % [id_map[x[0]] for x in list_update])

            with open("task_process/transfers/submitted_files.txt", "a+") as list_file:
                for update in list_update:
                    log.debug("{0}\n".format(str(update)))
                    list_file.write("{0}\n".format(str(update)))
        else:
            log.info("Nothing to update (fts job ID)")
    except Exception:
        log.exception('Failed to update file status for FTSJobID inclusion.')
Exemplo n.º 7
0
    def __init__(self, confFile=None, quiet=False, debug=True, testMode=False):
        """
        Initialise class members

        :arg WMCore.Configuration config: input Publisher configuration
        :arg bool quiet: it tells if a quiet logger is needed
        :arg bool debug: it tells if needs a verbose logger
        :arg bool testMode: it tells if to run in test (no subprocesses) mode.
        """
        def createLogdir(dirname):
            """ Create the directory dirname ignoring erors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.makedirs(dirname)
            except OSError as ose:
                if ose.errno != 17:  #ignore the "Directory already exists error"
                    print(str(ose))
                    print(
                        "The Publisher Worker needs to access the '%s' directory"
                        % dirname)
                    sys.exit(1)

        def setRootLogger(logsDir, quiet=False, debug=True, console=False):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logs/log.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :arg bool console: it tells if to direct all printoput to console rather then files, useful for debug
            :return logger: a logger with the appropriate logger level."""

            createLogdir(logsDir)
            createLogdir(os.path.join(logsDir, 'processes'))
            createLogdir(os.path.join(logsDir, 'tasks'))

            if console:
                # if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog(os.path.join(
                    logsDir, 'log.txt'),
                                                when='midnight')
                logFormatter = logging.Formatter(
                    "%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s"
                )
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setMasterLogger()
            logger.debug("PID %s.", os.getpid())
            logger.debug("Logging level initialized to %s.", loglevel)
            return logger

        def logVersionAndConfig(config=None, logger=None):
            """
            log version number and major config. parameters
            args: config : a configuration object loaded from file
            args: logger : the logger instance to use
            """
            pubstartDict = {}
            pubstartDict['version'] = __version__
            pubstartDict['asoworker'] = config.General.asoworker
            pubstartDict['instance'] = config.General.instance
            if config.General.instance == 'other':
                pubstartDict['restHost'] = config.General.restHost
                pubstartDict['dbInstance'] = config.General.dbInstance
            pubstartDict['max_slaves'] = config.General.max_slaves
            pubstartDict['DBShost'] = config.TaskPublisher.DBShost
            pubstartDict['dryRun'] = config.TaskPublisher.dryRun
            # one line for automatic parsing
            logger.info('PUBSTART: %s', json.dumps(pubstartDict))
            # multiple lines for humans to read
            for k, v in pubstartDict.items():
                logger.info('%s: %s', k, v)
            return

        self.configurationFile = confFile  # remember this, will have to pass it to TaskPublish
        config = loadConfigurationFile(confFile)
        self.config = config.General
        self.TPconfig = config.TaskPublisher

        # these are used for talking to DBS
        os.putenv('X509_USER_CERT', self.config.serviceCert)
        os.putenv('X509_USER_KEY', self.config.serviceKey)
        self.block_publication_timeout = self.config.block_closure_timeout
        self.lfn_map = {}
        self.force_publication = False
        self.force_failure = False
        self.TestMode = testMode
        self.taskFilesDir = self.config.taskFilesDir
        createLogdir(self.taskFilesDir)
        createLogdir(os.path.join(self.taskFilesDir, 'FailedBlocks'))

        self.logger = setRootLogger(self.config.logsDir,
                                    quiet=quiet,
                                    debug=debug,
                                    console=self.TestMode)

        logVersionAndConfig(config, self.logger)

        from WMCore.Credential.Proxy import Proxy
        proxy = Proxy({'logger': self.logger})
        from ServerUtilities import tempSetLogLevel
        with tempSetLogLevel(self.logger, logging.ERROR):
            self.myDN = proxy.getSubjectFromCert(
                certFile=self.config.serviceCert)

        try:
            instance = self.config.instance
        except:
            msg = "No instance provided: need to specify config.General.instance in the configuration"
            raise ConfigException(msg)

        if instance in SERVICE_INSTANCES:
            self.logger.info('Will connect to CRAB service: %s', instance)
            restHost = SERVICE_INSTANCES[instance]['restHost']
            dbInstance = SERVICE_INSTANCES[instance]['dbInstance']
        else:
            msg = "Invalid instance value '%s'" % instance
            raise ConfigException(msg)
        if instance == 'other':
            self.logger.info(
                'Will use restHost and dbInstance from config file')
            try:
                restHost = self.config.restHost
                dbInstance = self.config.dbInstance
            except:
                msg = "Need to specify config.General.restHost and dbInstance in the configuration"
                raise ConfigException(msg)

        self.logger.info(
            'Will connect to CRAB Data Base %s instance via URL: https://%s',
            dbInstance, restHost)

        # CRAB REST API's
        self.max_files_per_block = self.config.max_files_per_block
        self.crabServer = CRABRest(hostname=restHost,
                                   localcert=self.config.serviceCert,
                                   localkey=self.config.serviceKey,
                                   retry=3,
                                   userAgent='CRABPublisher')
        self.crabServer.setDbInstance(dbInstance=dbInstance)
        self.startTime = time.time()
Exemplo n.º 8
0
class Master(object):
    """I am the main daemon kicking off all Publisher work via slave Publishers"""
    def __init__(self, confFile=None, quiet=False, debug=True, testMode=False):
        """
        Initialise class members

        :arg WMCore.Configuration config: input Publisher configuration
        :arg bool quiet: it tells if a quiet logger is needed
        :arg bool debug: it tells if needs a verbose logger
        :arg bool testMode: it tells if to run in test (no subprocesses) mode.
        """
        def createLogdir(dirname):
            """ Create the directory dirname ignoring erors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.makedirs(dirname)
            except OSError as ose:
                if ose.errno != 17:  #ignore the "Directory already exists error"
                    print(str(ose))
                    print(
                        "The Publisher Worker needs to access the '%s' directory"
                        % dirname)
                    sys.exit(1)

        def setRootLogger(logsDir, quiet=False, debug=True, console=False):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logs/log.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :arg bool console: it tells if to direct all printoput to console rather then files, useful for debug
            :return logger: a logger with the appropriate logger level."""

            createLogdir(logsDir)
            createLogdir(os.path.join(logsDir, 'processes'))
            createLogdir(os.path.join(logsDir, 'tasks'))

            if console:
                # if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog(os.path.join(
                    logsDir, 'log.txt'),
                                                when='midnight')
                logFormatter = logging.Formatter(
                    "%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s"
                )
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setMasterLogger()
            logger.debug("PID %s.", os.getpid())
            logger.debug("Logging level initialized to %s.", loglevel)
            return logger

        def logVersionAndConfig(config=None, logger=None):
            """
            log version number and major config. parameters
            args: config : a configuration object loaded from file
            args: logger : the logger instance to use
            """
            pubstartDict = {}
            pubstartDict['version'] = __version__
            pubstartDict['asoworker'] = config.General.asoworker
            pubstartDict['instance'] = config.General.instance
            if config.General.instance == 'other':
                pubstartDict['restHost'] = config.General.restHost
                pubstartDict['dbInstance'] = config.General.dbInstance
            pubstartDict['max_slaves'] = config.General.max_slaves
            pubstartDict['DBShost'] = config.TaskPublisher.DBShost
            pubstartDict['dryRun'] = config.TaskPublisher.dryRun
            # one line for automatic parsing
            logger.info('PUBSTART: %s', json.dumps(pubstartDict))
            # multiple lines for humans to read
            for k, v in pubstartDict.items():
                logger.info('%s: %s', k, v)
            return

        self.configurationFile = confFile  # remember this, will have to pass it to TaskPublish
        config = loadConfigurationFile(confFile)
        self.config = config.General
        self.TPconfig = config.TaskPublisher

        # these are used for talking to DBS
        os.putenv('X509_USER_CERT', self.config.serviceCert)
        os.putenv('X509_USER_KEY', self.config.serviceKey)
        self.block_publication_timeout = self.config.block_closure_timeout
        self.lfn_map = {}
        self.force_publication = False
        self.force_failure = False
        self.TestMode = testMode
        self.taskFilesDir = self.config.taskFilesDir
        createLogdir(self.taskFilesDir)
        createLogdir(os.path.join(self.taskFilesDir, 'FailedBlocks'))

        self.logger = setRootLogger(self.config.logsDir,
                                    quiet=quiet,
                                    debug=debug,
                                    console=self.TestMode)

        logVersionAndConfig(config, self.logger)

        from WMCore.Credential.Proxy import Proxy
        proxy = Proxy({'logger': self.logger})
        from ServerUtilities import tempSetLogLevel
        with tempSetLogLevel(self.logger, logging.ERROR):
            self.myDN = proxy.getSubjectFromCert(
                certFile=self.config.serviceCert)

        try:
            instance = self.config.instance
        except:
            msg = "No instance provided: need to specify config.General.instance in the configuration"
            raise ConfigException(msg)

        if instance in SERVICE_INSTANCES:
            self.logger.info('Will connect to CRAB service: %s', instance)
            restHost = SERVICE_INSTANCES[instance]['restHost']
            dbInstance = SERVICE_INSTANCES[instance]['dbInstance']
        else:
            msg = "Invalid instance value '%s'" % instance
            raise ConfigException(msg)
        if instance == 'other':
            self.logger.info(
                'Will use restHost and dbInstance from config file')
            try:
                restHost = self.config.restHost
                dbInstance = self.config.dbInstance
            except:
                msg = "Need to specify config.General.restHost and dbInstance in the configuration"
                raise ConfigException(msg)

        self.logger.info(
            'Will connect to CRAB Data Base %s instance via URL: https://%s',
            dbInstance, restHost)

        # CRAB REST API's
        self.max_files_per_block = self.config.max_files_per_block
        self.crabServer = CRABRest(hostname=restHost,
                                   localcert=self.config.serviceCert,
                                   localkey=self.config.serviceKey,
                                   retry=3,
                                   userAgent='CRABPublisher')
        self.crabServer.setDbInstance(dbInstance=dbInstance)
        self.startTime = time.time()

    def active_tasks(self, crabserver):
        """
        :param crabserver: CRABRest object to access proper REST as createdin __init__ method
        TODO  detail here the strucutre it returns
        :return: a list tuples [(task,info)]. One element for each task which has jobs to be published
                 each list element has the format (task, info) where task and info are lists
                 task=[] a set fo 4 attributes formatted as a list of strings:
                    [username, usergroup, userrole, taskname]
                    this is a subset of the information present in each element of thee next list !
                 info=[filedic1, filedic2, filedic3...] a list of dictionaries,one per file with keys:
                   u'username', u'cache_url', u'source_lfn', u'publication_state', u'destination',
                   u'user_role', u'last_update', u'input_dataset', u'dbs_url', u'aso_worker',
                   u'user_group', u'taskname', u'transfer_state', u'destination_lfn'
        """

        self.logger.debug("Retrieving publications from oracleDB")
        filesToPublish = []
        asoworkers = self.config.asoworker
        # asoworkers can be a string or a list of strings
        # but if it is a string, do not turn it into a list of chars !
        asoworkers = [asoworkers] if isinstance(asoworkers,
                                                basestring) else asoworkers
        for asoworker in asoworkers:
            self.logger.info(
                "Processing publication requests for asoworker: %s", asoworker)
            fileDoc = {}
            fileDoc['asoworker'] = asoworker
            fileDoc['subresource'] = 'acquirePublication'
            data = encodeRequest(fileDoc)
            try:
                result = crabserver.post(api='filetransfers', data=data)  # pylint: disable=unused-variable
            except Exception as ex:
                self.logger.error(
                    "Failed to acquire publications from crabserver: %s", ex)
                return []

            self.logger.debug(
                "Retrieving max.100000 acquired publications from oracleDB")
            fileDoc = dict()
            fileDoc['asoworker'] = asoworker
            fileDoc['subresource'] = 'acquiredPublication'
            fileDoc['grouping'] = 0
            fileDoc['limit'] = 100000
            data = encodeRequest(fileDoc)
            try:
                results = crabserver.get(api='filetransfers', data=data)
            except Exception as ex:
                self.logger.error(
                    "Failed to acquire publications from crabserver: %s", ex)
                return []
            files = oracleOutputMapping(results)
            self.logger.info(
                "%s acquired publications retrieved for asoworker %s",
                len(files), asoworker)
            filesToPublish.extend(files)

        # TODO: join query for publisher (same of submitter)
        unique_tasks = [
            list(i) for i in set(
                tuple([
                    x['username'], x['user_group'], x['user_role'],
                    x['taskname']
                ]) for x in filesToPublish if x['transfer_state'] == 3)
        ]

        info = []
        for task in unique_tasks:
            info.append(
                [x for x in filesToPublish if x['taskname'] == task[3]])
        return zip(unique_tasks, info)

    def getPublDescFiles(self, workflow, lfn_ready, logger):
        """
        Download and read the files describing what needs to be published
        CRAB REST does not have any good way to select from the DB only what we need
        most efficient way is to get full list for the task, and then trim it here
        see: https://github.com/dmwm/CRABServer/issues/6124
        """
        out = []

        dataDict = {}
        dataDict['taskname'] = workflow
        dataDict['filetype'] = 'EDM'
        data = encodeRequest(dataDict)
        try:
            res = self.crabServer.get(api='filemetadata', data=data)
            # res is a 3-plu: (result, exit code, status)
            res = res[0]
        except Exception as ex:
            logger.error(
                "Error during metadata retrieving from crabserver:\n%s", ex)
            return out

        metadataList = [json.loads(md) for md in res['result']
                        ]  # CRAB REST returns a list of JSON objects
        for md in metadataList:
            # pick only the metadata we need
            if md['lfn'] in lfn_ready:
                out.append(md)

        logger.info('Got filemetadata for %d LFNs', len(out))
        return out

    def algorithm(self):
        """
        1. Get a list of files to publish from the REST and organize by taskname
        2. For each taks get a suitably sized input for publish
        3. Submit the publish to a subprocess
        """

        self.startTime = time.time()
        tasks = self.active_tasks(self.crabServer)

        # example code, uncomment to pick only one task
        # myTask='180912_142016:arizzi_crab_NanoDnRMXDYJetsToLL_M-105To160_TuneCUETP8M1_13TeV-amcaRunIISummer16MiniAODv2-PUMorio__heIV_v6-v22201'
        # tasksToDo=[]
        # for t in tasks:
        #  if t[0][3]==myTask:
        #  tasksToDo.append(t)
        # tasks = tasksToDo

        maxSlaves = self.config.max_slaves
        self.logger.info(
            'kicking off pool for %s tasks using up to %s concurrent slaves',
            len(tasks), maxSlaves)
        #self.logger.debug('list of tasks %s', [x[0][3] for x in tasks])
        ## print one line per task with the number of files to be published. Allow to find stuck tasks
        self.logger.debug(' # of acquired files : taskname')
        for task in tasks:
            taskName = task[0][3]
            acquiredFiles = len(task[1])
            flag = '(***)' if acquiredFiles > 1000 else '     '  # mark suspicious tasks
            self.logger.debug('%s %5d : %s', flag, acquiredFiles, taskName)

        processes = []

        try:
            for task in tasks:
                taskname = str(task[0][3])
                # this IF is for testing on preprod or dev DB's, which are full of old unpublished tasks
                if int(taskname[0:4]) < 2008:
                    self.logger.info(
                        "Skipped %s. Ignore tasks created before August 2020.",
                        taskname)
                    continue
                username = task[0][0]
                if username in self.config.skipUsers:
                    self.logger.info("Skipped user %s task %s", username,
                                     taskname)
                    continue
                if self.TestMode:
                    self.startSlave(
                        task)  # sequentially do one task after another
                    continue
                else:  # deal with each task in a separate process
                    p = Process(target=self.startSlave, args=(task, ))
                    p.start()
                    self.logger.info('Starting process %s  pid=%s', p, p.pid)
                    self.logger.info('PID %s will work on task %s', p.pid,
                                     taskname)
                    processes.append(p)
                if len(processes) == maxSlaves:
                    while len(processes) == maxSlaves:
                        # wait until one process has completed
                        time.sleep(10)
                        for proc in processes:
                            if not proc.is_alive():
                                self.logger.info('Terminated: %s pid=%s', proc,
                                                 proc.pid)
                                processes.remove(proc)

        except Exception:
            self.logger.exception("Error during process mapping")
        self.logger.info(
            'No more tasks to care for. Wait for remaining %d processes to terminate',
            len(processes))
        while (processes):
            time.sleep(10)
            for proc in processes:
                if not proc.is_alive():
                    self.logger.info('Terminated: %s pid=%s', proc, proc.pid)
                    processes.remove(proc)

        self.logger.info("Algorithm iteration completed")
        self.logger.info("Wait %d sec for next cycle", self.pollInterval())
        newStartTime = time.strftime(
            "%H:%M:%S", time.localtime(time.time() + self.pollInterval()))
        # BEWARE: any change to message in next line needs to be synchronized with
        # a change in Publisher/stop.sh otherwise that script will break
        self.logger.info("Next cycle will start at %s", newStartTime)

    def startSlave(self, task):
        """
        start a slave process to deal with publication for a single task
        :param task: one tupla describing  a task as returned by  active_tasks()
        :return: 0  It will always terminate normally, if publication fails it will mark it in the DB
        """
        # TODO: lock task!
        # - process logger
        logger = setSlaveLogger(str(task[0][3]))
        logger.info("Process %s is starting. PID %s", task[0][3], os.getpid())

        self.force_publication = False
        workflow = str(task[0][3])

        if len(task[1]) > self.max_files_per_block:
            self.force_publication = True
            msg = "All datasets have more than %s ready files." % (
                self.max_files_per_block)
            msg += " No need to retrieve task status nor last publication time."
            logger.info(msg)
        else:
            msg = "At least one dataset has less than %s ready files." % (
                self.max_files_per_block)
            logger.info(msg)
            # Retrieve the workflow status. If the status can not be retrieved, continue
            # with the next workflow.
            workflow_status = ''
            msg = "Retrieving status"
            logger.info(msg)
            data = encodeRequest({'workflow': workflow})
            try:
                res = self.crabServer.get(api='workflow', data=data)
            except Exception as ex:
                logger.warn(
                    'Error retrieving status from crabserver for %s:\n%s',
                    workflow, str(ex))
                return 0

            try:
                workflow_status = res[0]['result'][0]['status']
                msg = "Task status is %s." % workflow_status
                logger.info(msg)
            except ValueError:
                msg = "Workflow removed from WM."
                logger.error(msg)
                workflow_status = 'REMOVED'
            except Exception as ex:
                msg = "Error loading task status!"
                msg += str(ex)
                msg += str(traceback.format_exc())
                logger.error(msg)
            # If the workflow status is terminal, go ahead and publish all the ready files
            # in the workflow.
            if workflow_status in ['COMPLETED', 'FAILED', 'KILLED', 'REMOVED']:
                self.force_publication = True
                if workflow_status in ['KILLED', 'REMOVED']:
                    self.force_failure = True
                msg = "Considering task status as terminal. Will force publication."
                logger.info(msg)
            # Otherwise...
            else:  ## TODO put this else in a function like def checkForPublication()
                msg = "Task status is not considered terminal."
                logger.info(msg)
                msg = "Getting last publication time."
                logger.info(msg)
                # Get when was the last time a publication was done for this workflow (this
                # should be more or less independent of the output dataset in case there are
                # more than one).
                last_publication_time = None
                data = encodeRequest({
                    'workflow': workflow,
                    'subresource': 'search'
                })
                try:
                    result = self.crabServer.get(api='task', data=data)
                    logger.debug("task: %s ", str(result[0]))
                    last_publication_time = getColumn(result[0],
                                                      'tm_last_publication')
                except Exception as ex:
                    logger.error("Error during task doc retrieving:\n%s", ex)
                if last_publication_time:
                    date = last_publication_time  # datetime in Oracle format
                    timetuple = datetime.strptime(
                        date, "%Y-%m-%d %H:%M:%S.%f").timetuple(
                        )  # convert to time tuple
                    last_publication_time = time.mktime(
                        timetuple)  # convert to seconds since Epoch (float)

                msg = "Last publication time: %s." % str(last_publication_time)
                logger.debug(msg)
                # If this is the first time a publication would be done for this workflow, go
                # ahead and publish.
                if not last_publication_time:
                    self.force_publication = True
                    msg = "There was no previous publication. Will force publication."
                    logger.info(msg)
                # Otherwise...
                else:
                    last = last_publication_time
                    msg = "Last published block time: %s" % last
                    logger.debug(msg)
                    # If the last publication was long time ago (> our block publication timeout),
                    # go ahead and publish.
                    now = int(time.time()) - time.timezone
                    time_since_last_publication = now - last
                    hours = int(time_since_last_publication / 60 / 60)
                    minutes = int(
                        (time_since_last_publication - hours * 60 * 60) / 60)
                    timeout_hours = int(self.block_publication_timeout / 60 /
                                        60)
                    timeout_minutes = int((self.block_publication_timeout -
                                           timeout_hours * 60 * 60) / 60)
                    msg = "Last publication was %sh:%sm ago" % (hours, minutes)
                    if time_since_last_publication > self.block_publication_timeout:
                        self.force_publication = True
                        msg += " (more than the timeout of %sh:%sm)." % (
                            timeout_hours, timeout_minutes)
                        msg += " Will force publication."
                    else:
                        msg += " (less than the timeout of %sh:%sm)." % (
                            timeout_hours, timeout_minutes)
                        msg += " Not enough to force publication."
                    logger.info(msg)

        # logger.info(task[1])
        try:
            if self.force_publication:
                # - get info
                active_ = [{
                    'key': [
                        x['username'], x['user_group'], x['user_role'],
                        x['taskname']
                    ],
                    'value': [
                        x['destination'], x['source_lfn'],
                        x['destination_lfn'], x['input_dataset'], x['dbs_url'],
                        x['last_update']
                    ]
                } for x in task[1] if x['transfer_state'] == 3
                           and x['publication_state'] not in [2, 3, 5]]

                lfn_ready = []
                wf_jobs_endtime = []
                pnn, input_dataset, input_dbs_url = "", "", ""
                for active_file in active_:
                    job_end_time = active_file['value'][5]
                    if job_end_time:
                        wf_jobs_endtime.append(
                            int(job_end_time) - time.timezone)
                    source_lfn = active_file['value'][1]
                    dest_lfn = active_file['value'][2]
                    self.lfn_map[dest_lfn] = source_lfn
                    if not pnn or not input_dataset or not input_dbs_url:
                        pnn = str(active_file['value'][0])
                        input_dataset = str(active_file['value'][3])
                        input_dbs_url = str(active_file['value'][4])
                    lfn_ready.append(dest_lfn)

                username = task[0][0]

                # Get metadata
                toPublish = []
                toFail = []
                publDescFiles_list = self.getPublDescFiles(
                    workflow, lfn_ready, logger)
                for file_ in active_:
                    metadataFound = False
                    for doc in publDescFiles_list:
                        # logger.info(type(doc))
                        # logger.info(doc)
                        if doc["lfn"] == file_["value"][2]:
                            doc["User"] = username
                            doc["Group"] = file_["key"][1]
                            doc["Role"] = file_["key"][2]
                            doc["UserDN"] = self.myDN
                            doc["Destination"] = file_["value"][0]
                            doc["SourceLFN"] = file_["value"][1]
                            toPublish.append(doc)
                            metadataFound = True
                            break

                    # if we failed to find metadata mark publication as failed to avoid to keep looking
                    # at same files over and over
                    if not metadataFound:
                        toFail.append(file_["value"][1])
                with open(self.taskFilesDir + workflow + '.json',
                          'w') as outfile:
                    json.dump(toPublish, outfile)
                logger.debug(
                    'Unitarity check: active_:%d toPublish:%d toFail:%d',
                    len(active_), len(toPublish), len(toFail))
                if len(toPublish) + len(toFail) != len(active_):
                    logger.error("SOMETHING WRONG IN toPublish vs toFail !!")
                if toFail:
                    logger.info(
                        'Did not find useful metadata for %d files. Mark as failed',
                        len(toFail))
                    from ServerUtilities import getHashLfn
                    nMarked = 0
                    for lfn in toFail:
                        source_lfn = lfn
                        docId = getHashLfn(source_lfn)
                        data = dict()
                        data['asoworker'] = self.config.asoworker
                        data['subresource'] = 'updatePublication'
                        data['list_of_ids'] = docId
                        data['list_of_publication_state'] = 'FAILED'
                        data['list_of_retry_value'] = 1
                        data[
                            'list_of_failure_reason'] = 'File type not EDM or metadata not found'
                        try:
                            result = self.crabServer.post(
                                api='filetransfers', data=encodeRequest(data))
                            #logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result)
                        except Exception as ex:
                            logger.error(
                                "Error updating status for DocumentId: %s lfn: %s",
                                docId, source_lfn)
                            logger.error("Error reason: %s", ex)

                        nMarked += 1
                        #if nMarked % 10 == 0:
                    logger.info('marked %d files as Failed', nMarked)

                # find the location in the current environment of the script we want to run
                import Publisher.TaskPublish as tp
                taskPublishScript = tp.__file__
                cmd = "python %s " % taskPublishScript
                cmd += " --configFile=%s" % self.configurationFile
                cmd += " --taskname=%s" % workflow
                if self.TPconfig.dryRun:
                    cmd += " --dry"
                logger.info("Now execute: %s", cmd)
                stdout, stderr, exitcode = executeCommand(cmd)
                if exitcode != 0:
                    errorMsg = 'Failed to execute command: %s.\n StdErr: %s.' % (
                        cmd, stderr)
                    raise Exception(errorMsg)
                else:
                    logger.info('TaskPublishScript done : %s', stdout)

                jsonSummary = stdout.split()[-1]
                with open(jsonSummary, 'r') as fd:
                    summary = json.load(fd)
                result = summary['result']
                reason = summary['reason']

                taskname = summary['taskname']
                if result == 'OK':
                    if reason == 'NOTHING TO DO':
                        logger.info('Taskname %s is OK. Nothing to do',
                                    taskname)
                    else:
                        msg = 'Taskname %s is OK. Published %d files in %d blocks.' % \
                              (taskname, summary['publishedFiles'], summary['publishedBlocks'])
                        if summary['nextIterFiles']:
                            msg += ' %d files left for next iteration.' % summary[
                                'nextIterFiles']
                        logger.info(msg)
                if result == 'FAIL':
                    logger.error('Taskname %s : TaskPublish failed with: %s',
                                 taskname, reason)
                    if reason == 'DBS Publication Failure':
                        logger.error(
                            'Taskname %s : %d blocks failed for a total of %d files',
                            taskname, summary['failedBlocks'],
                            summary['failedFiles'])
                        logger.error(
                            'Taskname %s : Failed block(s) details have been saved in %s',
                            taskname, summary['failedBlockDumps'])
        except Exception as ex:
            logger.exception("Exception when calling TaskPublish!\n%s",
                             str(ex))

        return 0

    def pollInterval(self):
        """
        Decide when next polling cycle must start so that there is a start each
        self.config.pollInterval seconds
        :return: timeToWait: integer: the intervat time to wait before starting next cycle, in seconds
        """
        now = time.time()
        elapsed = int(now - self.startTime)
        timeToWait = self.config.pollInterval - elapsed
        timeToWait = max(timeToWait,
                         0)  # if too muach time has passed, start now !
        return timeToWait
Exemplo n.º 9
0
def algorithm():
    """

    script algorithm
    - create fts REST HTTPRequest
    - delegate user proxy to fts if needed
    - check for fts jobs to monitor and update states in oracle
    - get last line from last_transfer.txt
    - gather list of file to transfers
        + group by source
        + submit ftsjob and save fts jobid
        + update info in oracle
    - append new fts job ids to fts_jobids.txt
    """

    # TODO: pass by configuration
    fts = HTTPRequests(hostname=FTS_ENDPOINT.split("https://")[1],
                       localcert=proxy, localkey=proxy)

    logging.info("using user's proxy from %s", proxy)
    ftsContext = fts3.Context(FTS_ENDPOINT, proxy, proxy, verify=True)
    logging.info("Delegating proxy to FTS...")
    delegationId = fts3.delegate(ftsContext, lifetime=timedelta(hours=48), delegate_when_lifetime_lt=timedelta(hours=24), force=False)
    delegationStatus = fts.get("delegation/"+delegationId)
    logging.info("Delegated proxy valid until %s", delegationStatus[0]['termination_time'])

    # instantiate an object to talk with CRAB REST server

    try:
        crabserver = CRABRest(restInfo['host'], localcert=proxy, localkey=proxy,
                              userAgent='CRABSchedd')
        crabserver.setDbInstance(restInfo['dbInstance'])
    except Exception:
        logging.exception("Failed to set connection to crabserver")
        return

    with open("task_process/transfers.txt") as _list:
        _data = _list.readlines()[0]
        try:
            doc = json.loads(_data)
            username = doc["username"]
            taskname = doc["taskname"]
            destination = doc["destination"]
        except Exception as ex:
            msg = "Username gathering failed with\n%s" % str(ex)
            logging.warn(msg)
            raise ex

    try:
        logging.info("Initializing Rucio client")
        os.environ["X509_USER_PROXY"] = proxy
        logging.info("Initializing Rucio client for %s", taskname)
        rucioClient = CRABDataInjector(taskname,
                                    destination,
                                    account=username,
                                    scope="user."+username,
                                    auth_type='x509_proxy')
    except Exception as exc:
        msg = "Rucio initialization failed with\n%s" % str(exc)
        logging.warn(msg)
        raise exc

    jobs_ongoing = state_manager(fts, crabserver)
    new_jobs = submission_manager(rucioClient, ftsContext, crabserver)

    logging.info("Transfer jobs ongoing: %s, new: %s ", jobs_ongoing, new_jobs)

    return
Exemplo n.º 10
0
    def __init__(self,
                 config,
                 logWarning,
                 logDebug,
                 sequential=False,
                 console=False,
                 name='master'):
        """Initializer

        :arg WMCore.Configuration config: input TaskWorker configuration
        :arg bool logWarning: it tells if a quiet logger is needed
        :arg bool logDebug: it tells if needs a verbose logger
        :arg bool sequential: it tells if to run in sequential (no subprocesses) mode.
        :arg bool console: it tells if to log to console.
        :arg string name: defines a name for the log of this master process"""
        def createLogdir(dirname):
            """ Create the directory dirname ignoring errors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.mkdir(dirname)
            except OSError as ose:
                if ose.errno != 17:  #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The task worker need to access the '%s' directory" %
                          dirname)
                    sys.exit(1)

        def createAndCleanLogDirectories(logsDir):
            # it can be named with the time stamp a TW started
            createLogdir(logsDir)
            createLogdir(logsDir + '/tasks')
            currentProcessesDir = logsDir + '/processes/'
            createLogdir(currentProcessesDir)
            # when running inside a container process logs will start with same
            # process numbers, i.e. same name, at any container restart.
            # to avoid clashes and confusion, we will put away all previous processes
            # logs when a TW instance starts. To this goal each TW which runs
            # creates a directory where new containers will move its logs, so
            # identify LastLogs_timestamp directory
            latestLogDir = None  # the logs directory could be empty
            files = os.listdir(currentProcessesDir)
            files.sort(
                reverse=True
            )  # if there are multiple Latest*, will hit the latest first
            for f in files:
                if f.startswith('Latest'):
                    latestLogDir = currentProcessesDir + f
                    break
            if files and latestLogDir:
                # rename from Latest to Old
                oldLogsDir = latestLogDir.replace('Latest', 'Old')
                shutil.move(latestLogDir, oldLogsDir)
            else:
                print(
                    "LatestLogDir not found in logs/processes, create a dummy dir to store old files"
                )
                oldLogsDir = currentProcessesDir + 'OldLog-Unknwown'
                createLogdir(oldLogsDir)
            # move process logs for latest TW run to old directory
            for f in files:
                if f.startswith('proc.c3id'):
                    shutil.move(currentProcessesDir + f, oldLogsDir)

            # create a new LateastLogs directory where to store logs from this TaskWorker
            YYMMDD_HHMMSS = time.strftime('%y%m%d_%H%M%S', time.localtime())
            myDir = currentProcessesDir + 'LatestLogs-' + YYMMDD_HHMMSS
            createLogdir(myDir)

        def setRootLogger(logWarning, logDebug, console, name):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logsDir/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool logWarning: it tells if a quiet logger is needed
            :arg bool logDebug: it tells if needs a verbose logger
            :arg bool console: it tells if to log to console
            :arg string name: define a name for the log file of this master process
            :return logger: a logger with the appropriate logger level."""

            # this must only done for real Master, not when it is used by TapeRecallStatus
            logsDir = config.TaskWorker.logsDir
            if name == 'master':
                createAndCleanLogDirectories(logsDir)

            if console:
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog(logsDir + '/twlog.txt',
                                                when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if logWarning:
                loglevel = logging.WARNING
            if logDebug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger(name, logsDir)
            logger.info("PID %s.", os.getpid())
            logger.info("Logging level initialized to %s.", loglevel)
            return logger

        def logVersionAndConfig(config=None, logger=None):
            """
            log version number and major config. parameters
            args: config : a configuration object loaded from file
            args: logger : the logger instance to use
            """
            twstartDict = {}
            twstartDict['version'] = __version__
            twstartDict['DBSHostName'] = config.Services.DBSHostName
            twstartDict['name'] = config.TaskWorker.name
            twstartDict['instance'] = config.TaskWorker.instance
            if config.TaskWorker.instance == 'other':
                twstartDict['restHost'] = config.TaskWorker.restHost
                twstartDict['dbInstance'] = config.TaskWorker.dbInstance
            twstartDict['nslaves'] = config.TaskWorker.nslaves
            twstartDict[
                'recurringActions'] = config.TaskWorker.recurringActions
            # one line for automatic parsing
            logger.info('TWSTART: %s', json.dumps(twstartDict))
            # multiple lines for humans to read
            for k, v in twstartDict.items():
                logger.info('%s: %s', k, v)
            return

        self.STOP = False
        self.TEST = sequential
        self.logger = setRootLogger(logWarning, logDebug, console, name)
        self.config = config
        self.restHost = None
        dbInstance = None

        logVersionAndConfig(self.config, self.logger)

        try:
            instance = self.config.TaskWorker.instance
        except:
            msg = "No instance provided: need to specify config.TaskWorker.instance in the configuration"
            raise ConfigException(msg)

        if instance in SERVICE_INSTANCES:
            self.logger.info('Will connect to CRAB service: %s', instance)
            self.restHost = SERVICE_INSTANCES[instance]['restHost']
            dbInstance = SERVICE_INSTANCES[instance]['dbInstance']
        else:
            msg = "Invalid instance value '%s'" % instance
            raise ConfigException(msg)
        if instance == 'other':
            self.logger.info(
                'Will use restHost and dbInstance from config file')
            try:
                self.restHost = self.config.TaskWorker.restHost
                dbInstance = self.config.TaskWorker.dbInstance
            except:
                msg = "Need to specify config.TaskWorker.restHost and dbInstance in the configuration"
                raise ConfigException(msg)
        self.dbInstance = dbInstance

        self.logger.info('Will connect via URL: https://%s/%s', self.restHost,
                         self.dbInstance)

        #Let's increase the server's retries for recoverable errors in the MasterWorker
        #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...)
        self.crabserver = CRABRest(self.restHost,
                                   self.config.TaskWorker.cmscert,
                                   self.config.TaskWorker.cmskey,
                                   retry=20,
                                   logger=self.logger,
                                   userAgent='CRABTaskWorker')
        self.crabserver.setDbInstance(self.dbInstance)
        self.logger.debug("Hostcert: %s, hostkey: %s",
                          str(self.config.TaskWorker.cmscert),
                          str(self.config.TaskWorker.cmskey))
        # Retries for any failures
        if not hasattr(self.config.TaskWorker, 'max_retry'):
            self.config.TaskWorker.max_retry = 0
        if not hasattr(self.config.TaskWorker, 'retry_interval'):
            self.config.TaskWorker.retry_interval = [
                retry * 20 * 2
                for retry in range(self.config.TaskWorker.max_retry)
            ]
        if not len(self.config.TaskWorker.retry_interval
                   ) == self.config.TaskWorker.max_retry:
            raise ConfigException(
                "No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry."
            )
        # use the config to pass some useful global stuff to all workers
        # will use TaskWorker.cmscert/key to talk with CMSWEB
        self.config.TaskWorker.envForCMSWEB = newX509env(
            X509_USER_CERT=self.config.TaskWorker.cmscert,
            X509_USER_KEY=self.config.TaskWorker.cmskey)

        if self.TEST:
            self.slaves = TestWorker(self.config, self.restHost,
                                     self.dbInstance)
        else:
            self.slaves = Worker(self.config, self.restHost, self.dbInstance)
        self.slaves.begin()
        recurringActionsNames = getattr(self.config.TaskWorker,
                                        'recurringActions', [])
        self.recurringActions = [
            self.getRecurringActionInst(name) for name in recurringActionsNames
        ]
Exemplo n.º 11
0
class MasterWorker(object):
    """I am the master of the TaskWorker"""
    def __init__(self,
                 config,
                 logWarning,
                 logDebug,
                 sequential=False,
                 console=False,
                 name='master'):
        """Initializer

        :arg WMCore.Configuration config: input TaskWorker configuration
        :arg bool logWarning: it tells if a quiet logger is needed
        :arg bool logDebug: it tells if needs a verbose logger
        :arg bool sequential: it tells if to run in sequential (no subprocesses) mode.
        :arg bool console: it tells if to log to console.
        :arg string name: defines a name for the log of this master process"""
        def createLogdir(dirname):
            """ Create the directory dirname ignoring errors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.mkdir(dirname)
            except OSError as ose:
                if ose.errno != 17:  #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The task worker need to access the '%s' directory" %
                          dirname)
                    sys.exit(1)

        def createAndCleanLogDirectories(logsDir):
            # it can be named with the time stamp a TW started
            createLogdir(logsDir)
            createLogdir(logsDir + '/tasks')
            currentProcessesDir = logsDir + '/processes/'
            createLogdir(currentProcessesDir)
            # when running inside a container process logs will start with same
            # process numbers, i.e. same name, at any container restart.
            # to avoid clashes and confusion, we will put away all previous processes
            # logs when a TW instance starts. To this goal each TW which runs
            # creates a directory where new containers will move its logs, so
            # identify LastLogs_timestamp directory
            latestLogDir = None  # the logs directory could be empty
            files = os.listdir(currentProcessesDir)
            files.sort(
                reverse=True
            )  # if there are multiple Latest*, will hit the latest first
            for f in files:
                if f.startswith('Latest'):
                    latestLogDir = currentProcessesDir + f
                    break
            if files and latestLogDir:
                # rename from Latest to Old
                oldLogsDir = latestLogDir.replace('Latest', 'Old')
                shutil.move(latestLogDir, oldLogsDir)
            else:
                print(
                    "LatestLogDir not found in logs/processes, create a dummy dir to store old files"
                )
                oldLogsDir = currentProcessesDir + 'OldLog-Unknwown'
                createLogdir(oldLogsDir)
            # move process logs for latest TW run to old directory
            for f in files:
                if f.startswith('proc.c3id'):
                    shutil.move(currentProcessesDir + f, oldLogsDir)

            # create a new LateastLogs directory where to store logs from this TaskWorker
            YYMMDD_HHMMSS = time.strftime('%y%m%d_%H%M%S', time.localtime())
            myDir = currentProcessesDir + 'LatestLogs-' + YYMMDD_HHMMSS
            createLogdir(myDir)

        def setRootLogger(logWarning, logDebug, console, name):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logsDir/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool logWarning: it tells if a quiet logger is needed
            :arg bool logDebug: it tells if needs a verbose logger
            :arg bool console: it tells if to log to console
            :arg string name: define a name for the log file of this master process
            :return logger: a logger with the appropriate logger level."""

            # this must only done for real Master, not when it is used by TapeRecallStatus
            logsDir = config.TaskWorker.logsDir
            if name == 'master':
                createAndCleanLogDirectories(logsDir)

            if console:
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog(logsDir + '/twlog.txt',
                                                when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if logWarning:
                loglevel = logging.WARNING
            if logDebug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger(name, logsDir)
            logger.info("PID %s.", os.getpid())
            logger.info("Logging level initialized to %s.", loglevel)
            return logger

        def logVersionAndConfig(config=None, logger=None):
            """
            log version number and major config. parameters
            args: config : a configuration object loaded from file
            args: logger : the logger instance to use
            """
            twstartDict = {}
            twstartDict['version'] = __version__
            twstartDict['DBSHostName'] = config.Services.DBSHostName
            twstartDict['name'] = config.TaskWorker.name
            twstartDict['instance'] = config.TaskWorker.instance
            if config.TaskWorker.instance == 'other':
                twstartDict['restHost'] = config.TaskWorker.restHost
                twstartDict['dbInstance'] = config.TaskWorker.dbInstance
            twstartDict['nslaves'] = config.TaskWorker.nslaves
            twstartDict[
                'recurringActions'] = config.TaskWorker.recurringActions
            # one line for automatic parsing
            logger.info('TWSTART: %s', json.dumps(twstartDict))
            # multiple lines for humans to read
            for k, v in twstartDict.items():
                logger.info('%s: %s', k, v)
            return

        self.STOP = False
        self.TEST = sequential
        self.logger = setRootLogger(logWarning, logDebug, console, name)
        self.config = config
        self.restHost = None
        dbInstance = None

        logVersionAndConfig(self.config, self.logger)

        try:
            instance = self.config.TaskWorker.instance
        except:
            msg = "No instance provided: need to specify config.TaskWorker.instance in the configuration"
            raise ConfigException(msg)

        if instance in SERVICE_INSTANCES:
            self.logger.info('Will connect to CRAB service: %s', instance)
            self.restHost = SERVICE_INSTANCES[instance]['restHost']
            dbInstance = SERVICE_INSTANCES[instance]['dbInstance']
        else:
            msg = "Invalid instance value '%s'" % instance
            raise ConfigException(msg)
        if instance == 'other':
            self.logger.info(
                'Will use restHost and dbInstance from config file')
            try:
                self.restHost = self.config.TaskWorker.restHost
                dbInstance = self.config.TaskWorker.dbInstance
            except:
                msg = "Need to specify config.TaskWorker.restHost and dbInstance in the configuration"
                raise ConfigException(msg)
        self.dbInstance = dbInstance

        self.logger.info('Will connect via URL: https://%s/%s', self.restHost,
                         self.dbInstance)

        #Let's increase the server's retries for recoverable errors in the MasterWorker
        #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...)
        self.crabserver = CRABRest(self.restHost,
                                   self.config.TaskWorker.cmscert,
                                   self.config.TaskWorker.cmskey,
                                   retry=20,
                                   logger=self.logger,
                                   userAgent='CRABTaskWorker')
        self.crabserver.setDbInstance(self.dbInstance)
        self.logger.debug("Hostcert: %s, hostkey: %s",
                          str(self.config.TaskWorker.cmscert),
                          str(self.config.TaskWorker.cmskey))
        # Retries for any failures
        if not hasattr(self.config.TaskWorker, 'max_retry'):
            self.config.TaskWorker.max_retry = 0
        if not hasattr(self.config.TaskWorker, 'retry_interval'):
            self.config.TaskWorker.retry_interval = [
                retry * 20 * 2
                for retry in range(self.config.TaskWorker.max_retry)
            ]
        if not len(self.config.TaskWorker.retry_interval
                   ) == self.config.TaskWorker.max_retry:
            raise ConfigException(
                "No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry."
            )
        # use the config to pass some useful global stuff to all workers
        # will use TaskWorker.cmscert/key to talk with CMSWEB
        self.config.TaskWorker.envForCMSWEB = newX509env(
            X509_USER_CERT=self.config.TaskWorker.cmscert,
            X509_USER_KEY=self.config.TaskWorker.cmskey)

        if self.TEST:
            self.slaves = TestWorker(self.config, self.restHost,
                                     self.dbInstance)
        else:
            self.slaves = Worker(self.config, self.restHost, self.dbInstance)
        self.slaves.begin()
        recurringActionsNames = getattr(self.config.TaskWorker,
                                        'recurringActions', [])
        self.recurringActions = [
            self.getRecurringActionInst(name) for name in recurringActionsNames
        ]

    def getRecurringActionInst(self, actionName):
        mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName,
                         fromlist=actionName)
        return getattr(mod, actionName)(self.config.TaskWorker.logsDir)

    def _lockWork(self, limit, getstatus, setstatus):
        """Today this is always returning true, because we do not want the worker to die if
           the server endpoint is not avaialable.
           Prints a log entry if answer is greater than 400:
            * the server call succeeded or
            * the server could not find anything to update or
            * the server has an internal error"""

        configreq = {
            'subresource': 'process',
            'workername': self.config.TaskWorker.name,
            'getstatus': getstatus,
            'limit': limit,
            'status': setstatus
        }
        try:
            #self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq))
            self.crabserver.post(api='workflowdb',
                                 data=urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during _lockWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
            return False
        except Exception:  #pylint: disable=broad-except
            self.logger.exception(
                "Server could not process the _lockWork request (prameters are %s)",
                configreq)
            return False

        return True

    def getWork(self, limit, getstatus, ignoreTWName=False):
        configreq = {
            'limit': limit,
            'workername': self.config.TaskWorker.name,
            'getstatus': getstatus
        }
        if ignoreTWName:
            configreq['workername'] = '%'

        pendingwork = []
        try:
            #pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data=configreq)[0]['result']
            pendingwork = self.crabserver.get(api='workflowdb',
                                              data=configreq)[0]['result']
        except HTTPException as hte:
            msg = "HTTP Error during getWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception:  #pylint: disable=broad-except
            self.logger.exception(
                "Server could not process the getWork request (prameters are %s)",
                configreq)
        return pendingwork

    def quit_(self, dummyCode, dummyTraceback):
        self.logger.info(
            "Received kill request. Setting STOP flag in the master process..."
        )
        self.STOP = True

    def updateWork(self, taskname, command, status):
        """ Update taskname setting the status and the command for it
            Return True if the change succeded, False otherwise
        """

        configreq = {
            'workflow': taskname,
            'command': command,
            'status': status,
            'subresource': 'state'
        }
        try:
            #self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq))
            self.crabserver.post(api='workflowdb',
                                 data=urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during updateWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception:  #pylint: disable=broad-except
            self.logger.exception(
                "Server could not process the updateWork request (prameters are %s)",
                configreq)
        else:
            return True  #success
        return False  #failure

    def restartQueuedTasks(self):
        """ This method is used at the TW startup and it restarts QUEUED tasks
            setting them  back again to NEW.
        """
        limit = self.slaves.nworkers * 2
        total = 0
        while True:
            pendingwork = self.getWork(limit=limit, getstatus='QUEUED')
            for task in pendingwork:
                self.logger.debug("Restarting QUEUED task %s",
                                  task['tm_taskname'])
                self.updateWork(task['tm_taskname'], task['tm_task_command'],
                                'NEW')
            if not pendingwork:
                self.logger.info("Finished restarting QUEUED tasks (total %s)",
                                 total)
                break  #too bad "do..while" does not exist in python...
            else:
                total += len(pendingwork)
                self.logger.info(
                    "Restarted %s tasks (limit %s), getting next chunk of tasks",
                    len(pendingwork), limit)

    def failBannedTask(self, task):
        """ This method is used at the TW startup and it fails NEW tasks which I do not like
        The method put those task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of
         the command field.
        Initial implementation bans based on a list of usernames, other task attributes can
          be checked if needed by adding a bit of code
        Returns:
            True : if the task was declared bad and was failed
            False: for normal (good) tasks
        """
        bannedUsernames = getattr(self.config.TaskWorker, 'bannedUsernames',
                                  [])
        if task['tm_username'] in bannedUsernames:
            self.logger.debug("Forcefully failing task %s",
                              task['tm_taskname'])
            if task['tm_task_command']:
                dummyWorktype, failstatus = STATE_ACTIONS_MAP[
                    task['tm_task_command']]
            else:
                failstatus = 'FAILED'
            self.updateWork(task['tm_taskname'], task['tm_task_command'],
                            failstatus)
            # TODO look into logging a message for the user
            return True
        return False

    def algorithm(self):
        """I'm the intelligent guy taking care of getting the work
           and distributing it to the slave processes."""

        self.logger.debug("Restarting QUEUED tasks before startup.")
        self.restartQueuedTasks()
        self.logger.debug("Master Worker Starting Main Cycle.")
        while not self.STOP:
            limit = self.slaves.queueableTasks()
            if not self._lockWork(
                    limit=limit, getstatus='NEW', setstatus='HOLDING'):
                time.sleep(self.config.TaskWorker.polling)
                continue

            pendingwork = self.getWork(limit=limit, getstatus='HOLDING')

            if pendingwork:
                keys = ['tm_task_command', 'tm_taskname']
                tasksInfo = [{k: v
                              for k, v in task.items() if k in keys}
                             for task in pendingwork]
                self.logger.info("Retrieved a total of %d works",
                                 len(pendingwork))
                self.logger.debug("Retrieved the following works: \n%s",
                                  str(tasksInfo))

            toInject = []
            for task in pendingwork:
                if self.failBannedTask(task):
                    continue
                if self.updateWork(task['tm_taskname'],
                                   task['tm_task_command'], 'QUEUED'):
                    worktype, failstatus = STATE_ACTIONS_MAP[
                        task['tm_task_command']]
                    toInject.append((worktype, task, failstatus, None))
                else:
                    #The task stays in HOLDING and will be acquired again later
                    self.logger.info(
                        "Skipping %s since it could not be updated to QUEUED. Will be retried in the next iteration",
                        task['tm_taskname'])

            self.slaves.injectWorks(toInject)

            for action in self.recurringActions:
                if action.isTimeToGo():
                    #Maybe we should use new slaves and not reuse the ones used for the tasks
                    self.logger.debug("Injecting recurring action: \n%s",
                                      (str(action.__module__)))
                    self.slaves.injectWorks([(handleRecurring, {
                        'tm_username': '******',
                        'tm_taskname': action.__module__
                    }, 'FAILED', action.__module__)])

            self.logger.info('Master Worker status:')
            self.logger.info(' - free slaves: %d', self.slaves.freeSlaves())
            self.logger.info(' - acquired tasks: %d',
                             self.slaves.queuedTasks())
            self.logger.info(' - tasks pending in queue: %d',
                             self.slaves.pendingTasks())

            time.sleep(self.config.TaskWorker.polling)

            dummyFinished = self.slaves.checkFinished()

        self.logger.debug("Master Worker Exiting Main Cycle.")
Exemplo n.º 12
0
def processWorkerLoop(inputs, results, resthost, dbInstance, procnum, logger, logsDir):
    procName = "Process-%s" % procnum
    while True:
        try:
            ## Get (and remove) an item from the input queue. If the queue is empty, wait
            ## until an item is available. Item content is:
            ##  workid : an integer assigne by the queue module
            ##  work   : a function handler to the needed action e.g. function handleNewTask
            ##  task   : a task dictionary
            ##  failstatus : the status to assign to the task if work fails (e.g. 'SUBMITFAILED')
            workid, work, task, failstatus, inputargs = inputs.get()
            if work == 'STOP':
                break
            taskhandler = addTaskLogHandler(logger, task['tm_username'], task['tm_taskname'], logsDir)
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logger.error(crashMessage)
            break

        outputs = None
        t0 = time.time()
        #log entry below is used for logs parsing, therefore, changing it might require to update logstash configuration
        logger.debug("%s: Starting %s on %s", procName, str(work), task['tm_taskname'])
        try:
            msg = None
            outputs = work(resthost, dbInstance, WORKER_CONFIG, task, procnum, inputargs)
        except TapeDatasetException as tde:
            outputs = Result(task=task, err=str(tde))
        except WorkerHandlerException as we:
            outputs = Result(task=task, err=str(we))
            msg = str(we)
        except Exception as exc: #pylint: disable=broad-except
            outputs = Result(task=task, err=str(exc))
            msg = "%s: I just had a failure for %s" % (procName, str(exc))
            msg += "\n\tworkid=" + str(workid)
            msg += "\n\ttask=" + str(task['tm_taskname'])
            msg += "\n" + str(traceback.format_exc())
        finally:
            if msg:
                crabserver = CRABRest(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey,
                                      retry=20, logger=logger, userAgent='CRABTaskWorker')
                crabserver.setDbInstance(dbInstance)
                failTask(task['tm_taskname'], crabserver, msg, logger, failstatus)
        t1 = time.time()
        workType = task.get('tm_task_command', 'RECURRING')
        #log entry below is used for logs parsing, therefore, changing it might require to update logstash configuration
        logger.debug("%s: %s work on %s completed in %d seconds: %s", procName, workType, task['tm_taskname'], t1-t0, outputs)

        try:
            out, _, _ = executeCommand("ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" % os.getpid())
            msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'], out.strip())
            logger.debug(msg)
        except Exception:
            logger.exception("Problem getting worker RSS:")

        removeTaskLogHandler(logger, taskhandler)

        results.put({
                     'workid': workid,
                     'out' : outputs
                    })
Exemplo n.º 13
0
def handleNewTask(resthost, dbInstance, config, task, procnum, *args,
                  **kwargs):
    """Performs the injection of a new task

    :arg str resthost: the hostname where the rest interface is running
    :arg str dbInstance: the rest base url to contact
    :arg WMCore.Configuration config: input configuration
    :arg TaskWorker.DataObjects.Task task: the task to work on
    :arg int procnum: the process number taking care of the work
    :*args and *kwargs: extra parameters currently not defined
    :return: the handler."""
    crabserver = CRABRest(resthost,
                          config.TaskWorker.cmscert,
                          config.TaskWorker.cmskey,
                          retry=20,
                          logger=logging.getLogger(str(procnum)),
                          userAgent='CRABTaskWorker',
                          version=__version__)
    crabserver.setDbInstance(dbInstance)
    handler = TaskHandler(task,
                          procnum,
                          crabserver,
                          config,
                          'handleNewTask',
                          createTempDir=True)
    rucioClient = getNativeRucioClient(config=config, logger=handler.logger)
    handler.addWork(
        MyProxyLogon(config=config,
                     crabserver=crabserver,
                     procnum=procnum,
                     myproxylen=60 * 60 * 24))
    handler.addWork(
        StageoutCheck(config=config,
                      crabserver=crabserver,
                      procnum=procnum,
                      rucioClient=rucioClient))
    if task['tm_job_type'] == 'Analysis':
        if task.get('tm_user_files'):
            handler.addWork(
                UserDataDiscovery(config=config,
                                  crabserver=crabserver,
                                  procnum=procnum))
        else:
            handler.addWork(
                DBSDataDiscovery(config=config,
                                 crabserver=crabserver,
                                 procnum=procnum,
                                 rucioClient=rucioClient))
    elif task['tm_job_type'] == 'PrivateMC':
        handler.addWork(
            MakeFakeFileSet(config=config,
                            crabserver=crabserver,
                            procnum=procnum))
    handler.addWork(
        Splitter(config=config, crabserver=crabserver, procnum=procnum))
    handler.addWork(
        DagmanCreator(config=config,
                      crabserver=crabserver,
                      procnum=procnum,
                      rucioClient=rucioClient))
    if task['tm_dry_run'] == 'T':
        handler.addWork(
            DryRunUploader(config=config,
                           crabserver=crabserver,
                           procnum=procnum))
    else:
        handler.addWork(
            DagmanSubmitter(config=config,
                            crabserver=crabserver,
                            procnum=procnum))

    return handler.actionWork(args, kwargs)
Exemplo n.º 14
0
            self.logger.error(hte.headers)


if __name__ == '__main__':
    """ Simple main to execute the action standalone. You just need to set the task worker environment.
        The main is set up to work with the production task worker. If you want to use it on your own
        instance you need to change resthost, resturi, and twconfig.
    """
    resthost = 'cmsweb.cern.ch'
    dbinstance = 'prod'
    twconfig = '/data/srv/TaskManager/current/TaskWorkerConfig.py'

    logger = logging.getLogger()
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter("%(asctime)s:%(levelname)s:%(module)s %(message)s", datefmt="%a, %d %b %Y %H:%M:%S %Z(%z)")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.DEBUG)

    from WMCore.Configuration import loadConfigurationFile
    cfg = loadConfigurationFile(twconfig)

    resthost = 'cmsweb-testbed.cern.ch'
    dbinstance = 'dev'
    fmdc = FMDCleaner(cfg.TaskWorker.logsDir)
    crabserver = CRABRest(hostname=resthost, localcert=cfg.TaskWorker.cmscert,
                          localkey=cfg.TaskWorker.cmskey, retry=2,
                          logger=logger)
    crabserver.setDbInstance(dbInstance=dbinstance)
    fmdc._execute(crabserver)
Exemplo n.º 15
0
def main():
    """
    Need a doc string here.
    """
    setupLog()

    if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists(
            os.environ["_CONDOR_JOB_AD"]):
        printLog(
            "Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist"
        )
        sys.exit(0)

    printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" %
             os.environ['_CONDOR_JOB_AD'])

    with open(os.environ['_CONDOR_JOB_AD']) as fd:
        ad = classad.parseOne(fd)
    printLog("Parsed ad: %s" % ad)

    # instantiate a server object to talk with crabserver
    host = ad['CRAB_RestHost']
    dbInstance = ad['CRAB_DbInstance']
    cert = ad['X509UserProxy']
    crabserver = CRABRest(host, cert, cert, retry=3, userAgent='CRABSchedd')
    crabserver.setDbInstance(dbInstance)

    checkTaskInfo(crabserver, ad)

    # is this the first time this script runs for this task ? (it runs at each resubmit as well !)
    if not os.path.exists('WEB_DIR'):
        makeWebDir(ad)
        printLog(
            "Webdir has been set up. Uploading the webdir URL to the REST")

        retries = 0
        exitCode = 1
        maxRetries = 3
        while retries < maxRetries and exitCode != 0:
            exitCode = uploadWebDir(crabserver, ad)
            if exitCode != 0:
                time.sleep(retries * 20)
            retries += 1
        if exitCode != 0:
            printLog(
                "Exiting AdjustSites because the webdir upload failed %d times."
                % maxRetries)
            sys.exit(1)
        printLog(
            "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir"
            % exitCode)

        saveProxiedWebdir(crabserver, ad)
        printLog("Proxied webdir saved")

    printLog(
        "Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions"
    )

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = ad['CRAB_ResubmitList']
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True

    # Hold and release processing and tail DAGs here so that modifications
    # to the submission and log files will be picked up.
    schedd = htcondor.Schedd()
    tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote(
        ad.get("CRAB_ReqName"))
    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Holding processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGKILL')
        schedd.act(htcondor.JobAction.Hold, tailconst)

    if resubmitJobIds:
        adjustedJobIds = []
        filenames = getGlob(ad, "RunJobs.dag.nodes.log",
                            "RunJobs[1-9]*.subdag.nodes.log")
        for fn in filenames:
            if hasattr(htcondor, 'lock'):
                # While dagman is not running at this point, the schedd may be writing events to this
                # file; hence, we only edit the file while holding an appropriate lock.
                # Note this lock method didn't exist until 8.1.6; prior to this, we simply
                # run dangerously.
                with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock):
                    adjustedJobIds.extend(
                        adjustPostScriptExitStatus(resubmitJobIds, fn))
            else:
                adjustedJobIds.extend(
                    adjustPostScriptExitStatus(resubmitJobIds, fn))
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parseOne(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))

    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Releasing processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1')
        schedd.act(htcondor.JobAction.Release, tailconst)

    printLog("Exiting AdjustSite")
Exemplo n.º 16
0
def algorithm():
    """
    script algorithm
    - instantiates FTS3 python easy client
    - delegate user proxy to fts if needed
    - check for fts jobs to monitor and update states in oracle
    - get last line from last_transfer.txt
    - gather list of file to transfers
        + group by source
        + submit ftsjob and save fts jobid
        + update info in oracle
    - append new fts job ids to fts_jobids.txt
    """

    logging.info("using user's proxy from %s", proxy)
    ftsContext = fts3.Context(FTS_ENDPOINT, proxy, proxy, verify=True)
    logging.info("Delegating proxy to FTS...")
    delegationId = fts3.delegate(
        ftsContext,
        lifetime=timedelta(hours=48),  # pylint: disable=unused-variable
        delegate_when_lifetime_lt=timedelta(hours=24),
        force=False)
    # we never had problems with delegation since we put proper delegate_when in the above line
    # but if need to check delegation arise, it can be done with a query like
    # curl --cert proxy --key proxy https://fts3-cms.cern.ch:8446/delegation/<delegatinId>
    # see: https://fts3-docs.web.cern.ch/fts3-docs/fts-rest/docs/api.html#get-delegationdlgid

    # instantiate an object to talk with CRAB REST server
    try:
        crabserver = CRABRest(restInfo['host'],
                              localcert=proxy,
                              localkey=proxy,
                              userAgent='CRABSchedd')
        crabserver.setDbInstance(restInfo['dbInstance'])
    except Exception:
        logging.exception("Failed to set connection to crabserver")
        return

    with open("task_process/transfers.txt") as _list:
        _data = _list.readlines()[0]
        try:
            doc = json.loads(_data)
            username = doc["username"]
            taskname = doc["taskname"]
            destination = doc["destination"]
        except Exception as ex:
            msg = "Username gathering failed with\n%s" % str(ex)
            logging.warn(msg)
            raise ex

    try:
        logging.info("Initializing Rucio client")
        os.environ["X509_USER_PROXY"] = proxy
        os.environ["X509_USER_PROXY"] = proxy
        rucioClient = rucioclient(account=username, auth_type='x509_proxy')
    except Exception as exc:
        msg = "Rucio initialization failed with\n%s" % str(exc)
        logging.warn(msg)
        raise exc

    jobs_ongoing = state_manager(ftsContext, crabserver)
    new_jobs = submission_manager(rucioClient, ftsContext, crabserver)

    logging.info("Transfer jobs ongoing: %s, new: %s ", jobs_ongoing, new_jobs)

    return
Exemplo n.º 17
0
def publishInDBS3(config, taskname, verbose):
    """
    Publish output from one task in DBS
    """

    def mark_good(files, crabServer, logger):
        """
        Mark the list of files as tranferred
        """

        msg = "Marking %s file(s) as published." % len(files)
        logger.info(msg)
        if dryRun:
            logger.info("DryRun: skip marking good file")
            return

        nMarked = 0
        for lfn in files:
            data = {}
            source_lfn = lfn
            docId = getHashLfn(source_lfn)
            data['asoworker'] = config.General.asoworker
            data['subresource'] = 'updatePublication'
            data['list_of_ids'] = [docId]
            data['list_of_publication_state'] = ['DONE']
            data['list_of_retry_value'] = [1]
            data['list_of_failure_reason'] = ['']

            try:
                result = crabServer.post(api='filetransfers', data=encodeRequest(data))
                logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result)
            except Exception as ex:
                logger.error("Error updating status for DocumentId: %s lfn: %s", docId, source_lfn)
                logger.error("Error reason: %s", ex)

            nMarked += 1
            if nMarked % 10 == 0:
                logger.info('marked %d files', nMarked)

    def mark_failed(files, crabServer, logger, failure_reason=""):
        """
        Something failed for these files so increment the retry count
        """
        msg = "Marking %s file(s) as failed" % len(files)
        logger.info(msg)
        if dryRun:
            logger.debug("DryRun: skip marking failes files")
            return

        nMarked = 0
        for lfn in files:
            source_lfn = lfn
            docId = getHashLfn(source_lfn)
            data = dict()
            data['asoworker'] = config.General.asoworker
            data['subresource'] = 'updatePublication'
            data['list_of_ids'] = [docId]
            data['list_of_publication_state'] = ['FAILED']
            data['list_of_retry_value'] = [1]
            data['list_of_failure_reason'] = [failure_reason]

            logger.debug("data: %s ", data)
            try:
                result = crabServer.post(api='filetransfers', data=encodeRequest(data))
                logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result)
            except Exception as ex:
                logger.error("Error updating status for DocumentId: %s lfn: %s", docId, source_lfn)
                logger.error("Error reason: %s", ex)

            nMarked += 1
            if nMarked % 10 == 0:
                logger.info('marked %d files', nMarked)

    def createLogdir(dirname):
        """
        Create the directory dirname ignoring erors in case it exists. Exit if
        the directory cannot be created.
        """
        try:
            os.mkdir(dirname)
        except OSError as ose:
            if ose.errno != 17: #ignore the "Directory already exists error"
                print(str(ose))
                print("The task worker need to access the '%s' directory" % dirname)
                sys.exit(1)
        return

    def saveSummaryJson(logdir, summary):
        """
        Save a publication summary as JSON. Make a new file every time this script runs
        :param summary: a summary disctionary. Must at least have key 'taskname'
        :param logdir: the directory where to write the summary
        :return: the full path name of the written file
        """
        taskname = summary['taskname']
        counter = 1
        summaryFileName = os.path.join(logdir, taskname + '-1.json')
        while os.path.exists(summaryFileName):
            counter += 1
            summaryFileName = os.path.join(logdir, taskname + '-%d.json' % counter)
        with open(summaryFileName, 'w') as fd:
            json.dump(summary, fd)
        return summaryFileName

    taskFilesDir = config.General.taskFilesDir
    dryRun = config.TaskPublisher.dryRun
    username = taskname.split(':')[1].split('_')[0]
    logdir = os.path.join(config.General.logsDir, 'tasks', username)
    logfile = os.path.join(logdir, taskname + '.log')
    createLogdir(logdir)
    migrationLogDir = os.path.join(config.General.logsDir, 'migrations')
    createLogdir(migrationLogDir)
    logger = logging.getLogger(taskname)
    logging.basicConfig(filename=logfile, level=logging.INFO, format=config.TaskPublisher.logMsgFormat)
    if verbose:
        logger.setLevel(logging.DEBUG)

    logger.info("Start new iteration on taskname:  %s\nGet files to publish", taskname)

    # prepare a dummy summary JSON file in case there's nothing to do
    nothingToDo = {}
    nothingToDo['taskname'] = taskname
    nothingToDo['result'] = 'OK'
    nothingToDo['reason'] = 'NOTHING TO DO'
    nothingToDo['publishedBlocks'] = 0
    nothingToDo['failedBlocks'] = 0
    nothingToDo['failedBlockDumps'] = []
    nothingToDo['publishedFiles'] = 0
    nothingToDo['failedFiles'] = 0
    nothingToDo['nextIterFiles'] = 0

    toPublish = []
    # TODO move from new to done when processed
    fname = taskFilesDir + taskname + ".json"
    with open(fname) as f:
        toPublish = json.load(f)

    if not toPublish:
        logger.info("Empty data file %s", fname)
        summaryFileName = saveSummaryJson(logdir, nothingToDo)
        return summaryFileName

    pnn = toPublish[0]["Destination"]
    dataset = toPublish[0]['outdataset']
    logger.info("Will publish user files in %s", dataset)

    # CRABServer REST API's (see CRABInterface)
    try:
        instance = config.General.instance
    except:
        msg = "No instance provided: need to specify config.General.instance in the configuration"
        raise ConfigException(msg)

    if instance in SERVICE_INSTANCES:
        logger.info('Will connect to CRAB service: %s', instance)
        restHost = SERVICE_INSTANCES[instance]['restHost']
        dbInstance = SERVICE_INSTANCES[instance]['dbInstance']
    else:
        msg = "Invalid instance value '%s'" % instance
        raise ConfigException(msg)
    if instance == 'other':
        logger.info('Will use restHost and dbInstance from config file')
        try:
            restHost = config.General.restHost
            dbInstance = config.General.dbInstance
        except:
            msg = "Need to specify config.General.restHost and dbInstance in the configuration"
            raise ConfigException(msg)

    restURInoAPI = '/crabserver/' + dbInstance
    logger.info('Will connect to CRAB Data Base via URL: https://%s/%s', restHost, restURInoAPI)

    # CRAB REST API's
    crabServer = CRABRest(hostname=restHost, localcert=config.General.serviceCert,
                          localkey=config.General.serviceKey, retry=3,
                          userAgent='CRABPublisher')
    crabServer.setDbInstance(dbInstance=dbInstance)

    data = dict()
    data['subresource'] = 'search'
    data['workflow'] = taskname

    try:
        results = crabServer.get(api='task', data=encodeRequest(data))
    except Exception as ex:
        logger.error("Failed to get acquired publications from oracleDB: %s", ex)
        nothingToDo['result'] = 'FAIL'
        nothingToDo['reason'] = 'Error contacting CRAB REST'
        summaryFileName = saveSummaryJson(logdir, nothingToDo)
        return summaryFileName

    if verbose:
        logger.info(results[0]['desc']['columns'])

    try:
        inputDatasetIndex = results[0]['desc']['columns'].index("tm_input_dataset")
        inputDataset = results[0]['result'][inputDatasetIndex]
        sourceURLIndex = results[0]['desc']['columns'].index("tm_dbs_url")
        sourceURL = results[0]['result'][sourceURLIndex]
        publish_dbs_urlIndex = results[0]['desc']['columns'].index("tm_publish_dbs_url")
        publish_dbs_url = results[0]['result'][publish_dbs_urlIndex]

        if not sourceURL.endswith("/DBSReader") and not sourceURL.endswith("/DBSReader/"):
            sourceURL += "/DBSReader"
    except Exception:
        logger.exception("ERROR")

    # When looking up parents may need to look in global DBS as well.
    globalURL = sourceURL
    globalURL = globalURL.replace('phys01', 'global')
    globalURL = globalURL.replace('phys02', 'global')
    globalURL = globalURL.replace('phys03', 'global')
    globalURL = globalURL.replace('caf', 'global')

    # allow to use a DBS REST host different from cmsweb.cern.ch (which is the
    # default inserted by CRAB Client)
    sourceURL = sourceURL.replace('cmsweb.cern.ch', config.TaskPublisher.DBShost)
    globalURL = globalURL.replace('cmsweb.cern.ch', config.TaskPublisher.DBShost)
    publish_dbs_url = publish_dbs_url.replace('cmsweb.cern.ch', config.TaskPublisher.DBShost)

    # DBS client relies on X509 env. vars
    os.environ['X509_USER_CERT'] = config.General.serviceCert
    os.environ['X509_USER_KEY'] = config.General.serviceKey

    # create DBS API objects
    logger.info("DBS Source API URL: %s", sourceURL)
    sourceApi = dbsClient.DbsApi(url=sourceURL)
    logger.info("DBS Global API URL: %s", globalURL)
    globalApi = dbsClient.DbsApi(url=globalURL)

    if publish_dbs_url.endswith('/DBSWriter'):
        publish_read_url = publish_dbs_url[:-len('/DBSWriter')] + '/DBSReader'
        publish_migrate_url = publish_dbs_url[:-len('/DBSWriter')] + '/DBSMigrate'
    else:
        publish_migrate_url = publish_dbs_url + '/DBSMigrate'
        publish_read_url = publish_dbs_url + '/DBSReader'
        publish_dbs_url += '/DBSWriter'
    try:
        logger.info("DBS Destination API URL: %s", publish_dbs_url)
        destApi = dbsClient.DbsApi(url=publish_dbs_url)
        logger.info("DBS Destination read API URL: %s", publish_read_url)
        destReadApi = dbsClient.DbsApi(url=publish_read_url)
        logger.info("DBS Migration API URL: %s", publish_migrate_url)
        migrateApi = dbsClient.DbsApi(url=publish_migrate_url)
    except Exception:
        logger.exception('Wrong DBS URL %s', publish_dbs_url)
        nothingToDo['result'] = 'FAIL'
        nothingToDo['reason'] = 'Error contacting DBS'
        summaryFileName = saveSummaryJson(logdir, nothingToDo)
        return summaryFileName

    logger.info("inputDataset: %s", inputDataset)
    noInput = len(inputDataset.split("/")) <= 3

    if not noInput:
        try:
            existing_datasets = sourceApi.listDatasets(dataset=inputDataset, detail=True, dataset_access_type='*')
            primary_ds_type = existing_datasets[0]['primary_ds_type']
            # There's little chance this is correct, but it's our best guess for now.
            # CRAB2 uses 'crab2_tag' for all cases
            existing_output = destReadApi.listOutputConfigs(dataset=inputDataset)
        except Exception:
            logger.exception('Wrong DBS URL %s', publish_dbs_url)
            nothingToDo['result'] = 'FAIL'
            nothingToDo['reason'] = 'Error looking up input dataset in DBS'
            summaryFileName = saveSummaryJson(logdir, nothingToDo)
            return summaryFileName
        if not existing_output:
            msg = "Unable to list output config for input dataset %s." % (inputDataset)
            logger.error(msg)
            global_tag = 'crab3_tag'
        else:
            global_tag = existing_output[0]['global_tag']
    else:
        msg = "This publication appears to be for private MC."
        logger.info(msg)
        primary_ds_type = 'mc'
        global_tag = 'crab3_tag'

    acquisition_era_name = "CRAB"
    processing_era_config = {'processing_version': 1, 'description': 'CRAB3_processing_era'}

    appName = 'cmsRun'
    appVer = toPublish[0]["swversion"]
    pset_hash = toPublish[0]['publishname'].split("-")[-1]
    gtag = str(toPublish[0]['globaltag'])
    if gtag == "None":
        gtag = global_tag
    try:
        if toPublish[0]['acquisitionera'] and not toPublish[0]['acquisitionera'] in ["null"]:
            acquisitionera = str(toPublish[0]['acquisitionera'])
        else:
            acquisitionera = acquisition_era_name
    except Exception:
        acquisitionera = acquisition_era_name

    _, primName, procName, tier = toPublish[0]['outdataset'].split('/')

    primds_config = {'primary_ds_name': primName, 'primary_ds_type': primary_ds_type}
    msg = "About to insert primary dataset"
    logger.debug(msg)
    if dryRun:
        logger.info("DryRun: skip insertPrimaryDataset")
    else:
        destApi.insertPrimaryDataset(primds_config)
        msg = "Successfully inserted primary dataset %s." % (primName)
        logger.info(msg)

    final = {}
    failed = []
    publish_in_next_iteration = []
    published = []

    # Find all files already published in this dataset.
    try:
        existingDBSFiles = destReadApi.listFiles(dataset=dataset, detail=True)
        existingFiles = [f['logical_file_name'] for f in existingDBSFiles]
        existingFilesValid = [f['logical_file_name'] for f in existingDBSFiles if f['is_file_valid']]
        msg = "Dataset %s already contains %d files" % (dataset, len(existingFiles))
        msg += " (%d valid, %d invalid)." % (len(existingFilesValid), len(existingFiles) - len(existingFilesValid))
        logger.info(msg)
        final['existingFiles'] = len(existingFiles)
    except Exception as ex:
        msg = "Error when listing files in DBS: %s" % (str(ex))
        msg += "\n%s" % (str(traceback.format_exc()))
        logger.error(msg)
        nothingToDo['result'] = 'FAIL'
        nothingToDo['reason'] = 'Error listing existing files in DBS'
        summaryFileName = saveSummaryJson(logdir, nothingToDo)
        return summaryFileName

    # check if actions are needed
    workToDo = False

    for fileTo in toPublish:
        #print(existingFilesValid)
        if fileTo['lfn'] not in existingFiles:
            workToDo = True
            break

    if not workToDo:
        msg = "Nothing uploaded, output dataset has these files already."
        logger.info(msg)
        logger.info('Make sure those files are marked as Done')
        # docId is the has of the source LFN i.e. the file in the tmp area at the running site
        files = [f['SourceLFN'] for f in toPublish]
        mark_good(files, crabServer, logger)
        summaryFileName = saveSummaryJson(logdir, nothingToDo)
        return summaryFileName

    acquisition_era_config = {'acquisition_era_name': acquisitionera, 'start_date': 0}

    output_config = {'release_version': appVer,
                     'pset_hash': pset_hash,
                     'app_name': appName,
                     'output_module_label': 'o',
                     'global_tag': global_tag,
                    }

    dataset_config = {'dataset': dataset,
                      'processed_ds_name': procName,
                      'data_tier_name': tier,
                      'dataset_access_type': 'VALID',
                      'physics_group_name': 'CRAB3',
                      'last_modification_date': int(time.time()),
                     }

    logger.info("Output dataset config: %s", str(dataset_config))

    # List of all files that must (and can) be published.
    dbsFiles = []
    dbsFiles_f = []
    # Set of all the parent files from all the files requested to be published.
    parentFiles = set()
    # Set of parent files for which the migration to the destination DBS instance
    # should be skipped (because they were not found in DBS).
    parentsToSkip = set()
    # Set of parent files to migrate from the source DBS instance
    # to the destination DBS instance.
    localParentBlocks = set()
    # Set of parent files to migrate from the global DBS instance
    # to the destination DBS instance.
    globalParentBlocks = set()

    # Loop over all files to publish.
    for file_ in toPublish:
        if verbose:
            logger.info(file_)
        # Check if this file was already published and if it is valid.
        if file_['lfn'] not in existingFilesValid:
            # We have a file to publish.
            # Get the parent files and for each parent file do the following:
            # 1) Add it to the list of parent files.
            # 2) Find the block to which it belongs and insert that block name in
            #    (one of) the set of blocks to be migrated to the destination DBS.
            for parentFile in list(file_['parents']):
                if parentFile not in parentFiles:
                    parentFiles.add(parentFile)
                    # Is this parent file already in the destination DBS instance?
                    # (If yes, then we don't have to migrate this block.)
                    blocksDict = destReadApi.listBlocks(logical_file_name=parentFile)
                    if not blocksDict:
                        # No, this parent file is not in the destination DBS instance.
                        # Maybe it is in the same DBS instance as the input dataset?
                        blocksDict = sourceApi.listBlocks(logical_file_name=parentFile)
                        if blocksDict:
                            # Yes, this parent file is in the same DBS instance as the input dataset.
                            # Add the corresponding block to the set of blocks from the source DBS
                            # instance that have to be migrated to the destination DBS.
                            localParentBlocks.add(blocksDict[0]['block_name'])
                        else:
                            # No, this parent file is not in the same DBS instance as input dataset.
                            # Maybe it is in global DBS instance?
                            blocksDict = globalApi.listBlocks(logical_file_name=parentFile)
                            if blocksDict:
                                # Yes, this parent file is in global DBS instance.
                                # Add the corresponding block to the set of blocks from global DBS
                                # instance that have to be migrated to the destination DBS.
                                globalParentBlocks.add(blocksDict[0]['block_name'])
                    # If this parent file is not in the destination DBS instance, is not
                    # the source DBS instance, and is not in global DBS instance, then it
                    # means it is not known to DBS and therefore we can not migrate it.
                    # Put it in the set of parent files for which migration should be skipped.
                    if not blocksDict:
                        parentsToSkip.add(parentFile)
                # If this parent file should not be migrated because it is not known to DBS,
                # we remove it from the list of parents in the file-to-publish info dictionary
                # (so that when publishing, this "parent" file will not appear as a parent).
                if parentFile in parentsToSkip:
                    msg = "Skipping parent file %s, as it doesn't seem to be known to DBS." % (parentFile)
                    logger.info(msg)
                    if parentFile in file_['parents']:
                        file_['parents'].remove(parentFile)
            # Add this file to the list of files to be published.
            dbsFiles.append(format_file_3(file_))
            dbsFiles_f.append(file_)
        #print file
        published.append(file_['SourceLFN'])
        #published.append(file_['lfn'].replace("/store","/store/temp"))

    # Print a message with the number of files to publish.
    msg = "Found %d files not already present in DBS which will be published." % (len(dbsFiles))
    logger.info(msg)

    # If there are no files to publish, continue with the next dataset.
    if not dbsFiles_f:
        msg = "No file to publish to do for this dataset."
        logger.info(msg)
        summaryFileName = saveSummaryJson(logdir, nothingToDo)
        return summaryFileName

    # Migrate parent blocks before publishing.
    # First migrate the parent blocks that are in the same DBS instance
    # as the input dataset.
    if localParentBlocks:
        msg = "List of parent blocks that need to be migrated from %s:\n%s" % (sourceApi.url, localParentBlocks)
        logger.info(msg)
        if dryRun:
            logger.info("DryRun: skipping migration request")
        else:
            statusCode, failureMsg = migrateByBlockDBS3(taskname, migrateApi, destReadApi, sourceApi,
                                                        inputDataset, localParentBlocks, migrationLogDir, verbose)
            if statusCode:
                failureMsg += " Not publishing any files."
                logger.info(failureMsg)
                summaryFileName = saveSummaryJson(logdir, nothingToDo)
                return summaryFileName
    # Then migrate the parent blocks that are in the global DBS instance.
    if globalParentBlocks:
        msg = "List of parent blocks that need to be migrated from %s:\n%s" % (globalApi.url, globalParentBlocks)
        logger.info(msg)
        if dryRun:
            logger.info("DryRun: skipping migration request")
        else:
            statusCode, failureMsg = migrateByBlockDBS3(taskname, migrateApi, destReadApi, globalApi,
                                                        inputDataset, globalParentBlocks, migrationLogDir, verbose)
            if statusCode:
                failureMsg += " Not publishing any files."
                logger.info(failureMsg)
                summaryFileName = saveSummaryJson(logdir, nothingToDo)
                return summaryFileName
    # Publish the files in blocks. The blocks must have exactly max_files_per_block
    # files, unless there are less than max_files_per_block files to publish to
    # begin with. If there are more than max_files_per_block files to publish,
    # publish as many blocks as possible and leave the tail of files for the next
    # PublisherWorker call, unless forced to published.
    block_count = 0
    count = 0
    publishedBlocks = 0
    failedBlocks = 0
    max_files_per_block = config.General.max_files_per_block
    dumpList = []   # keep a list of files where blocks which fail publication are dumped
    while True:
        block_name = "%s#%s" % (dataset, str(uuid.uuid4()))
        files_to_publish = dbsFiles[count:count+max_files_per_block]
        try:
            block_config = {'block_name': block_name, 'origin_site_name': pnn, 'open_for_writing': 0}
            if verbose:
                msg = "Inserting files %s into block %s." % ([f['logical_file_name']
                                                              for f in files_to_publish], block_name)
                logger.info(msg)
            blockDump = createBulkBlock(output_config, processing_era_config,
                                        primds_config, dataset_config,
                                        acquisition_era_config, block_config, files_to_publish)
            #logger.debug("Block to insert: %s\n %s" % (blockDump, destApi.__dict__ ))

            if dryRun:
                logger.info("DryRun: skip insertBulkBlock")
            else:
                destApi.insertBulkBlock(blockDump)
            block_count += 1
            publishedBlocks += 1
        except Exception as ex:
            #logger.error("Error for files: %s" % [f['SourceLFN'] for f in toPublish])
            logger.error("Error for files: %s", [f['lfn'] for f in toPublish])
            failed.extend([f['SourceLFN'] for f in toPublish])
            #failed.extend([f['lfn'].replace("/store","/store/temp") for f in toPublish])
            msg = "Error when publishing (%s) " % ", ".join(failed)
            msg += str(ex)
            msg += str(traceback.format_exc())
            logger.error(msg)
            failure_reason = str(ex)
            taskFilesDir = config.General.taskFilesDir
            fname = os.path.join(taskFilesDir, 'FailedBlocks', 'failed-block-at-%s.txt' % time.time())
            with open(fname, 'w') as fd:
                fd.write(pprint.pformat(blockDump))
            dumpList.append(fname)
            failedBlocks += 1
            logger.error("FAILING BLOCK DUE TO %s SAVED AS %s", str(ex), fname)
        count += max_files_per_block
        files_to_publish_next = dbsFiles_f[count:count+max_files_per_block]
        if len(files_to_publish_next) < max_files_per_block:
            publish_in_next_iteration.extend([f["SourceLFN"] for f in files_to_publish_next])
            #publish_in_next_iteration.extend([f["lfn"].replace("/store","/store/temp") for f in files_to_publish_next])
            break
    published = [x for x in published if x not in failed + publish_in_next_iteration]
    # Fill number of files/blocks published for this dataset.
    final['files'] = len(dbsFiles) - len(failed) - len(publish_in_next_iteration)
    final['blocks'] = block_count
    # Print a publication status summary for this dataset.
    msg = "End of publication status:"
    msg += " failed %s" % len(failed)
    if verbose:
        msg += ": %s" % failed
    msg += ", published %s" % len(published)
    if verbose:
        msg += ": %s" % published
    msg += ", publish_in_next_iteration %s" % len(publish_in_next_iteration)
    if verbose:
        msg += ": %s" % publish_in_next_iteration
    msg += ", results %s" % (final)
    logger.info(msg)

    try:
        if published:
            mark_good(published, crabServer, logger)
            data['workflow'] = taskname
            data['subresource'] = 'updatepublicationtime'
            crabServer.post(api='task', data=encodeRequest(data))
        if failed:
            logger.debug("Failed files: %s ", failed)
            mark_failed(failed, crabServer, logger, failure_reason)
    except Exception as ex:
        logger.exception("Status update failed: %s", ex)

    summary = {}
    summary['taskname'] = taskname
    summary['result'] = 'OK' if not failed else 'FAIL'
    summary['reason'] = '' if not failed else 'DBS Publication Failure'
    summary['publishedBlocks'] = publishedBlocks
    summary['failedBlocks'] = failedBlocks
    summary['failedBlockDumps'] = dumpList
    summary['publishedFiles'] = len(published)
    summary['failedFiles'] = len(failed)
    summary['nextIterFiles'] = len(publish_in_next_iteration)

    summaryFileName = saveSummaryJson(logdir, summary)

    return summaryFileName
Exemplo n.º 18
0
class CRAB3CreateJson(object):
    def __init__(self, resthost, jsonDoc, logger=None):
        if not logger:
            self.logger = logging.getLogger(__name__)
            handler = logging.StreamHandler(sys.stdout)
            formatter = logging.Formatter(
                "%(asctime)s:%(levelname)s:%(module)s %(message)s")
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
            self.logger.setLevel(logging.DEBUG)
        else:
            self.logger = logger

        self.jsonDoc = jsonDoc
        self.resthost = resthost
        self.pool = ''
        self.schedds = []
        self.resthost = "cmsweb.cern.ch:8443"
        self.crabserver = CRABRest(hostname=resthost,
                                   localcert='/data/certs/servicecert.pem',
                                   localkey='/data/certs/servicekey.pem',
                                   retry=10,
                                   userAgent='CRABTaskWorker')
        self.crabserver.setDbInstance(dbInstance='prod')
        # use child collector on port 9620 to get schedd attributes
        collName = "cmsgwms-collector-global.cern.ch:9620,cmsgwms-collector-global.fnal.gov:9620"
        self.coll = htcondor.Collector(collName)

    def getCountTasksByStatus(self):
        try:
            configreq = {'minutes': "120", 'subresource': "counttasksbystatus"}
            result = self.crabserver.get(api='task', data=configreq)
            return dict(result[0]['result'])
        except Exception:
            e = sys.exc_info()
            if hasattr(e, "headers"):
                self.logger.error(str(e.headers))
            self.logger.debug("Error in getCountTasksByStatus:\n%s", e)
            pprint(e[1])
            traceback.print_tb(e[2])
            return []

    def getCountTasksByStatusAbs(self):
        try:
            configreq = {
                'minutes': "144000",
                'subresource': "counttasksbystatus"
            }  # query last 100 days only
            result = self.crabserver.get(api='task', data=configreq)
            return dict(result[0]['result'])
        except Exception:
            e = sys.exc_info()
            if hasattr(e, "headers"):
                self.logger.error(str(e.headers))
            self.logger.exception("Error in getCountTasksByStatusAbs:\n%s", e)
            pprint(e[1])
            traceback.print_tb(e[2])
            return []

    def getScheddsInfo(self):
        data = []
        try:
            result = self.coll.query(
                htcondor.AdTypes.Schedd, 'CMSGWMS_Type=?="crabschedd"', [
                    'Name', 'ShadowsRunning', 'TotalSchedulerJobsRunning',
                    'TotalIdleJobs', 'TotalRunningJobs', 'TotalHeldJobs',
                    'TotalJobAds'
                ])
            for schedd in result:
                data.append([
                    schedd['Name'], schedd['ShadowsRunning'],
                    schedd['TotalSchedulerJobsRunning'],
                    schedd['TotalIdleJobs'], schedd['TotalRunningJobs'],
                    schedd['TotalHeldJobs'], schedd['TotalJobAds']
                ])
        except Exception as e:
            self.logger.debug("Error in getShadowsRunning: %s", e)
        return data

    def execute(self):
        subprocesses_config = 6
        # In this case 5 + 1 MasterWorker process
        sub_grep_command = "ps -ef | grep MasterWorker | grep -v 'grep' | wc -l"
        # If any subprocess is dead or not working, modify percentage of availability
        # If subprocesses are not working - service availability 0%
        process_count = int(
            subprocess.Popen(sub_grep_command,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT).stdout.read())

        if subprocesses_config == process_count:
            # This means that everything is fine
            self.jsonDoc['status'] = "available"
        else:
            self.jsonDoc['status'] = "degraded"

        # Get the number of tasks per status
        twStatus = self.getCountTasksByStatus()

        states_filter = [
            'SUBMITTED', 'FAILED', 'QUEUED', 'NEW', 'KILLED', 'KILLFAILED',
            'RESUBMITFAILED', 'SUBMITFAILED', 'TAPERECALL'
        ]
        if len(twStatus) > 0:
            for state in twStatus.keys():
                if state in states_filter:
                    self.jsonDoc['current_task_states'].update(
                        {str(state): int(twStatus[state])})

        # get the absolut number of tasks per status
        twStatus = self.getCountTasksByStatusAbs()

        states_filter = [
            'KILL', 'RESUBMIT', 'NEW', 'QUEUED', 'KILLFAILED',
            'RESUBMITFAILED', 'SUBMITFAILED', 'UPLOADED', 'TAPERECALL'
        ]
        if len(twStatus) > 0:
            for state in twStatus.keys():
                if state in states_filter:
                    self.jsonDoc['abs_task_states'].update(
                        {str(state): int(twStatus[state])})

        # get the number of condor_shadow processes per schedd
        ListOfSchedds = self.getScheddsInfo()
        totalRunningTasks = 0
        totalIdleTasks = 0
        totalRunningTP = 0
        # see https://htcondor-wiki.cs.wisc.edu/index.cgi/wiki?p=MagicNumbers
        pickSchedulerIdle = 'JobUniverse==7 && JobStatus==1'
        pickLocalRunning = 'JobUniverse==12 && JobStatus==2'

        if len(ListOfSchedds) > 0:
            metrics = []
            for oneSchedd in ListOfSchedds:
                scheddName = oneSchedd[0]
                # influxDB tags and fields are also added according to
                # https://monitdocs.web.cern.ch/monitdocs/ingestion/service_metrics.html#writing-to-influxdb
                # see https://github.com/dmwm/CRABServer/pull/6017  But they are irrelevant
                # as long as MONIT entry point data are sent to is an Elastic Search one. See comments
                # above in "send" function
                influxDb_measures = dict(
                    shadows=int(oneSchedd[1]),
                    running_schedulers=int(oneSchedd[2]),
                    idle_jobs=int(oneSchedd[3]),
                    running_jobs=int(oneSchedd[4]),
                    held_jobs=int(oneSchedd[5]),
                    all_jobs=int(oneSchedd[6]),
                )
                jsonDocSchedd = dict(
                    producer='crab',
                    type='schedd',
                    hostname=gethostname(),
                    name=scheddName,
                    idb_tags=["name"],  # for InfluxDB
                    idb_fields=influxDb_measures.keys(),  # for InfluxDB
                )
                jsonDocSchedd.update(influxDb_measures)

                metrics.append(jsonDocSchedd)

                totalRunningTasks += int(oneSchedd[2])
                # if one schedd does not answer, go on and try the others
                try:
                    scheddAdd = self.coll.locate(htcondor.DaemonTypes.Schedd,
                                                 scheddName)
                except Exception:
                    continue
                schedd = htcondor.Schedd(scheddAdd)
                try:
                    idleDags = list(schedd.xquery(pickSchedulerIdle))
                except Exception:
                    idleDags = []
                try:
                    runningTPs = list(schedd.xquery(pickLocalRunning))
                except Exception:
                    runningTPs = []
                numDagIdle = len(idleDags)
                numTPRun = len(runningTPs)
                totalIdleTasks += numDagIdle
                totalRunningTP += numTPRun

            #print metrics
            try:
                send_and_check(metrics)
            except Exception as ex:
                print(ex)
        self.jsonDoc['total_running_tasks'] = totalRunningTasks
        self.jsonDoc['total_idle_tasks'] = totalIdleTasks
        self.jsonDoc['total_running_tp'] = totalRunningTP

        return self.jsonDoc
Exemplo n.º 19
0
def perform_transfers(inputFile, lastLine, direct=False):
    """
    get transfers submitted and save the last read line of the transfer list

    :param inputFile: file name containing post job files ready
    :type inputFile: str
    :param lastLine: last line processed
    :type lastLine: int
    :param direct: job output stored on temp or directly, defaults to False
    :param direct: bool, optional
    :return: (username,taskname) or None in case of critical error
    :rtype: tuple or None
    """

    if not os.path.exists(inputFile):
        return None, None

    # Get proxy and rest endpoint information
    proxy = None
    #if os.path.exists('task_process/rest_filetransfers.txt'):
    #    with open("task_process/rest_filetransfers.txt", "r") as _rest:
    #        rest_filetransfers = _rest.readline().split('\n')[0]
    #        proxy = os.getcwd() + "/" + _rest.readline()
    #        logging.info("Proxy: %s", proxy)
    #        os.environ["X509_USER_PROXY"] = proxy
    if os.path.exists('task_process/RestInfoForFileTransfers.json'):
        with open('task_process/RestInfoForFileTransfers.json') as fp:
            restInfo = json.load(fp)
            proxy = os.getcwd() + "/" + restInfo['proxyfile']
            #rest_filetransfers = restInfo['host'] + '/crabserver/' + restInfo['dbInstance']
            os.environ["X509_USER_PROXY"] = proxy

    # If there are no user proxy yet, just wait for the first pj of the task to finish
    if not proxy:
        logging.info('No proxy available yet - waiting for first post-job')
        return None

    try:
        crabserver = CRABRest(restInfo['host'],
                              localcert=proxy,
                              localkey=proxy,
                              userAgent='CRABSchedd')
        crabserver.setDbInstance(restInfo['dbInstamce'])
    except Exception:
        logging.exception("Failed to set connection to crabserver")
        return

    logging.info("starting from line: %s", lastLine)

    # define ntuples and column names
    # TODO: make use of dict instead of this
    file_to_submit = []
    to_submit_columns = [
        "source_lfn", "destination_lfn", "id", "source", "destination",
        "checksums", "filesize", "publishname"
    ]
    transfers = []
    user = None
    taskname = None
    destination = None

    # get username and taskname form input file list
    with open(inputFile) as _list:
        doc = json.loads(_list.readlines()[0])
        user = doc['username']
        taskname = doc["publishname"].replace(
            '-00000000000000000000000000000000', '/rucio/USER#000000')

    # Save needed info in ordered lists
    with open(inputFile) as _list:
        for _data in _list.readlines()[lastLine:]:
            file_to_submit = []
            try:
                lastLine += 1
                doc = json.loads(_data)
            except Exception:
                continue
            for column in to_submit_columns:
                # Save everything other than checksums and publishnames
                # They will be managed below
                if column not in ['checksums', 'publishname']:
                    file_to_submit.append(doc[column])
                # Change publishname for task with publication disabled
                # as discussed in https://github.com/dmwm/CRABServer/pull/6038#issuecomment-618654580
                if column == "publishname":
                    taskname = doc["publishname"].replace(
                        '-00000000000000000000000000000000',
                        '/rucio/USER#000000')
                    file_to_submit.append(taskname)
                # Save adler checksum in a form accepted by Rucio
                if column == "checksums":
                    file_to_submit.append(doc["checksums"]["adler32"].rjust(
                        8, '0'))
            transfers.append(file_to_submit)
            destination = doc["destination"]

    # Pass collected info to submit function
    if len(transfers) > 0:
        # Store general job metadata
        job_data = {
            'taskname': taskname,
            'username': user,
            'destination': destination,
            'proxy': proxy,
            'crabserver': crabserver
        }
        # Split the processing for the directly staged files
        if not direct:
            try:
                # Start the submission process  that will be managed
                # in src/python/TransferInterface/RegisterFiles.py
                success = submit((transfers, to_submit_columns), job_data,
                                 logging)
            except Exception:
                logging.exception('Submission process failed.')

            # if succeeded go on and update the last processed file
            # otherwise retry at the next round of task process
            if success:
                # update last read line
                with open("task_process/transfers/last_transfer_new.txt",
                          "w+") as _last:
                    _last.write(str(lastLine))
                os.rename("task_process/transfers/last_transfer_new.txt",
                          "task_process/transfers/last_transfer.txt")

        elif direct:

            # In case of direct stageout do the same as above but with flag direct=True
            try:
                success = submit((transfers, to_submit_columns),
                                 job_data,
                                 logging,
                                 direct=True)
            except Exception:
                logging.exception('Registering direct stage files failed.')

            if success:
                # update last read line
                with open(
                        "task_process/transfers/last_transfer_direct_new.txt",
                        "w+") as _last:
                    _last.write(str(lastLine))
                os.rename(
                    "task_process/transfers/last_transfer_direct_new.txt",
                    "task_process/transfers/last_transfer_direct.txt")

    return user, taskname
Exemplo n.º 20
0
if __name__ == "__main__":
    import os
    import logging
    from RESTInteractions import CRABRest
    from WMCore.Configuration import Configuration

    logging.basicConfig(level=logging.DEBUG)
    config = Configuration()

    config.section_("TaskWorker")
    #will use X509_USER_PROXY var for this test
    config.TaskWorker.cmscert = os.environ["X509_USER_PROXY"]
    config.TaskWorker.cmskey = os.environ["X509_USER_PROXY"]

    server_ = CRABRest('cmsweb-testbed.cern.ch', config.TaskWorker.cmscert,
                       config.TaskWorker.cmskey)
    server_.setDbInstance('dev')
    resubmitter = DagmanResubmitter(config, server_)
    resubmitter.execute(
        task={
            'tm_taskname':
            '141129_110306_crab3test-5:atanasi_crab_test_resubmit',
            'user_proxy': os.environ["X509_USER_PROXY"],
            'resubmit_site_whitelist': ['T2_IT_Bari'],
            'resubmit_site_blacklist': ['T2_IT_Legnaro'],
            'resubmit_priority': 2,
            'resubmit_numcores': 1,
            'resubmit_maxjobruntime': 1000,
            'resubmit_maxmemory': 1000
        })