def resolve_surl(fspec, protocol, ddmconf, **kwargs): """ Get final destination SURL for file to be transferred to Objectstore Can be customized at the level of specific copytool :param protocol: suggested protocol :param ddmconf: full ddm storage data :param fspec: file spec data :return: dictionary {'surl': surl} """ ddm = ddmconf.get(fspec.ddmendpoint) if not ddm: raise PilotException('Failed to resolve ddmendpoint by name=%s' % fspec.ddmendpoint) if ddm.is_deterministic: surl = protocol.get('endpoint', '') + os.path.join( protocol.get('path', ''), get_rucio_path(fspec.scope, fspec.lfn)) elif ddm.type in ['OS_ES', 'OS_LOGS']: surl = protocol.get('endpoint', '') + os.path.join( protocol.get('path', ''), fspec.lfn) fspec.protocol_id = protocol.get('id') else: raise PilotException( 'resolve_surl(): Failed to construct SURL for non deterministic ddm=%s: NOT IMPLEMENTED', fspec.ddmendpoint) return {'surl': surl}
def copy_out(files, **kwargs): """ Upload given files to S3 storage. :param files: list of `FileSpec` objects :raise: PilotException in case of controlled error """ workdir = kwargs.pop('workdir') for fspec in files: path = os.path.join(workdir, fspec.lfn) if os.path.exists(path): bucket = 'bucket' # UPDATE ME logger.info('uploading %s to bucket=%s using object name=%s', path, bucket, fspec.lfn) status, diagnostics = upload_file(path, bucket, object_name=fspec.lfn) if not status: ## an error occurred # create new error code(s) in ErrorCodes.py and set it/them in resolve_common_transfer_errors() error = resolve_common_transfer_errors(diagnostics, is_stagein=False) fspec.status = 'failed' fspec.status_code = error.get('rcode') raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) else: diagnostics = 'local output file does not exist: %s' % path logger.warning(diagnostics) fspec.status = 'failed' fspec.status_code = errors.STAGEOUTFAILED raise PilotException(diagnostics, code=fspec.status_code, state=fspec.status) fspec.status = 'transferred' fspec.status_code = 0 return files
def resolve_surl(self, fspec, protocol, ddmconf, **kwargs): """ Get final destination SURL for file to be transferred Can be customized at the level of specific copytool :param protocol: suggested protocol :param ddmconf: full ddmconf data :param activity: ordered list of preferred activity names to resolve SE protocols :return: dict with keys ('pfn', 'ddmendpoint') """ # consider only deterministic sites (output destination) ddm = ddmconf.get(fspec.ddmendpoint) if not ddm: raise PilotException('Failed to resolve ddmendpoint by name=%s' % fspec.ddmendpoint) # path = protocol.get('path', '').rstrip('/') # if not (ddm.is_deterministic or (path and path.endswith('/rucio'))): if not ddm.is_deterministic: raise PilotException( 'resolve_surl(): Failed to construct SURL for non deterministic ddm=%s: ' 'NOT IMPLEMENTED' % fspec.ddmendpoint, code=ErrorCodes.NONDETERMINISTICDDM) surl = protocol.get('endpoint', '') + os.path.join( protocol.get('path', ''), self.get_path(fspec.scope, fspec.lfn)) return {'surl': surl}
def copy_in(files, **kwargs): """ Download given files using xrdcp command. :param files: list of `FileSpec` objects :raise: PilotException in case of controlled error """ #allow_direct_access = kwargs.get('allow_direct_access') or False setup = kwargs.pop('copytools', {}).get('xrdcp', {}).get('setup') coption = _resolve_checksum_option(setup, **kwargs) trace_report = kwargs.get('trace_report') localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None) for fspec in files: # update the trace report localsite = localsite if localsite else fspec.ddmendpoint trace_report.update(localSite=localsite, remoteSite=fspec.ddmendpoint, filesize=fspec.filesize) trace_report.update(filename=fspec.lfn, guid=fspec.guid.replace('-', '')) trace_report.update(scope=fspec.scope, dataset=fspec.dataset) # continue loop for files that are to be accessed directly ## TOBE DEPRECATED (anisyonk) #if fspec.is_directaccess(ensure_replica=False) and allow_direct_access and fspec.accessmode == 'direct': # fspec.status_code = 0 # fspec.status = 'remote_io' # trace_report.update(url=fspec.turl, clientState='FOUND_ROOT', stateReason='direct_access') # trace_report.send() # continue trace_report.update(catStart=time()) dst = fspec.workdir or kwargs.get('workdir') or '.' destination = os.path.join(dst, fspec.lfn) try: filesize_cmd, checksum_cmd, checksum_type = _stagefile(coption, fspec.turl, destination, fspec.filesize, is_stagein=True, setup=setup, **kwargs) fspec.status_code = 0 fspec.status = 'transferred' except PilotException as error: fspec.status = 'failed' fspec.status_code = error.get_error_code() diagnostics = error.get_detail() state = 'STAGEIN_ATTEMPT_FAILED' trace_report.update(clientState=state, stateReason=diagnostics, timeEnd=time()) trace_report.send() raise PilotException(diagnostics, code=fspec.status_code, state=state) else: # compare checksums fspec.checksum[checksum_type] = checksum_cmd # remote checksum state, diagnostics = verify_catalog_checksum(fspec, destination) if diagnostics != "": trace_report.update(clientState=state or 'STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, timeEnd=time()) trace_report.send() raise PilotException(diagnostics, code=fspec.status_code, state=state) trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time()) trace_report.send() return files
def resolve_surl(fspec, protocol, ddmconf, **kwargs): """ Get final destination SURL for file to be transferred to Objectstore Can be customized at the level of specific copytool :param protocol: suggested protocol :param ddmconf: full ddm storage data :param fspec: file spec data :return: dictionary {'surl': surl} """ ddm = ddmconf.get(fspec.ddmendpoint) if not ddm: raise PilotException('failed to resolve ddmendpoint by name=%s' % fspec.ddmendpoint) if ddm.is_deterministic: surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), get_rucio_path(fspec.scope, fspec.lfn)) elif ddm.type in ['OS_ES', 'OS_LOGS']: surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), fspec.lfn) fspec.protocol_id = protocol.get('id') else: raise PilotException('resolve_surl(): Failed to construct SURL for non deterministic ddm=%s: NOT IMPLEMENTED', fspec.ddmendpoint) # example: # protocol = {u'path': u'/atlas-eventservice', u'endpoint': u's3://s3.cern.ch:443/', u'flavour': u'AWS-S3-SSL', u'id': 175} # surl = 's3://s3.cern.ch:443//atlas-eventservice/EventService_premerge_24706191-5013009653-24039149400-322-5.tar' return {'surl': surl}
def _stagefile(coption, source, destination, filesize, is_stagein, setup=None, **kwargs): """ Stage the file (stagein or stageout) :return: destination file details (checksum, checksum_type) in case of success, throw exception in case of failure :raise: PilotException in case of controlled error """ filesize_cmd, checksum_cmd, checksum_type = None, None, None cmd = '%s -np -f %s %s %s' % (copy_command, coption, source, destination) if setup: cmd = "source %s; %s" % (setup, cmd) #timeout = get_timeout(filesize) #logger.info("Executing command: %s, timeout=%s" % (cmd, timeout)) rcode, stdout, stderr = execute(cmd, **kwargs) logger.info('rcode=%d, stdout=%s, stderr=%s' % (rcode, stdout, stderr)) if rcode: ## error occurred error = resolve_common_transfer_errors(stdout + stderr, is_stagein=is_stagein) #rcode = error.get('rcode') ## TO BE IMPLEMENTED #if not is_stagein and rcode == PilotErrors.ERR_CHKSUMNOTSUP: ## stage-out, on fly checksum verification is not supported .. ignore # logger.info('stage-out: ignore ERR_CHKSUMNOTSUP error .. will explicitly verify uploaded file') # return None, None raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) # extract filesize and checksum values from output if coption != "": filesize_cmd, checksum_cmd, checksum_type = get_file_info_from_output( stdout + stderr) ## verify transfer by returned checksum or call remote checksum calculation ## to be moved at the base level is_verified = True ## TO BE IMPLEMENTED LATER if not is_verified: rcode = ErrorCodes.GETADMISMATCH if is_stagein else ErrorCodes.PUTADMISMATCH raise PilotException("Copy command failed", code=rcode, state='AD_MISMATCH') return filesize_cmd, checksum_cmd, checksum_type
def prepare_destinations(self, files, activities): """ Resolve destination RSE (filespec.ddmendpoint) for each entry from `files` according to requested `activities` Apply Pilot-side logic to choose proper destination :param files: list of FileSpec objects to be processed :param activities: ordered list of activities to be used to resolve astorages :return: updated fspec entries """ if not self.infosys.queuedata: ## infosys is not initialized: not able to fix destination if need, nothing to do return files try: if isinstance(activities, (str, unicode)): # Python 2 activities = [activities] except Exception: if isinstance(activities, str): # Python 3 activities = [activities] if not activities: raise PilotException("Failed to resolve destination: passed empty activity list. Internal error.", code=ErrorCodes.INTERNALPILOTPROBLEM, state='INTERNAL_ERROR') astorages = self.infosys.queuedata.astorages or {} storages = None activity = activities[0] for a in activities: storages = astorages.get(a, {}) if storages: break if not storages: raise PilotException("Failed to resolve destination: no associated storages defined for activity=%s (%s)" % (activity, ','.join(activities)), code=ErrorCodes.NOSTORAGE, state='NO_ASTORAGES_DEFINED') # take the fist choice for now, extend the logic later if need ddm = storages[0] self.logger.info("[prepare_destinations][%s]: allowed (local) destinations: %s" % (activity, storages)) self.logger.info("[prepare_destinations][%s]: resolved default destination ddm=%s" % (activity, ddm)) for e in files: if not e.ddmendpoint: ## no preferences => use default destination self.logger.info("[prepare_destinations][%s]: fspec.ddmendpoint is not set for lfn=%s" " .. will use default ddm=%s as (local) destination" % (activity, e.lfn, ddm)) e.ddmendpoint = ddm elif e.ddmendpoint not in storages: ## fspec.ddmendpoint is not in associated storages => assume it as final (non local) alternative destination self.logger.info("[prepare_destinations][%s]: Requested fspec.ddmendpoint=%s is not in the list of allowed (local) destinations" " .. will consider default ddm=%s for transfer and tag %s as alt. location" % (activity, e.ddmendpoint, ddm, e.ddmendpoint)) e.ddmendpoint = ddm e.ddmendpoint_alt = e.ddmendpoint ### consider me later return files
def copy_out(files, **kwargs): """ Upload given files to GS storage. :param files: list of `FileSpec` objects :raise: PilotException in case of controlled error """ workdir = kwargs.pop('workdir') for fspec in files: logger.info('Going to process fspec.turl=%s', fspec.turl) import re # bucket = re.sub(r'gs://(.*?)/.*', r'\1', fspec.turl) reobj = re.match(r'gs://([^/]*)/(.*)', fspec.turl) (bucket, remote_path) = reobj.groups() # ["pilotlog.txt", "payload.stdout", "payload.stderr"]: for logfile in os.listdir(workdir): if logfile.endswith("gz"): continue path = os.path.join(workdir, logfile) if os.path.exists(path): object_name = os.path.join(remote_path, logfile) logger.info('uploading %s to bucket=%s using object name=%s', path, bucket, object_name) status, diagnostics = upload_file(path, bucket, object_name=object_name) if not status: ## an error occurred # create new error code(s) in ErrorCodes.py and set it/them in resolve_common_transfer_errors() error = resolve_common_transfer_errors(diagnostics, is_stagein=False) fspec.status = 'failed' fspec.status_code = error.get('rcode') raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) else: diagnostics = 'local output file does not exist: %s' % path logger.warning(diagnostics) fspec.status = 'failed' fspec.status_code = errors.STAGEOUTFAILED raise PilotException(diagnostics, code=fspec.status_code, state=fspec.status) fspec.status = 'transferred' fspec.status_code = 0 return files
def resolve_protocol(fspec, activity, ddm): """ Rosolve protocols to be used to transfer the file with corressponding activity :param fspec: file spec data :param activity: actvitiy name as string :param ddm: ddm storage data :return: protocol as dictionary """ logger.info( "Resolving protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s)" % (fspec.lfn, fspec.ddmendpoint, activity)) activity = get_ddm_activity(activity) protocols = ddm.arprotocols.get(activity) protocols_allow = [] for schema in allowed_schemas: for protocol in protocols: if schema is None or protocol.get('endpoint', '').startswith( "%s://" % schema): protocols_allow.append(protocol) if not protocols_allow: err = "No available allowed protocols for file(lfn: %s, ddmendpoint: %s) with activity(%s)" % ( fspec.lfn, fspec.ddmendpoint, activity) logger.error(err) raise PilotException(err) protocol = protocols_allow[0] logger.info( "Resolved protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s): %s" % (fspec.lfn, fspec.ddmendpoint, activity, protocol)) return protocol
def init(self, pandaqueue, confinfo=None, extinfo=None, jobinfo=None): self.confinfo = confinfo or PilotConfigProvider() self.jobinfo = jobinfo # or JobInfoProvider() self.extinfo = extinfo or ExtInfoProvider(cache_time=self.cache_time) self.pandaqueue = pandaqueue if not self.pandaqueue: raise PilotException( 'Failed to initialize InfoService: panda queue name is not set' ) self.queues_info = {} ## reset cache data self.storages_info = {} ## reset cache data #self.sites_info = {} ## reset cache data self.queuedata = self.resolve_queuedata(self.pandaqueue) if not self.queuedata or not self.queuedata.name: raise QueuedataFailure( "Failed to resolve queuedata for queue=%s, wrong PandaQueue name?" % self.pandaqueue) self.resolve_storage_data() ## prefetch details for all storages
def resolve_storage_data(self, ddmendpoints=[]): ## high level API """ :return: dict of DDMEndpoint settings by DDMEndpoint name as a key """ if isinstance(ddmendpoints, basestring): ddmendpoints = [ddmendpoints] cache = self.storages_info miss_objs = set(ddmendpoints) - set(cache) if not ddmendpoints or miss_objs: # not found in cache: do load and initialize data # the order of providers makes the priority r = self._resolve_data(self.whoami(), providers=(self.confinfo, self.jobinfo, self.extinfo), args=[miss_objs], merge=True) if ddmendpoints: not_resolved = set(ddmendpoints) - set(r) if not_resolved: raise PilotException( "internal error: Failed to load storage details for ddms=%s" % sorted(not_resolved)) for ddm in r: cache[ddm] = StorageData(r[ddm]) return cache
def resolve_protocols(self, files): """ Populates filespec.protocols for each entry from `files` according to preferred `fspec.ddm_activity` value :param files: list of `FileSpec` objects fdat.protocols = [dict(endpoint, path, flavour), ..] :return: `files` """ ddmconf = self.infosys.resolve_storage_data() for fdat in files: ddm = ddmconf.get(fdat.ddmendpoint) if not ddm: error = 'Failed to resolve output ddmendpoint by name=%s (from PanDA), please check configuration.' % fdat.ddmendpoint self.logger.error("resolve_protocols: %s, fspec=%s" % (error, fdat)) raise PilotException(error, code=ErrorCodes.NOSTORAGE) protocols = [] for aname in fdat.ddm_activity: protocols = ddm.arprotocols.get(aname) if protocols: break fdat.protocols = protocols return files
def create_output_list(files, init_dir, ddmconf): """ Add files to the output list which tells ARC CE which files to upload """ if not ddmconf: raise PilotException("copy_out() failed to resolve ddmconf from function arguments", code=ErrorCodes.STAGEOUTFAILED, state='COPY_ERROR') for fspec in files: arcturl = fspec.turl if arcturl.startswith('s3://'): # Use Rucio proxy to upload to OS arcturl = re.sub(r'^s3', 's3+rucio', arcturl) # Add failureallowed option so failed upload does not fail job rucio = 'rucio://rucio-lb-prod.cern.ch;failureallowed=yes/objectstores' rse = fspec.ddmendpoint activity = 'write' arcturl = '/'.join([rucio, arcturl, rse, activity]) else: # Add ARC options to TURL checksumtype, checksum = list(fspec.checksum.items())[0] # Python 2/3 # resolve token value from fspec.ddmendpoint token = ddmconf.get(fspec.ddmendpoint).token if not token: logger.info('No space token info for %s', fspec.ddmendpoint) else: arcturl = re.sub(r'((:\d+)/)', r'\2;autodir=no;spacetoken=%s/' % token, arcturl) arcturl += ':checksumtype=%s:checksumvalue=%s' % (checksumtype, checksum) logger.info('Adding to output.list: %s %s', fspec.lfn, arcturl) # Write output.list with open(os.path.join(init_dir, 'output.list'), 'a') as f: f.write('%s %s\n' % (fspec.lfn, arcturl))
def main(): """ Main function of PanDA Pilot 2. Prepare for and execute the requested workflow. :return: exit code (int). """ # get the logger logger = logging.getLogger(__name__) # print the pilot version pilot_version_banner() # define threading events args.graceful_stop = threading.Event() args.abort_job = threading.Event() args.job_aborted = threading.Event() # define useful variables args.retrieve_next_job = True # go ahead and download a new job args.signal = None # to store any incoming signals args.signal_counter = 0 # keep track of number of received kill signal (suicide counter) args.kill_time = 0 # keep track of when first kill signal arrived # read and parse config file config.read(args.config) # perform https setup https_setup(args, get_pilot_version()) # initialize InfoService try: infosys.init(args.queue) # check if queue is ACTIVE if infosys.queuedata.state != 'ACTIVE': logger.critical('specified queue is NOT ACTIVE: %s -- aborting' % infosys.queuedata.name) raise PilotException("Panda Queue is NOT ACTIVE") except PilotException as error: logger.fatal(error) return error.get_error_code() # set the site name for rucio ## is it really used? environ['PILOT_RUCIO_SITENAME'] = infosys.queuedata.site # set requested workflow logger.info('pilot arguments: %s' % str(args)) logger.info('selected workflow: %s' % args.workflow) workflow = __import__('pilot.workflow.%s' % args.workflow, globals(), locals(), [args.workflow], -1) # execute workflow try: exit_code = workflow.run(args) except Exception as e: logger.fatal('main pilot function caught exception: %s' % e) exit_code = None return exit_code
def copy_in(files, **kwargs): """ Download given files from an S3 bucket. :param files: list of `FileSpec` objects :raise: PilotException in case of controlled error """ for fspec in files: dst = fspec.workdir or kwargs.get('workdir') or '.' bucket = 'bucket' # UPDATE ME path = os.path.join(dst, fspec.lfn) logger.info('downloading object %s from bucket=%s to local file %s', fspec.lfn, bucket, path) status, diagnostics = download_file(path, bucket, object_name=fspec.lfn) if not status: ## an error occurred error = resolve_common_transfer_errors(diagnostics, is_stagein=True) fspec.status = 'failed' fspec.status_code = error.get('rcode') raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) fspec.status_code = 0 fspec.status = 'transferred' return files
def copy_in(files, **kwargs): """ Download given files using rucio copytool. :param files: list of `FileSpec` objects :raise: PilotException in case of controlled error """ # don't spoil the output, we depend on stderr parsing os.environ[ 'RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' ddmconf = kwargs.pop('ddmconf', {}) activity = kwargs.pop('activity', None) # trace_report = kwargs.get('trace_report') for fspec in files: cmd = [] logger.info("To transfer file: %s" % fspec) ddm = ddmconf.get(fspec.ddmendpoint) if ddm: protocol = resolve_protocol(fspec, activity, ddm) surls = resolve_surl(fspec, protocol, ddmconf) if 'surl' in surls: fspec.surl = surls['surl'] ddm_special_setup = ddm.get_special_setup(protocol.get('id', None)) if ddm_special_setup: cmd += [ddm_special_setup] dst = fspec.workdir or kwargs.get('workdir') or '.' cmd += [ '/usr/bin/env', 'rucio', '-v', 'download', '--no-subdir', '--dir', dst ] if require_replicas: cmd += ['--rse', fspec.replicas[0][0]] if fspec.surl: if fspec.ddmendpoint: cmd.extend(['--rse', fspec.ddmendpoint]) cmd.extend(['--pfn', fspec.surl]) cmd += ['%s:%s' % (fspec.scope, fspec.lfn)] rcode, stdout, stderr = execute(" ".join(cmd), **kwargs) if rcode: ## error occurred error = resolve_common_transfer_errors(stderr, is_stagein=True) fspec.status = 'failed' fspec.status_code = error.get('rcode') raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) fspec.status_code = 0 fspec.status = 'transferred' return files
def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_copytools='rucio', trace_report=None): """ If `acopytools` is not specified then it will be automatically resolved via infosys. In this case `infosys` requires initialization. :param acopytools: dict of copytool names per activity to be used for transfers. Accepts also list of names or string value without activity passed. :param logger: logging.Logger object to use for logging (None means no logging) :param default_copytools: copytool name(s) to be used in case of unknown activity passed. Accepts either list of names or single string value. """ super(StagingClient, self).__init__() if not logger: logger = logging.getLogger('%s.%s' % (__name__, 'null')) logger.disabled = True self.logger = logger self.infosys = infosys_instance or infosys if isinstance(acopytools, basestring): acopytools = {'default': [acopytools]} if acopytools else {} if isinstance(acopytools, (list, tuple)): acopytools = {'default': acopytools} if acopytools else {} self.acopytools = acopytools or {} if self.infosys.queuedata: if not self.acopytools: ## resolve from queuedata.acopytools using infosys self.acopytools = (self.infosys.queuedata.acopytools or {}).copy() if not self.acopytools: ## resolve from queuedata.copytools using infosys self.acopytools = dict( default=(self.infosys.queuedata.copytools or {}).keys()) if not self.acopytools.get('default'): if isinstance(default_copytools, basestring): default_copytools = [default_copytools ] if default_copytools else [] self.acopytools['default'] = default_copytools if not self.acopytools: msg = 'failed to initilize StagingClient: no acopytools options found, acopytools=%s' % self.acopytools logger.error(msg) self.trace_report.update(clientState='BAD_COPYTOOL', stateReason=msg) self.trace_report.send() raise PilotException("failed to resolve acopytools settings") logger.info('configured copytools per activity: acopytools=%s' % self.acopytools) # get an initialized trace report (has to be updated for get/put if not defined before) self.trace_report = trace_report if trace_report else TraceReport( pq=os.environ.get('PILOT_SITENAME', ''))
def copy_out(files, **kwargs): """ Upload given files using xrdcp command. :param files: list of `FileSpec` objects :raise: PilotException in case of controlled error """ setup = kwargs.pop('copytools', {}).get('xrdcp', {}).get('setup') coption = _resolve_checksum_option(setup, **kwargs) trace_report = kwargs.get('trace_report') for fspec in files: trace_report.update(scope=fspec.scope, dataset=fspec.dataset, url=fspec.surl, filesize=fspec.filesize) trace_report.update(catStart=time(), filename=fspec.lfn, guid=fspec.guid.replace('-', '')) try: filesize_cmd, checksum_cmd, checksum_type = _stagefile(coption, fspec.surl, fspec.turl, fspec.filesize, is_stagein=False, setup=setup, **kwargs) fspec.status_code = 0 fspec.status = 'transferred' trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time()) trace_report.send() except PilotException as error: fspec.status = 'failed' fspec.status_code = error.get_error_code() state = 'STAGEOUT_ATTEMPT_FAILED' diagnostics = error.get_detail() trace_report.update(clientState=state, stateReason=diagnostics, timeEnd=time()) trace_report.send() raise PilotException(diagnostics, code=fspec.status_code, state=state) else: # compare checksums fspec.checksum[checksum_type] = checksum_cmd # remote checksum state, diagnostics = verify_catalog_checksum(fspec, fspec.surl) if diagnostics != "": trace_report.update(clientState=state or 'STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, timeEnd=time()) trace_report.send() raise PilotException(diagnostics, code=fspec.status_code, state=state) return files
def copy_out(files, **kwargs): """ Upload given files using gfal command. :param files: Files to upload :raises: PilotException in case of errors """ if not check_for_gfal(): raise StageOutFailure("No GFAL2 tools found") trace_report = kwargs.get('trace_report') for fspec in files: trace_report.update(scope=fspec.scope, dataset=fspec.dataset, url=fspec.surl, filesize=fspec.filesize) trace_report.update(catStart=time(), filename=fspec.lfn, guid=fspec.guid.replace('-', '')) src = fspec.workdir or kwargs.get('workdir') or '.' timeout = get_timeout(fspec.filesize) source = "file://%s" % os.path.abspath(fspec.surl or os.path.join(src, fspec.lfn)) destination = fspec.turl cmd = ['gfal-copy --verbose -f', ' -t %s' % timeout] if fspec.checksum: cmd += ['-K', '%s:%s' % list(fspec.checksum.items())[0]] # Python 2/3 cmd += [source, destination] rcode, stdout, stderr = execute(" ".join(cmd), **kwargs) if rcode: ## error occurred if rcode in [errno.ETIMEDOUT, errno.ETIME]: error = {'rcode': ErrorCodes.STAGEOUTTIMEOUT, 'state': 'CP_TIMEOUT', 'error': 'Copy command timed out: %s' % stderr} else: error = resolve_common_transfer_errors(stdout + stderr, is_stagein=False) fspec.status = 'failed' fspec.status_code = error.get('rcode') trace_report.update(clientState=error.get('state', None) or 'STAGEOUT_ATTEMPT_FAILED', stateReason=error.get('error', 'unknown error'), timeEnd=time()) trace_report.send() raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) fspec.status_code = 0 fspec.status = 'transferred' trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time()) trace_report.send() return files
def set_info(args): ## should be DEPRECATED: use `infosys.init(queuename)` """ Set up all necessary site information for given PandaQueue name. Resolve everything from the specified queue name (passed via `args.queue`) and fill extra lookup structure (Populate `args.info`). raise PilotException in case of errors. :param args: input (shared) agruments :return: None """ # ## initialize info service infosys.init(args.queue) args.info = collections.namedtuple('info', ['queue', 'infoservice', # queuedata, 'site', 'storages', # 'site_info', 'storages_info']) args.info.queue = args.queue args.info.infoservice = infosys # ## THIS is actually for tests and redundant - the pilot.info.infosys should be used # args.infoservice = infosys # ?? # check if queue is ACTIVE if infosys.queuedata.state != 'ACTIVE': logger.critical('specified queue is NOT ACTIVE: %s -- aborting' % infosys.queuedata.name) raise PilotException("Panda Queue is NOT ACTIVE") # do we need explicit varible declaration (queuedata)? # same as args.location.infoservice.queuedata #args.location.queuedata = infosys.queuedata # do we need explicit varible declaration (Experiment site name)? # same as args.location.infoservice.queuedata.site #args.location.site = infosys.queuedata.site # do we need explicit varible declaration (storages_info)? # same as args.location.infoservice.storages_info #args.location.storages_info = infosys.storages_info # find all enabled storages at site args.info.storages = [ddm for ddm, dat in infosys.storages_info.iteritems() if dat.site == infosys.queuedata.site] #args.info.sites_info = infosys.sites_info logger.info('queue: %s' % args.info.queue)
def resolve_replica(self, fspec, primary_schemas=None, allowed_schemas=None): """ Resolve input replica first according to `primary_schemas`, if not found then look up within `allowed_schemas` :param fspec: input `FileSpec` objects :param allowed_schemas: list of allowed schemas or any if None :return: dict(surl, ddmendpoint, pfn) :raise PilotException: if replica not found """ if not fspec.replicas: self.logger.warning( 'resolve_replicas() recevied no fspec.replicas') return allowed_schemas = allowed_schemas or [None] replica = None for ddmendpoint, replicas in fspec.replicas: if not replicas: # ignore ddms with no replicas continue if primary_schemas: ## look up primary schemas if requested replica = self.get_preferred_replica(replicas, primary_schemas) if not replica: replica = self.get_preferred_replica(replicas, allowed_schemas) if replica: surl = self.get_preferred_replica( replicas, ['srm']) or replicas[ 0] # prefer SRM protocol for surl -- to be verified self.logger.info( "[stage-in] surl (srm replica) from Rucio: pfn=%s, ddmendpoint=%s" % (surl, ddmendpoint)) break if not replica: # replica not found schemas = 'any' if not allowed_schemas[0] else ','.join( allowed_schemas) error = 'Failed to find replica for input file=%s, allowed_schemas=%s, fspec=%s' % ( fspec.lfn, schemas, fspec) self.logger.error("resolve_replica: %s" % error) raise PilotException(error, code=ErrorCodes.REPLICANOTFOUND) return {'surl': surl, 'ddmendpoint': ddmendpoint, 'pfn': replica}
def resolve_surl(fspec, protocol, ddmconf, **kwargs): """ Get final destination SURL for file to be transferred to Objectstore Can be customized at the level of specific copytool :param protocol: suggested protocol :param ddmconf: full ddm storage data :param fspec: file spec data :return: dictionary {'surl': surl} """ ddm = ddmconf.get(fspec.ddmendpoint) if not ddm: raise PilotException('failed to resolve ddmendpoint by name=%s' % fspec.ddmendpoint) dataset = fspec.dataset if dataset: dataset = dataset.replace("#{pandaid}", os.environ['PANDAID']) else: dataset = "" remote_path = os.path.join(protocol.get('path', ''), dataset) # pilot ID is passed by the envvar GTAG # try: # rprotocols = ddm.rprotocols # logger.debug('ddm.rprotocols=%s' % rprotocols) # if "http_access" in rprotocols: # http_access = rprotocols["http_access"] # os.environ['GTAG'] = http_access + os.path.join(remote_path, config.Pilot.pilotlog) # logger.debug('http_access=%s' % http_access) # except Exception: # logger.warning("Failed in get 'http_access' in ddm.rprotocols") surl = protocol.get('endpoint', '') + remote_path logger.info('For GCS bucket, set surl=%s', surl) # example: # protocol = {u'path': u'/atlas-eventservice', u'endpoint': u's3://s3.cern.ch:443/', u'flavour': u'AWS-S3-SSL', u'id': 175} # surl = 's3://s3.cern.ch:443//atlas-eventservice/EventService_premerge_24706191-5013009653-24039149400-322-5.tar' return {'surl': surl}
def require_protocols(self, files, copytool, activity): """ Populates fspec.protocols and fspec.turl for each entry in `files` according to preferred fspec.ddm_activity :param files: list of `FileSpec` objects :param activity: str or ordered list of transfer activity names to resolve acopytools related data :return: None """ allowed_schemas = getattr(copytool, 'allowed_schemas', None) if self.infosys and self.infosys.queuedata: copytool_name = copytool.__name__.rsplit('.', 1)[-1] allowed_schemas = self.infosys.queuedata.resolve_allowed_schemas(activity, copytool_name) or allowed_schemas files = self.resolve_protocols(files) ddmconf = self.infosys.resolve_storage_data() for fspec in files: protocols = self.resolve_protocol(fspec, allowed_schemas) if not protocols: # no protocols found error = 'Failed to resolve protocol for file=%s, allowed_schemas=%s, fspec=%s' % (fspec.lfn, allowed_schemas, fspec) self.logger.error("resolve_protocol: %s" % error) raise PilotException(error, code=ErrorCodes.NOSTORAGEPROTOCOL) # take first available protocol for copytool: FIX ME LATER if need (do iterate over all allowed protocols?) protocol = protocols[0] self.logger.info("Resolved protocol to be used for transfer lfn=%s: data=%s" % (protocol, fspec.lfn)) resolve_surl = getattr(copytool, 'resolve_surl', None) if not callable(resolve_surl): resolve_surl = self.resolve_surl r = resolve_surl(fspec, protocol, ddmconf) ## pass ddmconf for possible custom look up at the level of copytool if r.get('surl'): fspec.turl = r['surl'] if r.get('ddmendpoint'): fspec.ddmendpoint = r['ddmendpoint']
def resolve_surl(fspec, protocol, ddmconf, **kwargs): """ Get final destination SURL for file to be transferred to Objectstore Can be customized at the level of specific copytool :param protocol: suggested protocol :param ddmconf: full ddm storage data :param fspec: file spec data :return: dictionary {'surl': surl} """ try: pandaqueue = infosys.pandaqueue except Exception: pandaqueue = "" if pandaqueue is None: pandaqueue = "" ddm = ddmconf.get(fspec.ddmendpoint) if not ddm: raise PilotException('failed to resolve ddmendpoint by name=%s' % fspec.ddmendpoint) dataset = fspec.dataset if dataset: dataset = dataset.replace("#{pandaid}", os.environ['PANDAID']) else: dataset = "" remote_path = os.path.join(protocol.get('path', ''), pandaqueue, dataset) surl = protocol.get('endpoint', '') + remote_path logger.info('For GCS bucket, set surl=%s', surl) # example: # protocol = {u'path': u'/atlas-eventservice', u'endpoint': u's3://s3.cern.ch:443/', u'flavour': u'AWS-S3-SSL', u'id': 175} # surl = 's3://s3.cern.ch:443//atlas-eventservice/EventService_premerge_24706191-5013009653-24039149400-322-5.tar' return {'surl': surl}
def transfer(self, files, activity=['pw'], **kwargs): # noqa: C901 """ Automatically stage passed files using copy tools related to given `activity` :param files: list of `FileSpec` objects :param activity: list of activity names used to determine appropriate copytool (prioritized list) :param kwargs: extra kwargs to be passed to copytool transfer handler :raise: PilotException in case of controlled error :return: output of copytool trasfers (to be clarified) """ if isinstance(activity, basestring): activity = [activity] result, errors = None, [] avail_activity = False for act in activity: copytools = self.acopytools.get(act) storages = self.astorages.get(act) if not copytools: logger.warn("No available copytools for activity %s" % act) continue if act in ['pw', 'pls', 'es_events', 'es_failover'] and not storages: # for write activity, if corresponding storages are not defined, should use different activity logger.warn("Failed to find corresponding astorages for writing activity(%s), will try next activity" % act) continue storage = storages[0] if storages else None avail_activity = True for name in copytools: try: if name not in self.copytool_modules: raise PilotException('passed unknown copytool with name=%s .. skipped' % name) module = self.copytool_modules[name]['module_name'] logger.info('Trying to use copytool=%s for activity=%s' % (name, act)) copytool = __import__('pilot.copytool.%s' % module, globals(), locals(), [module], -1) except PilotException as e: errors.append(e) logger.debug('Error: %s' % e) continue except Exception as e: logger.warning('Failed to import copytool module=%s, error=%s' % (module, e)) logger.debug('Error: %s' % e) continue try: result = self.transfer_files(copytool, files, act, storage, **kwargs) except PilotException, e: errors.append(e) logger.debug('Error: %s' % e) except Exception as e: logger.warning('Failed to transfer files using copytool=%s .. skipped; error=%s' % (copytool, e)) logger.error(traceback.format_exc()) errors.append(e) if errors and isinstance(errors[-1], PilotException) and errors[-1].get_error_code() == ErrorCodes.MISSINGOUTPUTFILE: raise errors[-1] if result: break else: logger.warn("Failed to transfer files using activity(%s) copytool(%s) with error=%s" % (act, name, errors))
errors.append(e) if errors and isinstance(errors[-1], PilotException) and errors[-1].get_error_code() == ErrorCodes.MISSINGOUTPUTFILE: raise errors[-1] if result: break else: logger.warn("Failed to transfer files using activity(%s) copytool(%s) with error=%s" % (act, name, errors)) if result: break else: logger.warn("Failed to transfer files using activity(%s) with copytools(%s)" % (act, copytools)) if not avail_activity: raise PilotException('Not available activity with both acopytools and astorages defined') if not result: raise PilotException('Failed to transfer files with activities %s' % (activity)) return result class StageInESClient(StagingESClient, StageInClient): def process_storage_id(self, files): """ If storage_id is specified, replace ddmendpoint by parsing storage_id """ for fspec in files: if fspec.storage_token: storage_id, path_convention = fspec.get_storage_id_and_path_convention()
def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, external_dir, label='stage-in', container_type='container'): """ Get the middleware container execution command. Note: this function is tailor made for stage-in/out. :param job: job object. :param xdata: list of FileSpec objects. :param queue: queue name (string). :param script: name of stage-in/out script (string). :param eventtype: :param localsite: :param remotesite: :param external_dir: input or output files directory (string). :param label: optional 'stage-[in|out]' (string). :param container_type: optional 'container/bash' (string). :return: stage-in/out command (string). :raises PilotException: for stage-in/out related failures """ if label == 'stage-out': filedata_dictionary = get_filedata_strings(xdata) else: filedata_dictionary = get_filedata(xdata) # write file data to file try: status = write_json( path.join(job.workdir, config.Container.stagein_replica_dictionary), filedata_dictionary) except Exception as exc: diagnostics = 'exception caught in get_command(): %s' % exc logger.warning(diagnostics) raise PilotException(diagnostics) else: if not status: diagnostics = 'failed to write replica dictionary to file' logger.warning(diagnostics) raise PilotException(diagnostics) # copy pilot source into container directory, unless it is already there diagnostics = copy_pilot_source(job.workdir) if diagnostics: raise PilotException(diagnostics) final_script_path = path.join(job.workdir, script) environ['PYTHONPATH'] = environ.get('PYTHONPATH') + ':' + job.workdir script_path = path.join('pilot/scripts', script) full_script_path = path.join(path.join(job.workdir, script_path)) copy(full_script_path, final_script_path) if container_type == 'container': # correct the path when containers have been used final_script_path = path.join('.', script) workdir = '/srv' else: # for container_type=bash we need to add the rucio setup pilot_user = environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.container' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: final_script_path = user.get_middleware_container_script( '', final_script_path, asetup=True) except PilotException: final_script_path = 'python %s' % final_script_path workdir = job.workdir cmd = "%s -d -w %s -q %s --eventtype=%s --localsite=%s --remotesite=%s --produserid=\"%s\" --jobid=%s" % \ (final_script_path, workdir, queue, eventtype, localsite, remotesite, job.produserid.replace(' ', '%20'), job.jobid) if label == 'stage-in': cmd += " --eventservicemerge=%s --usepcache=%s --usevp=%s --replicadictionary=%s" % \ (job.is_eventservicemerge, job.infosys.queuedata.use_pcache, job.use_vp, config.Container.stagein_replica_dictionary) if external_dir: cmd += ' --inputdir=%s' % external_dir else: # stage-out cmd += ' --lfns=%s --scopes=%s --datasets=%s --ddmendpoints=%s --guids=%s' % \ (filedata_dictionary['lfns'], filedata_dictionary['scopes'], filedata_dictionary['datasets'], filedata_dictionary['ddmendpoints'], filedata_dictionary['guids']) if external_dir: cmd += ' --outputdir=%s' % external_dir cmd += ' --taskid=%s' % job.taskid cmd += ' --jobdefinitionid=%s' % job.jobdefinitionid cmd += ' --catchall=%s' % job.infosys.queuedata.catchall if container_type == 'bash': cmd += '\nexit $?' return cmd
def copy_in(files, **kwargs): """ Download given files using the lsm-get command. :param files: list of `FileSpec` objects. :raise: PilotException in case of controlled error. :return: files `FileSpec` object. """ exit_code = 0 stdout = "" stderr = "" copytools = kwargs.get('copytools') or [] copysetup = get_copysetup(copytools, 'lsm') trace_report = kwargs.get('trace_report') #allow_direct_access = kwargs.get('allow_direct_access') # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite')) for fspec in files: # update the trace report localsite = localsite if localsite else fspec.ddmendpoint trace_report.update(localSite=localsite, remoteSite=fspec.ddmendpoint, filesize=fspec.filesize) trace_report.update(filename=fspec.lfn, guid=fspec.guid.replace('-', '')) trace_report.update(scope=fspec.scope, dataset=fspec.dataset) # continue loop for files that are to be accessed directly ## TO BE DEPRECATED (anisyonk) #if fspec.is_directaccess(ensure_replica=False) and allow_direct_access and fspec.accessmode == 'direct': # fspec.status_code = 0 # fspec.status = 'remote_io' # trace_report.update(url=fspec.turl, clientState='FOUND_ROOT', stateReason='direct_access') # trace_report.send() # continue trace_report.update(catStart=time()) dst = fspec.workdir or kwargs.get('workdir') or '.' #timeout = get_timeout(fspec.filesize) source = fspec.turl destination = os.path.join(dst, fspec.lfn) logger.info("transferring file %s from %s to %s", fspec.lfn, source, destination) exit_code, stdout, stderr = move(source, destination, dst_in=True, copysetup=copysetup) if exit_code != 0: logger.warning( "transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) error = resolve_common_transfer_errors(stderr, is_stagein=True) fspec.status = 'failed' fspec.status_code = error.get('rcode') trace_report.update(clientState=error.get('state') or 'STAGEIN_ATTEMPT_FAILED', stateReason=error.get('error'), timeEnd=time()) trace_report.send() raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) # verify checksum; compare local checksum with catalog value (fspec.checksum), use same checksum type state, diagnostics = verify_catalog_checksum(fspec, destination) if diagnostics != "": trace_report.update(clientState=state or 'STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, timeEnd=time()) trace_report.send() raise PilotException(diagnostics, code=fspec.status_code, state=state) fspec.status_code = 0 fspec.status = 'transferred' trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time()) trace_report.send() # for testing kill signals #import signal #os.kill(os.getpid(), signal.SIGSEGV) return files
def copy_out(files, **kwargs): """ Upload given files using lsm copytool. :param files: list of `FileSpec` objects. :raise: PilotException in case of controlled error. """ copytools = kwargs.get('copytools') or [] copysetup = get_copysetup(copytools, 'lsm') trace_report = kwargs.get('trace_report') ddmconf = kwargs.get('ddmconf', None) if not ddmconf: raise PilotException( "copy_out() failed to resolve ddmconf from function arguments", code=ErrorCodes.STAGEOUTFAILED, state='COPY_ERROR') for fspec in files: trace_report.update(scope=fspec.scope, dataset=fspec.dataset, url=fspec.surl, filesize=fspec.filesize) trace_report.update(catStart=time(), filename=fspec.lfn, guid=fspec.guid.replace('-', '')) # resolve token value from fspec.ddmendpoint ddm = ddmconf.get(fspec.ddmendpoint) token = ddm.token if not token: diagnostics = "copy_out() failed to resolve token value for ddmendpoint=%s" % ( fspec.ddmendpoint) trace_report.update(clientState='STAGEOUT_ATTEMPT_FAILED', stateReason=diagnostics, timeEnd=time()) trace_report.send() raise PilotException(diagnostics, code=ErrorCodes.STAGEOUTFAILED, state='COPY_ERROR') src = fspec.workdir or kwargs.get('workdir') or '.' #timeout = get_timeout(fspec.filesize) source = os.path.join(src, fspec.lfn) destination = fspec.turl # checksum has been calculated in the previous step - transfer_files() in api/data # note: pilot is handing over checksum to the command - which will/should verify it after the transfer checksum = "adler32:%s" % fspec.checksum.get('adler32') # define the command options opts = { '--size': fspec.filesize, '-t': token, '--checksum': checksum, '--guid': fspec.guid } try: opts = " ".join(["%s %s" % (k, v) for (k, v) in opts.iteritems()]) # Python 2 except Exception: opts = " ".join([ "%s %s" % (k, v) for (k, v) in list(opts.items()) ]) # Python 3 logger.info("transferring file %s from %s to %s", fspec.lfn, source, destination) nretries = 1 # input parameter to function? for retry in range(nretries): exit_code, stdout, stderr = move(source, destination, dst_in=False, copysetup=copysetup, options=opts) if exit_code != 0: if stderr == "": stderr = stdout error = resolve_common_transfer_errors(stderr, is_stagein=False) fspec.status = 'failed' fspec.status_code = error.get('exit_code') trace_report.update(clientState=error.get('state', None) or 'STAGEOUT_ATTEMPT_FAILED', stateReason=error.get( 'error', 'unknown error'), timeEnd=time()) trace_report.send() raise PilotException(error.get('error'), code=error.get('exit_code'), state=error.get('state')) else: # all successful logger.info('all successful') break fspec.status_code = 0 fspec.status = 'transferred' trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time()) trace_report.send() return files
def copy_in(files, **kwargs): """ Download given files using gfal-copy command. :param files: list of `FileSpec` objects :raise: PilotException in case of controlled error """ #allow_direct_access = kwargs.get('allow_direct_access') or False trace_report = kwargs.get('trace_report') if not check_for_gfal(): raise StageInFailure("No GFAL2 tools found") localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', os.environ.get('DQ2_LOCAL_SITE_ID', None)) for fspec in files: # update the trace report localsite = localsite if localsite else fspec.ddmendpoint trace_report.update(localSite=localsite, remoteSite=fspec.ddmendpoint, filesize=fspec.filesize) trace_report.update(filename=fspec.lfn, guid=fspec.guid.replace('-', '')) trace_report.update(scope=fspec.scope, dataset=fspec.dataset) # continue loop for files that are to be accessed directly ## TO BE DEPRECATED (should be applied at top level) (anisyonk) #if fspec.is_directaccess(ensure_replica=False) and allow_direct_access and fspec.accessmode == 'direct': # fspec.status_code = 0 # fspec.status = 'remote_io' # trace_report.update(url=fspec.turl, clientState='FOUND_ROOT', stateReason='direct_access') # trace_report.send() # continue trace_report.update(catStart=time()) dst = fspec.workdir or kwargs.get('workdir') or '.' timeout = get_timeout(fspec.filesize) source = fspec.turl destination = "file://%s" % os.path.abspath( os.path.join(dst, fspec.lfn)) cmd = ['gfal-copy --verbose -f', ' -t %s' % timeout] if fspec.checksum: cmd += ['-K', '%s:%s' % list(fspec.checksum.items())[0]] # Python 2/3 cmd += [source, destination] rcode, stdout, stderr = execute(" ".join(cmd), **kwargs) if rcode: ## error occurred if rcode in [errno.ETIMEDOUT, errno.ETIME]: error = { 'rcode': ErrorCodes.STAGEINTIMEOUT, 'state': 'CP_TIMEOUT', 'error': 'Copy command timed out: %s' % stderr } else: error = resolve_common_transfer_errors(stdout + stderr, is_stagein=True) fspec.status = 'failed' fspec.status_code = error.get('rcode') trace_report.update(clientState=error.get('state') or 'STAGEIN_ATTEMPT_FAILED', stateReason=error.get('error'), timeEnd=time()) trace_report.send() raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) fspec.status_code = 0 fspec.status = 'transferred' trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time()) trace_report.send() return files