def create_core_dump(pid=None, workdir=None): """ Create core dump and copy it to work directory """ if not pid or not workdir: logger.warning( 'cannot create core file since pid or workdir is unknown') return cmd = 'gdb --pid %d -ex \'generate-core-file\'' % pid exit_code, stdout, stderr = execute(cmd) if not exit_code: path = locate_core_file(pid=pid) if path: try: copy(path, workdir) except Exception as error: logger.warning('failed to copy core file: %s', error) else: logger.debug('copied core dump to workdir') else: logger.warning('failed to execute command: %s, stdout+err=%s', cmd, stdout + stderr)
def post_memory_monitor_action(job): """ Perform post action items for memory monitor. :param job: job object. :return: """ nap = 3 path1 = os.path.join(job.workdir, get_memory_monitor_summary_filename()) path2 = os.environ.get('PILOT_HOME') i = 0 maxretry = 20 while i <= maxretry: if os.path.exists(path1): break logger.info( "taking a short nap (%d s) to allow the memory monitor to finish writing to the summary file (#%d/#%d)" % (nap, i, maxretry)) time.sleep(nap) i += 1 try: copy(path1, path2) except Exception as e: logger.warning('failed to copy memory monitor output: %s' % e)
def download_transform(url, transform_name, workdir): """ Download the transform from the given url :param url: download URL with path to transform (string). :param transform_name: trf name (string). :param workdir: work directory (string). :return: """ status = False diagnostics = "" path = os.path.join(workdir, transform_name) cmd = 'curl -sS \"%s\" > %s' % (url, path) trial = 1 max_trials = 3 # test if $HARVESTER_WORKDIR is set harvester_workdir = os.environ.get('HARVESTER_WORKDIR') if harvester_workdir is not None: # skip curl by setting max_trials = 0 max_trials = 0 source_path = os.path.join(harvester_workdir, transform_name) try: copy(source_path, path) status = True except Exception as error: status = False diagnostics = "Failed to copy file %s to %s : %s" % (source_path, path, error) logger.error(diagnostics) # try to download the trf a maximum of 3 times while trial <= max_trials: logger.info("executing command [trial %d/%d]: %s" % (trial, max_trials, cmd)) exit_code, stdout, stderr = execute(cmd, mute=True) if not stdout: stdout = "(None)" if exit_code != 0: # Analyze exit code / output diagnostics = "curl command failed: %d, %s, %s" % (exit_code, stdout, stderr) logger.warning(diagnostics) if trial == max_trials: logger.fatal('could not download transform: %s' % stdout) status = False break else: logger.info("will try again after 60 s") sleep(60) else: logger.info("curl command returned: %s" % stdout) status = True break trial += 1 return status, diagnostics
def copy_output(job, job_scratch_dir, work_dir): cp_start = time.time() try: for outfile in job.output_files.keys(): if os.path.exists(outfile): copy(os.path.join(job_scratch_dir, outfile), os.path.join(work_dir, outfile)) os.chdir(work_dir) except IOError: raise FileHandlingFailure( "Copy from scratch dir to access point failed") finally: cp_time = time.time() - cp_start logger.info("Copy of outputs took: {0} sec.".format(cp_time)) return 0
def get_analysis_trf(transform, workdir): """ Prepare to download the user analysis transform with curl. The function will verify the download location from a known list of hosts. :param transform: full trf path (url) (string). :param workdir: work directory (string). :return: exit code (int), diagnostics (string), transform_name (string) """ ec = 0 diagnostics = "" # test if $HARVESTER_WORKDIR is set harvester_workdir = os.environ.get('HARVESTER_WORKDIR') if harvester_workdir is not None: search_pattern = "%s/jobO.*.tar.gz" % harvester_workdir logger.debug("search_pattern - %s" % search_pattern) jobopt_files = glob.glob(search_pattern) for jobopt_file in jobopt_files: logger.debug("jobopt_file = %s workdir = %s" % (jobopt_file, workdir)) try: copy(jobopt_file, workdir) except Exception as e: logger.error("could not copy file %s to %s : %s" % (jobopt_file, workdir, e)) if '/' in transform: transform_name = transform.split('/')[-1] else: logger.warning('did not detect any / in %s (using full transform name)' % transform) transform_name = transform # is the command already available? (e.g. if already downloaded by a preprocess/main process step) if os.path.exists(os.path.join(workdir, transform_name)): logger.info('script %s is already available - no need to download again' % transform_name) return ec, diagnostics, transform_name original_base_url = "" # verify the base URL for base_url in get_valid_base_urls(): if transform.startswith(base_url): original_base_url = base_url break if original_base_url == "": diagnostics = "invalid base URL: %s" % transform return errors.TRFDOWNLOADFAILURE, diagnostics, "" # try to download from the required location, if not - switch to backup status = False for base_url in get_valid_base_urls(order=original_base_url): trf = re.sub(original_base_url, base_url, transform) logger.debug("attempting to download script: %s" % trf) status, diagnostics = download_transform(trf, transform_name, workdir) if status: break if not status: return errors.TRFDOWNLOADFAILURE, diagnostics, "" logger.info("successfully downloaded script") path = os.path.join(workdir, transform_name) logger.debug("changing permission of %s to 0o755" % path) try: os.chmod(path, 0o755) # Python 2/3 except Exception as e: diagnostics = "failed to chmod %s: %s" % (transform_name, e) return errors.CHMODTRF, diagnostics, "" return ec, diagnostics, transform_name
def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, external_dir, label='stage-in', container_type='container'): """ Get the middleware container execution command. Note: this function is tailor made for stage-in/out. :param job: job object. :param xdata: list of FileSpec objects. :param queue: queue name (string). :param script: name of stage-in/out script (string). :param eventtype: :param localsite: :param remotesite: :param external_dir: input or output files directory (string). :param label: optional 'stage-[in|out]' (string). :param container_type: optional 'container/bash' (string). :return: stage-in/out command (string). :raises PilotException: for stage-in/out related failures """ if label == 'stage-out': filedata_dictionary = get_filedata_strings(xdata) else: filedata_dictionary = get_filedata(xdata) # write file data to file try: status = write_json( path.join(job.workdir, config.Container.stagein_replica_dictionary), filedata_dictionary) except Exception as exc: diagnostics = 'exception caught in get_command(): %s' % exc logger.warning(diagnostics) raise PilotException(diagnostics) else: if not status: diagnostics = 'failed to write replica dictionary to file' logger.warning(diagnostics) raise PilotException(diagnostics) # copy pilot source into container directory, unless it is already there diagnostics = copy_pilot_source(job.workdir) if diagnostics: raise PilotException(diagnostics) final_script_path = path.join(job.workdir, script) environ['PYTHONPATH'] = environ.get('PYTHONPATH') + ':' + job.workdir script_path = path.join('pilot/scripts', script) full_script_path = path.join(path.join(job.workdir, script_path)) copy(full_script_path, final_script_path) if container_type == 'container': # correct the path when containers have been used final_script_path = path.join('.', script) workdir = '/srv' else: # for container_type=bash we need to add the rucio setup pilot_user = environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.container' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: final_script_path = user.get_middleware_container_script( '', final_script_path, asetup=True) except PilotException: final_script_path = 'python %s' % final_script_path workdir = job.workdir cmd = "%s -d -w %s -q %s --eventtype=%s --localsite=%s --remotesite=%s --produserid=\"%s\" --jobid=%s" % \ (final_script_path, workdir, queue, eventtype, localsite, remotesite, job.produserid.replace(' ', '%20'), job.jobid) if label == 'stage-in': cmd += " --eventservicemerge=%s --usepcache=%s --usevp=%s --replicadictionary=%s" % \ (job.is_eventservicemerge, job.infosys.queuedata.use_pcache, job.use_vp, config.Container.stagein_replica_dictionary) if external_dir: cmd += ' --inputdir=%s' % external_dir else: # stage-out cmd += ' --lfns=%s --scopes=%s --datasets=%s --ddmendpoints=%s --guids=%s' % \ (filedata_dictionary['lfns'], filedata_dictionary['scopes'], filedata_dictionary['datasets'], filedata_dictionary['ddmendpoints'], filedata_dictionary['guids']) if external_dir: cmd += ' --outputdir=%s' % external_dir cmd += ' --taskid=%s' % job.taskid cmd += ' --jobdefinitionid=%s' % job.jobdefinitionid cmd += ' --catchall=%s' % job.infosys.queuedata.catchall if container_type == 'bash': cmd += '\nexit $?' return cmd