def update_configuration(self): configfile = os.path.join(self.config.workdir, 'config.py') if self.__last_config_update < os.path.getmtime(configfile): try: logger.info('updating configuration') self.__last_config_update = time.time() new_config = imp.load_source('userconfig', configfile).config self.config.update(new_config) self.config.save() util.register_checkpoint(self.config.workdir, 'configuration_check', self.__last_config_update) except Exception: logger.exception('failed to update configuration:') util.PartiallyMutable.purge() for method, args in util.PartiallyMutable.changes(): if method is None: continue logger.debug("executing callback '{}' with arguments {}".format(method, args)) attrs = method.split('.') call = self if attrs[0] not in ['config', 'source']: logger.error('invalid registered callback: {}'.format(method)) continue try: for attr in attrs: call = getattr(call, attr) call(*args) except Exception: logger.exception("caught exception while executing callback '{}' with arguments {}".format(method, args))
def kill(args): logger.info("setting flag to quit at the next checkpoint") with open(args.configfile) as configfile: config = yaml.load(configfile) workdir = config['workdir'] util.register_checkpoint(workdir, 'KILLED', 'PENDING') logger.info("reporting unfinished tasks as Aborted to the dashboard") # report unfinished tasks as aborted to dashboard # note: even if lobster didn't terminate gracefully for some reason, # "lobster terminate" can still be run afterwards to properly update # the tasks as aborted to the dashboard. db_path = os.path.join(workdir, "lobster.db") db = sqlite3.connect(db_path) ids = db.execute("select id from jobs where status=1").fetchall() task_id = util.checkpoint(workdir, 'id') if task_id: for (id,) in ids: if config.get('use dashboard', True): dash = cmssw.dash.Monitor(task_id) else: dash = cmssw.dash.DummyMonitor(task_id) dash.update_job(id, cmssw.dash.ABORTED) else: logger.warning("""taskid not found: could not report aborted jobs to the dashboard""")
def kill(self, config): logger.info("setting flag to quit at the next checkpoint") logger.debug("the following stack trace doesn't indicate a crash; it's just for debugging purposes.") logger.debug("stack:\n{0}".format(''.join(traceback.format_stack()))) util.register_checkpoint(config.workdir, 'KILLED', 'PENDING') if config.elk: config.elk.end()
def __init__(self, config, source): self.config = config self.source = source if config.plotdir: logger.info('plots in {0} will be updated automatically'.format(config.plotdir)) if config.foremen_logs: logger.info('foremen logs will be included from: {0}'.format(', '.join(config.foremen_logs))) self.plotter = Plotter(config) self.__last = datetime.datetime.now() self.__last_config_update = util.checkpoint(config.workdir, 'configuration_check') if not self.__last_config_update: self.__last_config_update = time.time() util.register_checkpoint(config.workdir, 'configuration_check', self.__last_config_update)
def __init__(self, config, source): self.config = config self.source = source if config.plotdir: logger.info('plots in {0} will be updated automatically'.format( config.plotdir)) if config.foremen_logs: logger.info('foremen logs will be included from: {0}'.format( ', '.join(config.foremen_logs))) self.plotter = Plotter(config) self.__last = datetime.datetime.now() self.__last_config_update = util.checkpoint(config.workdir, 'configuration_check') if not self.__last_config_update: self.__last_config_update = time.time() util.register_checkpoint(config.workdir, 'configuration_check', self.__last_config_update)
def update_configuration(self): configfile = os.path.join(self.config.workdir, 'config.py') if self.__last_config_update < os.path.getmtime(configfile): try: logger.info('updating configuration') self.__last_config_update = time.time() new_config = imp.load_source('userconfig', configfile).config self.config.update(new_config) self.config.save() util.register_checkpoint(self.config.workdir, 'configuration_check', self.__last_config_update) except Exception: logger.exception('failed to update configuration:') util.PartiallyMutable.purge() for method, args in util.PartiallyMutable.changes(): if method is None: continue logger.debug( "executing callback '{}' with arguments {}".format( method, args)) attrs = method.split('.') call = self if attrs[0] not in ['config', 'source']: logger.error( 'invalid registered callback: {}'.format(method)) continue try: for attr in attrs: call = getattr(call, attr) call(*args) except Exception: logger.exception( "caught exception while executing callback '{}' with arguments {}" .format(method, args))
def __init__(self, config): util.Timing.__init__(self, 'dash', 'handler', 'updates', 'elk', 'transfers', 'cleanup', 'propagate', 'sqlite') self.config = config self.basedirs = [config.base_directory, config.startup_directory] self.workdir = config.workdir self._storage = config.storage self.statusfile = os.path.join(self.workdir, 'status.json') self.siteconf = os.path.join(self.workdir, 'siteconf') self.parrot_path = os.path.dirname(util.which('parrot_run')) self.parrot_bin = os.path.join(self.workdir, 'bin') self.parrot_lib = os.path.join(self.workdir, 'lib') self.__algo = Algo(config) self.__host = socket.getfqdn() try: siteconf = loadSiteLocalConfig() self.__ce = siteconf.siteName self.__se = siteconf.localStageOutPNN() self.__frontier_proxy = siteconf.frontierProxies[0] except (SiteConfigError, IndexError): logger.error("can't load siteconfig, defaulting to hostname") self.__ce = socket.getfqdn() self.__se = socket.getfqdn() try: self.__frontier_proxy = os.environ['HTTP_PROXY'] except KeyError: logger.error( "can't determine proxy for Frontier via $HTTP_PROXY") sys.exit(1) try: with open('/etc/cvmfs/default.local') as f: lines = f.readlines() except IOError: lines = [] for l in lines: m = re.match('\s*CVMFS_HTTP_PROXY\s*=\s*[\'"]?(.*)[\'"]?', l) if m: self.__cvmfs_proxy = m.group(1).strip("\"'") break else: try: self.__cvmfs_proxy = os.environ['HTTP_PROXY'] except KeyError: logger.error("can't determine proxy for CVMFS via $HTTP_PROXY") sys.exit(1) logger.debug("using {} as proxy for CVMFS".format(self.__cvmfs_proxy)) logger.debug("using {} as proxy for Frontier".format( self.__frontier_proxy)) logger.debug("using {} as osg_version".format( self.config.advanced.osg_version)) util.sendemail("Your Lobster project has started!", self.config) self.__taskhandlers = {} self.__store = unit.UnitStore(self.config) self.__setup_inputs() self.copy_siteconf() create = not util.checkpoint(self.workdir, 'id') if create: self.taskid = 'lobster_{0}_{1}'.format( self.config.label, sha1(str(datetime.datetime.utcnow())).hexdigest()[-16:]) util.register_checkpoint(self.workdir, 'id', self.taskid) shutil.copy(self.config.base_configuration, os.path.join(self.workdir, 'config.py')) else: self.taskid = util.checkpoint(self.workdir, 'id') util.register_checkpoint(self.workdir, 'RESTARTED', str(datetime.datetime.utcnow())) if not util.checkpoint(self.workdir, 'executable'): # We can actually have more than one exe name (one per task label) # Set 'cmsRun' if any of the tasks are of that type, # or use cmd command if all tasks execute the same cmd, # or use 'noncmsRun' if task cmds are different # Using this for dashboard exe name reporting cmsconfigs = [wflow.pset for wflow in self.config.workflows] cmds = [wflow.command for wflow in self.config.workflows] if any(cmsconfigs): exename = 'cmsRun' elif all(x == cmds[0] and x is not None for x in cmds): exename = cmds[0] else: exename = 'noncmsRun' util.register_checkpoint(self.workdir, 'executable', exename) for wflow in self.config.workflows: if create and not util.checkpoint(self.workdir, wflow.label): wflow.setup(self.workdir, self.basedirs) logger.info("querying backend for {0}".format(wflow.label)) with fs.alternative(): dataset_info = wflow.dataset.get_info() logger.info("registering {0} in database".format(wflow.label)) self.__store.register_dataset(wflow, dataset_info, wflow.category.runtime) util.register_checkpoint(self.workdir, wflow.label, 'REGISTERED') elif os.path.exists(os.path.join(wflow.workdir, 'running')): for id in self.get_taskids(wflow.label): util.move(wflow.workdir, id, 'failed') for wflow in self.config.workflows: if wflow.parent: getattr(self.config.workflows, wflow.parent.label).register(wflow) if create: total_units = wflow.dataset.total_units * len( wflow.unique_arguments) self.__store.register_dependency(wflow.label, wflow.parent.label, total_units) if not util.checkpoint(self.workdir, 'sandbox cmssw version'): util.register_checkpoint(self.workdir, 'sandbox', 'CREATED') versions = set([w.version for w in self.config.workflows]) if len(versions) == 1: util.register_checkpoint(self.workdir, 'sandbox cmssw version', list(versions)[0]) if self.config.elk: if create: categories = { wflow.category.name: [] for wflow in self.config.workflows } for category in categories: for workflow in self.config.workflows: if workflow.category.name == category: categories[category].append(workflow.label) self.config.elk.create(categories) else: self.config.elk.resume() self.config.advanced.dashboard.setup(self.config) if create: self.config.save() self.config.advanced.dashboard.register_run() else: self.config.advanced.dashboard.update_task_status( (id_, dash.ABORTED) for id_ in self.__store.reset_units()) for p in (self.parrot_bin, self.parrot_lib): if not os.path.exists(p): os.makedirs(p) for exe in ('parrot_run', 'chirp', 'chirp_put', 'chirp_get'): shutil.copy(util.which(exe), self.parrot_bin) subprocess.check_call( ["strip", os.path.join(self.parrot_bin, exe)]) p_helper = os.path.join(os.path.dirname(self.parrot_path), 'lib', 'lib64', 'libparrot_helper.so') shutil.copy(p_helper, self.parrot_lib)
def __init__(self, config): self.config = config self.basedirs = [config['configdir'], config['startdir']] self.workdir = config.get('workdir', os.getcwd()) self.stageout = config.get('stageout location', os.getcwd()) self.statusfile = os.path.join(self.workdir, 'status.yaml') self.parrot_path = os.path.dirname(util.which('parrot_run')) self.parrot_bin = os.path.join(self.workdir, 'bin') self.parrot_lib = os.path.join(self.workdir, 'lib') self.extra_inputs = {} self.args = {} self.outputs = {} self.outputformats = {} self.cmds = {} chirp_server = config.get('chirp server') chirp_root = config.get('chirp root') create = not util.checkpoint(self.workdir, 'id') and not self.config.get('merge', False) if create: self.taskid = 'lobster_{0}_{1}'.format( self.config['id'], sha1(str(datetime.datetime.utcnow())).hexdigest()[-16:]) util.register_checkpoint(self.workdir, 'id', self.taskid) else: self.taskid = util.checkpoint(self.workdir, 'id') util.register_checkpoint(self.workdir, 'RESTARTED', str(datetime.datetime.utcnow())) self.config = apply_matching(self.config) for cfg in self.config['tasks']: label = cfg['label'] self.extra_inputs[label] = map( partial(util.findpath, self.basedirs), cfg.get('extra inputs', [])) self.outputs[label] = cfg.get('outputs', []) self.args[label] = cfg.get('parameters', []) self.outputformats[label] = cfg.get("output format", "{base}_{id}.{ext}") self.cmds[label] = cfg.get('cmd') taskdir = os.path.join(self.workdir, label) stageoutdir = os.path.join(self.stageout, label) if create: if not os.path.exists(taskdir): os.makedirs(taskdir) if chirp_root and stageoutdir.startswith(chirp_root): target = stageoutdir.replace(chirp_root, '', 1) if not chirp.exists(chirp_server, chirp_root, target): chirp.makedirs(chirp_server, chirp_root, target) else: if not os.path.exists(stageoutdir): os.makedirs(stageoutdir) shutil.copy(self.config['filename'], os.path.join(self.workdir, 'lobster_config.yaml')) for p in (self.parrot_bin, self.parrot_lib): if not os.path.exists(p): os.makedirs(p) for exe in ('parrot_run', 'chirp', 'chirp_put', 'chirp_get'): shutil.copy(util.which(exe), self.parrot_bin) subprocess.check_call(["strip", os.path.join(self.parrot_bin, exe)]) p_helper = os.path.join(os.path.dirname(self.parrot_path), 'lib', 'lib64', 'libparrot_helper.so') shutil.copy(p_helper, self.parrot_lib)
def __init__(self, config): super(JobProvider, self).__init__(config) self.__chirp = self.config.get('stageout server', None) self.__sandbox = os.path.join(self.workdir, 'sandbox') self.__datasets = {} self.__configs = {} self.__jobhandlers = {} self.__interface = MetaInterface() self.__store = jobit.JobitStore(self.config) self.__grid_files = [(os.path.join('/cvmfs/grid.cern.ch', x), os.path.join('grid', x)) for x in ['3.2.11-1/external/etc/profile.d/clean-grid-env-funcs.sh', '3.2.11-1/external/etc/profile.d/grid-env-funcs.sh', '3.2.11-1/external/etc/profile.d/grid-env.sh', '3.2.11-1/etc/profile.d/grid-env.sh', '3.2.11-1/glite/bin/voms-proxy-info', '3.2.11-1/glite/lib64/libvomsapi_nog.so.0.0.0', '3.2.11-1/glite/lib64/libvomsapi_nog.so.0', 'etc/grid-security/certificates' ] ] if self.config.get('use dashboard', False): logging.info("using dashboard with task id {0}".format(self.taskid)) self.__dash = dash.Monitor(self.taskid) else: self.__dash = dash.DummyMonitor(self.taskid) if not util.checkpoint(self.workdir, 'sandbox'): blacklist = self.config.get('sandbox blacklist', []) sandbox.package(os.environ['LOCALRT'], self.__sandbox, blacklist, self.config.get('recycle sandbox')) util.register_checkpoint(self.workdir, 'sandbox', 'CREATED') self.__dash.register_run() else: for id in self.__store.reset_jobits(): self.__dash.update_job(id, dash.ABORTED) for cfg in self.config['tasks']: label = cfg['label'] cfg['basedirs'] = self.basedirs cms_config = cfg.get('cmssw config') if cms_config: self.__configs[label] = os.path.basename(cms_config) self.__datasets[label] = cfg.get('dataset', cfg.get('files', '')) if cms_config and not cfg.has_key('outputs'): sys.argv = [sys.argv[0]] #To avoid problems loading configs that use the VarParsing module with open(cms_config, 'r') as f: source = imp.load_source('cms_config_source', cms_config, f) cfg_interface = CfgInterface(source.process) if hasattr(cfg_interface.data.GlobalTag.globaltag, 'value'): #Possibility: make this mandatory? cfg['global tag'] = cfg_interface.data.GlobalTag.globaltag.value() for m in cfg_interface.data.outputModules: self.outputs[label].append(getattr(cfg_interface.data, m).fileName._value) taskdir = os.path.join(self.workdir, label) if not util.checkpoint(self.workdir, label): if cms_config: shutil.copy(util.findpath(self.basedirs, cms_config), os.path.join(taskdir, os.path.basename(cms_config))) logging.info("querying backend for {0}".format(label)) dataset_info = self.__interface.get_info(cfg) logging.info("registering {0} in database".format(label)) self.__store.register(cfg, dataset_info) util.register_checkpoint(self.workdir, label, 'REGISTERED') elif os.path.exists(os.path.join(taskdir, 'running')): for d in os.listdir(os.path.join(taskdir, 'running')): shutil.move(os.path.join(taskdir, 'running', d), os.path.join(taskdir, 'failed'))
def sprint(self): with util.PartiallyMutable.unlock(): self.source = TaskProvider(self.config) action = actions.Actions(self.config, self.source) logger.info("using wq from {0}".format(wq.__file__)) logger.info("running Lobster version {0}".format(util.get_version())) logger.info("current PID is {0}".format(os.getpid())) wq.cctools_debug_flags_set("all") wq.cctools_debug_config_file( os.path.join(self.config.workdir, "work_queue_debug.log")) wq.cctools_debug_config_file_size(1 << 29) self.queue = wq.WorkQueue(self.config.advanced.wq_port) self.queue.specify_min_taskid(self.source.max_taskid() + 1) self.queue.specify_log( os.path.join(self.config.workdir, "work_queue.log")) self.queue.specify_transactions_log( os.path.join(self.config.workdir, "transactions.log")) self.queue.specify_name("lobster_" + self.config.label) self.queue.specify_keepalive_timeout(300) # self.queue.tune("short-timeout", 600) self.queue.tune("transfer-outlier-factor", 4) self.queue.specify_algorithm(wq.WORK_QUEUE_SCHEDULE_RAND) if self.config.advanced.full_monitoring: self.queue.enable_monitoring_full(None) else: self.queue.enable_monitoring(None) logger.info("starting queue as {0}".format(self.queue.name)) abort_active = False abort_threshold = self.config.advanced.abort_threshold abort_multiplier = self.config.advanced.abort_multiplier wq_max_retries = self.config.advanced.wq_max_retries if util.checkpoint(self.config.workdir, 'KILLED') == 'PENDING': util.register_checkpoint(self.config.workdir, 'KILLED', 'RESTART') # time in seconds to wait for WQ to return tasks, with minimum wait # time in case no more tasks are waiting interval = 120 interval_minimum = 30 tasks_left = 0 units_left = 0 successful_tasks = 0 categories = [] self.setup_logging('all') # Workflows can be assigned categories, with each category having # different cpu/memory/walltime requirements that WQ will automatically # fine-tune for category in self.config.categories: constraints = category.wq() if category.name != 'merge': categories.append(category.name) self.setup_logging(category.name) self.queue.specify_category_mode(category.name, category.mode) if category.mode == wq.WORK_QUEUE_ALLOCATION_MODE_FIXED: self.queue.specify_category_max_resources( category.name, constraints) else: self.queue.specify_category_first_allocation_guess( category.name, constraints) logger.debug('Category {0}: {1}'.format(category.name, constraints)) if 'wall_time' not in constraints: self.queue.activate_fast_abort_category( category.name, abort_multiplier) proxy_email_sent = False while not self.source.done(): with self.measure('status'): tasks_left = self.source.tasks_left() units_left = self.source.work_left() logger.debug("expecting {0} tasks, still".format(tasks_left)) self.queue.specify_num_tasks_left(tasks_left) for c in categories + ['all']: self.log(c, units_left) if util.checkpoint(self.config.workdir, 'KILLED') == 'PENDING': util.register_checkpoint(self.config.workdir, 'KILLED', str(datetime.datetime.utcnow())) # let the task source shut down gracefully logger.info("terminating task source") self.source.terminate() logger.info("terminating gracefully") break with self.measure('create'): have = {} for c in categories: cstats = self.queue.stats_category(c) have[c] = { 'running': cstats.tasks_running, 'queued': cstats.tasks_waiting } stats = self.queue.stats_hierarchy tasks = self.source.obtain(stats.total_cores, have) expiry = None if self.config.advanced.proxy: expiry = self.config.advanced.proxy.expires() proxy_time_left = self.config.advanced.proxy.time_left() if proxy_time_left >= 24 * 3600: proxy_email_sent = False if proxy_time_left < 24 * 3600 and not proxy_email_sent: util.sendemail( "Your proxy is about to expire.\n" + "Timeleft: " + str(datetime.timedelta(seconds=proxy_time_left)), self.config) proxy_email_sent = True for category, cmd, id, inputs, outputs, env, dir in tasks: task = wq.Task(cmd) task.specify_category(category) task.specify_tag(id) task.specify_max_retries(wq_max_retries) task.specify_monitor_output( os.path.join(dir, 'resource_monitor')) for k, v in env.items(): task.specify_environment_variable(k, v) for (local, remote, cache) in inputs: cache_opt = wq.WORK_QUEUE_CACHE if cache else wq.WORK_QUEUE_NOCACHE if os.path.isfile(local) or os.path.isdir(local): task.specify_input_file(str(local), str(remote), cache_opt) else: logger.critical( "cannot send file to worker: {0}".format( local)) raise NotImplementedError for (local, remote) in outputs: task.specify_output_file(str(local), str(remote)) if expiry: task.specify_end_time(expiry * 10**6) self.queue.submit(task) with self.measure('status'): stats = self.queue.stats_hierarchy logger.info( "{0} out of {1} workers busy; {2} tasks running, {3} waiting; {4} units left" .format(stats.workers_busy, stats.workers_busy + stats.workers_ready, stats.tasks_running, stats.tasks_waiting, units_left)) with self.measure('update'): self.source.update(self.queue) # recurring actions are triggered here; plotting etc should run # while we have WQ hand us back tasks w/o any database # interaction with self.measure('action'): if action: action.take() with self.measure('fetch'): starttime = time.time() task = self.queue.wait(interval) tasks = [] while task: if task.return_status == 0: successful_tasks += 1 elif task.return_status in self.config.advanced.bad_exit_codes: logger.warning( "blacklisting host {0} due to bad exit code from task {1}" .format(task.hostname, task.tag)) self.queue.blacklist(task.hostname) tasks.append(task) remaining = int(starttime + interval - time.time()) if (interval - remaining < interval_minimum or self.queue.stats.tasks_waiting > 0 ) and remaining > 0: task = self.queue.wait(remaining) else: task = None # TODO do we really need this? We have everything based on # categories by now, so this should not be needed. if abort_threshold > 0 and successful_tasks >= abort_threshold and not abort_active: logger.info( "activating fast abort with multiplier: {0}".format( abort_multiplier)) abort_active = True self.queue.activate_fast_abort(abort_multiplier) if len(tasks) > 0: try: with self.measure('return'): self.source.release(tasks) except Exception: tb = traceback.format_exc() logger.critical( "cannot recover from the following exception:\n" + tb) util.sendemail( "Your Lobster project has crashed from the following exception:\n" + tb, self.config) for task in tasks: logger.critical( "tried to return task {0} from {1}".format( task.tag, task.hostname)) raise if units_left == 0: logger.info("no more work left to do") util.sendemail("Your Lobster project is done!", self.config) if self.config.elk: self.config.elk.end() if action: action.take(True)
def run(self, args): self.config = args.config if args.finalize: args.config.advanced.threshold_for_failure = 0 args.config.advanced.threshold_for_skipping = 0 if not os.path.exists(self.config.workdir): os.makedirs(self.config.workdir) if not util.checkpoint(self.config.workdir, "version"): util.register_checkpoint(self.config.workdir, "version", util.get_version()) else: util.verify(self.config.workdir) if not args.foreground: ttyfile = open(os.path.join(self.config.workdir, 'process.err'), 'a') logger.info("saving stderr and stdout to {0}".format( os.path.join(self.config.workdir, 'process.err'))) args.preserve.append(ttyfile) if self.config.advanced.dump_core: logger.info("setting core dump size to unlimited") resource.setrlimit( resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)) def localkill(num, frame): Terminate().run(args) signals = daemon.daemon.make_default_signal_map() signals[signal.SIGINT] = localkill signals[signal.SIGTERM] = localkill process = psutil.Process() preserved = [f.name for f in args.preserve] preserved += [os.path.realpath(os.path.abspath(f)) for f in preserved] openfiles = [ f for f in process.open_files() if f.path not in preserved ] openconns = process.connections() for c in openconns: logger.debug("open connection: {}".format(c)) args.preserve.append(c.fd) if len(openfiles) > 0: logger.error("cannot daemonize due to open files") for f in openfiles: logger.error("open file: {}".format(f.path)) raise RuntimeError("open files or connections") with daemon.DaemonContext( detach_process=not args.foreground, stdout=sys.stdout if args.foreground else ttyfile, stderr=sys.stderr if args.foreground else ttyfile, files_preserve=args.preserve, working_directory=self.config.workdir, pidfile=util.get_lock(self.config.workdir, args.force), prevent_core=False, initgroups=False, signal_map=signals): self.sprint() logger.info("lobster terminated") if not args.foreground: logger.info("stderr and stdout saved in {0}".format( os.path.join(self.config.workdir, 'process.err'))) try: # Fails if something with working directory creation went wrong Status().run(args) except Exception: pass
def __init__(self, config): self.config = config self.basedirs = [config['configdir'], config['startdir']] self.workdir = config.get('workdir', os.getcwd()) self.stageout = config.get('stageout location', os.getcwd()) self.statusfile = os.path.join(self.workdir, 'status.yaml') self.parrot_path = os.path.dirname(util.which('parrot_run')) self.parrot_bin = os.path.join(self.workdir, 'bin') self.parrot_lib = os.path.join(self.workdir, 'lib') self.extra_inputs = {} self.args = {} self.outputs = {} self.outputformats = {} self.cmds = {} create = not util.checkpoint(self.workdir, 'id') and not self.config.get('merge', False) if create: self.taskid = 'lobster_{0}_{1}'.format( self.config['id'], sha1(str(datetime.datetime.utcnow())).hexdigest()[-16:]) with open(self.statusfile, 'wb') as f: yaml.dump({'id': self.taskid}, f, default_flow_style=False) else: self.taskid = util.checkpoint(self.workdir, 'id') util.register_checkpoint(self.workdir, 'RESTARTED', str(datetime.datetime.utcnow())) self.config = apply_matching(self.config) for cfg in self.config['tasks']: label = cfg['label'] self.extra_inputs[label] = map( partial(util.findpath, self.basedirs), cfg.get('extra inputs', [])) self.outputs[label] = cfg.get('outputs', []) self.args[label] = cfg.get('parameters', []) self.outputformats[label] = cfg.get("output format", "{base}_{id}.{ext}") self.cmds[label] = cfg.get('cmd') taskdir = os.path.join(self.workdir, label) stageoutdir = os.path.join(self.stageout, label) if create: for dir in [taskdir, stageoutdir]: if not os.path.exists(dir): os.makedirs(dir) else: # TODO warn about non-empty stageout directories pass shutil.copy(self.config['filepath'], os.path.join(self.workdir, 'lobster_config.yaml')) for p in (self.parrot_bin, self.parrot_lib): if not os.path.exists(p): os.makedirs(p) for exe in ('parrot_run', 'chirp_put', 'chirp_get'): shutil.copy(util.which(exe), self.parrot_bin) subprocess.check_call(["strip", os.path.join(self.parrot_bin, exe)]) for lib in util.ldd(exe): shutil.copy(lib, self.parrot_lib) p_helper = os.path.join(os.path.dirname(self.parrot_path), 'lib', 'lib64', 'libparrot_helper.so') shutil.copy(p_helper, self.parrot_lib)
def run(args): dash_checker = cmssw.dash.JobStateChecker(300) with open(args.configfile) as configfile: config = yaml.load(configfile) workdir = config['workdir'] if not os.path.exists(workdir): os.makedirs(workdir) util.register_checkpoint(workdir, "version", get_distribution('Lobster').version) else: util.verify(workdir) cmsjob = False if config.get('type', 'cmssw') == 'cmssw': cmsjob = True from ProdCommon.Credential.CredentialAPI import CredentialAPI cred = CredentialAPI({'credential': 'Proxy'}) if cred.checkCredential(Time=60): if not 'X509_USER_PROXY' in os.environ: os.environ['X509_USER_PROXY'] = cred.credObj.getUserProxy() else: if config.get('advanced', {}).get('renew proxy', True): try: cred.ManualRenewCredential() except Exception as e: print("could not renew proxy") sys.exit(1) else: print("please renew your proxy") sys.exit(1) print "Saving log to {0}".format(os.path.join(workdir, 'lobster.log')) if not args.foreground: ttyfile = open(os.path.join(workdir, 'lobster.err'), 'a') print "Saving stderr and stdout to {0}".format(os.path.join(workdir, 'lobster.err')) signals = daemon.daemon.make_default_signal_map() signals[signal.SIGTERM] = lambda num, frame: kill(args) with daemon.DaemonContext( detach_process=not args.foreground, stdout=sys.stdout if args.foreground else ttyfile, stderr=sys.stderr if args.foreground else ttyfile, working_directory=workdir, pidfile=util.get_lock(workdir), signal_map=signals): fileh = logging.handlers.RotatingFileHandler(os.path.join(workdir, 'lobster.log'), maxBytes=500e6, backupCount=10) fileh.setFormatter(ShortPathFormatter("%(asctime)s [%(levelname)5s] - %(pathname)-40s %(lineno)4d: %(message)s")) fileh.setLevel(config.get('advanced', {}).get('log level', 2) * 10) logger.addHandler(fileh) logger.setLevel(config.get('advanced', {}).get('log level', 2) * 10) if args.foreground: console = logging.StreamHandler() console.setLevel(config.get('advanced', {}).get('log level', 2) * 10) console.setFormatter(ShortPathFormatter("%(asctime)s [%(levelname)5s] - %(pathname)-40s %(lineno)4d: %(message)s")) logger.addHandler(console) config['configdir'] = args.configdir config['filename'] = args.configfile config['startdir'] = args.startdir if cmsjob: job_src = cmssw.JobProvider(config) actions = cmssw.Actions(config) else: job_src = job.SimpleJobProvider(config) actions = None logger.info("using wq from {0}".format(wq.__file__)) wq.cctools_debug_flags_set("all") wq.cctools_debug_config_file(os.path.join(workdir, "work_queue_debug.log")) wq.cctools_debug_config_file_size(1 << 29) queue = wq.WorkQueue(-1) queue.specify_log(os.path.join(workdir, "work_queue.log")) queue.specify_name("lobster_" + config["id"]) queue.specify_keepalive_timeout(300) # queue.tune("short-timeout", 600) queue.tune("transfer-outlier-factor", 4) queue.specify_algorithm(wq.WORK_QUEUE_SCHEDULE_RAND) logger.info("starting queue as {0}".format(queue.name)) logger.info("submit workers with: condor_submit_workers -M {0} <num>".format(queue.name)) payload = config.get('advanced', {}).get('payload', 400) abort_active = False abort_threshold = config.get('advanced', {}).get('abort threshold', 400) abort_multiplier = config.get('advanced', {}).get('abort multiplier', 4) if util.checkpoint(workdir, 'KILLED') == 'PENDING': util.register_checkpoint(workdir, 'KILLED', 'RESTART') jobits_left = 0 successful_jobs = 0 creation_time = 0 destruction_time = 0 with open(os.path.join(workdir, "lobster_stats.log"), "a") as statsfile: statsfile.write( "#timestamp " + "total_workers_connected total_workers_joined total_workers_removed " + "workers_busy workers_idle " + "tasks_running " + "total_send_time total_receive_time " + "total_create_time total_return_time " + "idle_percentage " + "capacity " + "efficiency " + "total_memory " + "total_cores " + "jobits_left\n") while not job_src.done(): jobits_left = job_src.work_left() stats = queue.stats with open(os.path.join(workdir, "lobster_stats.log"), "a") as statsfile: now = datetime.datetime.now() statsfile.write(" ".join(map(str, [ int(int(now.strftime('%s')) * 1e6 + now.microsecond), stats.total_workers_connected, stats.total_workers_joined, stats.total_workers_removed, stats.workers_busy, stats.workers_idle, stats.tasks_running, stats.total_send_time, stats.total_receive_time, creation_time, destruction_time, stats.idle_percentage, stats.capacity, stats.efficiency, stats.total_memory, stats.total_cores, jobits_left ] )) + "\n" ) if util.checkpoint(workdir, 'KILLED') == 'PENDING': util.register_checkpoint(workdir, 'KILLED', str(datetime.datetime.utcnow())) # just in case, check for any remaining not done task that # hasn't been reported as aborted for task_id in queue._task_table.keys(): status = cmssw.dash.status_map[queue.task_state(task_id)] if status not in (cmssw.dash.DONE, cmssw.dash.ABORTED): job_src._JobProvider__dash.update_job(task_id, cmssw.dash.ABORTED) logger.info("terminating gracefully") break logger.info("{0} out of {1} workers busy; {3} jobs running, {4} waiting; {2} jobits left".format( stats.workers_busy, stats.workers_busy + stats.workers_ready, jobits_left, stats.tasks_running, stats.tasks_waiting)) hunger = max(payload - stats.tasks_waiting, 0) t = time.time() while hunger > 0: jobs = job_src.obtain(50) if jobs == None or len(jobs) == 0: break hunger -= len(jobs) cores = config.get('cores per job', 1) for id, cmd, inputs, outputs in jobs: task = wq.Task(cmd) task.specify_tag(id) task.specify_cores(cores) # temporary work-around? # task.specify_memory(1000) # task.specify_disk(4000) for (local, remote, cache) in inputs: if os.path.isfile(local): cache_opt = wq.WORK_QUEUE_CACHE if cache else wq.WORK_QUEUE_NOCACHE task.specify_input_file(str(local), str(remote), cache_opt) elif os.path.isdir(local): task.specify_directory(local, remote, wq.WORK_QUEUE_INPUT, wq.WORK_QUEUE_CACHE, recursive=True) else: logger.critical("cannot send file to worker: {0}".format(local)) raise NotImplementedError for (local, remote) in outputs: task.specify_output_file(str(local), str(remote)) queue.submit(task) creation_time += int((time.time() - t) * 1e6) # update dashboard status for all not done tasks # report Done status only once when releasing the task # WAITING_RETRIEVAL is not a valid status in dashboard # so, skipping it for now monitor = job_src._JobProvider__dash queue = queue exclude_states = (cmssw.dash.DONE, cmssw.dash.WAITING_RETRIEVAL) try: dash_checker.update_dashboard_states(monitor, queue, exclude_states) except Exception as e: logger.warning("Could not update job states to dashboard") task = queue.wait(300) tasks = [] while task: if task.return_status == 0: successful_jobs += 1 tasks.append(task) if queue.stats.tasks_complete > 0: task = queue.wait(1) else: task = None if len(tasks) > 0: try: t = time.time() job_src.release(tasks) destruction_time += int((time.time() - t) * 1e6) except: tb = traceback.format_exc() logger.critical("cannot recover from the following exception:\n" + tb) for task in tasks: logger.critical("tried to return task {0} from {1}".format(task.tag, task.hostname)) raise if successful_jobs >= abort_threshold and not abort_active: logger.info("activating fast abort with multiplier: {0}".format(abort_multiplier)) abort_active = True queue.activate_fast_abort(abort_multiplier) # recurring actions are triggered here if actions: actions.take() if jobits_left == 0: logger.info("no more work left to do")
def run(args): with open(args.configfile) as configfile: config = yaml.load(configfile) workdir = config['workdir'] if not os.path.exists(workdir): os.makedirs(workdir) cmsjob = False if config.get('type', 'cmssw') == 'cmssw': cmsjob = True from ProdCommon.Credential.CredentialAPI import CredentialAPI cred = CredentialAPI({'credential': 'Proxy'}) if cred.checkCredential(Time=60): if not 'X509_USER_PROXY' in os.environ: os.environ['X509_USER_PROXY'] = cred.credObj.getUserProxy() else: if config.get('check proxy', True): try: cred.ManualRenewCredential() except Exception as e: logging.critical("could not renew proxy") sys.exit(1) else: logging.critical("please renew your proxy") sys.exit(1) mode_label = 'merge_' if args.merge else '' print "Saving log to {0}".format(os.path.join(workdir, mode_label+'lobster.log')) if not args.foreground: ttyfile = open(os.path.join(workdir, mode_label+'lobster.err'), 'a') print "Saving stderr and stdout to {0}".format(os.path.join(workdir, mode_label+'lobster.err')) with daemon.DaemonContext( detach_process=not args.foreground, stdout=sys.stdout if args.foreground else ttyfile, stderr=sys.stderr if args.foreground else ttyfile, working_directory=workdir, pidfile=get_lock(workdir)): logging.basicConfig( datefmt="%Y-%m-%d %H:%M:%S", format="%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s", level=config.get('log level', 2) * 10, filename=os.path.join(workdir, mode_label+'lobster.log')) if args.foreground: console = logging.StreamHandler() console.setLevel(config.get('log level', 2) * 10) console.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s")) logging.getLogger('').addHandler(console) config['configdir'] = args.configdir config['filepath'] = args.configfile config['startdir'] = args.startdir if args.merge: if args.server: config['stageout server'] = args.server config['max megabytes'] = args.max_megabytes job_src = cmssw.MergeProvider(config) elif cmsjob: job_src = cmssw.JobProvider(config) else: job_src = job.SimpleJobProvider(config) wq.cctools_debug_flags_set("all") wq.cctools_debug_config_file(os.path.join(workdir, mode_label+"work_queue_debug.log")) wq.cctools_debug_config_file_size(1 << 29) queue = wq.WorkQueue(-1) queue.specify_log(os.path.join(workdir, mode_label+"work_queue.log")) queue.specify_name("lobster_" + mode_label + config["id"]) queue.specify_keepalive_timeout(300) # queue.tune("short-timeout", 600) queue.tune("transfer-outlier-factor", 4) logging.info("starting queue as {0}".format(queue.name)) logging.info("submit workers with: condor_submit_workers -M {0} <num>".format(queue.name)) payload = config.get('tune', {}).get('payload', 400) abort_active = False abort_threshold = config.get('tune', {}).get('abort threshold', 400) abort_multiplier = config.get('tune', {}).get('abort multiplier', 4) if util.checkpoint(workdir, 'KILLED') == 'PENDING': util.register_checkpoint(workdir, 'KILLED', 'RESTART') successful_jobs = 0 creation_time = 0 destruction_time = 0 with open(os.path.join(workdir, mode_label+"lobster_stats.log"), "a") as statsfile: statsfile.write( "#timestamp " + "total_workers_connected total_workers_joined total_workers_removed " + "workers_busy workers_idle " + "tasks_running " + "total_send_time total_receive_time " + "total_create_time total_return_time " + "idle_percentage " + "capacity " + "efficiency " + "jobits_left\n") while not job_src.done(): jobits_left = job_src.work_left() stats = queue.stats with open(os.path.join(workdir, mode_label+"lobster_stats.log"), "a") as statsfile: now = datetime.datetime.now() statsfile.write(" ".join(map(str, [ int(int(now.strftime('%s')) * 1e6 + now.microsecond), stats.total_workers_connected, stats.total_workers_joined, stats.total_workers_removed, stats.workers_busy, stats.workers_idle, stats.tasks_running, stats.total_send_time, stats.total_receive_time, creation_time, destruction_time, stats.idle_percentage, stats.capacity, stats.efficiency, jobits_left ] )) + "\n" ) if util.checkpoint(workdir, 'KILLED') == 'PENDING': util.register_checkpoint(workdir, 'KILLED', str(datetime.datetime.utcnow())) logging.info("terminating gracefully") break logging.info("{0} out of {1} workers busy; {3} jobs running, {4} waiting; {2} jobits left".format( stats.workers_busy, stats.workers_busy + stats.workers_ready, jobits_left, stats.tasks_running, stats.tasks_waiting)) hunger = max(payload - stats.tasks_waiting, 0) t = time.time() while hunger > 0: jobs = job_src.obtain(50) if jobs == None or len(jobs) == 0: break hunger -= len(jobs) for id, cmd, inputs, outputs in jobs: task = wq.Task(cmd) task.specify_tag(id) task.specify_cores(1) # temporary work-around? # task.specify_memory(1000) # task.specify_disk(4000) for (local, remote) in inputs: if os.path.isfile(local): task.specify_input_file(str(local), str(remote), wq.WORK_QUEUE_CACHE) elif os.path.isdir(local): task.specify_directory(local, remote, wq.WORK_QUEUE_INPUT, wq.WORK_QUEUE_CACHE, recursive=True) else: logging.critical("cannot send file to worker: {0}".format(local)) raise NotImplementedError for (local, remote) in outputs: task.specify_output_file(str(local), str(remote)) queue.submit(task) creation_time += int((time.time() - t) * 1e6) task = queue.wait(300) tasks = [] while task: if task.return_status == 0: successful_jobs += 1 tasks.append(task) if queue.stats.tasks_complete > 0: task = queue.wait(1) else: task = None if len(tasks) > 0: try: t = time.time() job_src.release(tasks) destruction_time += int((time.time() - t) * 1e6) except: tb = traceback.format_exc() logging.critical("cannot recover from the following exception:\n" + tb) for task in tasks: logging.critical("tried to return task {0} from {1}".format(task.tag, task.hostname)) raise if successful_jobs >= abort_threshold and not abort_active: logging.info("activating fast abort with multiplier: {0}".format(abort_multiplier)) abort_active = True queue.activate_fast_abort(abort_multiplier) if jobits_left == 0: logging.info("no more work left to do")
def kill(args): with open(args.configfile) as configfile: config = yaml.load(configfile) workdir = config['workdir'] util.register_checkpoint(workdir, 'KILLED', 'PENDING')
def __init__(self, config): util.Timing.__init__(self, 'dash', 'handler', 'updates', 'elk', 'transfers', 'cleanup', 'propagate', 'sqlite') self.config = config self.basedirs = [config.base_directory, config.startup_directory] self.workdir = config.workdir self._storage = config.storage self.statusfile = os.path.join(self.workdir, 'status.json') self.siteconf = os.path.join(self.workdir, 'siteconf') self.parrot_path = os.path.dirname(util.which('parrot_run')) self.parrot_bin = os.path.join(self.workdir, 'bin') self.parrot_lib = os.path.join(self.workdir, 'lib') self.__algo = Algo(config) self.__host = socket.getfqdn() try: siteconf = loadSiteLocalConfig() self.__ce = siteconf.siteName self.__se = siteconf.localStageOutPNN() self.__frontier_proxy = siteconf.frontierProxies[0] except (SiteConfigError, IndexError): logger.error("can't load siteconfig, defaulting to hostname") self.__ce = socket.getfqdn() self.__se = socket.getfqdn() try: self.__frontier_proxy = os.environ['HTTP_PROXY'] except KeyError: logger.error("can't determine proxy for Frontier via $HTTP_PROXY") sys.exit(1) try: with open('/etc/cvmfs/default.local') as f: lines = f.readlines() except IOError: lines = [] for l in lines: m = re.match('\s*CVMFS_HTTP_PROXY\s*=\s*[\'"]?(.*)[\'"]?', l) if m: self.__cvmfs_proxy = m.group(1).strip("\"'") break else: try: self.__cvmfs_proxy = os.environ['HTTP_PROXY'] except KeyError: logger.error("can't determine proxy for CVMFS via $HTTP_PROXY") sys.exit(1) logger.debug("using {} as proxy for CVMFS".format(self.__cvmfs_proxy)) logger.debug("using {} as proxy for Frontier".format(self.__frontier_proxy)) logger.debug("using {} as osg_version".format(self.config.advanced.osg_version)) util.sendemail("Your Lobster project has started!", self.config) self.__taskhandlers = {} self.__store = unit.UnitStore(self.config) self.__setup_inputs() self.copy_siteconf() create = not util.checkpoint(self.workdir, 'id') if create: self.taskid = 'lobster_{0}_{1}'.format( self.config.label, sha1(str(datetime.datetime.utcnow())).hexdigest()[-16:]) util.register_checkpoint(self.workdir, 'id', self.taskid) shutil.copy(self.config.base_configuration, os.path.join(self.workdir, 'config.py')) else: self.taskid = util.checkpoint(self.workdir, 'id') util.register_checkpoint(self.workdir, 'RESTARTED', str(datetime.datetime.utcnow())) if not util.checkpoint(self.workdir, 'executable'): # We can actually have more than one exe name (one per task label) # Set 'cmsRun' if any of the tasks are of that type, # or use cmd command if all tasks execute the same cmd, # or use 'noncmsRun' if task cmds are different # Using this for dashboard exe name reporting cmsconfigs = [wflow.pset for wflow in self.config.workflows] cmds = [wflow.command for wflow in self.config.workflows] if any(cmsconfigs): exename = 'cmsRun' elif all(x == cmds[0] and x is not None for x in cmds): exename = cmds[0] else: exename = 'noncmsRun' util.register_checkpoint(self.workdir, 'executable', exename) for wflow in self.config.workflows: if create and not util.checkpoint(self.workdir, wflow.label): wflow.setup(self.workdir, self.basedirs) logger.info("querying backend for {0}".format(wflow.label)) with fs.alternative(): dataset_info = wflow.dataset.get_info() logger.info("registering {0} in database".format(wflow.label)) self.__store.register_dataset(wflow, dataset_info, wflow.category.runtime) util.register_checkpoint(self.workdir, wflow.label, 'REGISTERED') elif os.path.exists(os.path.join(wflow.workdir, 'running')): for id in self.get_taskids(wflow.label): util.move(wflow.workdir, id, 'failed') for wflow in self.config.workflows: if wflow.parent: getattr(self.config.workflows, wflow.parent.label).register(wflow) if create: total_units = wflow.dataset.total_units * len(wflow.unique_arguments) self.__store.register_dependency(wflow.label, wflow.parent.label, total_units) if not util.checkpoint(self.workdir, 'sandbox cmssw version'): util.register_checkpoint(self.workdir, 'sandbox', 'CREATED') versions = set([w.version for w in self.config.workflows]) if len(versions) == 1: util.register_checkpoint(self.workdir, 'sandbox cmssw version', list(versions)[0]) if self.config.elk: if create: categories = {wflow.category.name: [] for wflow in self.config.workflows} for category in categories: for workflow in self.config.workflows: if workflow.category.name == category: categories[category].append(workflow.label) self.config.elk.create(categories) else: self.config.elk.resume() self.config.advanced.dashboard.setup(self.config) if create: self.config.save() self.config.advanced.dashboard.register_run() else: self.config.advanced.dashboard.update_task_status( (id_, dash.ABORTED) for id_ in self.__store.reset_units() ) for p in (self.parrot_bin, self.parrot_lib): if not os.path.exists(p): os.makedirs(p) for exe in ('parrot_run', 'chirp', 'chirp_put', 'chirp_get'): shutil.copy(util.which(exe), self.parrot_bin) subprocess.check_call(["strip", os.path.join(self.parrot_bin, exe)]) p_helper = os.path.join(os.path.dirname(self.parrot_path), 'lib', 'lib64', 'libparrot_helper.so') shutil.copy(p_helper, self.parrot_lib)