def savelogs(self, failed_jobs, samples=5): logdir = os.path.join(self.__plotdir, 'logs') if not os.path.exists(logdir): os.makedirs(logdir) pool = multiprocessing.Pool(processes=10) work = [] codes = {} for exit_code, jobs in zip(*split_by_column(failed_jobs[['id', 'exit_code']], 'exit_code')): codes[exit_code] = [len(jobs), {}] logger.info("Copying sample logs for exit code {0}".format(exit_code)) for id, e in list(jobs[-samples:]): codes[exit_code][1][id] = [] try: source = glob.glob(os.path.join(self.__workdir, '*', 'failed', util.id2dir(id)))[0] except IndexError: continue target = os.path.join(os.path.join(self.__plotdir, 'logs'), str(id)) if os.path.exists(target): shutil.rmtree(target) os.makedirs(target) files = [] for l in ['cmssw.log.gz', 'job.log.gz']: s = os.path.join(source, l) t = os.path.join(target, l[:-3]) if os.path.exists(s): codes[exit_code][1][id].append(l[:-3]) work.append((exit_code, id, l[:-3], pool.apply_async(unpack, [s, t]))) for (code, id, file, res) in work: if not res.get(): codes[code][1][id].remove(file) pool.close() pool.join() for code in codes: for id in range(samples - len(codes[code][1])): codes[code][1][-id] = [] return codes
def run(self, args): config = args.config logger = logging.getLogger('lobster.status') store = unit.UnitStore(config) data = list(store.workflow_status()) headers = [x.split() for x in data.pop(0)] header_rows = max([len(x) for x in headers]) for i in range(0, header_rows): data.insert(i, [x[i] if len(x) > i else '' for x in headers]) widths = \ [max(map(len, (xs[0] for xs in data)))] + \ [max(map(len, (str(xs[i]) for xs in data))) for i in range(1, len(data[0]))] data.insert(header_rows, ['=' * w for w in widths]) headfmt = ' '.join('{{:^{0}}}'.format(w) for w in widths) mainfmt = '{{:{0}}} '.format(widths[0]) + ' '.join( '{{:>{0}}}'.format(w) for w in widths[1:]) report = '\n'.join( [headfmt.format(*data[i]) for i in range(0, header_rows)] + [mainfmt.format(*map(str, row)) for row in data[header_rows:]]) logger.info("workflow summary:\n" + report) wdir = config.workdir for wflow in config.workflows: tasks = store.failed_units(wflow.label) files = store.skipped_files(wflow.label) if len(tasks) > 0: msg = "tasks with failed units for {0}:".format(wflow.label) for task in tasks: tdir = os.path.normpath( os.path.join(wdir, wflow.label, 'failed', util.id2dir(task))) msg += "\n" + tdir logger.info(msg) if len(files) > 0: msg = "files skipped for {0}:\n".format( wflow.label) + "\n".join(files) logger.info(msg)
def run(self, args): config = args.config logger = logging.getLogger('lobster.status') store = unit.UnitStore(config) data = list(store.workflow_status()) headers = [x.split() for x in data.pop(0)] header_rows = max([len(x) for x in headers]) for i in range(0, header_rows): data.insert(i, [x[i] if len(x) > i else '' for x in headers]) widths = \ [max(map(len, (xs[0] for xs in data)))] + \ [max(map(len, (str(xs[i]) for xs in data))) for i in range(1, len(data[0]))] data.insert(header_rows, ['=' * w for w in widths]) headfmt = ' '.join('{{:^{0}}}'.format(w) for w in widths) mainfmt = '{{:{0}}} '.format( widths[0]) + ' '.join('{{:>{0}}}'.format(w) for w in widths[1:]) report = '\n'.join( [headfmt.format(*data[i]) for i in range(0, header_rows)] + [mainfmt.format(*map(str, row)) for row in data[header_rows:]]) logger.info("workflow summary:\n" + report) wdir = config.workdir for wflow in config.workflows: tasks = store.failed_units(wflow.label) files = store.skipped_files(wflow.label) if len(tasks) > 0: msg = "tasks with failed units for {0}:".format(wflow.label) for task in tasks: tdir = os.path.normpath(os.path.join( wdir, wflow.label, 'failed', util.id2dir(task))) msg += "\n" + tdir logger.info(msg) if len(files) > 0: msg = "files skipped for {0}:\n".format( wflow.label) + "\n".join(files) logger.info(msg)
def get_report(self, label, task): return os.path.join(self.workdir, label, 'successful', util.id2dir(task), 'report.json')
def get_jobdir(self, jobid, label='', status='running'): # See id2dir for job id formatting in filesystem paths return os.path.normpath(os.path.join(self.workdir, label, status, util.id2dir(jobid)))
def insert_block(self, dbs, primary_dataset, dataset, user, config, basedir, datasetdir, stageoutdir, chunk): block = self.prepare_block(dataset, user) files = [] tasks = [] configs = [] logger.info('preparing DBS entry for {} task block: {}'.format( len(chunk), block['block_name'])) for task, _ in chunk: taskdir = os.path.join(basedir, util.id2dir(task)) try: files.append( self.prepare_file(dataset, block, user, taskdir, datasetdir, stageoutdir)) cfg = config.copy() cfg['lfn'] = files[-1]['logical_file_name'] configs.append(cfg) tasks.append(task) except ValueError as e: logger.warn( 'could not find expected output for task {}: {}'.format( task, e.message)) block.update({ 'file_count': len(files), 'block_size': sum([int(f['file_size']) for f in files]) }) dump = { 'dataset_conf_list': [config], 'file_conf_list': configs, 'files': files, 'processing_era': { 'processing_version': 1, 'description': 'CRAB3_processing_era' }, 'primds': primary_dataset, 'dataset': dataset, 'acquisition_era': { 'acquisition_era_name': user, 'start_date': 0 }, 'block': block, 'file_parent_list': [] } # For debugging # from pprint import pprint # pprint(config) try: dbs['local'].insertBulkBlock(dump) except HTTPError as e: if e.code in (401, 412): raise e logger.exception(e) return tasks, block
def publish(args): with open(args.configfile) as f: config = yaml.load(f) config = apply_matching(config) if len(args.datasets) == 0: args.datasets = [task['label'] for task in config.get('tasks', [])] workdir = config['workdir'] user = config.get('publish user', os.environ['USER']) publish_instance = config.get('dbs instance', 'phys03') published = {'dataset': '', 'dbs instance': publish_instance} print "Saving log to {0}".format(os.path.join(workdir, 'publish.log')) if not args.foreground: ttyfile = open(os.path.join(workdir, 'publish.err'), 'a') print "Saving stderr and stdout to {0}".format(os.path.join(workdir, 'publish.err')) with daemon.DaemonContext( detach_process=not args.foreground, stdout=sys.stdout if args.foreground else ttyfile, stderr=sys.stderr if args.foreground else ttyfile, working_directory=workdir, pidfile=util.get_lock(workdir)): logging.basicConfig( datefmt="%Y-%m-%d %H:%M:%S", format="%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s", level=config.get('advanced', {}).get('log level', 2) * 10, filename=os.path.join(workdir, 'publish.log')) if args.foreground: console = logging.StreamHandler() console.setLevel(config.get('advanced', {}).get('log level', 2) * 10) console.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s")) logging.getLogger('').addHandler(console) db = JobitStore(config) das_interface = MetaInterface() dbs = {} for path, key in [[('global', 'DBSReader'), 'global'], [(publish_instance, 'DBSWriter'), 'local'], [(publish_instance, 'DBSReader'), 'reader'], [(publish_instance, 'DBSMigrate'), 'migrator']]: dbs[key] = DbsApi('https://cmsweb.cern.ch/dbs/prod/{0}/'.format(os.path.join(*path))) for label in args.datasets: (dset, stageout_path, release, gtag, publish_label, cfg, pset_hash, ds_id, publish_hash) = [str(x) for x in db.dataset_info(label)] dset = dset.strip('/').split('/')[0] if not pset_hash or pset_hash == 'None': logging.info('the parameter set hash has not been calculated') logging.info('calculating parameter set hash now (may take a few minutes)') cfg_path = os.path.join(workdir, label, os.path.basename(cfg)) tmp_path = cfg_path.replace('.py', '_tmp.py') with open(cfg_path, 'r') as infile: with open(tmp_path, 'w') as outfile: fix = "import sys \nif not hasattr(sys, 'argv'): sys.argv = ['{0}']\n" outfile.write(fix.format(tmp_path)) outfile.write(infile.read()) try: pset_hash = createPSetHash(tmp_path)[-32:] db.update_pset_hash(pset_hash, label) except: logging.warning('error calculating the cmssw parameter set hash') os.remove(tmp_path) block = BlockDump(user, dset, dbs['global'], publish_hash, publish_label, release, pset_hash, gtag) if len(dbs['local'].listAcquisitionEras(acquisition_era_name=user)) == 0: try: dbs['local'].insertAcquisitionEra({'acquisition_era_name': user}) except Exception, ex: logging.warn(ex) try: dbs['local'].insertPrimaryDataset(block.data['primds']) dbs['local'].insertDataset(block.data['dataset']) except Exception, ex: logging.warn(ex) raise jobs = db.finished_jobs(label) first_job = 0 inserted = False logging.info('found %d successful %s jobs to publish' % (len(jobs), label)) missing = [] while first_job < len(jobs): block.reset() chunk = jobs[first_job:first_job+args.block_size] logging.info('preparing DBS entry for %i job block: %s' % (len(chunk), block['block']['block_name'])) for job, merged_job in chunk: status = 'merged' if merged_job else 'successful' id = merged_job if merged_job else job tag = 'merged_{0}'.format(merged_job) if merged_job else str(job) f = gzip.open(os.path.join(workdir, label, status, util.id2dir(id), 'report.xml.gz'), 'r') report = readJobReport(f)[0] PFN = os.path.join(stageout_path, report.files[0]['PFN'].replace('.root', '_%s.root' % tag)) LFN = block.get_LFN(PFN) matched_PFN = block.get_matched_PFN(PFN, LFN) if not matched_PFN: logging.warn('could not find expected output for job(s) {0}'.format(job)) missing.append(job) else: logging.info('adding %s to block' % LFN) block.add_file_config(LFN) block.add_file(LFN, report.files[0], job, merged_job) block.add_dataset_config() if args.migrate_parents: block.add_file_parents(LFN, report) if args.migrate_parents: parents_to_migrate = list(set([p['parent_logical_file_name'] for p in block['file_parent_list']])) migrate_parents(parents_to_migrate, dbs) if len(block.data['files']) > 0: try: inserted = True dbs['local'].insertBulkBlock(block.data) db.update_published(block.get_publish_update()) logging.info('block inserted: %s' % block['block']['block_name']) except HTTPError, e: logging.critical(e) first_job += args.block_size