def filelist(self, sample, working_dir): # JMTBAD are there performance problems by not matching the json to the files per job? json_fn = os.path.join(working_dir, 'cs.json') if hasattr(sample, 'json') and sample.json: shutil.copy2(sample.json, json_fn) else: touch(json_fn) filenames = sample.filenames if not self.is_cmsRun: filenames = self.normalize_fns(filenames) if sample.split_by == 'events': per = sample.events_per assert sample.nevents_orig > 0 or sample.total_events > 0 nevents = sample.total_events if sample.total_events > 0 else sample.nevents_orig njobs = int_ceil(nevents, per) fn_groups = [filenames] else: use_njobs = sample.files_per < 0 per = abs(sample.files_per) if sample.total_files > 0: filenames = filenames[:sample.total_files] njobs = getattr(sample, 'njobs', int_ceil(len(filenames), per)) fn_groups = [ x for x in (filenames[i * per:(i + 1) * per] for i in xrange(njobs)) if x ] if not use_njobs: njobs = len(fn_groups) # let it fail downward if self._njobs is not None: assert self._njobs <= njobs njobs = self._njobs encoded_filelist = base64.b64encode( zlib.compress(pickle.dumps(fn_groups, -1))) files_to_write = [ ('cs_outputfiles', self.output_files), ('cs_stageoutfiles', self.stageout_files), ('cs_filelist.py', self.filelist_py_template.replace('__FILELIST__', encoded_filelist)), ('cs_njobs', str(njobs)), ('cs_jobmap', '\n'.join(str(i) for i in xrange(njobs)) + '\n'), # will be more complicated for resubmits ('cs_primaryds', sample.primary_dataset), ('cs_samplename', sample.name), ('cs_timestamp', (self.timestamp + timedelta(seconds=self.nsubmits)).strftime('%y%m%d_%H%M%S')), ] for fn, content in files_to_write: open(os.path.join(working_dir, fn), 'wt').write(content) return njobs
def submit(self, sample): self.nsubmits += 1 print 'submit', self.batch_name, sample.name, if self.dataset: try: sample.set_curr_dataset(self.dataset) except KeyError: print "\033[1m warning: \033[0m sample %s not submitted, doesn't have dataset %s" % ( sample.name, self.dataset) return if sample.split_by == 'events' and not sample.is_mc: print "\033[1m warning: \033[0m sample %s not submitted because can't split by events on data sample" return working_dir = os.path.join(self.batch_dir, 'condor_%s' % sample.name) if os.path.exists(working_dir): print "\033[1m warning: \033[0m sample %s not submitted, working dir %s already exists" % ( sample.name, working_dir) return os.mkdir(working_dir) touch(os.path.join(working_dir, 'cs_dir')) open(os.path.join(working_dir, 'cs_ex'), 'wt').write(self.ex_str) njobs = self.filelist(sample, working_dir) pset_fn = self.pset(sample, working_dir) jdl_fn = os.path.join(working_dir, 'cs_submit.jdl') open(jdl_fn, 'wt').write(self.jdl_template.replace('__NJOBS__', str(njobs))) if not self.testing: self._submit(working_dir, njobs) else: print 'in testing mode, not submitting anything.' if pset_fn: diff_out, diff_ret = popen('diff -uN %s %s' % (self.pset_template_fn, pset_fn), return_exit_code=True) if diff_ret != 0: print '.py diff:\n---------' print diff_out raw_input('continue?') print
def filelist(self, sample, working_dir): # JMTBAD are there performance problems by not matching the json to the files per job? json_fn = os.path.join(working_dir, 'cs.json') if hasattr(sample, 'json') and sample.json: shutil.copy2(sample.json, json_fn) else: touch(json_fn) filenames = sample.filenames if not self.is_cmsRun: filenames = self.normalize_fns(filenames) if sample.split_by == 'events': per = sample.events_per assert sample.nevents_orig > 0 njobs = int_ceil(sample.nevents_orig, per) fn_groups = [filenames] else: use_njobs = sample.files_per < 0 per = abs(sample.files_per) njobs = getattr(sample, 'njobs', int_ceil(len(filenames), per)) fn_groups = [x for x in (filenames[i*per:(i+1)*per] for i in xrange(njobs)) if x] if not use_njobs: njobs = len(fn_groups) # let it fail downward if self._njobs is not None: assert self._njobs <= njobs njobs = self._njobs encoded_filelist = base64.b64encode(zlib.compress(pickle.dumps(fn_groups, -1))) files_to_write = [ ('cs_outputfiles', self.output_files), ('cs_stageoutfiles', self.stageout_files), ('cs_filelist.py', self.filelist_py_template.replace('__FILELIST__', encoded_filelist)), ('cs_njobs', str(njobs)), ('cs_jobmap', '\n'.join(str(i) for i in xrange(njobs)) + '\n'), # will be more complicated for resubmits ('cs_primaryds', sample.primary_dataset), ('cs_samplename', sample.name), ('cs_timestamp', (self.timestamp + timedelta(seconds=self.nsubmits)).strftime('%y%m%d_%H%M%S')), ] for fn, content in files_to_write: open(os.path.join(working_dir, fn), 'wt').write(content) return njobs
def submit(self, sample): self.nsubmits += 1 print 'batch', self.batch_name, 'sample', sample.name, if self.dataset: try: sample.set_curr_dataset(self.dataset) except KeyError: print "\033[1m warning: \033[0m sample %s not submitted, doesn't have dataset %s" % (sample.name, self.dataset) return if sample.split_by == 'events' and not sample.is_mc: print "\033[1m warning: \033[0m sample %s not submitted because can't split by events on data sample" return working_dir = os.path.join(self.batch_dir, 'condor_%s' % sample.name) if os.path.exists(working_dir): print "\033[1m warning: \033[0m sample %s not submitted, working dir %s already exists" % (sample.name, working_dir) return os.mkdir(working_dir) touch(os.path.join(working_dir, 'cs_dir')) open(os.path.join(working_dir, 'cs_ex'), 'wt').write(self.ex_str) njobs = self.filelist(sample, working_dir) pset_fn = self.pset(sample, working_dir) jdl_fn = os.path.join(working_dir, 'cs_submit.jdl') open(jdl_fn, 'wt').write(self.jdl_template.replace('__NJOBS__', str(njobs))) if not self.testing: self._submit(working_dir, njobs) else: print 'in testing mode, not submitting anything.' if pset_fn: diff_out, diff_ret = popen('diff -uN %s %s' % (self.pset_template_fn, pset_fn), return_exit_code=True) if diff_ret != 0: print '.py diff:\n---------' print diff_out raw_input('continue?') print
def set_cs_done(wd): return touch(cs_done_fn(wd))