def populate_cimager_parset(parset): input_parset = Parset(parset) patch_dictionary = { 'Cimager.dataset': dataset, 'Cimager.restore': restore } image_names = [] for image_name in input_parset.getStringVector('Cimager.Images.Names'): image_names.append("%s_%s" % (image_name, name)) subset = input_parset.makeSubset( "Cimager.Images.%s" % image_name, "Cimager.Images.%s" % image_names[-1] ) patch_dictionary[ "Cimager.Images.%s.frequency" % image_names[-1] ] = frequency patch_dictionary[ "Cimager.Images.%s.direction" % image_names[-1] ] = "[ %s,%s,%s ]" % (ms_dir_ra, ms_dir_dec, ms_dir_type) for key in subset: patch_dictionary[key] = subset[key].get() input_parset.subtractSubset('Cimager.Images.image') for key in input_parset: patch_dictionary[key] = input_parset[key].get() patch_dictionary['Cimager.Images.Names'] = "[ %s ]" % ", ".join(image_names) return patch_parset( None, patch_dictionary, self.config.get("layout", "job_directory") )
def show_task(self, task): task_parset = Parset() if self.task_definitions.has_option(task,'parset'): task_parset.adoptFile(self.task_definitions.get(task,'parset')) print 'possible arguments: key = value' for k in task_parset.keywords(): print ' ',k,' ','=',' ',task_parset[k]
def run(self, infile, executable, args, kwargs, work_dir='/tmp', parsetasfile=False, args_format='', environment=''): """ This method contains all the needed functionality """ # Debugging info self.logger.debug("infile = %s" % infile) self.logger.debug("executable = %s" % executable) self.logger.debug("working directory = %s" % work_dir) self.logger.debug("arguments = %s" % args) self.logger.debug("arg dictionary = %s" % kwargs) self.logger.debug("environment = %s" % environment) self.environment.update(environment) # Time execution of this job with log_time(self.logger): #if os.path.exists(infile): self.logger.info("Processing %s" % infile) # Check if script is present if not os.path.isfile(executable): self.logger.error("Script %s not found" % executable) return 1 # hurray! race condition when running with more than one process on one filesystem if not os.path.isdir(work_dir): try: os.mkdir(work_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(work_dir): pass else: raise if parsetasfile: nodeparset = Parset() parsetname = os.path.join(work_dir, os.path.basename(infile) + '.parset') for k, v in kwargs.items(): nodeparset.add(k, v) nodeparset.writeFile(parsetname) args.insert(0, parsetname) try: # **************************************************************** # Run # Change to working directory for the script pipedir = os.getcwd() os.chdir(work_dir) outdict = {} plugin = imp.load_source('main', executable) outdict = plugin.main(*args, **kwargs) os.chdir(pipedir) except CalledProcessError, err: # CalledProcessError isn't properly propagated by IPython self.logger.error(str(err)) return 1 except Exception, err: self.logger.error(str(err)) return 1
def __init__(self): control.__init__(self) self.parset = Parset() self.input_data = {} self.output_data = {} self.parset_feedback_file = None #self.logger = None#logging.RootLogger('DEBUG') self.name = ''
def show_task(self, task): task_parset = Parset() if self.task_definitions.has_option(task, 'parset'): task_parset.adoptFile(self.task_definitions.get(task, 'parset')) print 'possible arguments: key = value' for k in task_parset.keys: print ' ', k, ' ', '=', ' ', task_parset[ k]
def _construct_steps(self, step_name_list, step_control_dict, step_parset_files, step_parset_obj, parset_dir): step_list_copy = (copy.deepcopy(step_name_list)) counter = 0 while step_list_copy: counter -= 1 stepname = step_list_copy.pop(-1) fullparset = self.parset.makeSubset( self.parset.fullModuleName(str(stepname)) + '.') subparset = fullparset.makeSubset( fullparset.fullModuleName('control') + '.') number = 0 for item in step_list_copy: if item == stepname: number += 1 if number != 0: stepname += str(number) step_name_list[counter] = stepname step_control_dict[stepname] = subparset if fullparset.fullModuleName('argument'): stepparset = fullparset.makeSubset( fullparset.fullModuleName('argument') + '.') # ********************************************************************* # save parsets # either a filename is given in the main parset # or files will be created from subsets with stepnames.parset as filenames # for name, parset in step_parset_dict.iteritems(): try: file_parset = Parset(stepparset.getString('parset')) for k in file_parset.keywords(): if not k in stepparset.keys(): stepparset.add(k, str(file_parset[k])) stepparset.remove('parset') except: pass # parset from task.cfg try: file_parset = Parset( self.task_definitions.get(str(subparset['type']), 'parset')) for k in file_parset.keywords(): if not k in stepparset.keys(): stepparset.add(k, str(file_parset[k])) except: pass # for parset in control section try: file_parset = Parset(subparset.getString('parset')) for k in file_parset.keywords(): if not k in stepparset.keys(): stepparset.add(k, str(file_parset[k])) subparset.remove('parset') except: pass step_parset = os.path.join(parset_dir, stepname + '.parset') stepparset.writeFile(step_parset) step_parset_files[stepname] = step_parset step_parset_obj[stepname] = stepparset
def _create_mapfile_from_parset(parset, identifier): pars = Parset() pars.adoptFile(parset) dps = pars.makeSubset( pars.fullModuleName('DataProducts') + '.' ) datamap = DataMap([ tuple(os.path.join(location, filename).split(':')) + (skip,) for location, filename, skip in zip( dps.getStringVector(identifier + '.locations'), dps.getStringVector(identifier + '.filenames'), dps.getBoolVector(identifier + '.skip')) ]) return datamap
def _create_mask(self, npix, cell_size, output_image, concatenated_measurement_set, executable, working_directory, log4_cplus_name, sourcedb_path, mask_patch_size, image_path_directory): """ (3) create a casa image containing an mask blocking out the sources in the provided sourcedb. It expects: a. the ms for which the mask will be created, it is used to de termine some image details: (eg. pointing) b. parameters for running within the catchsegfault framework c. and the size of the mask_pach. To create a mask, first a empty measurement set is created using awimager: ready to be filled with mask data This function is a wrapper around some functionality written by: [email protected] steps: 1. Create a parset with image paramters used by: 2. awimager run. Creating an empty casa image. 3. Fill the casa image with mask data """ # ******************************************************************** # 1. Create the parset used to make a mask mask_file_path = output_image + ".mask" mask_patch_dictionary = { "npix": str(npix), "cellsize": str(cell_size), "image": str(mask_file_path), "ms": str(concatenated_measurement_set), "operation": "empty", "stokes": "'I'" } mask_parset = Parset.fromDict(mask_patch_dictionary) mask_parset_path = os.path.join(image_path_directory, "mask.par") mask_parset.writeFile(mask_parset_path) self.logger.debug( "Write parset for awimager mask creation: {0}".format( mask_parset_path)) # ********************************************************************* # 2. Create an empty mask using awimager cmd = [executable, mask_parset_path] self.logger.info(" ".join(cmd)) try: with CatchLog4CPlus( working_directory, self.logger.name + "." + os.path.basename(log4_cplus_name), os.path.basename(executable)) as logger: catch_segfaults(cmd, working_directory, self.environment, logger) # Thrown by catch_segfault except CalledProcessError, exception: self.logger.error(str(exception)) return 1
def _create_mask(self, npix, cell_size, output_image, concatenated_measurement_set, executable, working_directory, log4_cplus_name, sourcedb_path, mask_patch_size, image_path_directory): """ (3) create a casa image containing an mask blocking out the sources in the provided sourcedb. It expects: a. the ms for which the mask will be created, it is used to de termine some image details: (eg. pointing) b. parameters for running within the catchsegfault framework c. and the size of the mask_pach. To create a mask, first a empty measurement set is created using awimager: ready to be filled with mask data This function is a wrapper around some functionality written by: [email protected] steps: 1. Create a parset with image paramters used by: 2. awimager run. Creating an empty casa image. 3. Fill the casa image with mask data """ # ******************************************************************** # 1. Create the parset used to make a mask mask_file_path = output_image + ".mask" mask_patch_dictionary = {"npix": str(npix), "cellsize": str(cell_size), "image": str(mask_file_path), "ms": str(concatenated_measurement_set), "operation": "empty", "stokes": "'I'" } mask_parset = Parset.fromDict(mask_patch_dictionary) mask_parset_path = os.path.join(image_path_directory, "mask.par") mask_parset.writeFile(mask_parset_path) self.logger.debug( "Write parset for awimager mask creation: {0}".format( mask_parset_path)) # ********************************************************************* # 2. Create an empty mask using awimager cmd = [executable, mask_parset_path] self.logger.info(" ".join(cmd)) try: with CatchLog4CPlus(working_directory, self.logger.name + "." + os.path.basename(log4_cplus_name), os.path.basename(executable) ) as logger: catch_segfaults(cmd, working_directory, self.environment, logger) # Thrown by catch_segfault except CalledProcessError, exception: self.logger.error(str(exception)) return 1
def populate_cimager_parset(parset): input_parset = Parset(parset) patch_dictionary = { 'Cimager.dataset': dataset, 'Cimager.restore': restore } image_names = [] for image_name in input_parset.getStringVector( 'Cimager.Images.Names'): image_names.append("%s_%s" % (image_name, name)) subset = input_parset.makeSubset( "Cimager.Images.%s" % image_name, "Cimager.Images.%s" % image_names[-1]) patch_dictionary["Cimager.Images.%s.frequency" % image_names[-1]] = frequency patch_dictionary["Cimager.Images.%s.direction" % image_names[-1]] = "[ %s,%s,%s ]" % ( ms_dir_ra, ms_dir_dec, ms_dir_type) for key in subset: patch_dictionary[key] = subset[key].get() input_parset.subtractSubset('Cimager.Images.image') for key in input_parset: patch_dictionary[key] = input_parset[key].get() patch_dictionary['Cimager.Images.Names'] = "[ %s ]" % ", ".join( image_names) return patch_parset(None, patch_dictionary, self.config.get("layout", "job_directory"))
def gvds_iterator(gvds_file, nproc=4): """ Reads a GVDS file. Provides a generator, which successively returns the contents of the GVDS file in the form (host, filename), in chunks suitable for processing across the cluster. Ie, no more than nproc files per host at a time. """ parset = Parset(gvds_file) data = defaultdict(list) for part in range(parset.getInt('NParts')): host = parset.getString("Part%d.FileSys" % part).split(":")[0] file = parset.getString("Part%d.FileName" % part) vds = parset.getString("Part%d.Name" % part) data[host].append((file, vds)) for host, values in data.iteritems(): data[host] = utilities.group_iterable(values, nproc) while True: yieldable = [] for host, values in data.iteritems(): try: for filename, vds in values.next(): yieldable.append((host, filename, vds)) except StopIteration: pass if len(yieldable) == 0: raise StopIteration else: yield yieldable
def go(self): self.logger.info("Starting storagemapper run") super(storagemapper, self).go() # We read the storage node name out of the path # and append the local filename (ie, on the storage node) to the map # ---------------------------------------------------------------------- data = defaultdict(list) for filename in self.inputs['args']: host = filename.split(os.path.sep)[3] data[host].append(filename.split(host)[-1]) # Dump the generated mapping to a parset # ---------------------------------------------------------------------- parset = Parset() for host, filenames in data.iteritems(): parset.addStringVector(host, filenames) create_directory(os.path.dirname(self.inputs['mapfile'])) parset.writeFile(self.inputs['mapfile']) self.outputs['mapfile'] = self.inputs['mapfile'] return 0
def _create_mapfile_from_parset(parset, identifier): pars = Parset() pars.adoptFile(parset) dps = pars.makeSubset(pars.fullModuleName('DataProducts') + '.') datamap = DataMap([ tuple(os.path.join(location, filename).split(':')) + (skip, ) for location, filename, skip in zip( dps.getStringVector(identifier + '.locations'), dps.getStringVector(identifier + '.filenames'), dps.getBoolVector(identifier + '.skip')) ]) return datamap
def _construct_steps(self, step_name_list, step_control_dict, step_parset_files, step_parset_obj, parset_dir): step_list_copy = (copy.deepcopy(step_name_list)) counter = 0 while step_list_copy: counter -= 1 stepname = step_list_copy.pop(-1) fullparset = self.parset.makeSubset(self.parset.fullModuleName(str(stepname)) + '.') subparset = fullparset.makeSubset(fullparset.fullModuleName('control') + '.') number = 0 for item in step_list_copy: if item == stepname: number += 1 if number != 0: stepname += str(number) step_name_list[counter] = stepname step_control_dict[stepname] = subparset if fullparset.fullModuleName('argument'): stepparset = fullparset.makeSubset(fullparset.fullModuleName('argument') + '.') # ********************************************************************* # save parsets # either a filename is given in the main parset # or files will be created from subsets with stepnames.parset as filenames # for name, parset in step_parset_dict.iteritems(): try: file_parset = Parset(stepparset.getString('parset')) for k in file_parset.keywords(): if not k in stepparset.keys(): stepparset.add(k, str(file_parset[k])) stepparset.remove('parset') except: pass # parset from task.cfg try: file_parset = Parset(self.task_definitions.get(str(subparset['type']), 'parset')) for k in file_parset.keywords(): if not k in stepparset.keys(): stepparset.add(k, str(file_parset[k])) except: pass # for parset in control section try: file_parset = Parset(subparset.getString('parset')) for k in file_parset.keywords(): if not k in stepparset.keys(): stepparset.add(k, str(file_parset[k])) subparset.remove('parset') except: pass step_parset = os.path.join(parset_dir, stepname + '.parset') stepparset.writeFile(step_parset) step_parset_files[stepname] = step_parset step_parset_obj[stepname] = stepparset
def get_current_op_step_names(direction): """ Returns lits of step names for current operation """ current_op = get_current_op(direction) parset_file = os.path.join(direction.working_dir, 'results', current_op, direction.name, 'pipeline.parset') parset = Parset() parset.adoptFile(parset_file) pipeline_args = parset.makeSubset(parset.fullModuleName('pipeline') + '.') step_name_list = pipeline_args.getStringVector('steps') # Filter out plugin steps filter_step_name_list = [] for stepname in step_name_list: fullparset = parset.makeSubset( parset.fullModuleName(str(stepname)) + '.') subparset = fullparset.makeSubset( fullparset.fullModuleName('control') + '.') try: kind_of_step = subparset.getString('kind') except: kind_of_step = 'recipe' if kind_of_step != 'plugin': if kind_of_step == 'loop': loopsteps = subparset.getStringVector('loopsteps') for loopstep in loopsteps: fullparset_loop = parset.makeSubset( parset.fullModuleName(str(loopstep)) + '.') subparset_loop = fullparset_loop.makeSubset( fullparset_loop.fullModuleName('control') + '.') try: kind_of_loop_step = subparset_loop.getString('kind') except: kind_of_loop_step = 'recipe' if kind_of_loop_step != 'plugin': filter_step_name_list.append(loopstep) else: filter_step_name_list.append(stepname) return filter_step_name_list
def go(self): self.logger.info("Starting datamapper run") super(datamapper, self).go() # We build lists of compute-nodes per cluster and data-per-cluster, # then match them up to schedule jobs in a round-robin fashion. # ---------------------------------------------------------------------- clusterdesc = ClusterDesc(self.config.get('cluster', "clusterdesc")) if clusterdesc.subclusters: available_nodes = dict((cl.name, cycle(get_compute_nodes(cl))) for cl in clusterdesc.subclusters) else: available_nodes = { clusterdesc.name: cycle(get_compute_nodes(clusterdesc)) } data = defaultdict(list) for filename in self.inputs['args']: subcluster = filename.split(os.path.sep)[2] try: host = next(available_nodes[subcluster]) except KeyError as key: self.logger.error("%s is not a known cluster" % str(key)) raise data[host].append(filename) # Dump the generated mapping to a parset # ---------------------------------------------------------------------- parset = Parset() for host, filenames in data.items(): parset.addStringVector(host, filenames) parset.writeFile(self.inputs['mapfile']) self.outputs['mapfile'] = self.inputs['mapfile'] return 0
def run(self, infile, executable, args, kwargs, work_dir='/tmp', parsetasfile=True, args_format='', environment=''): """ This method contains all the needed functionality """ # Debugging info self.logger.debug("infile = %s" % infile) self.logger.debug("executable = %s" % executable) self.logger.debug("working directory = %s" % work_dir) self.logger.debug("arguments = %s" % args) self.logger.debug("arg dictionary = %s" % kwargs) self.logger.debug("environment = %s" % environment) self.environment.update(environment) self.work_dir = work_dir self.infile = infile self.executable = executable self.msout_original = kwargs['msout'].rstrip('/') kwargs.pop('msout') self.msout_destination_dir = os.path.dirname(self.msout_original) self.scratch_dir = tempfile.mkdtemp(dir=kwargs['local_scratch_dir']) kwargs.pop('local_scratch_dir') self.logger.info('Using {} as scratch directory'.format( self.scratch_dir)) # Set up scratch paths self.msout_scratch = os.path.join( self.scratch_dir, os.path.basename(self.msout_original)) args.append('msout=' + self.msout_scratch) # Time execution of this job with log_time(self.logger): #if os.path.exists(infile): self.logger.info("Processing %s" % infile) # Check if script is present if not os.path.isfile(executable): self.logger.error("Executable %s not found" % executable) return 1 # hurray! race condition when running with more than one process on one filesystem if not os.path.isdir(work_dir): try: os.mkdir(work_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(work_dir): pass else: raise argsformat = args_format['args_format'] if not parsetasfile: if argsformat == 'gnu': for k, v in kwargs.items(): args.append('--' + k + '=' + v) if argsformat == 'lofar': for k, v in kwargs.items(): args.append(k + '=' + v) if argsformat == 'argparse': for k, v in kwargs.items(): args.append('--' + k + ' ' + v) if argsformat == 'wsclean': for k, v in kwargs.items(): multargs = v.split(' ') if multargs: multargs.reverse() for item in multargs: args.insert(0, item) else: args.insert(0, v) args.insert(0, '-' + k) else: nodeparset = Parset() parsetname = os.path.join(work_dir, os.path.basename(infile) + '.parset') for k, v in kwargs.items(): nodeparset.add(k, v) nodeparset.writeFile(parsetname) args.insert(0, parsetname) try: # **************************************************************** #Run cmd = [executable] + args with CatchLog4CPlus( work_dir, self.logger.name + "." + os.path.basename(infile), os.path.basename(executable), ) as logger: # Catch segfaults and retry catch_segfaults(cmd, work_dir, self.environment, logger) except CalledProcessError, err: # CalledProcessError isn't properly propagated by IPython self.logger.error(str(err)) self.cleanup() return 1 except Exception, err: self.logger.error(str(err)) self.cleanup() return 1
def run(self, infile, executable, args, kwargs, work_dir='/tmp', parsetasfile=False, args_format='', environment=''): """ This function contains all the needed functionality """ # Debugging info self.logger.debug("infile = %s" % infile) self.logger.debug("executable = %s" % executable) self.logger.debug("working directory = %s" % work_dir) self.logger.debug("arguments = %s" % args) self.logger.debug("arg dictionary = %s" % kwargs) self.logger.debug("environment = %s" % environment) self.environment.update(environment) # Time execution of this job with log_time(self.logger): if infile[0] == '[': infiles = [ms.strip(" []\'\"") for ms in infile.split(',')] reffile = infiles[0] else: reffile = infile if os.path.exists(reffile): self.logger.info("Processing %s" % reffile) else: self.logger.error("Dataset %s does not exist" % reffile) return 1 # Check if executable is present if not os.access(executable, os.X_OK): self.logger.error("Executable %s not found" % executable) return 1 # race condition when running with more than one process on one filesystem if not os.path.isdir(work_dir): try: os.mkdir(work_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(work_dir): pass else: raise if not parsetasfile: self.logger.error("Nodescript \"executable_casa.py\" requires \"parsetasfile\" to be True!") return 1 else: nodeparset = Parset() sublist = [] for k, v in kwargs.items(): nodeparset.add(k, v) if str(k).find('.'): if not str(k).split('.')[0] in sublist: sublist.append(str(k).split('.')[0]) #quick hacks below. for proof of concept. casastring = '' for sub in sublist: subpar = nodeparset.makeSubset(nodeparset.fullModuleName(sub) + '.') casastring = sub + '(' for k in subpar.keys(): if str(subpar[k]).find('/') == 0: casastring += str(k) + '=' + "'" + str(subpar[k]) + "'" + ',' elif str(subpar[k]).find('casastr/') == 0: casastring += str(k) + '=' + "'" + str(subpar[k]).strip('casastr/') + "'" + ',' elif str(subpar[k]).lower() == 'false' or str(subpar[k]).lower() == 'true': casastring += str(k) + '=' + str(subpar[k]) + ',' else: # Test if int/float or list of int/float try: self.logger.info('value: {}'.format(subpar[k])) test = float(str(subpar[k])) is_int_float = True except: is_int_float = False if is_int_float: casastring += str(k) + '=' + str(subpar[k]) + ',' else: if '[' in str(subpar[k]) or '(' in str(subpar[k]): # Check if list of int/float or strings list_vals = [f.strip() for f in str(subpar[k]).strip('[]()').split(',')] is_int_float = True for list_val in list_vals: try: test = float(list_val) except: is_int_float = False break if is_int_float: casastring += str(k) + '=' + str(subpar[k]) + ',' else: casastring += str(k) + '=' + '[{}]'.format(','.join(["'"+list_val+"'" for list_val in list_vals])) + ',' else: # Simple string casastring += str(k) + '=' + "'" + str(subpar[k]) + "'" + ',' casastring = casastring.rstrip(',') casastring += ')\n' # 1) return code of a casapy is not properly recognized by the pipeline # wrapping in shellscript works for succesful runs. # failed runs seem to hang the pipeline... # 2) casapy can not have two instances running from the same directory. # create tmp dirs casapydir = tempfile.mkdtemp(dir=work_dir) if casastring != '': casafilename = os.path.join(work_dir, os.path.basename(reffile) + '.casacommand.py') casacommandfile = open(casafilename, 'w') casacommandfile.write(casastring) casacommandfile.close() args.append(casafilename) somename = os.path.join(work_dir, os.path.basename(reffile) + '.casashell.sh') commandstring = '' commandstring += executable for item in args: if str(item).find(' ') > -1 or str(item).find('[') > -1: commandstring += ' "' + item + '"' else: commandstring += ' ' + item crap = open(somename, 'w') crap.write('#!/bin/bash \n') crap.write('echo "Trying CASAPY command" \n') crap.write(commandstring + ' >& casa.log\n') crap.close() # file permissions st = os.stat(somename) os.chmod(somename, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) try: # **************************************************************** # Run cmd = [somename] with CatchLog4CPlus( casapydir, self.logger.name + "." + os.path.basename(reffile), os.path.basename(executable), ) as logger: # Catch segfaults and retry catch_segfaults( cmd, casapydir, self.environment, logger ) except CalledProcessError, err: # CalledProcessError isn't properly propagated by IPython self.logger.error(str(err)) return 1 except Exception, err: self.logger.error(str(err)) return 1
def run(self, infile, executable, args, kwargs, work_dir='/tmp', parsetasfile=True, args_format='', environment=''): """ This method contains all the needed functionality """ # Debugging info self.logger.debug("infile = %s" % infile) self.logger.debug("executable = %s" % executable) self.logger.debug("working directory = %s" % work_dir) self.logger.debug("arguments = %s" % args) self.logger.debug("arg dictionary = %s" % kwargs) self.logger.debug("environment = %s" % environment) self.environment.update(environment) self.work_dir = work_dir self.infile = infile self.executable = executable if 'replace-sourcedb' in kwargs: self.replace_sourcedb = kwargs['replace-sourcedb'] kwargs.pop('replace-sourcedb') if 'replace-parmdb' in kwargs: self.replace_parmdb = kwargs['replace-parmdb'] kwargs.pop('replace-parmdb') if 'dry-run' in kwargs: self.dry_run = kwargs['dry-run'] kwargs.pop('dry-run') if 'sourcedb' in kwargs: self.sourcedb = kwargs['sourcedb'] kwargs.pop('sourcedb') if 'parmdb' in kwargs: self.parmdb = kwargs['parmdb'] kwargs.pop('parmdb') if 'sourcedb-name' in kwargs: self.sourcedb_basename = kwargs['sourcedb-name'] self.replace_sourcedb = True kwargs.pop('sourcedb-name') if 'parmdb-name' in kwargs: self.parmdb_basename = kwargs['parmdb-name'] self.replace_parmdb = True kwargs.pop('parmdb-name') if 'force' in kwargs: self.replace_parmdb = True self.replace_sourcedb = True kwargs.pop('force') numthreads = 1 if 'numthreads' in kwargs: numthreads = kwargs['numthreads'] kwargs.pop('numthreads') args.append('--numthreads='+str(numthreads)) if 'observation' in kwargs: self.observation = kwargs.pop('observation') if 'catalog' in kwargs: self.catalog = kwargs.pop('catalog') self.createsourcedb() self.createparmdb() if not 'no-columns' in kwargs: #if not kwargs['no-columns']: self.addcolumns() else: kwargs.pop('no-columns') args.append('--sourcedb=' + self.sourcedb_path) args.append('--parmdb=' + self.parmdb_path) args.append(self.observation) #catalog = None # Time execution of this job with log_time(self.logger): #if os.path.exists(infile): self.logger.info("Processing %s" % infile) # Check if script is present if not os.path.isfile(executable): self.logger.error("Executable %s not found" % executable) return 1 # hurray! race condition when running with more than one process on one filesystem if not os.path.isdir(work_dir): try: os.mkdir(work_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(work_dir): pass else: raise if parsetasfile: nodeparset = Parset() parsetname = os.path.join(work_dir, os.path.basename(infile) + '.parset') for k, v in kwargs.items(): nodeparset.add(k, v) nodeparset.writeFile(parsetname) #args.insert(0, parsetname) args.append(parsetname) #if catalog is not None: # args.append(catalog) try: # **************************************************************** #Run cmd = [executable] + args with CatchLog4CPlus( work_dir, self.logger.name + "." + os.path.basename(infile), os.path.basename(executable), ) as logger: # Catch segfaults and retry catch_segfaults( cmd, work_dir, self.environment, logger ) except CalledProcessError, err: # CalledProcessError isn't properly propagated by IPython self.logger.error(str(err)) return 1 except Exception, err: self.logger.error(str(err)) return 1
def plugin_main(*args, **kwargs): parset = Parset(kwargs['first_parset']) parset.adoptFile(kwargs['second_parset']) parset.writeFile(kwargs['result_parset'] + '_feedback_file')
def pipeline_logic(self): try: parset_file = os.path.abspath(self.inputs['args'][0]) except IndexError: return self.usage() try: if self.parset.keys == []: self.parset.adoptFile(parset_file) self.parset_feedback_file = parset_file + "_feedback" except RuntimeError: print >> sys.stderr, "Error: Parset file not found!" return self.usage() self._replace_values() # just a reminder that this has to be implemented validator = GenericPipelineParsetValidation(self.parset) if not validator.validate_pipeline(): self.usage() exit(1) if not validator.validate_steps(): self.usage() exit(1) #set up directories job_dir = self.config.get("layout", "job_directory") parset_dir = os.path.join(job_dir, "parsets") mapfile_dir = os.path.join(job_dir, "mapfiles") # Create directories for temporary parset- and map files create_directory(parset_dir) create_directory(mapfile_dir) # ********************************************************************* # maybe we dont need a subset but just a steplist # at the moment only a list with stepnames is given for the pipeline.steps parameter # pipeline.steps=[vdsmaker,vdsreader,setupparmdb1,setupsourcedb1,ndppp1,....] # the names will be the prefix for parset subsets pipeline_args = self.parset.makeSubset( self.parset.fullModuleName('pipeline') + '.') pipeline_steps = self.parset.makeSubset( self.parset.fullModuleName('steps') + '.') # ********************************************************************* # forward declaration of things. just for better overview and understanding whats in here. # some of this might be removed in upcoming iterations, or stuff gets added. step_name_list = pipeline_args.getStringVector('steps') # construct the step name list if there were pipeline.steps.<subset> for item in pipeline_steps.keys(): if item in step_name_list: loc = step_name_list.index(item) step_name_list[loc:loc] = pipeline_steps.getStringVector(item) step_name_list.remove(item) step_control_dict = {} step_parset_files = {} step_parset_obj = {} activeloop = [''] # construct the list of step names and controls self._construct_steps(step_name_list, step_control_dict, step_parset_files, step_parset_obj, parset_dir) # initial parameters to be saved in resultsdict so that recipes have access to this step0 # double init values. 'input' should be considered deprecated # self.name would be consistent to use in subpipelines input_dictionary = { 'parset': parset_file, 'parsetobj': self.parset, 'parset_dir': parset_dir, 'mapfile_dir': mapfile_dir} resultdicts = {} for section in self.config.sections(): tmp_dict = {} for entry in self.config.items(section): input_dictionary[entry[0]] = entry[1] tmp_dict[entry[0]] = entry[1] resultdicts.update({section: copy.deepcopy(tmp_dict)}) resultdicts.update({'input': input_dictionary}) resultdicts.update({self.name: input_dictionary}) if 'pipeline.mapfile' in self.parset.keywords(): resultdicts['input']['mapfile'] = str(self.parset['pipeline.mapfile']) resultdicts[self.name]['mapfile'] = str(self.parset['pipeline.mapfile']) # ********************************************************************* # main loop # there is a distinction between recipes and plugins for user scripts. # plugins are not used at the moment and might better be replaced with master recipes while step_name_list: stepname = step_name_list.pop(0) self.logger.info("Beginning step %s" % (stepname,)) step = step_control_dict[stepname] #step_parset = step_parset_obj[stepname] inputdict = {} inputargs = [] resultdict = {} # default kind_of_step to recipe. try: kind_of_step = step.getString('kind') except: kind_of_step = 'recipe' try: typeval = step.getString('type') except: typeval = '' adds = None if stepname in step_parset_obj: adds = self._construct_step_parset(inputdict, step_parset_obj[stepname], resultdicts, step_parset_files[stepname], stepname) # stepname not a valid input for old recipes if kind_of_step == 'recipe': if self.task_definitions.get(typeval, 'recipe') == 'executable_args': inputdict['stepname'] = stepname if adds: inputdict.update(adds) self._construct_cmdline(inputargs, step, resultdicts) if stepname in step_parset_files: inputdict['parset'] = step_parset_files[stepname] self._construct_input(inputdict, step, resultdicts) # hack, popping 'type' is necessary, why? because you deleted kind already in parsets try: inputdict.pop('type') except: pass try: inputdict.pop('kind') except: pass # \hack # more hacks. Frameworks DictField not properly implemented. Construct your own dict from input. # python buildin functions cant handle the string returned from parset class. if 'environment' in inputdict.keys(): val = inputdict['environment'].rstrip('}').lstrip('{').replace(' ', '') splitval = str(val).split(',') valdict = {} for item in splitval: valdict[item.split(':')[0]] = item.split(':')[1] inputdict['environment'] = valdict # subpipeline. goal is to specify a pipeline within a pipeline. # load other existing pipeline parset and add them to your own. if kind_of_step == 'pipeline': subpipeline_parset = Parset() subpipeline_parset.adoptFile(typeval) submapfile = '' subpipeline_steplist = subpipeline_parset.getStringVector('pipeline.steps') if 'pipeline.mapfile' in subpipeline_parset.keywords(): submapfile = subpipeline_parset['pipeline.mapfile'] subpipeline_parset.remove('pipeline.mapfile') if 'mapfile_in' in inputdict.keys(): submapfile = inputdict.pop('mapfile_in') resultdicts.update({os.path.splitext(os.path.basename(typeval))[0]: { 'parset': typeval, 'mapfile': submapfile, }}) #todo: take care of pluginpathes and everything other then individual steps # make a pipeline parse methods that returns everything needed. # maybe as dicts to combine them to one subpipeline_parset.remove('pipeline.steps') if 'pipeline.pluginpath' in subpipeline_parset.keywords(): subpipeline_parset.remove('pipeline.pluginpath') checklist = copy.deepcopy(subpipeline_steplist) for k in self._keys(subpipeline_parset): if 'loopsteps' in k: for item in subpipeline_parset.getStringVector(k): checklist.append(item) # ********************************************************************* # master parset did not handle formatting and comments in the parset. # proper format only after use of parset.makesubset. then it is a different object # from a different super class :(. this also explains use of parset.keys and parset.keys() # take the parset from subpipeline and add it to the master parset. # UPDATE: do not use .keys on master parset. use .keywords(), then comments are filtered. # ********************************************************************* # replace names of steps with the subpipeline stepname to create a unique identifier. # replacement values starting with ! will be taken from the master parset and overwrite # the ones in the subpipeline. only works if the ! value is already in the subpipeline for k in self._keys(subpipeline_parset): val = subpipeline_parset[k] if not str(k).startswith('!') and not str(k).startswith('pipeline.replace.'): for item in checklist: if item+".output" in str(val): val = str(val).replace(item, stepname + '-' + item) self.parset.add(stepname + '-' + k, str(val)) else: # remove replacements strings to prevent loading the same key twice if k in self._keys(self.parset): self.parset.remove(k) self.parset.add(k, str(val)) for i, item in enumerate(subpipeline_steplist): subpipeline_steplist[i] = stepname + '-' + item for item in step_parset_obj[stepname].keys(): for k in self._keys(self.parset): if str(k).startswith('!') and item == str(k).strip("! ") or str(k).startswith('pipeline.replace.') and item == str(k)[17:].strip(): self.parset.remove(k) self.parset.add('! ' + item, str(step_parset_obj[stepname][item])) self._replace_values() self._construct_steps(subpipeline_steplist, step_control_dict, step_parset_files, step_parset_obj, parset_dir) for j in reversed(subpipeline_steplist): name = j step_control_dict[name] = step_control_dict[j] step_name_list.insert(0, name) # loop if kind_of_step == 'loop': # remember what loop is running to stop it from a conditional step if activeloop[0] is not stepname: activeloop.insert(0, stepname) # prepare counter = 0 breakloop = False if stepname in resultdicts: counter = int(resultdicts[stepname]['counter']) + 1 breakloop = resultdicts[stepname]['break'] loopsteps = step.getStringVector('loopsteps') # break at max iteration or when other step sets break variable if counter is step.getInt('loopcount'): breakloop = True if not breakloop: # add loop steps to the pipeline including the loop itself step_name_list.insert(0, stepname) self._construct_steps(loopsteps, step_control_dict, step_parset_files, step_parset_obj, parset_dir) for j in reversed(loopsteps): name = j step_control_dict[name] = step_control_dict[j] step_name_list.insert(0, name) # results for other steps to check and write states resultdict = {'counter': counter, 'break': breakloop} else: # reset values for second use of the loop (but why would you do that?) resultdict = {'counter': -1, 'break': False} activeloop.pop(0) # recipes if kind_of_step == 'recipe': with duration(self, stepname): resultdict = self.run_task( typeval, inputargs, **inputdict ) # plugins if kind_of_step == 'plugin': bla = str(self.config.get('DEFAULT', 'recipe_directories')) pluginpath = bla.rstrip(']').lstrip('[').split(',') for i, item in enumerate(pluginpath): pluginpath[i] = os.path.join(item, 'plugins') if 'pluginpath' in pipeline_args.keys(): pluginpath.append(pipeline_args.getString('pluginpath')) with duration(self, stepname): resultdict = loader.call_plugin(typeval, pluginpath, inputargs, **inputdict) resultdicts[stepname] = resultdict # breaking the loopstep # if the step has the keyword for loopbreaks assign the value if activeloop[0] in resultdicts and resultdict is not None and 'break' in resultdict: resultdicts[activeloop[0]]['break'] = resultdict['break']
def run(self, infile, executable, args, kwargs, work_dir='/tmp', parsetasfile=False, args_format='', environment=''): """ This method contains all the needed functionality """ # Debugging info self.logger.debug("infile = %s" % infile) self.logger.debug("executable = %s" % executable) self.logger.debug("working directory = %s" % work_dir) self.logger.debug("arguments = %s" % args) self.logger.debug("arg dictionary = %s" % kwargs) self.logger.debug("environment = %s" % environment) self.environment.update(environment) # Time execution of this job with log_time(self.logger): #if os.path.exists(infile): self.logger.info("Processing %s" % infile) # else: # self.logger.error("Dataset %s does not exist" % infile) # return 1 # Check if executable is present if not os.access(executable, os.X_OK): self.logger.error("Executable %s not found" % executable) return 1 # hurray! race condition when running with more than one process on one filesystem if not os.path.isdir(work_dir): try: os.mkdir(work_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(work_dir): pass else: raise argsformat = args_format['args_format'] # deal with multiple input files for wsclean if argsformat == 'wsclean': for i in reversed(xrange(len(args))): if str(args[i]).startswith('[') and str(args[i]).endswith(']'): tmplist = args.pop(i).lstrip('[').rstrip(']').split(',') for val in reversed(tmplist): args.insert(i, val.strip(' \'\"')) if not parsetasfile: if argsformat == 'gnu': for k, v in kwargs.items(): args.append('--' + k + '=' + v) if argsformat == 'lofar': for k, v in kwargs.items(): args.append(k + '=' + v) if argsformat == 'argparse': for k, v in kwargs.items(): args.append('--' + k + ' ' + v) if argsformat == 'wsclean': for k, v in kwargs.items(): if str(v).startswith('[') and str(v).endswith(']'): v = v.lstrip('[').rstrip(']').replace(' ', '') multargs = v.split(',') else: multargs = v.split(' ') if multargs: multargs.reverse() for item in multargs: args.insert(0, item) else: args.insert(0, v) args.insert(0, '-'+ k) else: nodeparset = Parset() parsetname = os.path.join(work_dir, os.path.basename(infile) + '.parset') for k, v in kwargs.items(): nodeparset.add(k, v) nodeparset.writeFile(parsetname) if argsformat == 'losoto': args.append(parsetname) else: args.insert(0,parsetname) try: # **************************************************************** # Run cmd = [executable] + args with CatchLog4CPlus( work_dir, self.logger.name + "." + os.path.basename(infile), os.path.basename(executable), ) as logger: # Catch segfaults and retry catch_segfaults( cmd, work_dir, self.environment, logger ) except CalledProcessError, err: # CalledProcessError isn't properly propagated by IPython self.logger.error(str(err)) return 1 except Exception, err: self.logger.error(str(err)) return 1
def run(self, imager_exec, vds, parset, resultsdir, start_time, end_time): # imager_exec: path to cimager executable # vds: VDS file describing the data to be imaged # parset: imager configuration # resultsdir: place resulting images here # start_time: ) time range to be imaged # end_time: ) in seconds (may be None) # ---------------------------------------------------------------------- with log_time(self.logger): self.logger.info("Processing %s" % (vds, )) # Bail out if destination exists (can thus resume a partial run). # Should be configurable? # ------------------------------------------------------------------ parset_data = Parset(parset) image_names = parset_data.getStringVector("Cimager.Images.Names") for image_name in image_names: outputfile = os.path.join(resultsdir, image_name + ".restored") self.logger.info(outputfile) if os.path.exists(outputfile): self.logger.info("Image already exists: aborting.") return 0 try: working_dir = mkdtemp(suffix=".%s" % (os.path.basename(__file__), )) # If a time range has been specified, copy that section of the # input MS and only image that. # -------------------------------------------------------------- query = [] if start_time: self.logger.debug("Start time is %s" % start_time) start_time = quantity(float(start_time), 's') query.append("TIME > %f" % start_time.get('s').get_value()) if end_time: self.logger.debug("End time is %s" % end_time) end_time = quantity(float(end_time), 's') query.append("TIME < %f" % end_time.get('s').get_value()) query = " AND ".join(query) if query: # Select relevant section of MS. # ---------------------------------------------------------- self.logger.debug("Query is %s" % query) output = os.path.join(working_dir, "timeslice.MS") vds_parset = get_parset(vds) t = table(vds_parset.getString("FileName")) t.query(query, name=output) # Patch updated information into imager configuration. # ---------------------------------------------------------- parset = patch_parset(parset, {'Cimager.dataset': output}) else: self.logger.debug("No time range selected") self.logger.debug("Running cimager") with CatchLog4CXX( working_dir, self.logger.name + "." + os.path.basename(vds)): cimager_process = Popen([imager_exec, "-inputs", parset], stdout=PIPE, stderr=PIPE, cwd=working_dir) sout, serr = cimager_process.communicate() log_process_output("cimager", sout, serr, self.logger) if cimager_process.returncode != 0: raise CalledProcessError(cimager_process.returncode, imager_exec) # Dump the resulting images in the pipeline results area. # I'm not aware of a foolproof way to predict the image names # that will be produced, so we read them from the # parset and add standard cimager prefixes. # -------------------------------------------------------------- parset_data = Parset(parset) image_names = parset_data.getStringVector( "Cimager.Images.Names") prefixes = [ "image", "psf", "residual", "weights", "sensitivity" ] self.logger.debug("Copying images to %s" % resultsdir) for image_name in image_names: for prefix in prefixes: filename = image_name.replace("image", prefix, 1) shutil.move(os.path.join(working_dir, filename), os.path.join(resultsdir, filename)) if parset_data.getBool('Cimager.restore'): shutil.move( os.path.join(working_dir, image_name + ".restored"), os.path.join(resultsdir, image_name + ".restored")) except CalledProcessError, e: self.logger.error(str(e)) return 1 finally:
def go(self): if 'executable' in self.inputs: executable = self.inputs['executable'] if 'environment' in self.inputs: self.environment.update(self.inputs['environment']) self.logger.info("Starting %s run" % executable) super(executable_args, self).go() # args format stuff args_format = {'args_format': self.inputs['args_format'], 'args_format_argument': self.inputs['args_format_argument'], 'args_format_option': self.inputs['args_format_option'], 'args_formatlongoption': self.inputs['args_format_longoption'], 'args_format_option_argument': self.inputs['args_format_option_argument']} # ********************************************************************* # try loading input/output data file, validate output vs the input location if # output locations are provided try: inputmapfiles = [] inlist = [] if self.inputs['mapfile_in']: inlist.append(self.inputs['mapfile_in']) if self.inputs['mapfiles_in']: for item in self.inputs['mapfiles_in']: inlist.append(item) self.inputs['mapfile_in'] = self.inputs['mapfiles_in'][0] for item in inlist: inputmapfiles.append(DataMap.load(item)) except Exception: self.logger.error('Could not load input Mapfile %s' % inlist) return 1 outputmapfiles = [] prefix = os.path.join(self.inputs['working_directory'], self.inputs['job_name']) if self.inputs['mapfile_out']: try: outdata = DataMap.load(self.inputs['mapfile_out']) outputmapfiles.append(outdata) except Exception: self.logger.error('Could not load output Mapfile %s' % self.inputs['mapfile_out']) return 1 # sync skip fields in the mapfiles align_data_maps(inputmapfiles[0], outputmapfiles[0]) elif self.inputs['mapfiles_out']: for item in self.inputs['mapfiles_out']: outputmapfiles.append(DataMap.load(item)) self.inputs['mapfile_out'] = self.inputs['mapfiles_out'][0] else: # ouput will be directed in the working directory if no output mapfile is specified outdata = copy.deepcopy(inputmapfiles[0]) if not self.inputs['inplace']: for item in outdata: item.file = os.path.join( self.inputs['working_directory'], self.inputs['job_name'], #os.path.basename(item.file) + '.' + os.path.split(str(executable))[1] os.path.splitext(os.path.basename(item.file))[0] + '.' + self.inputs['stepname'] ) self.inputs['mapfile_out'] = os.path.join(prefix, self.inputs['stepname'] + '.' + 'mapfile') self.inputs['mapfiles_out'].append(self.inputs['mapfile_out']) else: self.inputs['mapfile_out'] = self.inputs['mapfile_in'] self.inputs['mapfiles_out'].append(self.inputs['mapfile_out']) outputmapfiles.append(outdata) if not validate_data_maps(inputmapfiles[0], outputmapfiles[0]): self.logger.error( "Validation of data mapfiles failed!" ) return 1 if self.inputs['outputsuffixes']: # Handle multiple outputfiles for name in self.inputs['outputsuffixes']: outputmapfiles.append(copy.deepcopy(inputmapfiles[0])) self.inputs['mapfiles_out'].append(os.path.join(prefix, self.inputs['stepname'] + name + '.' + 'mapfile')) for item in outputmapfiles[-1]: item.file = os.path.join( prefix, os.path.splitext(os.path.basename(item.file))[0] + '.' + self.inputs['stepname'] + name ) self.inputs['mapfile_out'] = self.inputs['mapfiles_out'][0] # prepare arguments arglist = self.inputs['arguments'] parsetdict = {} if 'parset' in self.inputs: parset = Parset() parset.adoptFile(self.inputs['parset']) for k in parset.keys: parsetdict[k] = str(parset[k]) # construct multiple input data if not self.inputs['inputkeys'] and self.inputs['inputkey']: self.inputs['inputkeys'].append(self.inputs['inputkey']) if not self.inputs['outputkeys'] and self.inputs['outputkey']: self.inputs['outputkeys'].append(self.inputs['outputkey']) if not self.inputs['skip_infile'] and len(self.inputs['inputkeys']) is not len(inputmapfiles): self.logger.error("Number of input mapfiles %d and input keys %d have to match." % len(self.inputs['inputkeys']), len(inputmapfiles)) return 1 filedict = {} if self.inputs['inputkeys'] and not self.inputs['skip_infile']: for key, filemap in zip(self.inputs['inputkeys'], inputmapfiles): filedict[key] = [] for inp in filemap: filedict[key].append(inp.file) if self.inputs['outputkey']: filedict[self.inputs['outputkey']] = [] for item in outputmapfiles[0]: filedict[self.inputs['outputkey']].append(item.file) # ******************************************************************** # Call the node side of the recipe # Create and schedule the compute jobs command = "python %s" % (self.__file__.replace('master', 'nodes')).replace('executable_args', self.inputs['nodescript']) inputmapfiles[0].iterator = outputmapfiles[0].iterator = DataMap.SkipIterator jobs = [] for i, (outp, inp,) in enumerate(zip( outputmapfiles[0], inputmapfiles[0]) ): arglist_copy = copy.deepcopy(arglist) parsetdict_copy = copy.deepcopy(parsetdict) #if keylist: #for name, value in zip(keylist, inputlist): if filedict: for name, value in filedict.iteritems(): if arglist_copy and name in arglist_copy: ind = arglist_copy.index(name) arglist_copy[ind] = value[i] elif name in parsetdict_copy.values(): for k, v in parsetdict_copy.iteritems(): if v == name: parsetdict_copy[k] = value[i] else: parsetdict_copy[name] = value[i] jobs.append( ComputeJob( inp.host, command, arguments=[ inp.file, executable, arglist_copy, parsetdict_copy, prefix, self.inputs['parsetasfile'], args_format, #self.inputs['working_directory'], self.environment ] ) ) max_per_node = self.inputs['max_per_node'] self._schedule_jobs(jobs, max_per_node) jobresultdict = {} resultmap = {} for job, outp in zip(jobs, outputmapfiles[0]): if job.results['returncode'] != 0: outp.skip = True for k, v in job.results.items(): if not k in jobresultdict: jobresultdict[k] = [] jobresultdict[k].append(DataProduct(job.host, job.results[k], outp.skip)) if k == 'break': self.outputs.update({'break': v}) # temp solution. write all output dict entries to a mapfile mapfile_dir = os.path.join(self.config.get("layout", "job_directory"), "mapfiles") for k, v in jobresultdict.items(): dmap = DataMap(v) dmap.save(os.path.join(mapfile_dir, k + '.mapfile')) resultmap[k + '.mapfile'] = os.path.join(mapfile_dir, k + '.mapfile') self.outputs.update(resultmap) # ********************************************************************* # Check job results, and create output data map file if self.error.isSet(): # Abort if all jobs failed if all(job.results['returncode'] != 0 for job in jobs): self.logger.error("All jobs failed. Bailing out!") return 1 else: self.logger.warn( "Some jobs failed, continuing with succeeded runs" ) mapdict = {} for item, name in zip(outputmapfiles, self.inputs['mapfiles_out']): self.logger.debug("Writing data map file: %s" % name) item.save(name) mapdict[os.path.basename(name)] = name self.outputs['mapfile'] = self.inputs['mapfile_out'] if self.inputs['outputsuffixes']: self.outputs.update(mapdict) return 0
def run(self, infile, executable, args, kwargs, work_dir='/tmp', parsetasfile=False, args_format='', environment=''): """ This function contains all the needed functionality """ # Debugging info self.logger.debug("infile = %s" % infile) self.logger.debug("executable = %s" % executable) self.logger.debug("working directory = %s" % work_dir) self.logger.debug("arguments = %s" % args) self.logger.debug("arg dictionary = %s" % kwargs) self.logger.debug("environment = %s" % environment) self.environment.update(environment) # hack the planet #executable = 'casa' # Time execution of this job with log_time(self.logger): if os.path.exists(infile): self.logger.info("Processing %s" % infile) else: self.logger.error("Dataset %s does not exist" % infile) return 1 # Check if executable is present if not os.access(executable, os.X_OK): self.logger.error("Executable %s not found" % executable) return 1 # hurray! race condition when running with than one process on one filesystem if not os.path.isdir(work_dir): try: os.mkdir(work_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(work_dir): pass else: raise #print 'KWARGS: ', kwargs if not parsetasfile: for k, v in kwargs.items(): args.append('--' + k + '=' + v) else: nodeparset = Parset() sublist = [] for k, v in kwargs.items(): nodeparset.add(k, v) if str(k).find('.'): #print 'DOTPOS: ',str(k).find('.') #print 'SPLIT: ', str(k).split('.')[0] #print 'SPLIT: ', str(k).split('.')[1] if not str(k).split('.')[0] in sublist: sublist.append(str(k).split('.')[0]) #print 'SUBPARSETLIST: ', sublist #subpar = Parset() #quick hacks below. for proof of concept. subparsetlist = [] casastring = '' for sub in sublist: subpar = nodeparset.makeSubset( nodeparset.fullModuleName(sub) + '.') #print 'SUBPAR: ',subpar.keys() casastring = sub + '(' for k in subpar.keys(): #print 'SUBPARSET: ',k ,' ',subpar[k] #args.append('--' + k + '=' + subpar[k]) if str(subpar[k]).find('/') == 0: casastring += str(k) + '=' + "'" + str( subpar[k]) + "'" + ',' elif str(subpar[k]).find('/casastr/') == 0: casastring += str(k) + '=' + "'" + str( subpar[k]).strip('/casastr/') + "'" + ',' else: casastring += str(k) + '=' + str(subpar[k]) + ',' casastring = casastring.rstrip(',') casastring += ')\n' #print 'CASASTRING:' #print casastring # 1) return code of a casapy is not properly recognized by the pipeline # wrapping in shellscript works for succesful runs. # failed runs seem to hang the pipeline... # 2) casapy can not have two instances running from the same directory. # create tmp dirs casapydir = tempfile.mkdtemp(dir=work_dir) if casastring != '': casafilename = os.path.join( work_dir, os.path.basename(infile) + '.casacommand.py') casacommandfile = open(casafilename, 'w') casacommandfile.write('try:\n') casacommandfile.write(' ' + casastring) casacommandfile.write('except SystemExit:\n') casacommandfile.write(' pass\n') casacommandfile.write('except:\n') casacommandfile.write(' import os\n') casacommandfile.write(' os._exit(1)\n') casacommandfile.close() args.append(casafilename) somename = os.path.join( work_dir, os.path.basename(infile) + '.casashell.sh') commandstring = '' commandstring += executable for item in args: commandstring += ' ' + item #print 'COMMANDSTRING: ',commandstring crap = open(somename, 'w') crap.write('#!/bin/bash \n') crap.write('echo "Trying CASAPY command" \n') #crap.write('/home/zam/sfroehli/casapy-42.1.29047-001-1-64b/bin/casa' + ' --nologger'+' -c ' + casafilename) crap.write(commandstring) # crap.write('\nexit 0') crap.close() import stat st = os.stat(somename) #os.chmod(casafilename, stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) os.chmod( somename, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) try: # **************************************************************** # Run #cmd = [executable] + args cmd = [somename] with CatchLog4CPlus( casapydir, self.logger.name + "." + os.path.basename(infile), os.path.basename(executable), ) as logger: # Catch segfaults and retry catch_segfaults(cmd, casapydir, self.environment, logger) except CalledProcessError, err: # CalledProcessError isn't properly propagated by IPython self.logger.error(str(err)) return 1 except Exception, err: self.logger.error(str(err)) return 1
def run(self, infile, executable, args, kwargs, work_dir='/tmp', parsetasfile=True, args_format='', environment=''): """ This method contains all the needed functionality """ # Debugging info self.logger.debug("infile = %s" % infile) self.logger.debug("executable = %s" % executable) self.logger.debug("working directory = %s" % work_dir) self.logger.debug("arguments = %s" % args) self.logger.debug("arg dictionary = %s" % kwargs) self.logger.debug("environment = %s" % environment) self.environment.update(environment) self.work_dir = work_dir self.infile = infile self.executable = executable if 'replace-sourcedb' in kwargs: self.replace_sourcedb = kwargs['replace-sourcedb'] kwargs.pop('replace-sourcedb') if 'replace-parmdb' in kwargs: self.replace_parmdb = kwargs['replace-parmdb'] kwargs.pop('replace-parmdb') if 'dry-run' in kwargs: self.dry_run = kwargs['dry-run'] kwargs.pop('dry-run') if 'sourcedb' in kwargs: self.sourcedb = kwargs['sourcedb'] kwargs.pop('sourcedb') if 'parmdb' in kwargs: self.parmdb = kwargs['parmdb'] kwargs.pop('parmdb') if 'sourcedb-name' in kwargs: self.sourcedb_basename = kwargs['sourcedb-name'] self.replace_sourcedb = True kwargs.pop('sourcedb-name') if 'parmdb-name' in kwargs: self.parmdb_basename = kwargs['parmdb-name'] self.replace_parmdb = True kwargs.pop('parmdb-name') if 'force' in kwargs: self.replace_parmdb = True self.replace_sourcedb = True kwargs.pop('force') numthreads = 1 if 'numthreads' in kwargs: numthreads = kwargs['numthreads'] kwargs.pop('numthreads') args.append('--numthreads=' + str(numthreads)) if 'observation' in kwargs: self.observation = kwargs.pop('observation') if 'catalog' in kwargs: self.catalog = kwargs.pop('catalog') self.createsourcedb() self.createparmdb() if not 'no-columns' in kwargs: #if not kwargs['no-columns']: self.addcolumns() else: kwargs.pop('no-columns') args.append('--sourcedb=' + self.sourcedb_path) args.append('--parmdb=' + self.parmdb_path) args.append(self.observation) #catalog = None # Time execution of this job with log_time(self.logger): #if os.path.exists(infile): self.logger.info("Processing %s" % infile) # Check if script is present if not os.path.isfile(executable): self.logger.error("Executable %s not found" % executable) return 1 # hurray! race condition when running with more than one process on one filesystem if not os.path.isdir(work_dir): try: os.mkdir(work_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(work_dir): pass else: raise if parsetasfile: nodeparset = Parset() parsetname = os.path.join(work_dir, os.path.basename(infile) + '.parset') for k, v in list(kwargs.items()): nodeparset.add(k, v) nodeparset.writeFile(parsetname) #args.insert(0, parsetname) args.append(parsetname) #if catalog is not None: # args.append(catalog) try: # **************************************************************** #Run cmd = [executable] + args with CatchLog4CPlus( work_dir, self.logger.name + "." + os.path.basename(infile), os.path.basename(executable), ) as logger: # Catch segfaults and retry catch_segfaults(cmd, work_dir, self.environment, logger) except CalledProcessError as err: # CalledProcessError isn't properly propagated by IPython self.logger.error(str(err)) return 1 except Exception as err: self.logger.error(str(err)) return 1 # We need some signal to the master script that the script ran ok. self.outputs['ok'] = True return 0
def pipeline_logic(self): try: parset_file = os.path.abspath(self.inputs['args'][0]) except IndexError: return self.usage() try: if self.parset.keys == []: self.parset.adoptFile(parset_file) self.parset_feedback_file = parset_file + "_feedback" except RuntimeError: print >> sys.stderr, "Error: Parset file not found!" return self.usage() self._replace_values() # just a reminder that this has to be implemented validator = GenericPipelineParsetValidation(self.parset) if not validator.validate_pipeline(): self.usage() exit(1) if not validator.validate_steps(): self.usage() exit(1) #set up directories job_dir = self.config.get("layout", "job_directory") parset_dir = os.path.join(job_dir, "parsets") mapfile_dir = os.path.join(job_dir, "mapfiles") # Create directories for temporary parset- and map files create_directory(parset_dir) create_directory(mapfile_dir) # ********************************************************************* # maybe we dont need a subset but just a steplist # at the moment only a list with stepnames is given for the pipeline.steps parameter # pipeline.steps=[vdsmaker,vdsreader,setupparmdb1,setupsourcedb1,ndppp1,....] # the names will be the prefix for parset subsets pipeline_args = self.parset.makeSubset( self.parset.fullModuleName('pipeline') + '.') # ********************************************************************* # forward declaration of things. just for better overview and understanding whats in here. # some of this might be removed in upcoming iterations, or stuff gets added. step_name_list = pipeline_args.getStringVector('steps') step_control_dict = {} step_parset_files = {} step_parset_obj = {} activeloop = [''] # construct the list of step names and controls self._construct_steps(step_name_list, step_control_dict, step_parset_files, step_parset_obj, parset_dir) # initial parameters to be saved in resultsdict so that recipes have access to this step0 # double init values. 'input' should be considered deprecated # self.name would be consistent to use in subpipelines resultdicts = { 'input': { 'parset': parset_file, 'parsetobj': self.parset, 'job_dir': job_dir, 'parset_dir': parset_dir, 'mapfile_dir': mapfile_dir } } resultdicts.update({ self.name: { 'parset': parset_file, 'parsetobj': self.parset, 'job_dir': job_dir, 'parset_dir': parset_dir, 'mapfile_dir': mapfile_dir } }) if 'pipeline.mapfile' in self.parset.keys: resultdicts['input']['mapfile'] = str( self.parset['pipeline.mapfile']) resultdicts[self.name]['mapfile'] = str( self.parset['pipeline.mapfile']) # ********************************************************************* # main loop # there is a distinction between recipes and plugins for user scripts. # plugins are not used at the moment and might better be replaced with master recipes while step_name_list: stepname = step_name_list.pop(0) step = step_control_dict[stepname] #step_parset = step_parset_obj[stepname] inputdict = {} inputargs = [] resultdict = {} # default kind_of_step to recipe. try: kind_of_step = step.getString('kind') except: kind_of_step = 'recipe' try: typeval = step.getString('type') except: typeval = '' #self._construct_cmdline(inputargs, step, resultdicts) additional_input = {} if stepname in step_parset_obj: additional_input = self._construct_step_parset( step_parset_obj[stepname], resultdicts, step_parset_files[stepname], stepname) # stepname not a valid input for old recipes if kind_of_step == 'recipe': if self.task_definitions.get(typeval, 'recipe') == 'executable_args': inputdict = {'stepname': stepname} inputdict.update(additional_input) self._construct_cmdline(inputargs, step, resultdicts) if stepname in step_parset_files: inputdict['parset'] = step_parset_files[stepname] self._construct_input(inputdict, step, resultdicts) # hack, popping 'type' is necessary, why? because you deleted kind already in parsets try: inputdict.pop('type') except: pass try: inputdict.pop('kind') except: pass # \hack # more hacks. Frameworks DictField not properly implemented. Construct your own dict from input. # python buildin functions cant handle the string returned from parset class. if 'environment' in inputdict.keys(): val = inputdict['environment'].rstrip('}').lstrip('{').replace( ' ', '') splitval = str(val).split(',') valdict = {} for item in splitval: valdict[item.split(':')[0]] = item.split(':')[1] inputdict['environment'] = valdict # subpipeline. goal is to specify a pipeline within a pipeline. # load other existing pipeline parset and add them to your own. if kind_of_step == 'pipeline': subpipeline_parset = Parset() subpipeline_parset.adoptFile(typeval) submapfile = '' subpipeline_steplist = subpipeline_parset.getStringVector( 'pipeline.steps') if 'pipeline.mapfile' in subpipeline_parset.keys: submapfile = subpipeline_parset['pipeline.mapfile'] subpipeline_parset.remove('pipeline.mapfile') if 'mapfile_in' in inputdict.keys(): submapfile = inputdict.pop('mapfile_in') resultdicts.update({ os.path.splitext(os.path.basename(typeval))[0]: { 'parset': typeval, 'mapfile': submapfile, } }) #todo: take care of pluginpathes and everything other then individual steps # make a pipeline parse methods that returns everything needed. # maybe as dicts to combine them to one subpipeline_parset.remove('pipeline.steps') if 'pipeline.pluginpath' in subpipeline_parset.keys: subpipeline_parset.remove('pipeline.pluginpath') checklist = copy.deepcopy(subpipeline_steplist) for k in subpipeline_parset.keys: if 'loopsteps' in k: for item in subpipeline_parset.getStringVector(k): checklist.append(item) # ********************************************************************* # master parset did not handle formatting and comments in the parset. # proper format only after use of parset.makesubset. then it is a different object # from a different super class :(. this also explains use of parset.keys and parset.keys() # take the parset from subpipeline and add it to the master parset. # ********************************************************************* # replace names of steps with the subpipeline stepname to create a unique identifier. # replacement values starting with ! will be taken from the master parset and overwrite # the ones in the subpipeline. only works if the ! value is already in the subpipeline for k in subpipeline_parset.keys: if not str(k).startswith('#'): val = subpipeline_parset[k] if not str(k).startswith('!'): for item in checklist: if item in str(val): val = str(val).replace( item, stepname + '-' + item) self.parset.add(stepname + '-' + k, str(val)) else: self.parset.add(k, str(val)) for i, item in enumerate(subpipeline_steplist): subpipeline_steplist[i] = stepname + '-' + item for item in step_parset_obj[stepname].keys(): for k in self.parset.keys: if str(k).startswith('!') and item in k: self.parset.remove(k) self.parset.add( '! ' + item, str(step_parset_obj[stepname][item])) self._replace_values() self._construct_steps(subpipeline_steplist, step_control_dict, step_parset_files, step_parset_obj, parset_dir) for j in reversed(subpipeline_steplist): name = j step_control_dict[name] = step_control_dict[j] step_name_list.insert(0, name) # remove replacements strings to prevent loading the same key twice for k in copy.deepcopy(self.parset.keys): if str(k).startswith('!'): self.parset.remove(k) # loop if kind_of_step == 'loop': # remember what loop is running to stop it from a conditional step if activeloop[0] is not stepname: activeloop.insert(0, stepname) # prepare counter = 0 breakloop = False if stepname in resultdicts: counter = int(resultdicts[stepname]['counter']) + 1 breakloop = resultdicts[stepname]['break'] loopsteps = step.getStringVector('loopsteps') # break at max iteration or when other step sets break variable if counter is step.getInt('loopcount'): breakloop = True if not breakloop: # add loop steps to the pipeline including the loop itself step_name_list.insert(0, stepname) self._construct_steps(loopsteps, step_control_dict, step_parset_files, step_parset_obj, parset_dir) for j in reversed(loopsteps): name = j step_control_dict[name] = step_control_dict[j] step_name_list.insert(0, name) # results for other steps to check and write states resultdict = {'counter': counter, 'break': breakloop} else: # reset values for second use of the loop (but why would you do that?) resultdict = {'counter': -1, 'break': False} activeloop.pop(0) # recipes if kind_of_step == 'recipe': with duration(self, stepname): resultdict = self.run_task(typeval, inputargs, **inputdict) # plugins if kind_of_step == 'plugin': with duration(self, stepname): resultdict = loader.call_plugin( typeval, pipeline_args.getString('pluginpath'), inputargs, **inputdict) resultdicts[stepname] = resultdict # breaking the loopstep # if the step has the keyword for loopbreaks assign the value if resultdict is not None and 'break' in resultdict: if resultdict['break']: resultdicts[activeloop[0]]['break'] = resultdict['break']
class GenericPipeline(control): inputs = { 'loglevel': ingredient.StringField('--loglevel', help="loglevel", default='INFO', optional=True) } def __init__(self): control.__init__(self) self.parset = Parset() self.input_data = {} self.output_data = {} self.parset_feedback_file = None #self.logger = None#logging.RootLogger('DEBUG') self.name = '' #if not overwrite: # self.inputs['job_name'] = 'generic-pipeline' # if not self.inputs.has_key("start_time"): # import datetime # self.inputs["start_time"] = datetime.datetime.utcnow().replace(microsecond=0).isoformat() # if not hasattr(self, "config"): # self.config = self._read_config() # #self._read_config() # # ...and task files, if applicable # if not self.inputs.has_key("task_files"): # try: # self.inputs["task_files"] = utilities.string_to_list( # self.config.get('DEFAULT', "task_files") # ) # except NoOptionError: # self.inputs["task_files"] = [] # self.task_definitions = ConfigParser(self.config.defaults()) # print >> sys.stderr, "Reading task definition file(s): %s" % \ # ",".join(self.inputs["task_files"]) # self.task_definitions.read(self.inputs["task_files"]) # self.go() def usage(self): """ Display usage """ print >> sys.stderr, "Usage: %s [options] <parset-file>" % sys.argv[0] print >> sys.stderr, "Parset structure should look like:\n" \ "NYI" #return 1 def go(self): #""" #Read the parset-file that was given as input argument, and set the #jobname before calling the base-class's `go()` method. #""" try: parset_file = os.path.abspath(self.inputs['args'][0]) except IndexError: #return self.usage() self.usage() # Set job-name to basename of parset-file w/o extension, if it's not # set on the command-line with '-j' or '--job-name' if not 'job_name' in self.inputs: self.inputs['job_name'] = (os.path.splitext( os.path.basename(parset_file))[0]) self.name = self.inputs['job_name'] try: self.logger except: self.logger = getSearchingLogger(self.name) self.logger.setLevel(self.inputs['loglevel']) # Call the base-class's `go()` method. return super(GenericPipeline, self).go() # def pipeline_logic(self): # print 'Dummy because of stupid wrapping inside the framework' # if overwrite: # self.execute_pipeline() #def execute_pipeline(self): def pipeline_logic(self): try: parset_file = os.path.abspath(self.inputs['args'][0]) except IndexError: return self.usage() try: if self.parset.keys == []: self.parset.adoptFile(parset_file) self.parset_feedback_file = parset_file + "_feedback" except RuntimeError: print >> sys.stderr, "Error: Parset file not found!" return self.usage() self._replace_values() # just a reminder that this has to be implemented validator = GenericPipelineParsetValidation(self.parset) if not validator.validate_pipeline(): self.usage() exit(1) if not validator.validate_steps(): self.usage() exit(1) #set up directories job_dir = self.config.get("layout", "job_directory") parset_dir = os.path.join(job_dir, "parsets") mapfile_dir = os.path.join(job_dir, "mapfiles") # Create directories for temporary parset- and map files create_directory(parset_dir) create_directory(mapfile_dir) # ********************************************************************* # maybe we dont need a subset but just a steplist # at the moment only a list with stepnames is given for the pipeline.steps parameter # pipeline.steps=[vdsmaker,vdsreader,setupparmdb1,setupsourcedb1,ndppp1,....] # the names will be the prefix for parset subsets pipeline_args = self.parset.makeSubset( self.parset.fullModuleName('pipeline') + '.') # ********************************************************************* # forward declaration of things. just for better overview and understanding whats in here. # some of this might be removed in upcoming iterations, or stuff gets added. step_name_list = pipeline_args.getStringVector('steps') step_control_dict = {} step_parset_files = {} step_parset_obj = {} activeloop = [''] # construct the list of step names and controls self._construct_steps(step_name_list, step_control_dict, step_parset_files, step_parset_obj, parset_dir) # initial parameters to be saved in resultsdict so that recipes have access to this step0 # double init values. 'input' should be considered deprecated # self.name would be consistent to use in subpipelines resultdicts = { 'input': { 'parset': parset_file, 'parsetobj': self.parset, 'job_dir': job_dir, 'parset_dir': parset_dir, 'mapfile_dir': mapfile_dir } } resultdicts.update({ self.name: { 'parset': parset_file, 'parsetobj': self.parset, 'job_dir': job_dir, 'parset_dir': parset_dir, 'mapfile_dir': mapfile_dir } }) if 'pipeline.mapfile' in self.parset.keys: resultdicts['input']['mapfile'] = str( self.parset['pipeline.mapfile']) resultdicts[self.name]['mapfile'] = str( self.parset['pipeline.mapfile']) # ********************************************************************* # main loop # there is a distinction between recipes and plugins for user scripts. # plugins are not used at the moment and might better be replaced with master recipes while step_name_list: stepname = step_name_list.pop(0) step = step_control_dict[stepname] #step_parset = step_parset_obj[stepname] inputdict = {} inputargs = [] resultdict = {} # default kind_of_step to recipe. try: kind_of_step = step.getString('kind') except: kind_of_step = 'recipe' try: typeval = step.getString('type') except: typeval = '' #self._construct_cmdline(inputargs, step, resultdicts) additional_input = {} if stepname in step_parset_obj: additional_input = self._construct_step_parset( step_parset_obj[stepname], resultdicts, step_parset_files[stepname], stepname) # stepname not a valid input for old recipes if kind_of_step == 'recipe': if self.task_definitions.get(typeval, 'recipe') == 'executable_args': inputdict = {'stepname': stepname} inputdict.update(additional_input) self._construct_cmdline(inputargs, step, resultdicts) if stepname in step_parset_files: inputdict['parset'] = step_parset_files[stepname] self._construct_input(inputdict, step, resultdicts) # hack, popping 'type' is necessary, why? because you deleted kind already in parsets try: inputdict.pop('type') except: pass try: inputdict.pop('kind') except: pass # \hack # more hacks. Frameworks DictField not properly implemented. Construct your own dict from input. # python buildin functions cant handle the string returned from parset class. if 'environment' in inputdict.keys(): val = inputdict['environment'].rstrip('}').lstrip('{').replace( ' ', '') splitval = str(val).split(',') valdict = {} for item in splitval: valdict[item.split(':')[0]] = item.split(':')[1] inputdict['environment'] = valdict # subpipeline. goal is to specify a pipeline within a pipeline. # load other existing pipeline parset and add them to your own. if kind_of_step == 'pipeline': subpipeline_parset = Parset() subpipeline_parset.adoptFile(typeval) submapfile = '' subpipeline_steplist = subpipeline_parset.getStringVector( 'pipeline.steps') if 'pipeline.mapfile' in subpipeline_parset.keys: submapfile = subpipeline_parset['pipeline.mapfile'] subpipeline_parset.remove('pipeline.mapfile') if 'mapfile_in' in inputdict.keys(): submapfile = inputdict.pop('mapfile_in') resultdicts.update({ os.path.splitext(os.path.basename(typeval))[0]: { 'parset': typeval, 'mapfile': submapfile, } }) #todo: take care of pluginpathes and everything other then individual steps # make a pipeline parse methods that returns everything needed. # maybe as dicts to combine them to one subpipeline_parset.remove('pipeline.steps') if 'pipeline.pluginpath' in subpipeline_parset.keys: subpipeline_parset.remove('pipeline.pluginpath') checklist = copy.deepcopy(subpipeline_steplist) for k in subpipeline_parset.keys: if 'loopsteps' in k: for item in subpipeline_parset.getStringVector(k): checklist.append(item) # ********************************************************************* # master parset did not handle formatting and comments in the parset. # proper format only after use of parset.makesubset. then it is a different object # from a different super class :(. this also explains use of parset.keys and parset.keys() # take the parset from subpipeline and add it to the master parset. # ********************************************************************* # replace names of steps with the subpipeline stepname to create a unique identifier. # replacement values starting with ! will be taken from the master parset and overwrite # the ones in the subpipeline. only works if the ! value is already in the subpipeline for k in subpipeline_parset.keys: if not str(k).startswith('#'): val = subpipeline_parset[k] if not str(k).startswith('!'): for item in checklist: if item in str(val): val = str(val).replace( item, stepname + '-' + item) self.parset.add(stepname + '-' + k, str(val)) else: self.parset.add(k, str(val)) for i, item in enumerate(subpipeline_steplist): subpipeline_steplist[i] = stepname + '-' + item for item in step_parset_obj[stepname].keys(): for k in self.parset.keys: if str(k).startswith('!') and item in k: self.parset.remove(k) self.parset.add( '! ' + item, str(step_parset_obj[stepname][item])) self._replace_values() self._construct_steps(subpipeline_steplist, step_control_dict, step_parset_files, step_parset_obj, parset_dir) for j in reversed(subpipeline_steplist): name = j step_control_dict[name] = step_control_dict[j] step_name_list.insert(0, name) # remove replacements strings to prevent loading the same key twice for k in copy.deepcopy(self.parset.keys): if str(k).startswith('!'): self.parset.remove(k) # loop if kind_of_step == 'loop': # remember what loop is running to stop it from a conditional step if activeloop[0] is not stepname: activeloop.insert(0, stepname) # prepare counter = 0 breakloop = False if stepname in resultdicts: counter = int(resultdicts[stepname]['counter']) + 1 breakloop = resultdicts[stepname]['break'] loopsteps = step.getStringVector('loopsteps') # break at max iteration or when other step sets break variable if counter is step.getInt('loopcount'): breakloop = True if not breakloop: # add loop steps to the pipeline including the loop itself step_name_list.insert(0, stepname) self._construct_steps(loopsteps, step_control_dict, step_parset_files, step_parset_obj, parset_dir) for j in reversed(loopsteps): name = j step_control_dict[name] = step_control_dict[j] step_name_list.insert(0, name) # results for other steps to check and write states resultdict = {'counter': counter, 'break': breakloop} else: # reset values for second use of the loop (but why would you do that?) resultdict = {'counter': -1, 'break': False} activeloop.pop(0) # recipes if kind_of_step == 'recipe': with duration(self, stepname): resultdict = self.run_task(typeval, inputargs, **inputdict) # plugins if kind_of_step == 'plugin': with duration(self, stepname): resultdict = loader.call_plugin( typeval, pipeline_args.getString('pluginpath'), inputargs, **inputdict) resultdicts[stepname] = resultdict # breaking the loopstep # if the step has the keyword for loopbreaks assign the value if resultdict is not None and 'break' in resultdict: if resultdict['break']: resultdicts[activeloop[0]]['break'] = resultdict['break'] # ********************************************************************* # build the inputs for the master recipes. def _construct_input(self, inoutdict, controlparset, resdicts): # intermediate backward compatibility for opts subparset if controlparset.fullModuleName('opts'): argsparset = controlparset.makeSubset( controlparset.fullModuleName('opts') + '.') # hack elif 'loopcount' not in controlparset.keys(): argsparset = controlparset else: argsparset = controlparset.makeSubset( controlparset.fullModuleName('imaginary') + '.') # \hack self._replace_output_keyword(inoutdict, argsparset, resdicts) def _construct_cmdline(self, inoutargs, controlparset, resdicts): argsparset = controlparset.makeSubset( controlparset.fullModuleName('cmdline') + '.') for k in argsparset.keys(): if argsparset.getString(k).__contains__('.output.'): step, outvar = argsparset.getString(k).split('.output.') inoutargs.append(resdicts[step][outvar]) else: inoutargs.append(argsparset.getString(k)) try: controlparset.remove('cmdline.inmap') except: pass def _construct_steps(self, step_name_list, step_control_dict, step_parset_files, step_parset_obj, parset_dir): step_list_copy = (copy.deepcopy(step_name_list)) counter = 0 while step_list_copy: counter -= 1 stepname = step_list_copy.pop(-1) fullparset = self.parset.makeSubset( self.parset.fullModuleName(str(stepname)) + '.') subparset = fullparset.makeSubset( fullparset.fullModuleName('control') + '.') number = 0 for item in step_list_copy: if item == stepname: number += 1 if number != 0: stepname += str(number) step_name_list[counter] = stepname step_control_dict[stepname] = subparset # double implementation for intermediate backward compatibility if fullparset.fullModuleName( 'parsetarg') or fullparset.fullModuleName('argument'): if fullparset.fullModuleName('parsetarg'): stepparset = fullparset.makeSubset( fullparset.fullModuleName('parsetarg') + '.') if fullparset.fullModuleName('argument'): stepparset = fullparset.makeSubset( fullparset.fullModuleName('argument') + '.') # ********************************************************************* # save parsets # either a filename is given in the main parset # or files will be created from subsets with stepnames.parset as filenames # for name, parset in step_parset_dict.iteritems(): try: file_parset = Parset(stepparset.getString('parset')) for k in file_parset.keys: if not k in stepparset.keys(): stepparset.add(k, str(file_parset[k])) stepparset.remove('parset') except: pass # parset from task.cfg try: file_parset = Parset( self.task_definitions.get(str(subparset['type']), 'parset')) for k in file_parset.keys: if not k in stepparset.keys(): stepparset.add(k, str(file_parset[k])) except: pass # for parset in control section try: file_parset = Parset(subparset.getString('parset')) for k in file_parset.keys: if not k in stepparset.keys(): stepparset.add(k, str(file_parset[k])) subparset.remove('parset') except: pass step_parset = os.path.join(parset_dir, stepname + '.parset') stepparset.writeFile(step_parset) step_parset_files[stepname] = step_parset step_parset_obj[stepname] = stepparset def _replace_output_keyword(self, inoutdict, argsparset, resdicts): for k in argsparset.keys(): keystring = argsparset.getString(k) if keystring.__contains__('.output.'): if keystring.__contains__(','): keystring = keystring.rstrip(']') keystring = keystring.lstrip('[') vec = [] for item in keystring.split(','): if item.__contains__('.output.'): step, outvar = item.split('.output.') vec.append(resdicts[step][outvar]) else: vec.append(item) inoutdict[k] = vec else: step, outvar = argsparset.getString(k).split('.output.') if '+' in outvar: tmplist = str(outvar).split('+') inoutdict[k] = resdicts[step][tmplist[0]] + tmplist[1] else: inoutdict[k] = resdicts[step][outvar] else: inoutdict[k] = argsparset.getString(k) def _construct_step_parset(self, argsparset, resdicts, filename, stepname): addvals = {'inputkeys': [], 'mapfiles_in': [], 'arguments': []} # hack for original order of args tmp_keys = argsparset.keys() ordered_keys = [] for orig in self.parset.keys: for item in tmp_keys: if (stepname + '.') in orig and ( 'argument.' + item in orig and not 'argument.' + item + '.' in orig): ordered_keys.append(item) continue # \hack for k in ordered_keys: valuestring = argsparset.getString(k) if valuestring.__contains__('.output.'): if valuestring.__contains__(','): valuestring = valuestring.rstrip(']') valuestring = valuestring.lstrip('[') vec = [] for item in valuestring.split(','): if item.__contains__('.output.'): step, outvar = item.split('.output.') vec.append(resdicts[step][outvar]) if 'mapfile' in str(outvar): addvals['inputkeys'].append( resdicts[step][outvar]) addvals['mapfiles_in'].append( resdicts[step][outvar]) else: vec.append(item) argsparset.replace(k, str(vec)) if k == 'flags': addvals['arguments'] = vec argsparset.remove(k) else: step, outvar = argsparset.getString(k).split('.output.') #more ugly hacks... really needs clearly structured replacement method... if '+' in outvar: tmplist = str(outvar).split('+') argsparset.replace( k, str(resdicts[step][tmplist[0]]) + tmplist[1]) else: argsparset.replace(k, str(resdicts[step][outvar])) #if isinstance(resdicts[step][outvar], str): if 'mapfile' in str(outvar): addvals['inputkeys'].append(resdicts[step][outvar]) addvals['mapfiles_in'].append(resdicts[step][outvar]) if k == 'flags': addvals['arguments'] = str(argsparset[k]) argsparset.remove(k) else: if k == 'flags': addvals['arguments'] = str(argsparset[k]) argsparset.remove(k) #direct usage of outputkey if valuestring.__contains__('outputkey'): addvals['outputkey'] = 'outputkey' argsparset.writeFile(filename) return addvals def _get_parset_dicts(self): return {} def show_tasks(self): tasklist = [] tasklist = self.task_definitions.sections() for item in tasklist: print item #return tasklist def show_task(self, task): task_parset = Parset() if self.task_definitions.has_option(task, 'parset'): task_parset.adoptFile(self.task_definitions.get(task, 'parset')) print 'possible arguments: key = value' for k in task_parset.keys: print ' ', k, ' ', '=', ' ', task_parset[ k] def _add_step(self): steplist = [] def _replace_values(self): replacedict = {} try: import imp plugin = imp.load_source('main', str(self.parset['prepare'])) replacedict = plugin.main() except: pass for check in self.parset.keys: if str(check).startswith('!'): replacedict[str(check).lstrip('!').lstrip(' ')] = str( self.parset[check]) #print 'REPLACEDICT: ',replacedict for check in self.parset.keys: if not str(check).startswith('#'): for k, v in replacedict.iteritems(): if '{{ ' + k + ' }}' in str(self.parset[check]): replacestring = str(self.parset[check]).replace( '{{ ' + k + ' }}', v) self.parset.replace(check, replacestring)
def run(self, infile, executable, args, kwargs, work_dir='/tmp', parsetasfile=True, args_format='', environment=''): """ This method contains all the needed functionality """ # Debugging info self.logger.debug("infile = %s" % infile) self.logger.debug("executable = %s" % executable) self.logger.debug("working directory = %s" % work_dir) self.logger.debug("arguments = %s" % args) self.logger.debug("arg dictionary = %s" % kwargs) self.logger.debug("environment = %s" % environment) self.environment.update(environment) self.work_dir = work_dir self.infile = infile self.executable = executable self.msout_original = kwargs['msout'].rstrip('/') kwargs.pop('msout') self.msout_destination_dir = os.path.dirname(self.msout_original) # Set up scratch paths scratch_dir = kwargs['local_scratch_dir'] kwargs.pop('local_scratch_dir') try: os.mkdir(scratch_dir) except OSError: pass self.scratch_dir = tempfile.mkdtemp(dir=scratch_dir) self.logger.info('Using {} as scratch directory'.format(self.scratch_dir)) self.msout_scratch = os.path.join(self.scratch_dir, os.path.basename(self.msout_original)) args.append('msout=' + self.msout_scratch) # Time execution of this job with log_time(self.logger): #if os.path.exists(infile): self.logger.info("Processing %s" % infile) # Check if script is present if not os.path.isfile(executable): self.logger.error("Executable %s not found" % executable) return 1 # hurray! race condition when running with more than one process on one filesystem if not os.path.isdir(work_dir): try: os.mkdir(work_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(work_dir): pass else: raise argsformat = args_format['args_format'] if not parsetasfile: if argsformat == 'gnu': for k, v in kwargs.items(): args.append('--' + k + '=' + v) if argsformat == 'lofar': for k, v in kwargs.items(): args.append(k + '=' + v) if argsformat == 'argparse': for k, v in kwargs.items(): args.append('--' + k + ' ' + v) if argsformat == 'wsclean': for k, v in kwargs.items(): multargs = v.split(' ') if multargs: multargs.reverse() for item in multargs: args.insert(0, item) else: args.insert(0, v) args.insert(0, '-'+ k) else: nodeparset = Parset() parsetname = os.path.join(work_dir, os.path.basename(infile) + '.parset') for k, v in kwargs.items(): nodeparset.add(k, v) nodeparset.writeFile(parsetname) args.insert(0, parsetname) try: # **************************************************************** #Run cmd = [executable] + args with CatchLog4CPlus( work_dir, self.logger.name + "." + os.path.basename(infile), os.path.basename(executable), ) as logger: # Catch segfaults and retry catch_segfaults( cmd, work_dir, self.environment, logger ) except CalledProcessError, err: # CalledProcessError isn't properly propagated by IPython self.logger.error(str(err)) self.cleanup() return 1 except Exception, err: self.logger.error(str(err)) self.cleanup() return 1
def run(self, imager_exec, vds, parset, resultsdir, start_time, end_time): # imager_exec: path to cimager executable # vds: VDS file describing the data to be imaged # parset: imager configuration # resultsdir: place resulting images here # start_time: ) time range to be imaged # end_time: ) in seconds (may be None) # ---------------------------------------------------------------------- with log_time(self.logger): self.logger.info("Processing %s" % (vds,)) # Bail out if destination exists (can thus resume a partial run). # Should be configurable? # ------------------------------------------------------------------ parset_data = Parset(parset) image_names = parset_data.getStringVector("Cimager.Images.Names") for image_name in image_names: outputfile = os.path.join(resultsdir, image_name + ".restored") self.logger.info(outputfile) if os.path.exists(outputfile): self.logger.info("Image already exists: aborting.") return 0 try: working_dir = mkdtemp(suffix=".%s" % (os.path.basename(__file__),)) # If a time range has been specified, copy that section of the # input MS and only image that. # -------------------------------------------------------------- query = [] if start_time: self.logger.debug("Start time is %s" % start_time) start_time = quantity(float(start_time), 's') query.append("TIME > %f" % start_time.get('s').get_value()) if end_time: self.logger.debug("End time is %s" % end_time) end_time = quantity(float(end_time), 's') query.append("TIME < %f" % end_time.get('s').get_value()) query = " AND ".join(query) if query: # Select relevant section of MS. # ---------------------------------------------------------- self.logger.debug("Query is %s" % query) output = os.path.join(working_dir, "timeslice.MS") vds_parset = get_parset(vds) t = table(vds_parset.getString("FileName")) t.query(query, name=output) # Patch updated information into imager configuration. # ---------------------------------------------------------- parset = patch_parset(parset, { 'Cimager.dataset': output } ) else: self.logger.debug("No time range selected") self.logger.debug("Running cimager") with CatchLog4CXX( working_dir, self.logger.name + "." + os.path.basename(vds) ): cimager_process = Popen( [imager_exec, "-inputs", parset], stdout=PIPE, stderr=PIPE, cwd=working_dir ) sout, serr = cimager_process.communicate() log_process_output("cimager", sout, serr, self.logger) if cimager_process.returncode != 0: raise CalledProcessError( cimager_process.returncode, imager_exec ) # Dump the resulting images in the pipeline results area. # I'm not aware of a foolproof way to predict the image names # that will be produced, so we read them from the # parset and add standard cimager prefixes. # -------------------------------------------------------------- parset_data = Parset(parset) image_names = parset_data.getStringVector("Cimager.Images.Names") prefixes = [ "image", "psf", "residual", "weights", "sensitivity" ] self.logger.debug("Copying images to %s" % resultsdir) for image_name in image_names: for prefix in prefixes: filename = image_name.replace("image", prefix, 1) shutil.move( os.path.join(working_dir, filename), os.path.join(resultsdir, filename) ) if parset_data.getBool('Cimager.restore'): shutil.move( os.path.join(working_dir, image_name + ".restored"), os.path.join(resultsdir, image_name + ".restored") ) except CalledProcessError, e: self.logger.error(str(e)) return 1 finally:
def run( self, executable, initscript, infile, key, db_name, db_user, db_host ): # executable: path to KernelControl executable # initscript: path to lofarinit.sh # infile: MeasurementSet for processing # key, db_name, db_user, db_host: database connection parameters # ---------------------------------------------------------------------- with log_time(self.logger): if os.path.exists(infile): self.logger.info("Processing %s" % (infile)) else: self.logger.error("Dataset %s does not exist" % (infile)) return 1 # Build a configuration parset specifying database parameters # for the kernel # ------------------------------------------------------------------ self.logger.debug("Setting up kernel parset") filesystem = "%s:%s" % (os.uname()[1], get_mountpoint(infile)) fd, parset_filename = mkstemp() kernel_parset = Parset() for key, value in { "ObservationPart.Filesystem": filesystem, "ObservationPart.Path": infile, "BBDB.Key": key, "BBDB.Name": db_name, "BBDB.User": db_user, "BBDB.Host": db_host, "ParmLog": "", "ParmLoglevel": "", "ParmDB.Sky": infile + ".sky", "ParmDB.Instrument": infile + ".instrument" }.iteritems(): kernel_parset.add(key, value) kernel_parset.writeFile(parset_filename) os.close(fd) self.logger.debug("Parset written to %s" % (parset_filename,)) # Run the kernel # Catch & log output from the kernel logger and stdout # ------------------------------------------------------------------ working_dir = mkdtemp(suffix=".%s" % (os.path.basename(__file__),)) env = read_initscript(self.logger, initscript) try: cmd = [executable, parset_filename, "0"] self.logger.debug("Executing BBS kernel") with CatchLog4CPlus( working_dir, self.logger.name + "." + os.path.basename(infile), os.path.basename(executable), ): bbs_kernel_process = Popen( cmd, stdout=PIPE, stderr=PIPE, cwd=working_dir ) sout, serr = bbs_kernel_process.communicate() log_process_output("BBS kernel", sout, serr, self.logger) if bbs_kernel_process.returncode != 0: raise CalledProcessError( bbs_kernel_process.returncode, executable ) except CalledProcessError, e: self.logger.error(str(e)) return 1 finally:
def go(self): self.logger.info("Starting cimager run") super(cimager, self).go() self.outputs['images'] = [] # Build a GVDS file describing all the data to be processed # ---------------------------------------------------------------------- self.logger.debug("Building VDS file describing all data for cimager") gvds_file = os.path.join(self.config.get("layout", "job_directory"), "vds", "cimager.gvds") inputs = LOFARinput(self.inputs) inputs['args'] = self.inputs['args'] inputs['gvds'] = gvds_file inputs['unlink'] = False inputs['makevds'] = self.inputs['makevds'] inputs['combinevds'] = self.inputs['combinevds'] inputs['nproc'] = self.inputs['nproc'] inputs['directory'] = os.path.dirname(gvds_file) outputs = LOFARoutput(self.inputs) if self.cook_recipe('vdsmaker', inputs, outputs): self.logger.warn("vdsmaker reports failure") return 1 self.logger.debug("cimager GVDS is %s" % (gvds_file, )) # Read data for processing from the GVDS file # ---------------------------------------------------------------------- parset = Parset(gvds_file) data = [] for part in range(parset.getInt('NParts')): host = parset.getString("Part%d.FileSys" % part).split(":")[0] vds = parset.getString("Part%d.Name" % part) data.append((host, vds)) # Divide data into timesteps for imaging # timesteps is a list of (start, end, results directory) tuples # ---------------------------------------------------------------------- timesteps = [] results_dir = self.inputs['results_dir'] if self.inputs['timestep'] == 0: self.logger.info("No timestep specified; imaging all data") timesteps = [(None, None, results_dir)] else: self.logger.info("Using timestep of %s s" % self.inputs['timestep']) gvds = get_parset(gvds_file) start_time = quantity(gvds['StartTime'].get()).get('s').get_value() end_time = quantity(gvds['EndTime'].get()).get('s').get_value() step = float(self.inputs['timestep']) while start_time < end_time: timesteps.append((start_time, start_time + step, os.path.join(results_dir, str(start_time)))) start_time += step # Run each cimager process in a separate thread # ---------------------------------------------------------------------- command = "python %s" % (self.__file__.replace('master', 'nodes')) for label, timestep in enumerate(timesteps): self.logger.info("Processing timestep %d" % label) jobs = [] parsets = [] start_time, end_time, resultsdir = timestep for host, vds in data: vds_data = Parset(vds) frequency_range = [ vds_data.getDoubleVector("StartFreqs")[0], vds_data.getDoubleVector("EndFreqs")[-1] ] parsets.append( self.__get_parset( os.path.basename( vds_data.getString('FileName')).split('.')[0], vds_data.getString("FileName"), str(frequency_range), vds_data.getStringVector("Extra.FieldDirectionType") [0], vds_data.getStringVector("Extra.FieldDirectionRa")[0], vds_data.getStringVector("Extra.FieldDirectionDec")[0], 'True', # cimager bug: non-restored image unusable )) jobs.append( ComputeJob(host, command, arguments=[ self.inputs['imager_exec'], vds, parsets[-1], resultsdir, start_time, end_time ])) self._schedule_jobs(jobs, max_per_node=self.inputs['nproc']) for parset in parsets: parset = Parset(parset) image_names = parset.getStringVector("Cimager.Images.Names") self.outputs['images'].extend(image_names) [os.unlink(parset) for parset in parsets] # Check if we recorded a failing process before returning # ---------------------------------------------------------------------- if self.error.isSet(): self.logger.warn("Failed imager process detected") return 1 else: return 0
def go(self): self.logger.info("Starting cimager run") super(cimager, self).go() self.outputs['images' ] = [] # Build a GVDS file describing all the data to be processed # ---------------------------------------------------------------------- self.logger.debug("Building VDS file describing all data for cimager") gvds_file = os.path.join( self.config.get("layout", "job_directory"), "vds", "cimager.gvds" ) inputs = LOFARinput(self.inputs) inputs['args'] = self.inputs['args'] inputs['gvds'] = gvds_file inputs['unlink'] = False inputs['makevds'] = self.inputs['makevds'] inputs['combinevds'] = self.inputs['combinevds'] inputs['nproc'] = self.inputs['nproc'] inputs['directory'] = os.path.dirname(gvds_file) outputs = LOFARoutput(self.inputs) if self.cook_recipe('vdsmaker', inputs, outputs): self.logger.warn("vdsmaker reports failure") return 1 self.logger.debug("cimager GVDS is %s" % (gvds_file,)) # Read data for processing from the GVDS file # ---------------------------------------------------------------------- parset = Parset(gvds_file) data = [] for part in range(parset.getInt('NParts')): host = parset.getString("Part%d.FileSys" % part).split(":")[0] vds = parset.getString("Part%d.Name" % part) data.append((host, vds)) # Divide data into timesteps for imaging # timesteps is a list of (start, end, results directory) tuples # ---------------------------------------------------------------------- timesteps = [] results_dir = self.inputs['results_dir'] if self.inputs['timestep'] == 0: self.logger.info("No timestep specified; imaging all data") timesteps = [(None, None, results_dir)] else: self.logger.info("Using timestep of %s s" % self.inputs['timestep']) gvds = get_parset(gvds_file) start_time = quantity(gvds['StartTime'].get()).get('s').get_value() end_time = quantity(gvds['EndTime'].get()).get('s').get_value() step = float(self.inputs['timestep']) while start_time < end_time: timesteps.append( ( start_time, start_time+step, os.path.join(results_dir, str(start_time)) ) ) start_time += step # Run each cimager process in a separate thread # ---------------------------------------------------------------------- command = "python %s" % (self.__file__.replace('master', 'nodes')) for label, timestep in enumerate(timesteps): self.logger.info("Processing timestep %d" % label) jobs = [] parsets = [] start_time, end_time, resultsdir = timestep for host, vds in data: vds_data = Parset(vds) frequency_range = [ vds_data.getDoubleVector("StartFreqs")[0], vds_data.getDoubleVector("EndFreqs")[-1] ] parsets.append( self.__get_parset( os.path.basename(vds_data.getString('FileName')).split('.')[0], vds_data.getString("FileName"), str(frequency_range), vds_data.getStringVector("Extra.FieldDirectionType")[0], vds_data.getStringVector("Extra.FieldDirectionRa")[0], vds_data.getStringVector("Extra.FieldDirectionDec")[0], 'True', # cimager bug: non-restored image unusable ) ) jobs.append( ComputeJob( host, command, arguments=[ self.inputs['imager_exec'], vds, parsets[-1], resultsdir, start_time, end_time ] ) ) self._schedule_jobs(jobs, max_per_node=self.inputs['nproc']) for parset in parsets: parset = Parset(parset) image_names = parset.getStringVector("Cimager.Images.Names") self.outputs['images'].extend(image_names) [os.unlink(parset) for parset in parsets] # Check if we recorded a failing process before returning # ---------------------------------------------------------------------- if self.error.isSet(): self.logger.warn("Failed imager process detected") return 1 else: return 0
def run(self, executable, initscript, infile, key, db_name, db_user, db_host): # executable: path to KernelControl executable # initscript: path to lofarinit.sh # infile: MeasurementSet for processing # key, db_name, db_user, db_host: database connection parameters # ---------------------------------------------------------------------- with log_time(self.logger): if os.path.exists(infile): self.logger.info("Processing %s" % (infile)) else: self.logger.error("Dataset %s does not exist" % (infile)) return 1 # Build a configuration parset specifying database parameters # for the kernel # ------------------------------------------------------------------ self.logger.debug("Setting up kernel parset") filesystem = "%s:%s" % (os.uname()[1], get_mountpoint(infile)) fd, parset_filename = mkstemp() kernel_parset = Parset() for key, value in { "ObservationPart.Filesystem": filesystem, "ObservationPart.Path": infile, "BBDB.Key": key, "BBDB.Name": db_name, "BBDB.User": db_user, "BBDB.Host": db_host, "ParmLog": "", "ParmLoglevel": "", "ParmDB.Sky": infile + ".sky", "ParmDB.Instrument": infile + ".instrument" }.iteritems(): kernel_parset.add(key, value) kernel_parset.writeFile(parset_filename) os.close(fd) self.logger.debug("Parset written to %s" % (parset_filename, )) # Run the kernel # Catch & log output from the kernel logger and stdout # ------------------------------------------------------------------ working_dir = mkdtemp() env = read_initscript(self.logger, initscript) try: cmd = [executable, parset_filename, "0"] self.logger.debug("Executing BBS kernel") with CatchLog4CPlus( working_dir, self.logger.name + "." + os.path.basename(infile), os.path.basename(executable), ): bbs_kernel_process = Popen(cmd, stdout=PIPE, stderr=PIPE, cwd=working_dir) sout, serr = bbs_kernel_process.communicate() log_process_output("BBS kernel", sout, serr, self.logger) if bbs_kernel_process.returncode != 0: raise CalledProcessError(bbs_kernel_process.returncode, executable) except CalledProcessError, e: self.logger.error(str(e)) return 1 finally:
class GenericPipeline(control): inputs = { 'loglevel': ingredient.StringField( '--loglevel', help="loglevel", default='INFO', optional=True ) } def __init__(self): control.__init__(self) self.parset = Parset() self.input_data = {} self.output_data = {} self.parset_feedback_file = None #self.logger = None#logging.RootLogger('DEBUG') self.name = '' #if not overwrite: # self.inputs['job_name'] = 'generic-pipeline' # if not self.inputs.has_key("start_time"): # import datetime # self.inputs["start_time"] = datetime.datetime.utcnow().replace(microsecond=0).isoformat() # if not hasattr(self, "config"): # self.config = self._read_config() # #self._read_config() # # ...and task files, if applicable # if not self.inputs.has_key("task_files"): # try: # self.inputs["task_files"] = utilities.string_to_list( # self.config.get('DEFAULT', "task_files") # ) # except NoOptionError: # self.inputs["task_files"] = [] # self.task_definitions = ConfigParser(self.config.defaults()) # print >> sys.stderr, "Reading task definition file(s): %s" % \ # ",".join(self.inputs["task_files"]) # self.task_definitions.read(self.inputs["task_files"]) # self.go() def usage(self): """ Display usage """ print >> sys.stderr, "Usage: %s [options] <parset-file>" % sys.argv[0] print >> sys.stderr, "Parset structure should look like:\n" \ "NYI" #return 1 def go(self): #""" #Read the parset-file that was given as input argument, and set the #jobname before calling the base-class's `go()` method. #""" try: parset_file = os.path.abspath(self.inputs['args'][0]) except IndexError: #return self.usage() self.usage() # Set job-name to basename of parset-file w/o extension, if it's not # set on the command-line with '-j' or '--job-name' if not 'job_name' in self.inputs: self.inputs['job_name'] = ( os.path.splitext(os.path.basename(parset_file))[0]) self.name = self.inputs['job_name'] try: self.logger except: self.logger = getSearchingLogger(self.name) self.logger.setLevel(self.inputs['loglevel']) # Call the base-class's `go()` method. return super(GenericPipeline, self).go() # def pipeline_logic(self): # print 'Dummy because of wrapping inside the framework' # if overwrite: # self.execute_pipeline() #def execute_pipeline(self): def pipeline_logic(self): try: parset_file = os.path.abspath(self.inputs['args'][0]) except IndexError: return self.usage() try: if self.parset.keys == []: self.parset.adoptFile(parset_file) self.parset_feedback_file = parset_file + "_feedback" except RuntimeError: print >> sys.stderr, "Error: Parset file not found!" return self.usage() self._replace_values() # just a reminder that this has to be implemented validator = GenericPipelineParsetValidation(self.parset) if not validator.validate_pipeline(): self.usage() exit(1) if not validator.validate_steps(): self.usage() exit(1) #set up directories job_dir = self.config.get("layout", "job_directory") parset_dir = os.path.join(job_dir, "parsets") mapfile_dir = os.path.join(job_dir, "mapfiles") # Create directories for temporary parset- and map files create_directory(parset_dir) create_directory(mapfile_dir) # ********************************************************************* # maybe we dont need a subset but just a steplist # at the moment only a list with stepnames is given for the pipeline.steps parameter # pipeline.steps=[vdsmaker,vdsreader,setupparmdb1,setupsourcedb1,ndppp1,....] # the names will be the prefix for parset subsets pipeline_args = self.parset.makeSubset( self.parset.fullModuleName('pipeline') + '.') pipeline_steps = self.parset.makeSubset( self.parset.fullModuleName('steps') + '.') # ********************************************************************* # forward declaration of things. just for better overview and understanding whats in here. # some of this might be removed in upcoming iterations, or stuff gets added. step_name_list = pipeline_args.getStringVector('steps') # construct the step name list if there were pipeline.steps.<subset> for item in pipeline_steps.keys(): if item in step_name_list: loc = step_name_list.index(item) step_name_list[loc:loc] = pipeline_steps.getStringVector(item) step_name_list.remove(item) step_control_dict = {} step_parset_files = {} step_parset_obj = {} activeloop = [''] # construct the list of step names and controls self._construct_steps(step_name_list, step_control_dict, step_parset_files, step_parset_obj, parset_dir) # initial parameters to be saved in resultsdict so that recipes have access to this step0 # double init values. 'input' should be considered deprecated # self.name would be consistent to use in subpipelines input_dictionary = { 'parset': parset_file, 'parsetobj': self.parset, 'parset_dir': parset_dir, 'mapfile_dir': mapfile_dir} resultdicts = {} for section in self.config.sections(): tmp_dict = {} for entry in self.config.items(section): input_dictionary[entry[0]] = entry[1] tmp_dict[entry[0]] = entry[1] resultdicts.update({section: copy.deepcopy(tmp_dict)}) resultdicts.update({'input': input_dictionary}) resultdicts.update({self.name: input_dictionary}) if 'pipeline.mapfile' in self.parset.keywords(): resultdicts['input']['mapfile'] = str(self.parset['pipeline.mapfile']) resultdicts[self.name]['mapfile'] = str(self.parset['pipeline.mapfile']) # ********************************************************************* # main loop # there is a distinction between recipes and plugins for user scripts. # plugins are not used at the moment and might better be replaced with master recipes while step_name_list: stepname = step_name_list.pop(0) self.logger.info("Beginning step %s" % (stepname,)) step = step_control_dict[stepname] #step_parset = step_parset_obj[stepname] inputdict = {} inputargs = [] resultdict = {} # default kind_of_step to recipe. try: kind_of_step = step.getString('kind') except: kind_of_step = 'recipe' try: typeval = step.getString('type') except: typeval = '' adds = None if stepname in step_parset_obj: adds = self._construct_step_parset(inputdict, step_parset_obj[stepname], resultdicts, step_parset_files[stepname], stepname) # stepname not a valid input for old recipes if kind_of_step == 'recipe': if self.task_definitions.get(typeval, 'recipe') == 'executable_args': inputdict['stepname'] = stepname if adds: inputdict.update(adds) self._construct_cmdline(inputargs, step, resultdicts) if stepname in step_parset_files: inputdict['parset'] = step_parset_files[stepname] self._construct_input(inputdict, step, resultdicts) # hack, popping 'type' is necessary, why? because you deleted kind already in parsets try: inputdict.pop('type') except: pass try: inputdict.pop('kind') except: pass # \hack # more hacks. Frameworks DictField not properly implemented. Construct your own dict from input. # python buildin functions cant handle the string returned from parset class. if 'environment' in inputdict.keys(): val = inputdict['environment'].rstrip('}').lstrip('{').replace(' ', '') splitval = str(val).split(',') valdict = {} for item in splitval: valdict[item.split(':')[0]] = item.split(':')[1] inputdict['environment'] = valdict # subpipeline. goal is to specify a pipeline within a pipeline. # load other existing pipeline parset and add them to your own. if kind_of_step == 'pipeline': subpipeline_parset = Parset() subpipeline_parset.adoptFile(typeval) submapfile = '' subpipeline_steplist = subpipeline_parset.getStringVector('pipeline.steps') if 'pipeline.mapfile' in subpipeline_parset.keywords(): submapfile = subpipeline_parset['pipeline.mapfile'] subpipeline_parset.remove('pipeline.mapfile') if 'mapfile_in' in inputdict.keys(): submapfile = inputdict.pop('mapfile_in') resultdicts.update({os.path.splitext(os.path.basename(typeval))[0]: { 'parset': typeval, 'mapfile': submapfile, }}) #todo: take care of pluginpathes and everything other then individual steps # make a pipeline parse methods that returns everything needed. # maybe as dicts to combine them to one subpipeline_parset.remove('pipeline.steps') if 'pipeline.pluginpath' in subpipeline_parset.keywords(): subpipeline_parset.remove('pipeline.pluginpath') checklist = copy.deepcopy(subpipeline_steplist) for k in self._keys(subpipeline_parset): if 'loopsteps' in k: for item in subpipeline_parset.getStringVector(k): checklist.append(item) # ********************************************************************* # master parset did not handle formatting and comments in the parset. # proper format only after use of parset.makesubset. then it is a different object # from a different super class :(. this also explains use of parset.keys and parset.keys() # take the parset from subpipeline and add it to the master parset. # UPDATE: do not use .keys on master parset. use .keywords(), then comments are filtered. # ********************************************************************* # replace names of steps with the subpipeline stepname to create a unique identifier. # replacement values starting with ! will be taken from the master parset and overwrite # the ones in the subpipeline. only works if the ! value is already in the subpipeline for k in self._keys(subpipeline_parset): val = subpipeline_parset[k] if not str(k).startswith('!') and not str(k).startswith('pipeline.replace.'): for item in checklist: if item+".output" in str(val): val = str(val).replace(item, stepname + '-' + item) self.parset.add(stepname + '-' + k, str(val)) else: # remove replacements strings to prevent loading the same key twice if k in self._keys(self.parset): self.parset.remove(k) self.parset.add(k, str(val)) for i, item in enumerate(subpipeline_steplist): subpipeline_steplist[i] = stepname + '-' + item for item in step_parset_obj[stepname].keys(): for k in self._keys(self.parset): if str(k).startswith('!') and item == str(k).strip("! ") or str(k).startswith('pipeline.replace.') and item == str(k)[17:].strip(): self.parset.remove(k) self.parset.add('! ' + item, str(step_parset_obj[stepname][item])) self._replace_values() self._construct_steps(subpipeline_steplist, step_control_dict, step_parset_files, step_parset_obj, parset_dir) for j in reversed(subpipeline_steplist): name = j step_control_dict[name] = step_control_dict[j] step_name_list.insert(0, name) # loop if kind_of_step == 'loop': # remember what loop is running to stop it from a conditional step if activeloop[0] is not stepname: activeloop.insert(0, stepname) # prepare counter = 0 breakloop = False if stepname in resultdicts: counter = int(resultdicts[stepname]['counter']) + 1 breakloop = resultdicts[stepname]['break'] loopsteps = step.getStringVector('loopsteps') # break at max iteration or when other step sets break variable if counter is step.getInt('loopcount'): breakloop = True if not breakloop: # add loop steps to the pipeline including the loop itself step_name_list.insert(0, stepname) self._construct_steps(loopsteps, step_control_dict, step_parset_files, step_parset_obj, parset_dir) for j in reversed(loopsteps): name = j step_control_dict[name] = step_control_dict[j] step_name_list.insert(0, name) # results for other steps to check and write states resultdict = {'counter': counter, 'break': breakloop} else: # reset values for second use of the loop (but why would you do that?) resultdict = {'counter': -1, 'break': False} activeloop.pop(0) # recipes if kind_of_step == 'recipe': with duration(self, stepname): resultdict = self.run_task( typeval, inputargs, **inputdict ) # plugins if kind_of_step == 'plugin': bla = str(self.config.get('DEFAULT', 'recipe_directories')) pluginpath = bla.rstrip(']').lstrip('[').split(',') for i, item in enumerate(pluginpath): pluginpath[i] = os.path.join(item, 'plugins') if 'pluginpath' in pipeline_args.keys(): pluginpath.append(pipeline_args.getString('pluginpath')) with duration(self, stepname): resultdict = loader.call_plugin(typeval, pluginpath, inputargs, **inputdict) resultdicts[stepname] = resultdict # breaking the loopstep # if the step has the keyword for loopbreaks assign the value if activeloop[0] in resultdicts and resultdict is not None and 'break' in resultdict: resultdicts[activeloop[0]]['break'] = resultdict['break'] # ********************************************************************* # build the inputs for the master recipes. def _construct_input(self, inoutdict, controlparset, resdicts): # intermediate backward compatibility for opts subparset if controlparset.fullModuleName('opts'): argsparset = controlparset.makeSubset(controlparset.fullModuleName('opts') + '.') # hack elif 'loopcount' not in controlparset.keys(): argsparset = controlparset else: argsparset = controlparset.makeSubset(controlparset.fullModuleName('imaginary') + '.') # \hack self._replace_output_keyword(inoutdict, argsparset, argsparset.keys(), resdicts) def _construct_cmdline(self, inoutargs, controlparset, resdicts): inoutdict = {} argsparset = controlparset.makeSubset(controlparset.fullModuleName('cmdline') + '.') self._replace_output_keyword(inoutdict, argsparset, argsparset.keys(), resdicts) for k in inoutdict.keys(): inoutargs.append(inoutdict[k]) for k in controlparset.keys(): if 'cmdline' in k: controlparset.remove(k) def _construct_steps(self, step_name_list, step_control_dict, step_parset_files, step_parset_obj, parset_dir): step_list_copy = (copy.deepcopy(step_name_list)) counter = 0 while step_list_copy: counter -= 1 stepname = step_list_copy.pop(-1) fullparset = self.parset.makeSubset(self.parset.fullModuleName(str(stepname)) + '.') subparset = fullparset.makeSubset(fullparset.fullModuleName('control') + '.') number = 0 for item in step_list_copy: if item == stepname: number += 1 if number != 0: stepname += str(number) step_name_list[counter] = stepname step_control_dict[stepname] = subparset if fullparset.fullModuleName('argument'): stepparset = fullparset.makeSubset(fullparset.fullModuleName('argument') + '.') # ********************************************************************* # save parsets # either a filename is given in the main parset # or files will be created from subsets with stepnames.parset as filenames # for name, parset in step_parset_dict.iteritems(): try: file_parset = Parset(stepparset.getString('parset')) for k in file_parset.keywords(): if not k in stepparset.keys(): stepparset.add(k, str(file_parset[k])) stepparset.remove('parset') except: pass # parset from task.cfg try: file_parset = Parset(self.task_definitions.get(str(subparset['type']), 'parset')) for k in file_parset.keywords(): if not k in stepparset.keys(): stepparset.add(k, str(file_parset[k])) except: pass # for parset in control section try: file_parset = Parset(subparset.getString('parset')) for k in file_parset.keywords(): if not k in stepparset.keys(): stepparset.add(k, str(file_parset[k])) subparset.remove('parset') except: pass step_parset = os.path.join(parset_dir, stepname + '.parset') stepparset.writeFile(step_parset) step_parset_files[stepname] = step_parset step_parset_obj[stepname] = stepparset def _replace_output_keyword(self, inoutdict, argsparset, keyorder, resdicts): addvals = {'inputkeys': [], 'mapfiles_in': [], 'arguments': []} regobj = re.compile('([\w\+_-]+)\.output\.([\w\+._-]+)') for k in keyorder: keystring = argsparset.getString(k) hitlist = regobj.findall(keystring) if hitlist: for hit in hitlist: keystring = regobj.sub(str(resdicts[hit[0]][hit[1]]), keystring, 1) if 'mapfile' in hit[1] and not 'mapfile' in k: addvals['inputkeys'].append(resdicts[hit[0]][hit[1]]) addvals['mapfiles_in'].append(resdicts[hit[0]][hit[1]]) inoutdict[k] = keystring else: inoutdict[k] = argsparset.getString(k) if k == 'flags': addvals['arguments'] = keystring if 'outputkey' in keystring: addvals['outputkey'] = 'outputkey' return addvals def _construct_step_parset(self, inoutdict, argsparset, resdicts, filename, stepname): tmp_keys = argsparset.keys() ordered_keys = [] parsetdict = {} for orig in self._keys(self.parset): for item in tmp_keys: if (stepname + '.') in orig and ('argument.'+item in orig and not 'argument.'+item+'.' in orig): ordered_keys.append(item) continue # add keys from parset files that were not in the original list for item in argsparset.keys(): if not item in ordered_keys: ordered_keys.append(item) additional = self._replace_output_keyword(parsetdict, argsparset, ordered_keys, resdicts) for k in argsparset.keys(): argsparset.replace(k, parsetdict[k]) if k == 'flags': argsparset.remove(k) argsparset.writeFile(filename) return additional #inoutdict.update(additional) def _keys(self, inparset): outlist = [] for k in inparset.keys: for l in inparset.keywords(): if k == l: outlist.append(l) return outlist def _get_parset_dicts(self): return {} def show_tasks(self): tasklist = [] tasklist = self.task_definitions.sections() for item in tasklist: print item #return tasklist def show_task(self, task): task_parset = Parset() if self.task_definitions.has_option(task,'parset'): task_parset.adoptFile(self.task_definitions.get(task,'parset')) print 'possible arguments: key = value' for k in task_parset.keywords(): print ' ',k,' ','=',' ',task_parset[k] def _add_step(self): steplist = [] def _replace_values(self): replacedict = OrderedDict() for check in self._keys(self.parset): if str(check).startswith('!'): replacedict[str(check).lstrip('!').lstrip(' ')] = str(self.parset[check]) if str(check).startswith('pipeline.replace.'): replacedict[str(check).replace('pipeline.replace.', '').lstrip(' ')] = str(self.parset[check]) #expand environment variables for k, v in replacedict.items(): replacedict[k] = os.path.expandvars(v) for check in self._keys(self.parset): for k, v in reversed(replacedict.items()): if '{{ '+k+' }}' in str(self.parset[check]): replacestring = str(self.parset[check]).replace('{{ '+k+' }}',v) self.parset.replace(check,replacestring)
def run(self, infile, executable, args, kwargs, work_dir='/tmp', parsetasfile=False, args_format='', environment=''): """ This method contains all the needed functionality """ # Debugging info self.logger.debug("infile = %s" % infile) self.logger.debug("executable = %s" % executable) self.logger.debug("working directory = %s" % work_dir) self.logger.debug("arguments = %s" % args) self.logger.debug("arg dictionary = %s" % kwargs) self.logger.debug("environment = %s" % environment) self.environment.update(environment) # Time execution of this job with log_time(self.logger): #if os.path.exists(infile): self.logger.info("Processing %s" % infile) # else: # self.logger.error("Dataset %s does not exist" % infile) # return 1 # Check if executable is present if not os.access(executable, os.X_OK): self.logger.error("Executable %s not found" % executable) return 1 # hurray! race condition when running with more than one process on one filesystem if not os.path.isdir(work_dir): try: os.mkdir(work_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(work_dir): pass else: raise argsformat = args_format['args_format'] # deal with multiple input files for wsclean if argsformat == 'wsclean': for i in reversed(xrange(len(args))): if str(args[i]).startswith('[') and str( args[i]).endswith(']'): tmplist = args.pop(i).lstrip('[').rstrip(']').split( ',') for val in reversed(tmplist): args.insert(i, val.strip(' \'\"')) if not parsetasfile: if argsformat == 'gnu': for k, v in kwargs.items(): args.append('--' + k + '=' + v) if argsformat == 'lofar': for k, v in kwargs.items(): args.append(k + '=' + v) if argsformat == 'argparse': for k, v in kwargs.items(): args.append('--' + k + ' ' + v) if argsformat == 'wsclean': for k, v in kwargs.items(): if str(v).startswith('[') and str(v).endswith(']'): v = v.lstrip('[').rstrip(']').replace(' ', '') multargs = v.split(',') else: multargs = v.split(' ') if multargs: multargs.reverse() for item in multargs: args.insert(0, item) else: args.insert(0, v) args.insert(0, '-' + k) else: nodeparset = Parset() parsetname = os.path.join(work_dir, os.path.basename(infile) + '.parset') for k, v in kwargs.items(): nodeparset.add(k, v) nodeparset.writeFile(parsetname) if argsformat == 'losoto': args.append(parsetname) else: args.insert(0, parsetname) try: # **************************************************************** # Run cmd = [executable] + args with CatchLog4CPlus( work_dir, self.logger.name + "." + os.path.basename(infile), os.path.basename(executable), ) as logger: # Catch segfaults and retry catch_segfaults(cmd, work_dir, self.environment, logger) except CalledProcessError, err: # CalledProcessError isn't properly propagated by IPython self.logger.error(str(err)) return 1 except Exception, err: self.logger.error(str(err)) return 1
def run(self, infile, executable, args, kwargs, work_dir='/tmp', parsetasfile=False, args_format='', environment=''): """ This function contains all the needed functionality """ # Debugging info self.logger.debug("infile = %s" % infile) self.logger.debug("executable = %s" % executable) self.logger.debug("working directory = %s" % work_dir) self.logger.debug("arguments = %s" % args) self.logger.debug("arg dictionary = %s" % kwargs) self.logger.debug("environment = %s" % environment) self.environment.update(environment) # Time execution of this job with log_time(self.logger): if infile[0] == '[': infiles = [ms.strip(" []\'\"") for ms in infile.split(',')] reffile = infiles[0] else: reffile = infile if os.path.exists(reffile): self.logger.info("Processing %s" % reffile) else: self.logger.error("Dataset %s does not exist" % reffile) return 1 # Check if executable is present if not os.access(executable, os.X_OK): self.logger.error("Executable %s not found" % executable) return 1 # race condition when running with more than one process on one filesystem if not os.path.isdir(work_dir): try: os.mkdir(work_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(work_dir): pass else: raise if not parsetasfile: self.logger.error( "Nodescript \"executable_casa.py\" requires \"parsetasfile\" to be True!" ) return 1 else: nodeparset = Parset() sublist = [] for k, v in kwargs.items(): nodeparset.add(k, v) if str(k).find('.'): if not str(k).split('.')[0] in sublist: sublist.append(str(k).split('.')[0]) #quick hacks below. for proof of concept. casastring = '' for sub in sublist: subpar = nodeparset.makeSubset( nodeparset.fullModuleName(sub) + '.') casastring = sub + '(' for k in subpar.keys(): if str(subpar[k]).find('/') == 0: casastring += str(k) + '=' + "'" + str( subpar[k]) + "'" + ',' elif str(subpar[k]).find('casastr/') == 0: casastring += str(k) + '=' + "'" + str( subpar[k]).strip('casastr/') + "'" + ',' elif str(subpar[k]).lower() == 'false' or str( subpar[k]).lower() == 'true': casastring += str(k) + '=' + str(subpar[k]) + ',' else: # Test if int/float or list of int/float try: self.logger.info('value: {}'.format(subpar[k])) test = float(str(subpar[k])) is_int_float = True except: is_int_float = False if is_int_float: casastring += str(k) + '=' + str( subpar[k]) + ',' else: if '[' in str(subpar[k]) or '(' in str( subpar[k]): # Check if list of int/float or strings list_vals = [ f.strip() for f in str( subpar[k]).strip('[]()').split(',') ] is_int_float = True for list_val in list_vals: try: test = float(list_val) except: is_int_float = False break if is_int_float: casastring += str(k) + '=' + str( subpar[k]) + ',' else: casastring += str( k) + '=' + '[{}]'.format(','.join([ "'" + list_val + "'" for list_val in list_vals ])) + ',' else: # Simple string casastring += str(k) + '=' + "'" + str( subpar[k]) + "'" + ',' casastring = casastring.rstrip(',') casastring += ')\n' # 1) return code of a casapy is not properly recognized by the pipeline # wrapping in shellscript works for succesful runs. # failed runs seem to hang the pipeline... # 2) casapy can not have two instances running from the same directory. # create tmp dirs casapydir = tempfile.mkdtemp(dir=work_dir) if casastring != '': casafilename = os.path.join( work_dir, os.path.basename(reffile) + '.casacommand.py') casacommandfile = open(casafilename, 'w') casacommandfile.write(casastring) casacommandfile.close() args.append(casafilename) somename = os.path.join( work_dir, os.path.basename(reffile) + '.casashell.sh') commandstring = '' commandstring += executable for item in args: if str(item).find(' ') > -1 or str(item).find('[') > -1: commandstring += ' "' + item + '"' else: commandstring += ' ' + item crap = open(somename, 'w') crap.write('#!/bin/bash \n') crap.write('echo "Trying CASAPY command" \n') crap.write(commandstring + ' >& casa.log\n') crap.close() # file permissions st = os.stat(somename) os.chmod( somename, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) try: # **************************************************************** # Run cmd = [somename] with CatchLog4CPlus( casapydir, self.logger.name + "." + os.path.basename(reffile), os.path.basename(executable), ) as logger: # Catch segfaults and retry catch_segfaults(cmd, casapydir, self.environment, logger) except CalledProcessError, err: # CalledProcessError isn't properly propagated by IPython self.logger.error(str(err)) return 1 except Exception, err: self.logger.error(str(err)) return 1
def go(self): if 'executable' in self.inputs: executable = self.inputs['executable'] if self.inputs['nthreads']: self.environment["OMP_NUM_THREADS"] = str(self.inputs['nthreads']) if 'environment' in self.inputs: self.environment.update(self.inputs['environment']) self.logger.info("Starting %s run" % executable) super(executable_args, self).go() # args format stuff args_format = {'args_format': self.inputs['args_format'], 'args_format_argument': self.inputs['args_format_argument'], 'args_format_option': self.inputs['args_format_option'], 'args_formatlongoption': self.inputs['args_format_longoption'], 'args_format_option_argument': self.inputs['args_format_option_argument']} mapfile_dir = os.path.join(self.config.get("layout", "job_directory"), "mapfiles") work_dir = os.path.join(self.inputs['working_directory'], self.inputs['job_name']) # ********************************************************************* # try loading input/output data file, validate output vs the input location if # output locations are provided try: inputmapfiles = [] inlist = [] if self.inputs['mapfile_in']: inlist.append(self.inputs['mapfile_in']) if self.inputs['mapfiles_in']: for item in self.inputs['mapfiles_in']: inlist.append(item) self.inputs['mapfile_in'] = self.inputs['mapfiles_in'][0] for item in inlist: inputmapfiles.append(DataMap.load(item)) except Exception: self.logger.error('Could not load input Mapfile %s' % inlist) return 1 outputmapfiles = [] if self.inputs['mapfile_out']: try: outdata = DataMap.load(self.inputs['mapfile_out']) outputmapfiles.append(outdata) except Exception: self.logger.error('Could not load output Mapfile %s' % self.inputs['mapfile_out']) return 1 # sync skip fields in the mapfiles align_data_maps(inputmapfiles[0], outputmapfiles[0]) elif self.inputs['mapfiles_out']: for item in self.inputs['mapfiles_out']: outputmapfiles.append(DataMap.load(item)) self.inputs['mapfile_out'] = self.inputs['mapfiles_out'][0] else: # ouput will be directed in the working directory if no output mapfile is specified outdata = copy.deepcopy(inputmapfiles[0]) if not self.inputs['inplace']: for item in outdata: item.file = os.path.join( self.inputs['working_directory'], self.inputs['job_name'], #os.path.basename(item.file) + '.' + os.path.split(str(executable))[1] os.path.splitext(os.path.basename(item.file))[0] + '.' + self.inputs['stepname'] ) self.inputs['mapfile_out'] = os.path.join(mapfile_dir, self.inputs['stepname'] + '.' + 'mapfile') self.inputs['mapfiles_out'].append(self.inputs['mapfile_out']) else: self.inputs['mapfile_out'] = self.inputs['mapfile_in'] self.inputs['mapfiles_out'].append(self.inputs['mapfile_out']) outputmapfiles.append(outdata) if not validate_data_maps(inputmapfiles[0], outputmapfiles[0]): self.logger.error( "Validation of data mapfiles failed!" ) return 1 if self.inputs['outputsuffixes']: # Handle multiple outputfiles for name in self.inputs['outputsuffixes']: outputmapfiles.append(copy.deepcopy(inputmapfiles[0])) self.inputs['mapfiles_out'].append(os.path.join(mapfile_dir, self.inputs['stepname'] + name + '.' + 'mapfile')) for item in outputmapfiles[-1]: item.file = os.path.join( work_dir, os.path.splitext(os.path.basename(item.file))[0] + '.' + self.inputs['stepname'] + name ) self.inputs['mapfile_out'] = self.inputs['mapfiles_out'][0] # prepare arguments arglist = self.inputs['arguments'] parsetdict = {} if 'parset' in self.inputs: parset = Parset() parset.adoptFile(self.inputs['parset']) for k in parset.keys: parsetdict[k] = str(parset[k]) # construct multiple input data if self.inputs['inputkey'] and not self.inputs['inputkey'] in self.inputs['inputkeys']: self.inputs['inputkeys'].insert(0, self.inputs['inputkey']) if not self.inputs['outputkeys'] and self.inputs['outputkey']: self.inputs['outputkeys'].append(self.inputs['outputkey']) if not self.inputs['skip_infile'] and len(self.inputs['inputkeys']) is not len(inputmapfiles): self.logger.error("Number of input mapfiles %d and input keys %d have to match." % (len(inputmapfiles), len(self.inputs['inputkeys']))) return 1 filedict = {} if self.inputs['inputkeys'] and not self.inputs['skip_infile']: for key, filemap, mapname in zip(self.inputs['inputkeys'], inputmapfiles, inlist): if not mapname in self.inputs['mapfiles_as_string']: filedict[key] = [] for inp in filemap: filedict[key].append(inp.file) else: if key != mapname: filedict[key] = [] for inp in filemap: filedict[key].append(mapname) if self.inputs['outputkey']: filedict[self.inputs['outputkey']] = [] for item in outputmapfiles[0]: filedict[self.inputs['outputkey']].append(item.file) # ******************************************************************** # Call the node side of the recipe # Create and schedule the compute jobs #command = "python3 %s" % (self.__file__.replace('master', 'nodes')).replace('executable_args', self.inputs['nodescript']) recipe_dir_str = str(self.config.get('DEFAULT', 'recipe_directories')) recipe_directories = recipe_dir_str.rstrip(']').lstrip('[').split(',') pylist = os.getenv('PYTHONPATH').split(':') command = None for pl in pylist: if os.path.isfile(os.path.join(pl,'lofarpipe/recipes/nodes/'+self.inputs['nodescript']+'.py')): command = "python3 %s" % os.path.join(pl,'lofarpipe/recipes/nodes/'+self.inputs['nodescript']+'.py') for pl in recipe_directories: if os.path.isfile(os.path.join(pl,'nodes/'+self.inputs['nodescript']+'.py')): command = "python3 %s" % os.path.join(pl,'nodes/'+self.inputs['nodescript']+'.py') inputmapfiles[0].iterator = outputmapfiles[0].iterator = DataMap.SkipIterator jobs = [] for i, (outp, inp,) in enumerate(zip( outputmapfiles[0], inputmapfiles[0]) ): arglist_copy = copy.deepcopy(arglist) parsetdict_copy = copy.deepcopy(parsetdict) if filedict: for name, value in filedict.items(): replaced = False if arglist_copy: for arg in arglist: if name == arg: ind = arglist_copy.index(arg) arglist_copy[ind] = arglist_copy[ind].replace(name, value[i]) replaced = True if parsetdict_copy: if name in list(parsetdict_copy.values()): for k, v in parsetdict_copy.items(): if v == name: parsetdict_copy[k] = value[i] else: if not replaced: parsetdict_copy[name] = value[i] jobs.append( ComputeJob( inp.host, command, arguments=[ inp.file, executable, arglist_copy, parsetdict_copy, work_dir, self.inputs['parsetasfile'], args_format, self.environment ], resources={ "cores": self.inputs['nthreads'] } ) ) max_per_node = self.inputs['max_per_node'] self._schedule_jobs(jobs, max_per_node) jobresultdict = {} resultmap = {} for job, outp in zip(jobs, outputmapfiles[0]): if job.results['returncode'] != 0: outp.skip = True if not self.inputs['error_tolerance']: self.logger.error("A job has failed with returncode %d and error_tolerance is not set. Bailing out!" % job.results['returncode']) return 1 for k, v in list(job.results.items()): if not k in jobresultdict: jobresultdict[k] = [] jobresultdict[k].append(DataProduct(job.host, job.results[k], outp.skip)) if k == 'break': self.outputs.update({'break': v}) # temp solution. write all output dict entries to a mapfile #mapfile_dir = os.path.join(self.config.get("layout", "job_directory"), "mapfiles") #check directory for stand alone mode if not os.path.isdir(mapfile_dir): try: os.mkdir(mapfile_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(mapfile_dir): pass else: raise for k, v in list(jobresultdict.items()): dmap = DataMap(v) dmap.save(os.path.join(mapfile_dir, self.inputs['stepname'] + '.' + k + '.mapfile')) resultmap[k + '.mapfile'] = os.path.join(mapfile_dir, self.inputs['stepname'] + '.' + k + '.mapfile') self.outputs.update(resultmap) # ********************************************************************* # Check job results, and create output data map file if self.error.isSet(): # Abort if all jobs failed if all(job.results['returncode'] != 0 for job in jobs): self.logger.error("All jobs failed. Bailing out!") return 1 else: self.logger.warn( "Some jobs failed, continuing with succeeded runs" ) mapdict = {} for item, name in zip(outputmapfiles, self.inputs['mapfiles_out']): self.logger.debug("Writing data map file: %s" % name) item.save(name) mapdict[os.path.basename(name)] = name self.outputs['mapfile'] = self.inputs['mapfile_out'] if self.inputs['outputsuffixes']: self.outputs.update(mapdict) return 0
def go(self): if 'executable' in self.inputs: executable = self.inputs['executable'] if self.inputs['nthreads']: self.environment["OMP_NUM_THREADS"] = str(self.inputs['nthreads']) if 'environment' in self.inputs: self.environment.update(self.inputs['environment']) self.logger.info("Starting %s run" % executable) super(executable_args, self).go() # args format stuff args_format = {'args_format': self.inputs['args_format'], 'args_format_argument': self.inputs['args_format_argument'], 'args_format_option': self.inputs['args_format_option'], 'args_formatlongoption': self.inputs['args_format_longoption'], 'args_format_option_argument': self.inputs['args_format_option_argument']} mapfile_dir = os.path.join(self.config.get("layout", "job_directory"), "mapfiles") work_dir = os.path.join(self.inputs['working_directory'], self.inputs['job_name']) # ********************************************************************* # try loading input/output data file, validate output vs the input location if # output locations are provided try: inputmapfiles = [] inlist = [] if self.inputs['mapfile_in']: inlist.append(self.inputs['mapfile_in']) if self.inputs['mapfiles_in']: for item in self.inputs['mapfiles_in']: inlist.append(item) self.inputs['mapfile_in'] = self.inputs['mapfiles_in'][0] for item in inlist: inputmapfiles.append(DataMap.load(item)) except Exception: self.logger.error('Could not load input Mapfile %s' % inlist) return 1 outputmapfiles = [] if self.inputs['mapfile_out']: try: outdata = DataMap.load(self.inputs['mapfile_out']) outputmapfiles.append(outdata) except Exception: self.logger.error('Could not load output Mapfile %s' % self.inputs['mapfile_out']) return 1 # sync skip fields in the mapfiles align_data_maps(inputmapfiles[0], outputmapfiles[0]) elif self.inputs['mapfiles_out']: for item in self.inputs['mapfiles_out']: outputmapfiles.append(DataMap.load(item)) self.inputs['mapfile_out'] = self.inputs['mapfiles_out'][0] else: # ouput will be directed in the working directory if no output mapfile is specified outdata = copy.deepcopy(inputmapfiles[0]) if not self.inputs['inplace']: for item in outdata: item.file = os.path.join( self.inputs['working_directory'], self.inputs['job_name'], #os.path.basename(item.file) + '.' + os.path.split(str(executable))[1] os.path.splitext(os.path.basename(item.file))[0] + '.' + self.inputs['stepname'] ) self.inputs['mapfile_out'] = os.path.join(mapfile_dir, self.inputs['stepname'] + '.' + 'mapfile') self.inputs['mapfiles_out'].append(self.inputs['mapfile_out']) else: self.inputs['mapfile_out'] = self.inputs['mapfile_in'] self.inputs['mapfiles_out'].append(self.inputs['mapfile_out']) outputmapfiles.append(outdata) if not validate_data_maps(inputmapfiles[0], outputmapfiles[0]): self.logger.error( "Validation of data mapfiles failed!" ) return 1 if self.inputs['outputsuffixes']: # Handle multiple outputfiles for name in self.inputs['outputsuffixes']: outputmapfiles.append(copy.deepcopy(inputmapfiles[0])) self.inputs['mapfiles_out'].append(os.path.join(mapfile_dir, self.inputs['stepname'] + name + '.' + 'mapfile')) for item in outputmapfiles[-1]: item.file = os.path.join( work_dir, os.path.splitext(os.path.basename(item.file))[0] + '.' + self.inputs['stepname'] + name ) self.inputs['mapfile_out'] = self.inputs['mapfiles_out'][0] # prepare arguments arglist = self.inputs['arguments'] parsetdict = {} if 'parset' in self.inputs: parset = Parset() parset.adoptFile(self.inputs['parset']) for k in parset.keys: parsetdict[k] = str(parset[k]) # construct multiple input data if self.inputs['inputkey'] and not self.inputs['inputkey'] in self.inputs['inputkeys']: self.inputs['inputkeys'].insert(0, self.inputs['inputkey']) if not self.inputs['outputkeys'] and self.inputs['outputkey']: self.inputs['outputkeys'].append(self.inputs['outputkey']) if not self.inputs['skip_infile'] and len(self.inputs['inputkeys']) is not len(inputmapfiles): self.logger.error("Number of input mapfiles %d and input keys %d have to match." % (len(inputmapfiles), len(self.inputs['inputkeys']))) return 1 filedict = {} if self.inputs['inputkeys'] and not self.inputs['skip_infile']: for key, filemap, mapname in zip(self.inputs['inputkeys'], inputmapfiles, inlist): if not mapname in self.inputs['mapfiles_as_string']: filedict[key] = [] for inp in filemap: filedict[key].append(inp.file) else: if key != mapname: filedict[key] = [] for inp in filemap: filedict[key].append(mapname) if self.inputs['outputkey']: filedict[self.inputs['outputkey']] = [] for item in outputmapfiles[0]: filedict[self.inputs['outputkey']].append(item.file) # ******************************************************************** # Call the node side of the recipe # Create and schedule the compute jobs #command = "python %s" % (self.__file__.replace('master', 'nodes')).replace('executable_args', self.inputs['nodescript']) recipe_dir_str = str(self.config.get('DEFAULT', 'recipe_directories')) recipe_directories = recipe_dir_str.rstrip(']').lstrip('[').split(',') pylist = os.getenv('PYTHONPATH').split(':') command = None for pl in pylist: if os.path.isfile(os.path.join(pl,'lofarpipe/recipes/nodes/'+self.inputs['nodescript']+'.py')): command = "python %s" % os.path.join(pl,'lofarpipe/recipes/nodes/'+self.inputs['nodescript']+'.py') for pl in recipe_directories: if os.path.isfile(os.path.join(pl,'nodes/'+self.inputs['nodescript']+'.py')): command = "python %s" % os.path.join(pl,'nodes/'+self.inputs['nodescript']+'.py') inputmapfiles[0].iterator = outputmapfiles[0].iterator = DataMap.SkipIterator jobs = [] for i, (outp, inp,) in enumerate(zip( outputmapfiles[0], inputmapfiles[0]) ): arglist_copy = copy.deepcopy(arglist) parsetdict_copy = copy.deepcopy(parsetdict) if filedict: for name, value in filedict.iteritems(): replaced = False if arglist_copy: for arg in arglist: if name == arg: ind = arglist_copy.index(arg) arglist_copy[ind] = arglist_copy[ind].replace(name, value[i]) replaced = True if parsetdict_copy: if name in parsetdict_copy.values(): for k, v in parsetdict_copy.iteritems(): if v == name: parsetdict_copy[k] = value[i] else: if not replaced: parsetdict_copy[name] = value[i] jobs.append( ComputeJob( inp.host, command, arguments=[ inp.file, executable, arglist_copy, parsetdict_copy, work_dir, self.inputs['parsetasfile'], args_format, self.environment ], resources={ "cores": self.inputs['nthreads'] } ) ) max_per_node = self.inputs['max_per_node'] self._schedule_jobs(jobs, max_per_node) jobresultdict = {} resultmap = {} for job, outp in zip(jobs, outputmapfiles[0]): if job.results['returncode'] != 0: outp.skip = True if not self.inputs['error_tolerance']: self.logger.error("A job has failed with returncode %d and error_tolerance is not set. Bailing out!" % job.results['returncode']) return 1 for k, v in job.results.items(): if not k in jobresultdict: jobresultdict[k] = [] jobresultdict[k].append(DataProduct(job.host, job.results[k], outp.skip)) if k == 'break': self.outputs.update({'break': v}) # temp solution. write all output dict entries to a mapfile #mapfile_dir = os.path.join(self.config.get("layout", "job_directory"), "mapfiles") #check directory for stand alone mode if not os.path.isdir(mapfile_dir): try: os.mkdir(mapfile_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(mapfile_dir): pass else: raise for k, v in jobresultdict.items(): dmap = DataMap(v) dmap.save(os.path.join(mapfile_dir, self.inputs['stepname'] + '.' + k + '.mapfile')) resultmap[k + '.mapfile'] = os.path.join(mapfile_dir, self.inputs['stepname'] + '.' + k + '.mapfile') self.outputs.update(resultmap) # ********************************************************************* # Check job results, and create output data map file if self.error.isSet(): # Abort if all jobs failed if all(job.results['returncode'] != 0 for job in jobs): self.logger.error("All jobs failed. Bailing out!") return 1 else: self.logger.warn( "Some jobs failed, continuing with succeeded runs" ) mapdict = {} for item, name in zip(outputmapfiles, self.inputs['mapfiles_out']): self.logger.debug("Writing data map file: %s" % name) item.save(name) mapdict[os.path.basename(name)] = name self.outputs['mapfile'] = self.inputs['mapfile_out'] if self.inputs['outputsuffixes']: self.outputs.update(mapdict) return 0
class datamapper(BaseRecipe): """ Parses a list of filenames and attempts to map them to appropriate compute nodes (ie, which can access the files) on the LOFAR CEP cluster. Mapping by filename in this way is fragile, but is the best we can do for now. **Arguments** None. """ inputs = { 'mapfile': ingredient.StringField( '--mapfile', help= "Full path (including filename) of mapfile to produce (clobbered if exists)" ) } outputs = { 'mapfile': ingredient.FileField( help="Full path (including filename) of generated mapfile") } def go(self): self.logger.info("Starting datamapper run") super(datamapper, self).go() # We build lists of compute-nodes per cluster and data-per-cluster, # then match them up to schedule jobs in a round-robin fashion. # ---------------------------------------------------------------------- clusterdesc = ClusterDesc(self.config.get('cluster', "clusterdesc")) if clusterdesc.subclusters: available_nodes = dict((cl.name, cycle(get_compute_nodes(cl))) for cl in clusterdesc.subclusters) else: available_nodes = { clusterdesc.name: cycle(get_compute_nodes(clusterdesc)) } data = defaultdict(list) for filename in self.inputs['args']: subcluster = filename.split(os.path.sep)[2] try: host = available_nodes[subcluster].next() except KeyError, key: self.logger.error("%s is not a known cluster" % str(key)) raise data[host].append(filename) # Dump the generated mapping to a parset # ---------------------------------------------------------------------- parset = Parset() for host, filenames in data.iteritems(): parset.addStringVector(host, filenames) parset.writeFile(self.inputs['mapfile']) self.outputs['mapfile'] = self.inputs['mapfile'] return 0