Exemplo n.º 1
0
class vdsmaker(BaseRecipe, RemoteCommandRecipeMixIn):
    """
    Generate a GVDS file (and, optionally, individual VDS files per subband;
    see the ``unlink`` input parameter) describing a collection of
    MeasurementSets.

    1. Load data from disk, create the output vds paths
    2. Call the vdsmaker node script to generate the vds files
    3. Combine the vds files in a gvds file (master side operation)
    
    **Command line arguments**

    A mapfile describing the measurementsets to be processed.
    """
    inputs = {
        'gvds':
        ingredient.StringField('-g',
                               '--gvds',
                               help="File name for output GVDS file"),
        'directory':
        ingredient.DirectoryField('--directory',
                                  help="Directory for output GVDS file"),
        'makevds':
        ingredient.ExecField('--makevds',
                             help="Full path to makevds executable"),
        'combinevds':
        ingredient.ExecField('--combinevds',
                             help="Full path to combinevds executable"),
        'unlink':
        ingredient.BoolField('--unlink',
                             help="Unlink VDS files after combining",
                             default=True),
        'nproc':
        ingredient.IntField(
            '--nproc',
            help="Maximum number of simultaneous processes per compute node",
            default=8)
    }

    outputs = {'gvds': ingredient.FileField()}

    def go(self):
        """
        Contains functionality of the vdsmaker
        """
        super(vdsmaker, self).go()
        # **********************************************************************
        # 1. Load data from disk create output files
        args = self.inputs['args']
        self.logger.debug("Loading input-data mapfile: %s" % args[0])
        data = DataMap.load(args[0])

        # Skip items in `data` that have 'skip' set to True
        data.iterator = DataMap.SkipIterator

        # Create output vds names
        vdsnames = [
            os.path.join(self.inputs['directory'],
                         os.path.basename(item.file) + '.vds') for item in data
        ]

        # *********************************************************************
        # 2. Call vdsmaker
        command = "python %s" % (self.__file__.replace('master', 'nodes'))
        jobs = []
        for inp, vdsfile in zip(data, vdsnames):
            jobs.append(
                ComputeJob(inp.host,
                           command,
                           arguments=[
                               inp.file,
                               self.config.get('cluster', 'clusterdesc'),
                               vdsfile, self.inputs['makevds']
                           ]))
        self._schedule_jobs(jobs, max_per_node=self.inputs['nproc'])
        vdsnames = [
            vds for vds, job in zip(vdsnames, jobs)
            if job.results['returncode'] == 0
        ]
        if not vdsnames:
            self.logger.error("All makevds processes failed. Bailing out!")
            return 1

        # *********************************************************************
        # 3. Combine VDS files to produce GDS
        failure = False
        self.logger.info("Combining VDS files")
        executable = self.inputs['combinevds']
        gvds_out = self.inputs['gvds']
        # Create the gvds directory for output files, needed for combine
        create_directory(os.path.dirname(gvds_out))

        try:
            command = [executable, gvds_out] + vdsnames
            combineproc = subprocess.Popen(command,
                                           close_fds=True,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
            sout, serr = combineproc.communicate()
            log_process_output(executable, sout, serr, self.logger)
            if combineproc.returncode != 0:
                raise subprocess.CalledProcessError(combineproc.returncode,
                                                    command)
            self.outputs['gvds'] = gvds_out
            self.logger.info("Wrote combined VDS file: %s" % gvds_out)
        except subprocess.CalledProcessError, cpe:
            self.logger.exception("combinevds failed with status %d: %s" %
                                  (cpe.returncode, serr))
            failure = True
        except OSError, err:
            self.logger.error("Failed to spawn combinevds (%s)" % str(err))
            failure = True
Exemplo n.º 2
0
class cimager(BaseRecipe, RemoteCommandRecipeMixIn):
    """
    Provides a convenient, pipeline-based mechanism of running the cimager on
    a dataset.

    Can ingest either an MWimager-style parset, converting to cimager format
    as required, or a cimager parset directly.

    **Arguments**

    A mapfile describing the data to be processed.
    """
    inputs = {
        'imager_exec':
        ingredient.ExecField('--imager-exec', help="cimager executable"),
        'convert_exec':
        ingredient.ExecField('--convert-exec',
                             help="convertimagerparset executable"),
        'parset':
        ingredient.FileField(
            '--parset',
            help="Imager configuration parset (mwimager or cimager format)"),
        'nproc':
        ingredient.IntField(
            '--nproc',
            help="Maximum number of simultaneous processes per compute node",
            default=8),
        'timestep':
        ingredient.FloatField(
            '--timestep',
            help=
            "If non-zero, multiple images will be made, each using timestep seconds of data",
            default=0.0),
        'results_dir':
        ingredient.DirectoryField(
            '--results-dir',
            help="Directory in which resulting images will be placed",
        ),
        'parset_type':
        ParsetTypeField('--parset-type',
                        default="mwimager",
                        help="cimager or mwimager"),
        'makevds':
        ingredient.ExecField('--makevds',
                             help="makevds executable",
                             default="/opt/LofIm/daily/lofar/bin/makevds"),
        'combinevds':
        ingredient.ExecField('--comebinevds',
                             help="combinevds executable",
                             default="/opt/LofIm/daily/lofar/bin/combinevds")
    }

    outputs = {'images': ingredient.ListField()}

    def go(self):
        self.logger.info("Starting cimager run")
        super(cimager, self).go()
        self.outputs['images'] = []

        #              Build a GVDS file describing all the data to be processed
        # ----------------------------------------------------------------------
        self.logger.debug("Building VDS file describing all data for cimager")
        gvds_file = os.path.join(self.config.get("layout", "job_directory"),
                                 "vds", "cimager.gvds")
        inputs = LOFARinput(self.inputs)
        inputs['args'] = self.inputs['args']
        inputs['gvds'] = gvds_file
        inputs['unlink'] = False
        inputs['makevds'] = self.inputs['makevds']
        inputs['combinevds'] = self.inputs['combinevds']
        inputs['nproc'] = self.inputs['nproc']
        inputs['directory'] = os.path.dirname(gvds_file)
        outputs = LOFARoutput(self.inputs)
        if self.cook_recipe('vdsmaker', inputs, outputs):
            self.logger.warn("vdsmaker reports failure")
            return 1
        self.logger.debug("cimager GVDS is %s" % (gvds_file, ))

        #                            Read data for processing from the GVDS file
        # ----------------------------------------------------------------------
        parset = Parset(gvds_file)

        data = []
        for part in range(parset.getInt('NParts')):
            host = parset.getString("Part%d.FileSys" % part).split(":")[0]
            vds = parset.getString("Part%d.Name" % part)
            data.append((host, vds))

        #                                 Divide data into timesteps for imaging
        #          timesteps is a list of (start, end, results directory) tuples
        # ----------------------------------------------------------------------
        timesteps = []
        results_dir = self.inputs['results_dir']
        if self.inputs['timestep'] == 0:
            self.logger.info("No timestep specified; imaging all data")
            timesteps = [(None, None, results_dir)]
        else:
            self.logger.info("Using timestep of %s s" %
                             self.inputs['timestep'])
            gvds = get_parset(gvds_file)
            start_time = quantity(gvds['StartTime'].get()).get('s').get_value()
            end_time = quantity(gvds['EndTime'].get()).get('s').get_value()
            step = float(self.inputs['timestep'])
            while start_time < end_time:
                timesteps.append((start_time, start_time + step,
                                  os.path.join(results_dir, str(start_time))))
                start_time += step

        #                          Run each cimager process in a separate thread
        # ----------------------------------------------------------------------
        command = "python %s" % (self.__file__.replace('master', 'nodes'))
        for label, timestep in enumerate(timesteps):
            self.logger.info("Processing timestep %d" % label)
            jobs = []
            parsets = []
            start_time, end_time, resultsdir = timestep
            for host, vds in data:
                vds_data = Parset(vds)
                frequency_range = [
                    vds_data.getDoubleVector("StartFreqs")[0],
                    vds_data.getDoubleVector("EndFreqs")[-1]
                ]
                parsets.append(
                    self.__get_parset(
                        os.path.basename(
                            vds_data.getString('FileName')).split('.')[0],
                        vds_data.getString("FileName"),
                        str(frequency_range),
                        vds_data.getStringVector("Extra.FieldDirectionType")
                        [0],
                        vds_data.getStringVector("Extra.FieldDirectionRa")[0],
                        vds_data.getStringVector("Extra.FieldDirectionDec")[0],
                        'True',  # cimager bug: non-restored image unusable
                    ))
                jobs.append(
                    ComputeJob(host,
                               command,
                               arguments=[
                                   self.inputs['imager_exec'], vds,
                                   parsets[-1], resultsdir, start_time,
                                   end_time
                               ]))
            self._schedule_jobs(jobs, max_per_node=self.inputs['nproc'])
            for parset in parsets:
                parset = Parset(parset)
                image_names = parset.getStringVector("Cimager.Images.Names")
                self.outputs['images'].extend(image_names)
            [os.unlink(parset) for parset in parsets]

        #                Check if we recorded a failing process before returning
        # ----------------------------------------------------------------------
        if self.error.isSet():
            self.logger.warn("Failed imager process detected")
            return 1
        else:
            return 0

    def __get_parset(self, name, dataset, frequency, ms_dir_type, ms_dir_ra,
                     ms_dir_dec, restore):
        def convert_mwimager_parset(parset):
            try:
                with patched_parset(
                        parset,
                    {
                        'dataset': dataset,
                        'Images.frequency': frequency,
                        'msDirType': ms_dir_type,
                        'msDirRa': ms_dir_ra,
                        'msDirDec': ms_dir_dec,
                        'restore':
                        restore  # cimager bug: non-restored image unusable
                    }) as cimager_parset:
                    fd, converted_parset = tempfile.mkstemp(
                        dir=self.config.get("layout", "job_directory"))
                    convert_process = spawn_process([
                        self.inputs['convert_exec'], cimager_parset,
                        converted_parset
                    ], self.logger)
                    os.close(fd)
                    sout, serr = convert_process.communicate()
                    log_process_output(self.inputs['convert_exec'], sout, serr,
                                       self.logger)
                    if convert_process.returncode != 0:
                        raise subprocess.CalledProcessError(
                            convert_process.returncode, convert_exec)
                    return converted_parset
            except OSError as e:
                self.logger.error("Failed to spawn convertimagerparset (%s)" %
                                  str(e))
                raise
            except subprocess.CalledProcessError as e:
                self.logger.error(str(e))
                raise

        def populate_cimager_parset(parset):
            input_parset = Parset(parset)
            patch_dictionary = {
                'Cimager.dataset': dataset,
                'Cimager.restore': restore
            }
            image_names = []
            for image_name in input_parset.getStringVector(
                    'Cimager.Images.Names'):
                image_names.append("%s_%s" % (image_name, name))
                subset = input_parset.makeSubset(
                    "Cimager.Images.%s" % image_name,
                    "Cimager.Images.%s" % image_names[-1])
                patch_dictionary["Cimager.Images.%s.frequency" %
                                 image_names[-1]] = frequency
                patch_dictionary["Cimager.Images.%s.direction" %
                                 image_names[-1]] = "[ %s,%s,%s ]" % (
                                     ms_dir_ra, ms_dir_dec, ms_dir_type)
                for key in subset:
                    patch_dictionary[key] = subset[key].get()
            input_parset.subtractSubset('Cimager.Images.image')
            for key in input_parset:
                patch_dictionary[key] = input_parset[key].get()
            patch_dictionary['Cimager.Images.Names'] = "[ %s ]" % ", ".join(
                image_names)
            return patch_parset(None, patch_dictionary,
                                self.config.get("layout", "job_directory"))

        try:
            if self.inputs['parset_type'] == "mwimager":
                cimager_parset = convert_mwimager_parset(self.inputs['parset'])
            elif self.inputs['parset_type'] == "cimager":
                cimager_parset = populate_cimager_parset(self.inputs['parset'])
        except Exception as e:
            self.logger.exception("Failed to generate imager parset")
            raise

        return cimager_parset
Exemplo n.º 3
0
class demixing(BaseRecipe, RemoteCommandRecipeMixIn):
    """
    Run the demixer on the MS's on the compute nodes.
    """
    inputs = {
        'mapfile':
        ingredient.StringField(
            '--mapfile',
            help="Name of the output mapfile containing the names of the "
            "MS-files produced by the demixing recipe"),
        'working_directory':
        ingredient.StringField('-w',
                               '--working-directory',
                               help="Working directory used on output nodes. "
                               "Results will be written here"),
        'initscript':
        ingredient.FileField(
            '--initscript',
            help="The full path to an (Bourne) shell script which will "
            "intialise the environment (ie, ``lofarinit.sh``)"),
        'demix_parset_dir':
        ingredient.DirectoryField(
            '--demix-parset-dir',
            dest='demixdir',
            help="Directory containing the demixing parset-files",
        ),
        'db_host':
        ingredient.StringField(
            '--db-host',
            dest="db_host",
            help="Database host with optional port (e.g. ldb001)"),
        'skymodel':
        ingredient.FileField(
            '--skymodel',
            help="File containing the sky model to use",
        ),
        'demix_sources':
        ingredient.ListField(
            '--demix-sources',
            dest='remove',
            help="List of sources to remove e.g. 'CygA, CasA'; "
            "will be determined automatically if not specified.",
            default=[]),
        'ms_target':
        ingredient.StringField(
            '--ms-target',
            dest='target',
            help="Substring in the output MS name that replaces the "
            "substring 'uv' (default: 'target')",
            default="target"),
        'timestep':
        ingredient.IntField('--timestep',
                            help="Time step for averaging",
                            default=10),
        'freqstep':
        ingredient.IntField('--freqstep',
                            help="Frequency step for averaging",
                            default=60),
        'half_window':
        ingredient.IntField('--half-window',
                            help="Window size of median filter",
                            default=20),
        'threshold':
        ingredient.FloatField(
            '--threshold',
            help="Solutions above/below threshold*rms are smoothed",
            default=2.5),
        'nproc':
        ingredient.IntField(
            '--nproc',
            help="Maximum number of simultaneous processes per compute node",
            default=1)
    }

    outputs = {'mapfile': ingredient.FileField()}

    def go(self):
        self.logger.info("Starting demixing run")
        super(demixing, self).go()

        job_dir = os.path.join(self.inputs['working_directory'],
                               self.inputs['job_name'])

        #                       Load file <-> compute node mapping from disk
        # ------------------------------------------------------------------
        args = self.inputs['args']
        self.logger.debug("Loading input-data mapfile: %s" % args[0])
        indata = load_data_map(args[0])
        if len(args) > 1:
            self.logger.debug("Loading output-data mapfile: %s" % args[1])
            outdata = load_data_map(args[1])
            if not validate_data_maps(indata, outdata):
                self.logger.error(
                    "Validation of input/output data mapfiles failed")
                return 1
        else:
            # This is a bit of a kludge. The input MS-filenames are supposed to
            # contain the string "_uv". The demixing node script will produce
            # output MS-files, whose names have the string "_uv" replaced by
            # "_" + self.inputs['ms_target'] + "_sub".
            outdata = [(host,
                        os.path.join(
                            job_dir,
                            os.path.basename(infile).replace(
                                '_uv',
                                '_' + self.inputs['ms_target'] + '_sub')))
                       for host, infile in indata]

        command = "python %s" % (self.__file__.replace('master', 'nodes'))
        jobs = []
        for host, infile in indata:
            jobs.append(
                ComputeJob(
                    host,
                    command,
                    arguments=[
                        infile, job_dir, self.inputs['initscript'],
                        self.inputs['demix_sources'], self.inputs['ms_target'],
                        self.config.get('cluster', 'clusterdesc'),
                        self.inputs['timestep'], self.inputs['freqstep'],
                        self.inputs['half_window'], self.inputs['threshold'],
                        self.inputs['demix_parset_dir'],
                        self.inputs['skymodel'], self.inputs['db_host']
                    ]))
        self._schedule_jobs(jobs, max_per_node=self.inputs['nproc'])

        if self.error.isSet():
            return 1
        else:
            self.logger.debug("Writing mapfile %s" % self.inputs['mapfile'])
            store_data_map(self.inputs['mapfile'], outdata)
            self.outputs['mapfile'] = self.inputs['mapfile']
            return 0