Exemplo n.º 1
0
    def _get_warnings_text_and_table(self):
        """
        Return a :py:class:`Table <lab.reports.Table>` containing one line for
        each run where an unexplained error occured.
        """
        if not self.ERROR_ATTRIBUTES:
            logging.critical("The list of error attributes must not be empty.")

        table = reports.Table(title="Unexplained errors")
        table.set_column_order(self.ERROR_ATTRIBUTES)

        wrote_to_slurm_err = any(
            "output-to-slurm.err" in run.get("unexplained_errors", [])
            for run in self.runs.values())

        for run in self.runs.values():
            error_message = tools.get_unexplained_errors_message(run)
            if error_message:
                logging.error(error_message)
                run_dir = run["run_dir"]
                for attr in self.ERROR_ATTRIBUTES:
                    value = run.get(attr, "?")
                    if attr == "unexplained_errors":
                        value = self._format_unexplained_errors(value)
                        # Use formatted value as-is.
                        table.cell_formatters[run_dir][
                            attr] = reports.CellFormatter()
                    table.add_cell(run_dir, attr, value)

        errors = []

        if wrote_to_slurm_err:
            src_dir = self.eval_dir.rstrip("/")[:-len("-eval")]
            slurm_err_file = src_dir + "-grid-steps/slurm.err"
            try:
                slurm_err_content = tools.get_slurm_err_content(src_dir)
            except FileNotFoundError:
                slurm_err_file = "*-grid-steps/slurm.err"
                errors.append(
                    f"There was output to {slurm_err_file}, but the file was missing "
                    f"when this report was made.")
            else:
                slurm_err_content = tools.filter_slurm_err_content(
                    slurm_err_content)
                errors.append(
                    f"There was output to {slurm_err_file}. Below is the output without"
                    f'"memory cg" errors:\n```\n{slurm_err_content}\n```')
            logging.error(f"There was output to {slurm_err_file}.")

        if table:
            errors.append(str(table))

        infai_1_nodes = {f"ase{i:02d}.cluster.bc2.ch" for i in range(1, 25)}
        infai_2_nodes = {f"ase{i:02d}.cluster.bc2.ch" for i in range(31, 55)}
        nodes = self._get_node_names()
        if nodes & infai_1_nodes and nodes & infai_2_nodes:
            errors.append(
                "Report combines runs from infai_1 and infai_2 partitions.")

        return "\n".join(errors)
Exemplo n.º 2
0
    def __call__(self,
                 src_dir,
                 eval_dir=None,
                 merge=None,
                 filter=None,
                 **kwargs):
        """
        This method can be used to copy properties from an exp-dir or
        eval-dir into an eval-dir. If the destination eval-dir already
        exist, the data will be merged. This means *src_dir* can either
        be an exp-dir or an eval-dir and *eval_dir* can be a new or
        existing directory.

        We recommend using lab.Experiment.add_fetcher() to add fetchers
        to an experiment. See the method's documentation for a
        description of the parameters.

        """
        if not os.path.isdir(src_dir):
            logging.critical(
                "{} is missing or not a directory".format(src_dir))
        run_filter = tools.RunFilter(filter, **kwargs)

        eval_dir = eval_dir or src_dir.rstrip("/") + "-eval"
        logging.info("Fetching properties from {} to {}".format(
            src_dir, eval_dir))

        if merge is None:
            _check_eval_dir(eval_dir)
        elif merge:
            # No action needed, data will be merged.
            pass
        else:
            tools.remove_path(eval_dir)

        # Load properties in the eval_dir if there are any already.
        combined_props = tools.Properties(os.path.join(eval_dir, "properties"))
        fetch_from_eval_dir = not os.path.exists(
            os.path.join(src_dir, "runs-00001-00100"))
        if fetch_from_eval_dir:
            src_props = tools.Properties(
                filename=os.path.join(src_dir, "properties"))
            run_filter.apply(src_props)
            combined_props.update(src_props)
            logging.info("Fetched properties of {} runs.".format(
                len(src_props)))
        else:
            slurm_err_content = tools.get_slurm_err_content(src_dir)
            if slurm_err_content:
                logging.error("There was output to *-grid-steps/slurm.err")

            new_props = tools.Properties()
            run_dirs = sorted(glob(os.path.join(src_dir, "runs-*-*", "*")))
            total_dirs = len(run_dirs)
            logging.info(
                "Scanning properties from {:d} run directories".format(
                    total_dirs))
            for index, run_dir in enumerate(run_dirs, start=1):
                loglevel = logging.INFO if index % 100 == 0 else logging.DEBUG
                logging.log(loglevel,
                            "Scanning: {:6d}/{:d}".format(index, total_dirs))
                props = self.fetch_dir(run_dir)
                if slurm_err_content:
                    props.add_unexplained_error("output-to-slurm.err")
                id_string = "-".join(props["id"])
                new_props[id_string] = props
            run_filter.apply(new_props)
            combined_props.update(new_props)

        unexplained_errors = 0
        for props in combined_props.values():
            error_message = tools.get_unexplained_errors_message(props)
            if error_message:
                logging.error(error_message)
                unexplained_errors += 1

        tools.makedirs(eval_dir)
        combined_props.write()
        logging.info("Wrote properties file (contains {unexplained_errors} "
                     "runs with unexplained errors).".format(**locals()))
Exemplo n.º 3
0
    def _get_warnings_text_and_table(self):
        """
        Return a :py:class:`Table <lab.reports.Table>` containing one line for
        each run where an unexplained error occured.
        """
        if not self.ERROR_ATTRIBUTES:
            logging.critical('The list of error attributes must not be empty.')

        table = reports.Table(title='Unexplained errors')
        table.set_column_order(self.ERROR_ATTRIBUTES)

        wrote_to_slurm_err = any(
            'output-to-slurm.err' in run.get('unexplained_errors', [])
            for run in self.runs.values())

        num_unexplained_errors = 0
        for run in self.runs.values():
            error_message = tools.get_unexplained_errors_message(run)
            if error_message:
                logging.error(error_message)
                num_unexplained_errors += 1
                for attr in self.ERROR_ATTRIBUTES:
                    table.add_cell(run['run_dir'], attr, run.get(attr, '?'))

        if num_unexplained_errors:
            logging.error(
                'There were {num_unexplained_errors} runs with unexplained'
                ' errors.'.format(**locals()))

        errors = []

        if wrote_to_slurm_err:
            src_dir = self.eval_dir.rstrip('/')[:-len('-eval')]
            slurm_err_file = src_dir + '-grid-steps/slurm.err'
            try:
                slurm_err_content = tools.get_slurm_err_content(src_dir)
            except IOError:
                slurm_err_content = (
                    'The slurm.err file was missing while creating the report.'
                )
            else:
                slurm_err_content = tools.filter_slurm_err_content(
                    slurm_err_content)

            logging.error(
                'There was output to {slurm_err_file}.'.format(**locals()))

            errors.append(
                ' Contents of {slurm_err_file} without "memory cg"'
                ' errors:\n```\n{slurm_err_content}\n```'.format(**locals()))

        if table:
            errors.append(str(table))

        infai_1_nodes = set('ase{:02d}.cluster.bc2.ch'.format(i)
                            for i in range(1, 25))
        infai_2_nodes = set('ase{:02d}.cluster.bc2.ch'.format(i)
                            for i in range(31, 55))
        nodes = self._get_node_names()
        if nodes & infai_1_nodes and nodes & infai_2_nodes:
            errors.append(
                'Report combines runs from infai_1 and infai_2 partitions.')

        return '\n'.join(errors)