Exemplo n.º 1
0
class ParallelMaster(GenericMaster):
    """
    MasterJob that handles the creation and analysis of several parallel jobs (including master and
    continuation jobs), Examples are Murnaghan or Phonon calculations

    Args:
        project (ProjectHDFio): ProjectHDFio instance which points to the HDF5 file the job is stored in
        job_name (str): name of the job, which has to be unique within the project

    Attributes:

        .. attribute:: job_name

            name of the job, which has to be unique within the project

        .. attribute:: status

            execution status of the job, can be one of the following [initialized, appended, created, submitted,
                                                                      running, aborted, collect, suspended, refresh,
                                                                      busy, finished]

        .. attribute:: job_id

            unique id to identify the job in the pyiron database

        .. attribute:: parent_id

            job id of the predecessor job - the job which was executed before the current one in the current job series

        .. attribute:: master_id

            job id of the master job - a meta job which groups a series of jobs, which are executed either in parallel
            or in serial.

        .. attribute:: child_ids

            list of child job ids - only meta jobs have child jobs - jobs which list the meta job as their master

        .. attribute:: project

            Project instance the jobs is located in

        .. attribute:: project_hdf5

            ProjectHDFio instance which points to the HDF5 file the job is stored in

        .. attribute:: job_info_str

            short string to describe the job by it is job_name and job ID - mainly used for logging

        .. attribute:: working_directory

            working directory of the job is executed in - outside the HDF5 file

        .. attribute:: path

            path to the job as a combination of absolute file system path and path within the HDF5 file.

        .. attribute:: version

            Version of the hamiltonian, which is also the version of the executable unless a custom executable is used.

        .. attribute:: executable

            Executable used to run the job - usually the path to an external executable.

        .. attribute:: library_activated

            For job types which offer a Python library pyiron can use the python library instead of an external
            executable.

        .. attribute:: server

            Server object to handle the execution environment for the job.

        .. attribute:: queue_id

            the ID returned from the queuing system - it is most likely not the same as the job ID.

        .. attribute:: logger

            logger object to monitor the external execution and internal pyiron warnings.

        .. attribute:: restart_file_list

            list of files which are used to restart the calculation from these files.

        .. attribute:: job_type

            Job type object with all the available job types: ['ExampleJob', 'SerialMaster', 'ParallelMaster',
                                                               'ScriptJob', 'ListMaster']

        .. attribute:: child_names

            Dictionary matching the child ID to the child job name.

        .. attribute:: ref_job

            Reference job template from which all jobs within the ParallelMaster are generated.

        .. attribute:: number_jobs_total

            Total number of jobs
    """
    def __init__(self, project, job_name):
        self.input = GenericParameters("parameters")
        super(ParallelMaster, self).__init__(project, job_name=job_name)
        self.__name__ = "ParallelMaster"
        self.__version__ = "0.3"
        self._ref_job = None
        self._output = GenericOutput()
        self._job_generator = None
        self.submission_status = SubmissionStatus(db=project.db,
                                                  job_id=self.job_id)
        self.refresh_submission_status()

    @property
    def ref_job(self):
        """
        Get the reference job template from which all jobs within the ParallelMaster are generated.

        Returns:
            GenericJob: reference job
        """
        if self._ref_job:
            return self._ref_job
        try:
            ref_job = self[0]
            if isinstance(ref_job, GenericJob):
                self._ref_job = ref_job
                self._ref_job._job_id = None
                self._ref_job._status = JobStatus(db=self.project.db)
                return self._ref_job
            else:
                return None
        except IndexError:
            return None

    @ref_job.setter
    def ref_job(self, ref_job):
        """
        Set the reference job template from which all jobs within the ParallelMaster are generated.

        Args:
            ref_job (GenericJob): reference job
        """
        self.append(ref_job)

    @property
    def number_jobs_total(self):
        """
        Get number of total jobs

        Returns:
            int: number of total jobs
        """
        return self.submission_status.total_jobs

    @number_jobs_total.setter
    def number_jobs_total(self, num_jobs):
        """
        Set number of total jobs (optional: default = None)

        Args:
            num_jobs (int): number of submitted jobs
        """
        self.submission_status.total_jobs = num_jobs

    def set_input_to_read_only(self):
        """
        This function enforces read-only mode for the input classes, but it has to be implement in the individual
        classes.
        """
        self.input.read_only = True

    def reset_job_id(self, job_id=None):
        """
        Reset the job id sets the job_id to None as well as all connected modules like JobStatus and SubmissionStatus.
        """
        super(ParallelMaster, self).reset_job_id(job_id=job_id)
        if job_id is not None:
            self.submission_status = SubmissionStatus(db=self.project.db,
                                                      job_id=job_id)
        else:
            self.submission_status = SubmissionStatus(db=self.project.db,
                                                      job_id=self.job_id)

    def to_hdf(self, hdf=None, group_name=None):
        """
        Store the ParallelMaster in an HDF5 file

        Args:
            hdf (ProjectHDFio): HDF5 group object - optional
            group_name (str): HDF5 subgroup name - optional
        """
        super(ParallelMaster, self).to_hdf(hdf=hdf, group_name=group_name)
        with self.project_hdf5.open("input") as hdf5_input:
            self.input.to_hdf(hdf5_input)

    def from_hdf(self, hdf=None, group_name=None):
        """
        Restore the ParallelMaster from an HDF5 file

        Args:
            hdf (ProjectHDFio): HDF5 group object - optional
            group_name (str): HDF5 subgroup name - optional
        """
        super(ParallelMaster, self).from_hdf(hdf=hdf, group_name=group_name)
        with self.project_hdf5.open("input") as hdf5_input:
            self.input.from_hdf(hdf5_input)

    def write_input(self):
        """
        Write the input files - this contains the GenericInput of the ParallelMaster as well as reseting the submission
        status.
        """
        self.submission_status.submitted_jobs = 0
        self.input.write_file(file_name="input.inp",
                              cwd=self.working_directory)

    def collect_output(self):
        """
        Collect the output files of the external executable and store the information in the HDF5 file. This method has
        to be implemented in the individual meta jobs derived from the ParallelMaster.
        """
        raise NotImplementedError("Implement in derived class")

    def collect_logfiles(self):
        """
        Collect the log files of the external executable and store the information in the HDF5 file. This method is
        currently not implemented for the ParallelMaster.
        """
        pass

    def output_to_pandas(self, sort_by=None, h5_path="output"):
        """
        Convert output of all child jobs to a pandas Dataframe object.

        Args:
            sort_by (str): sort the output using pandas.DataFrame.sort_values(by=sort_by)
            h5_path (str): select child output to include - default='output'

        Returns:
            pandas.Dataframe: output as dataframe
        """
        # TODO: The output to pandas function should no longer be required
        with self.project_hdf5.open(h5_path) as hdf:
            for key in hdf.list_nodes():
                self._output[key] = hdf[key]
        df = pandas.DataFrame(self._output)
        if sort_by is not None:
            df = df.sort_values(by=sort_by)
        return df

    # TODO: make it more general and move it then into genericJob
    def show_hdf(self):
        """
        Display the output of the child jobs in a human readable print out
        """
        try:
            display = getattr(importlib.import_module("IPython"), "display")
        except ModuleNotFoundError:
            print("show_hdf() requires IPython to be installed.")
        else:
            for nn in self.project_hdf5.list_groups():
                with self.project_hdf5.open(nn) as hdf_dir:
                    display.display(nn)
                    if nn.strip() == "output":
                        display.display(self.output_to_pandas(h5_path=nn))
                        continue
                    for n in hdf_dir.list_groups():
                        display.display("-->" + n)
                        try:
                            display.display(hdf_dir.get_pandas(n))
                        except Exception as e:
                            print(e)
                            print("Not a pandas object")

    def save(self):
        """
        Save the object, by writing the content to the HDF5 file and storing an entry in the database.

        Returns:
            (int): Job ID stored in the database
        """
        job_id = super(ParallelMaster, self).save()
        self.refresh_submission_status()
        return job_id

    def refresh_submission_status(self):
        """
        Refresh the submission status - if a job ID job_id is set then the submission status is loaded from the
        database.
        """
        if self.job_id:
            self.submission_status = SubmissionStatus(db=self.project.db,
                                                      job_id=self.job_id)
            self.submission_status.refresh()

    def interactive_ref_job_initialize(self):
        """
        To execute the reference job in interactive mode it is necessary to initialize it.
        """
        if len(self._job_name_lst) > 0:
            self._ref_job = self.pop(-1)
            self._ref_job.job_name = self.job_name + "_" + self._ref_job.job_name
            if self._job_id is not None and self._ref_job._master_id is None:
                self._ref_job.master_id = self.job_id

    def copy(self):
        """
        Copy the GenericJob object which links to the job and its HDF5 file

        Returns:
            GenericJob: New GenericJob object pointing to the same job
        """
        new_job = super(ParallelMaster, self).copy()
        new_job.ref_job = self.ref_job
        return new_job

    def copy_to(self,
                project=None,
                new_job_name=None,
                input_only=False,
                new_database_entry=True):
        """
        Copy the content of the job including the HDF5 file to a new location

        Args:
            project (ProjectHDFio): project to copy the job to
            new_job_name (str): to duplicate the job within the same porject it is necessary to modify the job name
                                - optional
            input_only (bool): [True/False] to copy only the input - default False
            new_database_entry (bool): [True/False] to create a new database entry - default True

        Returns:
            GenericJob: GenericJob object pointing to the new location.
        """
        new_generic_job = super(ParallelMaster, self).copy_to(
            project=project,
            new_job_name=new_job_name,
            input_only=input_only,
            new_database_entry=new_database_entry,
        )
        new_generic_job.submission_status = SubmissionStatus(
            db=new_generic_job._hdf5.project.db, job_id=new_generic_job.job_id)
        return new_generic_job

    def is_finished(self):
        """
        Check if the ParallelMaster job is finished - by checking the job status and the submission status.

        Returns:
            bool: [True/False]
        """
        if self.status.finished:
            return True
        if len(self.child_ids) < len(self._job_generator):
            return False
        return set([
            self.project.db.get_job_status(child_id)
            for child_id in self.child_ids
        ]) < {"finished", "busy", "refresh", "aborted", "not_converged"}

    def iter_jobs(self, convert_to_object=True):
        """
        Iterate over the jobs within the ListMaster

        Args:
            convert_to_object (bool): load the full GenericJob object (default) or just the HDF5 / JobCore object

        Returns:
            yield: Yield of GenericJob or JobCore
        """
        for job_id in self._get_jobs_sorted():
            yield self.project.load(job_id,
                                    convert_to_object=convert_to_object)

    def _get_jobs_sorted(self):
        job_names = self.child_names.values()
        return [
            j for j in [
                self._job_generator.job_name(p)
                for p in self._job_generator.parameter_list
            ] if j in job_names
        ]

    def __getitem__(self, item):
        """
        Get/ read data from the HDF5 file

        Args:
            item (str, slice): path to the data or key of the data object

        Returns:
            dict, list, float, int: data or data object
        """
        child_id_lst = self.child_ids
        child_name_lst = [
            self.project.db.get_item_by_id(child_id)["job"]
            for child_id in self.child_ids
        ]
        if isinstance(item, int):
            total_lst = self._job_name_lst + child_name_lst
            item = total_lst[item]
        return self._get_item_when_str(item=item,
                                       child_id_lst=child_id_lst,
                                       child_name_lst=child_name_lst)

    def __len__(self):
        """
        Length of the ListMaster equal the number of childs appended.

        Returns:
            int: length of the ListMaster
        """
        return len(self.child_ids)

    def run_if_refresh(self):
        """
        Internal helper function the run if refresh function is called when the job status is 'refresh'. If the job was
        suspended previously, the job is going to be started again, to be continued.
        """
        self._logger.info("{}, status: {}, finished: {} parallel master "
                          "refresh".format(self.job_info_str, self.status,
                                           self.is_finished()))
        if self.is_finished():
            self.status.collect = True
            self.run()  # self.run_if_collect()
        elif (self.server.run_mode.non_modal or self.server.run_mode.queue
              ) and not self.submission_status.finished:
            self.run_static()
        else:
            self.refresh_job_status()
            if self.status.refresh:
                self.status.suspended = True
            if self.status.busy:
                self.status.refresh = True
                self.run_if_refresh()

    def _run_if_collect(self):
        """
        Internal helper function the run if collect function is called when the job status is 'collect'. It collects
        the simulation output using the standardized functions collect_output() and collect_logfiles(). Afterwards the
        status is set to 'finished'.
        """
        self._logger.info("{}, status: {}, finished".format(
            self.job_info_str, self.status))
        self.collect_output()

        job_id = self.get_job_id()
        db_dict = {}
        start_time = self.project.db.get_item_by_id(job_id)["timestart"]
        db_dict["timestop"] = datetime.now()
        db_dict["totalcputime"] = (db_dict["timestop"] - start_time).seconds
        self.project.db.item_update(db_dict, job_id)
        self.status.finished = True
        self._hdf5["status"] = self.status.string
        self._logger.info("{}, status: {}, parallel master".format(
            self.job_info_str, self.status))
        self.update_master()
        # self.send_to_database()

    def _validate_cores(self, job, cores_for_session):
        """
        Check if enough cores are available to start the next child job.

        Args:
            job (GenericJob): child job to be started
            cores_for_session (list): list of currently active cores - list of integers

        Returns:
            bool: [True/False]
        """
        return (self.get_child_cores() + job.server.cores +
                sum(cores_for_session) > self.server.cores)

    def _next_job_series(self, job):
        """
        Generate a list of child jobs to be executed in the next iteration.

        Args:
            job (GenericJob): child job to be started

        Returns:
            list: list of GenericJob objects
        """
        job_to_be_run_lst, cores_for_session = [], []
        while job is not None:
            self._logger.debug("create job: %s %s", job.job_info_str,
                               job.master_id)
            if not job.status.finished:
                self.submission_status.submit_next()
                job_to_be_run_lst.append(job)
                cores_for_session.append(job.server.cores)
                self._logger.info("{}: finished job {}".format(
                    self.job_name, job.job_name))
            job = next(self._job_generator, None)
            if job is not None and self._validate_cores(
                    job, cores_for_session):
                job = None
        return job_to_be_run_lst

    def _run_if_child_queue(self, job):
        """
        run function which is executed when the child jobs are submitted to the queue. In this case all child jobs are
        submitted at the same time without considering the number of cores specified for the Parallelmaster.

        Args:
            job (GenericJob): child job to be started
        """
        while job is not None:
            self._logger.debug("create job: %s %s", job.job_info_str,
                               job.master_id)
            if not job.status.finished:
                job.run()
                self._logger.info("{}: submitted job {}".format(
                    self.job_name, job.job_name))
            job = next(self._job_generator, None)
        self.submission_status.submitted_jobs = self.submission_status.total_jobs
        self.status.suspended = True
        if self.is_finished():
            self.status.collect = True
            self.run()

    def _run_if_master_non_modal_child_non_modal(self, job):
        """
        run function which is executed when the Parallelmaster as well as its childs are running in non modal mode.

        Args:
            job (GenericJob): child job to be started
        """
        job_to_be_run_lst = self._next_job_series(job)
        if self.project.db.get_job_status(job_id=self.job_id) != "busy":
            self.status.suspended = True
            for job in job_to_be_run_lst:
                job.run()
            if self.master_id:
                del self
        else:
            self.run_static()

    def _run_if_master_modal_child_modal(self, job):
        """
        run function which is executed when the Parallelmaster as well as its childs are running in modal mode.

        Args:
            job (GenericJob): child job to be started
        """
        while job is not None:
            self._logger.debug("create job: %s %s", job.job_info_str,
                               job.master_id)
            if not job.status.finished:
                self.submission_status.submit_next()
                job.run()
                self._logger.info("{}: finished job {}".format(
                    self.job_name, job.job_name))
            job = next(self._job_generator, None)
        if self.is_finished():
            self.status.collect = True
            self.run()
        elif self.status.busy:
            self.status.refresh = True
            self.run_if_refresh()
        else:
            self.status.suspended = True

    def _run_if_master_modal_child_non_modal(self, job):
        """
        run function which is executed when the Parallelmaster is running in modal mode and its childs are running in
        non modal mode.

        Args:
            job (GenericJob): child job to be started
        """
        pool = multiprocessing.Pool(self.server.cores)
        job_lst = []
        for i, p in enumerate(self._job_generator.parameter_list):
            if hasattr(self._job_generator, "job_name"):
                job = self.create_child_job(
                    self._job_generator.job_name(parameter=p))
            else:
                job = self.create_child_job(self.ref_job.job_name + "_" +
                                            str(i))
            job = self._job_generator.modify_job(job=job, parameter=p)
            job.server.run_mode.modal = True
            job.save()
            job.project_hdf5.create_working_directory()
            job.write_input()
            if s.database_is_disabled or (s.queue_adapter is not None
                                          and s.queue_adapter.remote_flag):
                job_lst.append(
                    (job.project.path, None,
                     job.project_hdf5.file_name + job.project_hdf5.h5_path,
                     False, False))
            else:
                job_lst.append(
                    (job.project.path, job.job_id, None, False, False))
        pool.map(job_wrap_function, job_lst)
        if s.database_is_disabled:
            self.project.db.update()
        self.status.collect = True
        self.run()  # self.run_if_collect()

    def run_static(self):
        """
        The run_static function is executed within the GenericJob class and depending on the run_mode of the
        Parallelmaster and its child jobs a more specific run function is selected.
        """
        self._logger.info("{} run parallel master (modal)".format(
            self.job_info_str))
        self.status.running = True
        self.submission_status.total_jobs = len(self._job_generator)
        self.submission_status.submitted_jobs = 0
        if self.job_id and not self.is_finished():
            self._logger.debug("{} child project {}".format(
                self.job_name, self.project.__str__()))
            job = next(self._job_generator, None)
            if (self.server.run_mode.non_modal or self.server.run_mode.queue) \
                    and job.server.run_mode.interactive:
                self.run_if_interactive()
            elif self.server.run_mode.queue:
                self._run_if_master_modal_child_non_modal(job=job)
            elif job.server.run_mode.queue:
                self._run_if_child_queue(job)
            elif self.server.run_mode.non_modal and job.server.run_mode.non_modal:
                self._run_if_master_non_modal_child_non_modal(job)
            elif (self.server.run_mode.modal and job.server.run_mode.modal
                  ) or (self.server.run_mode.interactive
                        and job.server.run_mode.interactive):
                self._run_if_master_modal_child_modal(job)
            elif self.server.run_mode.modal and job.server.run_mode.non_modal:
                self._run_if_master_modal_child_non_modal(job)
            else:
                raise TypeError()
        else:
            self.status.collect = True
            self.run()

    def run_if_interactive(self):
        if not (self.ref_job.server.run_mode.interactive
                or self.ref_job.server.run_mode.interactive_non_modal):
            raise ValueError(
                "The child job has to be run_mode interactive or interactive_non_modal."
            )
        if isinstance(self.ref_job, GenericMaster):
            self.run_static()
        elif self.server.cores == 1:
            self.interactive_ref_job_initialize()
            for parameter in self._job_generator.parameter_list:
                self._job_generator.modify_job(job=self.ref_job,
                                               parameter=parameter)
                self.ref_job.run()
            self.ref_job.interactive_close()
        else:
            if self.server.cores > len(self._job_generator.parameter_list):
                number_of_jobs = len(self._job_generator.parameter_list)
            else:
                number_of_jobs = self.server.cores
            max_tasks_per_job = (int(
                len(self._job_generator.parameter_list) // number_of_jobs) + 1)
            parameters_sub_lst = [
                self._job_generator.parameter_list[i:i + max_tasks_per_job]
                for i in range(0, len(self._job_generator.parameter_list),
                               max_tasks_per_job)
            ]
            list_of_sub_jobs = [
                self.create_child_job("job_" + str(i))
                for i in range(number_of_jobs)
            ]
            primary_job = list_of_sub_jobs[0]
            if not primary_job.server.run_mode.interactive_non_modal:
                raise ValueError(
                    "The child job has to be run_mode interactive_non_modal.")
            if primary_job.server.cores != 1:
                raise ValueError("The child job can only use a single core.")
            for iteration in range(len(parameters_sub_lst[0])):
                for job_ind, job in enumerate(list_of_sub_jobs):
                    if iteration < len(parameters_sub_lst[job_ind]):
                        self._job_generator.modify_job(
                            job=job,
                            parameter=parameters_sub_lst[job_ind][iteration])
                        job.run()
                for job_ind, job in enumerate(list_of_sub_jobs):
                    if iteration < len(parameters_sub_lst[job_ind]):
                        job.interactive_fetch()
            for job in list_of_sub_jobs:
                job.interactive_close()
            self.interactive_ref_job_initialize()
            self.ref_job.run()
            for key in primary_job.interactive_cache.keys():
                output_sum = []
                for job in list_of_sub_jobs:
                    output = job["output/interactive/" + key]
                    if isinstance(output, np.ndarray):
                        output = output.tolist()
                    if isinstance(output, list):
                        output_sum += output
                    else:
                        raise TypeError(
                            "output should be list or numpy.ndarray but it is ",
                            type(output),
                        )
                self.ref_job.interactive_cache[key] = output_sum
            interactive_cache_backup = self.ref_job.interactive_cache.copy()
            self.ref_job.interactive_flush(path="generic",
                                           include_last_step=True)
            self.ref_job.interactive_cache = interactive_cache_backup
            self.ref_job.interactive_close()
        self.status.collect = True
        self.run()

    def create_child_job(self, job_name):
        """
        Internal helper function to create the next child job from the reference job template - usually this is called
        as part of the create_jobs() function.

        Args:
            job_name (str): name of the next job

        Returns:
            GenericJob: next job
        """
        if not self.server.new_hdf:
            project = self.project
            where_dict = {
                "job": str(job_name),
                "project": str(self.project_hdf5.project_path),
                "subjob": str(self.project_hdf5.h5_path + "/" + job_name),
            }
            response = self.project.db.get_items_dict(where_dict,
                                                      return_all_columns=False)
            if len(response) > 0:
                job_id = response[-1]["id"]
            else:
                job_id = None
        else:
            project = self.project.open(self.job_name + "_hdf5")
            job_id = project.get_job_id(job_specifier=job_name)
        if job_id is not None:
            ham = project.load(job_id)
            self._logger.debug("job {} found, status: {}".format(
                job_name, ham.status))
            if ham.server.run_mode.queue:
                self.project.refresh_job_status_based_on_job_id(job_id,
                                                                que_mode=True)
            else:
                self.project.refresh_job_status_based_on_job_id(job_id,
                                                                que_mode=False)
            if ham.status.aborted:
                ham.status.created = True

            self._logger.debug("job - status: {}".format(ham.status))
            return ham

        job = self.ref_job.copy()
        job = self._load_all_child_jobs(job_to_load=job)
        if self.server.new_hdf:
            job.project_hdf5 = self.project_hdf5.create_hdf(
                path=self.project.open(self.job_name + "_hdf5").path,
                job_name=job_name)
        else:
            job.project_hdf5 = self.project_hdf5.open(job_name)
        if isinstance(job, GenericMaster):
            for sub_job in job._job_object_dict.values():
                self._child_job_update_hdf(parent_job=job, child_job=sub_job)
        self._logger.debug("create_job:: {} {} {} {}".format(
            self.project_hdf5.path,
            self._name,
            self.project_hdf5.h5_path,
            str(self.get_job_id()),
        ))
        job._name = job_name
        job.master_id = self.get_job_id()
        job.status.initialized = True
        if self.server.run_mode.non_modal and job.server.run_mode.modal:
            job.server.run_mode.non_modal = True
        elif self.server.run_mode.queue:
            job.server.run_mode.thread = True
        self._logger.info("{}: run job {}".format(self.job_name, job.job_name))
        return job

    def _db_server_entry(self):
        """
        connect all the info regarding the server into a single word that can be used e.g. as entry in a database

        Returns:
            (str): server info as single word

        """
        db_entry = super(ParallelMaster, self)._db_server_entry()
        if self.submission_status.total_jobs:
            return (db_entry + "#" +
                    str(self.submission_status.submitted_jobs) + "/" +
                    str(self.submission_status.total_jobs))
        else:
            return db_entry + "#" + str(self.submission_status.submitted_jobs)
Exemplo n.º 2
0
class SerialMasterBase(GenericMaster):
    """
    The serial master class is a metajob consisting of a dynamic list of jobs which are executed in serial mode. The job
    is derived from the GenericMaster.

    Args:
        project (ProjectHDFio): ProjectHDFio instance which points to the HDF5 file the job is stored in
        job_name (str): name of the job, which has to be unique within the project

    Attributes:

        .. attribute:: job_name

            name of the job, which has to be unique within the project

        .. attribute:: status

            execution status of the job, can be one of the following [initialized, appended, created, submitted,
                                                                      running, aborted, collect, suspended, refresh,
                                                                      busy, finished]

        .. attribute:: job_id

            unique id to identify the job in the pyiron database

        .. attribute:: parent_id

            job id of the predecessor job - the job which was executed before the current one in the current job series

        .. attribute:: master_id

            job id of the master job - a meta job which groups a series of jobs, which are executed either in parallel
            or in serial.

        .. attribute:: child_ids

            list of child job ids - only meta jobs have child jobs - jobs which list the meta job as their master

        .. attribute:: project

            Project instance the jobs is located in

        .. attribute:: project_hdf5

            ProjectHDFio instance which points to the HDF5 file the job is stored in

        .. attribute:: job_info_str

            short string to describe the job by it is job_name and job ID - mainly used for logging

        .. attribute:: working_directory

            working directory of the job is executed in - outside the HDF5 file

        .. attribute:: path

            path to the job as a combination of absolute file system path and path within the HDF5 file.

        .. attribute:: version

            Version of the hamiltonian, which is also the version of the executable unless a custom executable is used.

        .. attribute:: executable

            Executable used to run the job - usually the path to an external executable.

        .. attribute:: library_activated

            For job types which offer a Python library pyiron can use the python library instead of an external
            executable.

        .. attribute:: server

            Server object to handle the execution environment for the job.

        .. attribute:: queue_id

            the ID returned from the queuing system - it is most likely not the same as the job ID.

        .. attribute:: logger

            logger object to monitor the external execution and internal pyiron warnings.

        .. attribute:: restart_file_list

            list of files which are used to restart the calculation from these files.

        .. attribute:: job_type

            Job type object with all the available job types: ['ExampleJob', 'SerialMaster', 'ParallelMaster',
                                                               'ScriptJob', 'ListMaster']

        .. attribute:: child_names

            Dictionary matching the child ID to the child job name.

        .. attribute:: start_job

            The first job of the series.

        .. attribute:: input

            The input of the start job - the first job of the series.
    """

    def __init__(self, project, job_name):
        self._input = GenericParameters("parameters")  # e.g. convergence goal

        super(SerialMasterBase, self).__init__(project, job_name=job_name)
        self.__version__ = "0.3"

        self._output = GenericOutput()
        self._max_iterations = 100
        self._start_job = None
        self._convergence_goal = None
        self._convergence_goal_qwargs = {}
        self._convergence_goal_str = None

    @property
    def start_job(self):
        """
        Get the first job of the series.

        Returns:
            GenericJob: start job
        """
        if self._start_job:
            return self._start_job
        elif len(self) > 0:
            self._start_job = self[-1]
            return self._start_job
        else:
            return None

    @start_job.setter
    def start_job(self, job):
        """
        Set the first job of the series - that is the same like appending the job.

        Args:
            job (GenericJob): start job
        """
        self.append(job)

    @property
    def ref_job(self):
        return self.start_job

    @ref_job.setter
    def ref_job(self, job):
        self.append(job)

    @property
    def input(self):
        """
        Get the input of the start job - the first job of the series.

        Returns:
            GenericParameters: input of the start job
        """
        if self.start_job:
            return self._start_job.input
        else:
            return None

    @input.setter
    def input(self, value):
        """
        Set the input of the start job - the first job of the series.

        Args:
            value (GenericParameters): input of the start job
        """
        if self.start_job:
            self._start_job.input = value
        else:
            raise ValueError(
                "Input can only be set after a start job has been assinged."
            )

    def set_input_to_read_only(self):
        """
        This function enforces read-only mode for the input classes, but it has to be implement in the individual
        classes.
        """
        self._input.read_only = True

    def get_initial_child_name(self):
        """
        Get name of the initial child.

        Returns:
            str: name of the initial child
        """
        return self.project.db.get_item_by_id(self.child_ids[0])["job"]

    def create_next(self, job_name=None):
        """
        Create the next job in the series by duplicating the previous job.

        Args:
            job_name (str): name of the new job - optional - default='job_<index>'

        Returns:
            GenericJob: next job
        """
        if len(self) == 0:
            raise ValueError("No job available in job list, please append a job first.")
        if len(self._job_name_lst) > len(self.child_ids):
            return self.pop(-1)
        ham_old = self.project.load(self.child_ids[-1], convert_to_object=True)

        if ham_old.status.aborted:
            ham_old.status.created = True
            return ham_old
        elif not ham_old.status.finished:
            return None
        if job_name is None:
            job_name = "_".join(
                ham_old.job_name.split("_")[:-1] + [str(len(self.child_ids))]
            )
        new_job = ham_old.restart(job_name=job_name)
        new_job.server.cores = self.server.cores
        return new_job

    def collect_output(self):
        """
        Collect the output files of the individual jobs and set the output of the last job to be the output of the
        SerialMaster - so the SerialMaster contains the same output as its last child.
        """
        ham_lst = [self.project_hdf5.inspect(child_id) for child_id in self.child_ids]
        if (
            "output" in ham_lst[0].list_groups()
            and "generic" in ham_lst[0]["output"].list_groups()
        ):
            nodes = ham_lst[0]["output/generic"].list_nodes()
            with self.project_hdf5.open("output/generic") as hh:
                for node in nodes:
                    if ham_lst[0]["output/generic/{}".format(node)] is not None:
                        hh[node] = np.concatenate(
                            [ham["output/generic/{}".format(node)] for ham in ham_lst],
                            axis=0,
                        )
                    else:
                        hh[node] = None

    def collect_logfiles(self):
        """
        The collect logfiles function is required by the GenericJob class, therefore we use an empty template here.
        """
        pass

    def copy(self):
        """
        Copy the GenericJob object which links to the job and its HDF5 file

        Returns:
            GenericJob: New GenericJob object pointing to the same job
        """
        new_job = super(SerialMasterBase, self).copy()
        new_job.start_job = self.start_job
        return new_job

    def from_hdf(self, hdf=None, group_name=None):
        """
        Restore the SerialMaster from an HDF5 file

        Args:
            hdf (ProjectHDFio): HDF5 group object - optional
            group_name (str): HDF5 subgroup name - optional
        """
        super(SerialMasterBase, self).from_hdf(hdf=hdf, group_name=group_name)
        with self.project_hdf5.open("input") as hdf5_input:
            self._input.from_hdf(hdf5_input)
            convergence_goal_str = hdf5_input["convergence_goal"]
            if convergence_goal_str == "None":
                self._convergence_goal = None
            else:
                self._convergence_goal_str = convergence_goal_str
                self._convergence_goal = get_function_from_string(convergence_goal_str)
                self._convergence_goal_qwargs = hdf5_input["convergence_goal_qwargs"]

    def get_from_childs(self, path):
        """
        Extract the output from all child jobs and appending it to a list

        Args:
            path (str): path inside the HDF5 files of the individual jobs like 'output/generic/volume'

        Returns:
            list: list of output from the child jobs
        """
        var_lst = []
        for child_id in self.child_ids:
            ham = self.project.load(child_id, convert_to_object=False)
            var = ham.__getitem__(path)
            var_lst.append(var)
        return np.array(var_lst)

    def iter_jobs(self, convert_to_object=True):
        """
        Iterate over the jobs within the SerialMaster

        Args:
            convert_to_object (bool): load the full GenericJob object (default) or just the HDF5 / JobCore object

        Returns:
            yield: Yield of GenericJob or JobCore
        """
        for job_id in self.child_ids:
            yield self.project.load(job_id, convert_to_object=convert_to_object)

    def run_if_interactive(self):
        pass

    def _get_job_template(self):
        self._logger.info("run serial master {}".format(self.job_info_str))
        job = self.pop(-1)
        job._master_id = self.job_id
        job._hdf5 = self.child_hdf(job.job_name)
        self._logger.info("SerialMaster: run job {}".format(job.job_name))
        return job

    @staticmethod
    def _run_child_job(job):
        job.run()

    def _run_if_master_queue(self, job):
        job.server.run_mode.modal = True
        job.run()
        self.run_if_refresh()

    def _run_if_master_non_modal_child_non_modal(self, job):
        job.run()
        if self.master_id:
            del self

    def _run_if_master_modal_child_modal(self, job):
        job.run()
        self.run_if_refresh()

    def _run_if_master_modal_child_non_modal(self, job):
        job.run()
        while not job.status.finished and not job.status.aborted:
            job.refresh_job_status()
            time.sleep(5)
        self.run_if_refresh()

    def run_static(self, **qwargs):
        self.status.running = True
        if len(self) > len(self.child_ids):
            job = self._get_job_template()
            self.status.suspended = True
            if self.server.run_mode.queue:
                self._run_if_master_queue(job)
            elif self.server.run_mode.non_modal and job.server.run_mode.non_modal:
                self._run_if_master_non_modal_child_non_modal(job)
            elif self.server.run_mode.modal and job.server.run_mode.modal:
                self._run_if_master_modal_child_modal(job)
            elif self.server.run_mode.modal and job.server.run_mode.non_modal:
                self._run_if_master_modal_child_non_modal(job)
            else:
                raise TypeError()
        else:
            self.status.collect = True
            self.run()

    def set_goal(self, convergence_goal, **qwargs):
        """
        Set a convergence goal for the SerialMaster - this is necessary to stop the series.

        Args:
            convergence_goal (Function): the convergence goal can be any Python function, but if external packages are
                                         used like numpy they have to be imported within the function.
            **qwargs: arguments of the convergence goal function.
        """
        self._convergence_goal = convergence_goal
        self._convergence_goal_qwargs = qwargs
        self._convergence_goal_str = inspect.getsource(convergence_goal)
        if self.project_hdf5.file_exists:
            self.to_hdf()

    def show(self):
        """
        list all jobs in the SerialMaster

        Returns:
            list: list of jobs ['job', <index>, <GenericJob>]
        """
        return [["job", str(i), str(job)] for i, job in enumerate(self)]

    def to_hdf(self, hdf=None, group_name=None):
        """
        Store the SerialMaster in an HDF5 file

        Args:
            hdf (ProjectHDFio): HDF5 group object - optional
            group_name (str): HDF5 subgroup name - optional
        """
        super(SerialMasterBase, self).to_hdf(hdf=hdf, group_name=group_name)
        with self.project_hdf5.open("input") as hdf5_input:
            self._input.to_hdf(hdf5_input)
            if self._convergence_goal is not None:
                try:
                    hdf5_input["convergence_goal"] = inspect.getsource(
                        self._convergence_goal
                    )
                except IOError:
                    hdf5_input["convergence_goal"] = self._convergence_goal_str

                hdf5_input["convergence_goal_qwargs"] = self._convergence_goal_qwargs
            else:
                hdf5_input["convergence_goal"] = "None"

    def write_input(self):
        """
        Write the input files - for the SerialMaster this only contains convergence goal.
        """
        self._input.write_file(file_name="input.inp", cwd=self.working_directory)

    def __len__(self):
        """
        Length of the SerialMaster equal the number of childs appended.

        Returns:
            int: length of the SerialMaster
        """
        return len(self.child_ids + self._job_name_lst)

    def __getitem__(self, item):
        """
        Get/ read data from the GenericMaster

        Args:
            item (str, slice): path to the data or key of the data object

        Returns:
            dict, list, float, int: data or data object
        """
        child_id_lst = self.child_ids
        child_name_lst = [
            self.project.db.get_item_by_id(child_id)["job"]
            for child_id in self.child_ids
        ]
        if isinstance(item, int):
            total_lst = child_name_lst + self._job_name_lst
            item = total_lst[item]
        return self._get_item_when_str(
            item=item, child_id_lst=child_id_lst, child_name_lst=child_name_lst
        )

    def run_if_refresh(self):
        """
        Internal helper function the run if refresh function is called when the job status is 'refresh'. If the job was
        suspended previously, the job is going to be started again, to be continued.
        """
        conv_goal_exists = bool(self._convergence_goal)
        self._logger.info("Does the convergence goal exit: {}".format(conv_goal_exists))
        if not conv_goal_exists:
            self.status.collect = True
            self.run()
        else:
            subjobs_statuses = set(
                [
                    self.project.db.get_job_status(job_id=child_id)
                    for child_id in self.child_ids
                ]
            )
            if len(subjobs_statuses) == 0 or subjobs_statuses == {"finished"}:
                ham = self._convergence_goal(self, **self._convergence_goal_qwargs)
                if ham is not True:
                    self.append(ham)
                    self.to_hdf()
                    self.run_static()
                else:
                    self.status.collect = True
                    self.run()

    def _init_child_job(self, parent):
        """
        Update our reference job and copy their run mode.

        Args:
            parent (:class:`.GenericJob`): job instance that this job was created from
        """
        super()._init_child_job(parent)
        if parent.server.run_mode.non_modal:
            self.server.run_mode.non_modal = True
        elif (
            parent.server.run_mode.interactive
            or parent.server.run_mode.interactive_non_modal
        ):
            self.server.run_mode.interactive = True
Exemplo n.º 3
0
class ListMaster(GenericMaster):
    """
    The ListMaster is the most simple MetaJob derived from the GenericMaster. It behaves like a Python list object. Jobs
    can be append to the ListMaster just like elements are added to a list and then all jobs can be executed together.
    This also works for already executed jobs, unless they are already linked to a different MetaJob - meaning they
    already have a master ID assigned to them.

    Args:
        project (ProjectHDFio): ProjectHDFio instance which points to the HDF5 file the job is stored in
        job_name (str): name of the job, which has to be unique within the project

    Attributes:

        .. attribute:: job_name

            name of the job, which has to be unique within the project

        .. attribute:: status

            execution status of the job, can be one of the following [initialized, appended, created, submitted,
                                                                      running, aborted, collect, suspended, refresh,
                                                                      busy, finished]

        .. attribute:: job_id

            unique id to identify the job in the pyiron database

        .. attribute:: parent_id

            job id of the predecessor job - the job which was executed before the current one in the current job series

        .. attribute:: master_id

            job id of the master job - a meta job which groups a series of jobs, which are executed either in parallel
            or in serial.

        .. attribute:: child_ids

            list of child job ids - only meta jobs have child jobs - jobs which list the meta job as their master

        .. attribute:: project

            Project instance the jobs is located in

        .. attribute:: project_hdf5

            ProjectHDFio instance which points to the HDF5 file the job is stored in

        .. attribute:: job_info_str

            short string to describe the job by it is job_name and job ID - mainly used for logging

        .. attribute:: working_directory

            working directory of the job is executed in - outside the HDF5 file

        .. attribute:: path

            path to the job as a combination of absolute file system path and path within the HDF5 file.

        .. attribute:: version

            Version of the hamiltonian, which is also the version of the executable unless a custom executable is used.

        .. attribute:: executable

            Executable used to run the job - usually the path to an external executable.

        .. attribute:: library_activated

            For job types which offer a Python library pyiron can use the python library instead of an external
            executable.

        .. attribute:: server

            Server object to handle the execution environment for the job.

        .. attribute:: queue_id

            the ID returned from the queuing system - it is most likely not the same as the job ID.

        .. attribute:: logger

            logger object to monitor the external execution and internal pyiron warnings.

        .. attribute:: restart_file_list

            list of files which are used to restart the calculation from these files.

        .. attribute:: job_type

            Job type object with all the available job types: ['ExampleJob', 'SerialMaster', 'ParallelMaster',
                                                               'ScriptJob', 'ListMaster']

        .. attribute:: child_names

            Dictionary matching the child ID to the child job name.

        .. attribute:: submission_status

            Monitors how many jobs have been submitted and how many have to be submitted in future.
    """
    def __init__(self, project, job_name):
        self._input = GenericParameters("parameters")
        super(ListMaster, self).__init__(project, job_name=job_name)
        self.__version__ = "0.1"
        self._input["mode"] = "parallel"
        self.submission_status = SubmissionStatus(db=project.db,
                                                  job_id=self.job_id)
        self.refresh_submission_status()

    def set_input_to_read_only(self):
        """
        This function enforces read-only mode for the input classes, but it has to be implement in the individual
        classes.
        """
        self._input.read_only = True

    def reset_job_id(self, job_id=None):
        """
        Reset the job id sets the job_id to None as well as all connected modules like JobStatus and SubmissionStatus.
        """
        super(ListMaster, self).reset_job_id(job_id=job_id)
        self.submission_status = SubmissionStatus(db=self.project.db,
                                                  job_id=job_id)

    def refresh_submission_status(self):
        """
        Refresh the submission status - if a job ID job_id is set then the submission status is loaded from the
        database.
        """
        if self.job_id:
            self.submission_status = SubmissionStatus(db=self._hdf5.db,
                                                      job_id=self.job_id)
            self.submission_status.refresh()

    def save(self):
        """
        Save the object, by writing the content to the HDF5 file and storing an entry in the database.

        Returns:
            (int): Job ID stored in the database
        """
        job_id = super(ListMaster, self).save()
        self.refresh_submission_status()
        return job_id

    def append(self, job):
        """
        Append a job to the ListMaster - just like you would append an element to a list.

        Args:
            job (JobCore, GenericJob, int): job to append
        """
        if isinstance(job, JobCore) and job.job_id:
            job = job.job_id
        if isinstance(job, int):
            job = self._hdf5.project.load(job)
        if isinstance(job, GenericJob):
            if job.status.created or job.status.initialized:
                super(ListMaster, self).append(job=job)
            elif job.job_id and job.status.finished:
                if job.master_id is None:
                    if self.job_id is None:
                        self._job_id = self.save()
                    child_db_entry = self.project.db.get_item_by_id(job.job_id)
                    self.project.db.delete_item(job.job_id)
                    job._job_id = None
                    job.master_id = self._job_id
                    job.save()
                    del child_db_entry["id"]
                    del child_db_entry["masterid"]
                    self.project.db.item_update(child_db_entry, job.job_id)
                    self.submission_status.submit_next()
                    if len(self._job_name_lst) == 0:
                        self.status.finished = True
                        self.project.db.item_update(self._runtime(),
                                                    self.job_id)
                else:
                    raise ValueError(
                        "This job ",
                        job.job_name,
                        " is already connected to a master ",
                        job.master_id,
                        " and can not be appended here.",
                    )
        else:
            raise TypeError(
                "job has to be either GenericJob, JobCore or int, but it - ",
                job,
                " is ",
                type(job),
            )

    def is_finished(self):
        """
        Check if the ListMaster job is finished - by checking the job status and the submission status.

        Returns:
            bool: [True/False]
        """
        if self.status.finished:
            return True
        # self.status.busy = True
        self.submission_status.refresh()
        if not self.submission_status.finished:
            return False
        else:
            status_set = set([
                self._hdf5.db.get_item_by_id(child_id)["status"]
                for child_id in self.child_ids
            ])
            # status_set = set([job.get_status() for job in self.iter_jobs(convert_to_object=False)])
            if "finished" in status_set:
                return len(status_set) == 1
            else:
                return False

    def run_static(self):
        """
        The run static function is called by run to execute the simulation. For the
        ListMaster this means executing all the childs appened in parallel.
        """
        self._input["num_points"] = len(self)
        self._logger.info("{} run parallel master (modal)".format(
            self.job_info_str))
        self.status.running = True
        if len(self._job_name_lst) > 0:
            job_lst = []
            for i in range(len(self._job_name_lst)):
                ham = self.pop(i=0)
                if (ham.server.run_mode.non_modal
                        and self.get_child_cores() + ham.server.cores >
                        self.server.cores):
                    break
                self.submission_status.submit_next()
                if not ham.status.finished:
                    ham.run()
                self._logger.info("ListMaster: finished job {}".format(
                    ham.job_name))
                if ham.server.run_mode.thread:
                    job_lst.append(ham._process)
                else:
                    self.refresh_job_status()
            _ = [process.communicate() for process in job_lst if process]
            self.status.suspended = True
        if self.server.run_mode.modal or (
            (self.server.run_mode.non_modal or self.server.run_mode.queue)
                and self.is_finished()):
            self.status.finished = True

    def write_input(self):
        """
        Write the input files - for the ListMaster this only contains the execution mode, which is 'parallel' by
        default.
        """
        self._input.write_file(file_name="input.inp",
                               cwd=self.working_directory)

    def copy(self):
        """
        Copy the ListMaster object which links to the job and its HDF5 file

        Returns:
            ListMaster: New ListMaster object pointing to the same job
        """
        new_job = super(ListMaster, self).copy()
        new_job._child_ids = self.child_ids[:]
        return new_job

    def iter_jobs(self, convert_to_object=True):
        """
        Iterate over the jobs within the ListMaster

        Args:
            convert_to_object (bool): load the full GenericJob object (default) or just the HDF5 / JobCore object

        Returns:
            yield: Yield of GenericJob or JobCore
        """
        for job_id in self.child_ids:
            yield self._hdf5.load(job_id, convert_to_object=convert_to_object)

    def run_if_refresh(self):
        """
        Internal helper function the run if refresh function is called when the job status is 'refresh'. If the job was
        suspended previously, the job is going to be started again, to be continued.
        """
        self._logger.info("{}, status: {}, finished: {} parallel master "
                          "refresh".format(self.job_info_str, self.status,
                                           self.is_finished()))
        if self.is_finished() and not self.server.run_mode.modal:
            self.status.finished = True
        elif (self.server.run_mode.non_modal or self.server.run_mode.queue
              ) and not self.submission_status.finished:
            self.run_static()
        else:
            self.refresh_job_status()
            if self.status.refresh:
                self.status.suspended = True

    def collect_output(self):
        """
        Collect output is not implemented for ListMaster jobs
        """
        pass

    def run_if_interactive(self):
        """
        run_if_interactive() is not implemented for ListMaster jobs
        """
        pass

    def __getitem__(self, item):
        """
        Get/ read data from the HDF5 file

        Args:
            item (str, slice): path to the data or key of the data object

        Returns:
            dict, list, float, int: data or data object
        """
        child_id_lst = self.child_ids
        child_name_lst = [
            self.project.db.get_item_by_id(child_id)["job"]
            for child_id in self.child_ids
        ]
        if isinstance(item, int):
            total_lst = child_name_lst + self._job_name_lst
            item = total_lst[item]
        return self._get_item_when_str(item=item,
                                       child_id_lst=child_id_lst,
                                       child_name_lst=child_name_lst)

    def __len__(self):
        """
        Length of the ListMaster equal the number of childs appended.

        Returns:
            int: length of the ListMaster
        """
        return len(self.child_ids + self._job_name_lst)