Exemplo n.º 1
0
class FileTable(with_metaclass(Singleton)):
    def __init__(self, project):
        self._fileindex = None
        self._job_table = None
        self._project = os.path.abspath(project)
        self._columns = ['id', 'status', 'chemicalformula', 'job', 'subjob', 'projectpath', 'project', 'timestart',
                         'timestop', 'totalcputime', 'computer', 'hamilton', 'hamversion', 'parentid', 'masterid',
                         'username']
        self.force_reset()

    def force_reset(self):
        self._fileindex = PyFileIndex(path=self._project, filter_function=filter_function)
        df = pandas.DataFrame(self.init_table(fileindex=self._fileindex.dataframe))
        if len(df) != 0:
            self._job_table = df[np.array(self._columns)]
        else:
            self._job_table = pandas.DataFrame({k: [] for k in self._columns})

    def init_table(self, fileindex, working_dir_lst=None):
        if working_dir_lst is None:
            working_dir_lst = []
        fileindex = fileindex[~fileindex.is_directory]
        fileindex = fileindex.iloc[fileindex.path.values.argsort()]
        job_lst = []
        for path, mtime in zip(fileindex.path, fileindex.mtime):
            job_dict = self.get_extract(path, mtime)
            job_dict['id'] = len(working_dir_lst) + 1
            working_dir_lst.append(job_dict['project'][:-1] + job_dict['subjob'] + '_hdf5/')
            if job_dict['project'] in working_dir_lst:
                job_dict['masterid'] = working_dir_lst.index(job_dict['project']) + 1
            else:
                job_dict['masterid'] = None
            job_lst.append(job_dict)
        return job_lst

    @staticmethod
    def get_extract(path, mtime):
        basename = os.path.basename(path)
        job = os.path.splitext(basename)[0]
        time = datetime.datetime.fromtimestamp(mtime)
        return {'status': get_job_status_from_file(hdf5_file=path, job_name=job),
                'chemicalformula': None,
                'job': job,
                'subjob': '/' + job,
                'projectpath': None,
                'project': os.path.dirname(path) + '/',
                'timestart': time,
                'timestop': time,
                'totalcputime': 0.0,
                'computer': None,
                'username': None,
                'parentid': None,
                'hamilton': get_hamilton_from_file(hdf5_file=path, job_name=job),
                'hamversion': get_hamilton_version_from_file(hdf5_file=path, job_name=job)}

    def add_item_dict(self, par_dict):
        par_dict = dict((key.lower(), value) for key, value in par_dict.items())
        if len(self._job_table) != 0:
            job_id = np.max(self._job_table.id.values) + 1
        else:
            job_id = 1
        default_values = {'id': job_id,
                          'status': 'initialized',
                          'chemicalformula': None,
                          'timestart': datetime.datetime.now(),
                          'computer': None,
                          'parentid': None,
                          'username': None,
                          'timestop': None,
                          'totalcputime': None,
                          'masterid': None}
        for k, v in default_values.items():
            if k not in par_dict.keys():
                par_dict[k] = v
        self._job_table = pandas.concat([self._job_table,
                                         pandas.DataFrame([par_dict])[self._columns]]).reset_index(drop=True)
        return int(par_dict['id'])

    def item_update(self, par_dict, item_id):
        if isinstance(item_id, list):
            item_id = item_id[0]
        if isinstance(item_id, str):
            item_id = float(item_id)
        for k, v in par_dict.items():
            self._job_table.loc[self._job_table.id == int(item_id), k] = v

    def delete_item(self, item_id):
        item_id = int(item_id)
        if item_id in [int(v) for v in self._job_table.id.values]:
            self._job_table = self._job_table[self._job_table.id != item_id].reset_index(drop=True)
        else:
            raise ValueError

    def get_item_by_id(self, item_id):
        item_id = int(item_id)
        return {k: list(v.values())[0] for k, v in self._job_table[self._job_table.id == item_id].to_dict().items()}

    def get_items_dict(self, item_dict, return_all_columns=True):
        df = self._job_table
        if not isinstance(item_dict, dict):
            raise TypeError
        for k, v in item_dict.items():
            if k in ['id', 'parentid', 'masterid']:
                df = df[df[k] == int(v)]
            elif "%" not in str(v):
                df = df[df[k] == v]
            else:
                df = df[df[k].str.contains(v.replace('%', ''))]
        df_dict = df.to_dict()
        if return_all_columns:
            return [{k: v[i] for k, v in df_dict.items()} for i in df_dict['id'].keys()]
        else:
            return [{'id': i} for i in df_dict['id'].values()]

    def update(self):
        self._fileindex.update()
        if len(self._job_table) != 0:
            files_lst, working_dir_lst = zip(*[[project + subjob[1:] + '.h5', project + subjob[1:] + '_hdf5']
                                               for project, subjob in zip(self._job_table.project.values,
                                                                          self._job_table.subjob.values)])
            df_new = self._fileindex.dataframe[
                ~self._fileindex.dataframe.is_directory & ~self._fileindex.dataframe.path.isin(files_lst)]
        else:
            files_lst, working_dir_lst = [], []
            df_new = self._fileindex.dataframe[~self._fileindex.dataframe.is_directory]
        if len(df_new) > 0:
            job_lst = self.init_table(fileindex=df_new, working_dir_lst=list(working_dir_lst))
            df = pandas.DataFrame(job_lst)[self._columns]
            if len(files_lst) != 0 and len(working_dir_lst) != 0:
                self._job_table = pandas.concat([self._job_table, df]).reset_index(drop=True)
            else:
                self._job_table = df

    def get_db_columns(self):
        return self.get_table_headings()

    def get_table_headings(self):
        return self._job_table.columns.values

    def job_table(self, project=None, recursive=True, columns=None, all_columns=False, sort_by="id", max_colwidth=200,
                  job_name_contains=''):
        if project is None:
            project = self._project
        if columns is None:
            columns = ["job", "project", "chemicalformula"]
        if all_columns:
            columns = self._columns
        if len(self._job_table) != 0:
            if recursive:
                df = self._job_table[self._job_table.project.str.contains(project)]
            else:
                df = self._job_table[self._job_table.project == project]
        else:
            df = self._job_table
        pandas.set_option("display.max_colwidth", max_colwidth)
        if len(df) == 0:
            return df
        if job_name_contains != '':
            df = df[df.job.str.contains(job_name_contains)]
        if sort_by in columns:
            return df[columns].sort_values(by=sort_by)
        return df[columns]

    def get_jobs(self, project=None, recursive=True, columns=None):
        if project is None:
            project = self._project
        if columns is None:
            columns = ["id", "project"]
        df = self.job_table(project=project, recursive=recursive, columns=columns)
        if len(df) == 0:
            dictionary = {}
            for key in columns:
                dictionary[key] = list()
            return dictionary
            # return {key: list() for key in columns}
        dictionary = {}
        for key in df.keys():
            dictionary[key] = df[
                key
            ].tolist()  # ToDo: Check difference of tolist and to_list
        return dictionary

    def get_job_ids(self, project=None, recursive=True):
        return self.get_jobs(project=project, recursive=recursive, columns=['id'])["id"]

    def get_job_id(self, job_specifier, project=None):
        if project is None:
            project = self._project
        if isinstance(job_specifier, (int, np.integer)):
            return job_specifier  # is id

        job_specifier.replace(".", "_")
        # if job_specifier[0] is not '/':
        #     sub_job_name = '/' + job_specifier
        # else:
        #     sub_job_name = job_specifier
        # job_dict = _job_dict(database, sql_query, user, project_path, recursive=False,  # job=job_specifier,
        #                      sub_job_name=sub_job_name)
        # if len(job_dict) == 0:
        #     job_dict = _job_dict(database, sql_query, user, project_path, recursive=True,  # job=job_specifier,
        #                          sub_job_name=sub_job_name)
        job_id_lst = self._job_table[
            (self._job_table.project == project) & (self._job_table.job == job_specifier)].id.values
        if len(job_id_lst) == 0:
            job_id_lst = self._job_table[
                self._job_table.project.str.contains(project) & (self._job_table.job == job_specifier)].id.values
        if len(job_id_lst) == 0:
            return None
        elif len(job_id_lst) == 1:
            return int(job_id_lst[0])
        else:
            raise ValueError(
                "job name '{0}' in this project is not unique".format(job_specifier)
            )

    def get_child_ids(self, job_specifier, project=None, status=None):
        """
        Get the childs for a specific job

        Args:
            database (DatabaseAccess): Database object
            sql_query (str): SQL query to enter a more specific request
            user (str): username of the user whoes user space should be searched
            project_path (str): root_path - this is in contrast to the project_path in GenericPath
            job_specifier (str): name of the master job or the master jobs job ID
            status (str): filter childs which match a specific status - None by default

        Returns:
            list: list of child IDs
        """
        if project is None:
            project = self._project
        id_master = self.get_job_id(project=project, job_specifier=job_specifier)
        if id_master is None:
            return []
        else:
            if status is not None:
                id_lst = self._job_table[
                    (self._job_table.masterid == id_master) & (self._job_table.status == status)].id.values
            else:
                id_lst = self._job_table[(self._job_table.masterid == id_master)].id.values
            return sorted(id_lst)

    def set_job_status(self, job_specifier, status, project=None):
        """
        Set the status of a particular job

        Args:
            database (DatabaseAccess): Database object
            sql_query (str): SQL query to enter a more specific request
            user (str): username of the user whoes user space should be searched
            project_path (str): root_path - this is in contrast to the project_path in GenericPath
            job_specifier (str): name of the job or job ID
            status (str): job status can be one of the following ['initialized', 'appended', 'created', 'submitted',
                         'running', 'aborted', 'collect', 'suspended', 'refresh', 'busy', 'finished']

        """
        if project is None:
            project = self._project
        job_id = self.get_job_id(project=project, job_specifier=job_specifier)
        self._job_table.loc[self._job_table.id == job_id, 'status'] = status
        db_entry = self.get_item_by_id(item_id=job_id)
        h5io.write_hdf5(db_entry["project"] + db_entry["subjob"] + '.h5',
                        status,
                        title=db_entry["subjob"][1:] + '/status',
                        overwrite="update")

    def get_job_status(self, job_specifier, project=None):
        """
        Get the status of a particular job

        Args:
            database (DatabaseAccess): Database object
            sql_query (str): SQL query to enter a more specific request
            user (str): username of the user whoes user space should be searched
            project_path (str): root_path - this is in contrast to the project_path in GenericPath
            job_specifier (str): name of the job or job ID

        Returns:
            str: job status can be one of the following ['initialized', 'appended', 'created', 'submitted', 'running',
                 'aborted', 'collect', 'suspended', 'refresh', 'busy', 'finished']
        """
        if project is None:
            project = self._project
        try:
            return self._job_table[
                self._job_table.id == self.get_job_id(project=project, job_specifier=job_specifier)].status.values[0]
        except KeyError:
            return None

    def get_job_working_directory(self, job_specifier, project=None):
        """
        Get the working directory of a particular job

        Args:
            database (DatabaseAccess): Database object
            sql_query (str): SQL query to enter a more specific request
            user (str): username of the user whoes user space should be searched
            project_path (str): root_path - this is in contrast to the project_path in GenericPath
            job_specifier (str): name of the job or job ID

        Returns:
            str: working directory as absolute path
        """
        if project is None:
            project = self._project
        try:
            db_entry = self.get_item_by_id(item_id=self.get_job_id(project=project, job_specifier=job_specifier))
            if db_entry and len(db_entry) > 0:
                job_name = db_entry["subjob"][1:]
                return os.path.join(
                    db_entry["project"],
                    job_name + "_hdf5",
                    job_name,
                )
            else:
                return None
        except KeyError:
            return None
Exemplo n.º 2
0
class SciSweeper(object):
    def __init__(self,
                 working_directory=".",
                 job_class=None,
                 cores=1,
                 pysqa_config=None):
        self.working_directory = os.path.abspath(working_directory)
        if sys.version_info[0] >= 3:
            os.makedirs(self.working_directory, exist_ok=True)
        else:
            if not os.path.exists(self.working_directory):
                os.makedirs(self.working_directory)
        self._fileindex = PyFileIndex(path=self.working_directory,
                                      filter_function=filter_function)
        self._job_class = job_class
        self._results_df = None
        self._broken_jobs = []
        self._cores = cores
        self._job_name_function = None
        self.job = SciSweeperJob
        self._pysqa = None
        self.pysqa = pysqa_config
        self._job_id_lst = []

    @property
    def pysqa(self):
        return self._pysqa

    @pysqa.setter
    def pysqa(self, pysqa_config):
        if isinstance(pysqa_config, str):
            self._pysqa = QueueAdapter(pysqa_config)
        else:
            self._pysqa = pysqa_config

    @property
    def cores(self):
        return self._cores

    @cores.setter
    def cores(self, cores):
        self._cores = cores

    @property
    def job_name_function(self):
        return self._job_name_function

    @job_name_function.setter
    def job_name_function(self, job_name_function):
        self._job_name_function = job_name_function

    @property
    def job_class(self):
        return self._job_class

    @job_class.setter
    def job_class(self, job_class):
        self._job_class = job_class

    @property
    def results(self):
        return self._results_df

    @property
    def broken_jobs(self):
        return self._broken_jobs

    def collect(self):
        """
        Check status of the calculations and update the results table.
        """
        self._fileindex.update()
        dict_lst, broken_jobs = self._check_jobs()
        self._results_df = pandas.DataFrame(dict_lst)
        self._broken_jobs = (np.array([
            self._fileindex.dataframe[(~self._fileindex.dataframe.is_directory)
                                      & self._fileindex.dataframe.path.str.
                                      contains("/" + s + "/")].dirname.values
            for s in broken_jobs
        ]).flatten().tolist())

    def delete_jobs_from_queue(self):
        """
        Delete jobs from queuing system
        """
        if self._pysqa is not None:
            _ = [
                self.pysqa.delete_job(process_id=j[0])
                for j in self._job_id_lst
            ]

    def get_job_status(self):
        """
        Get job status from queuing system

        Returns:
            pandas.Dataframe/ None: Status table
        """
        if self._pysqa is not None:
            status_lst = self.pysqa.get_status_of_jobs(
                process_id_lst=[j[0] for j in self._job_id_lst])
            return pandas.DataFrame([{
                "queue_id": j[0],
                "job_name": j[1],
                "status": s
            } for s, j in zip(status_lst, self._job_id_lst)])

    def run_jobs_in_parallel(self,
                             input_dict_lst,
                             cores=None,
                             job_name_function=None):
        """
        Execute multiple SciSweeperJobs in parallel using multiprocessing.ThreadPool

        Args:
            input_dict_lst (list): List of dictionaries with input parametern
            cores (int/ None): number of cores to use = number of parallel threads.
            job_name_function (function/ None): Function which takes the input_dict and a counter as input to return the
                                                job_name as string. This can be defined by the user to have recognizable
                                                job names.
        """
        if cores is None:
            cores = self._cores
        if job_name_function is None:
            job_name_function = self.job_name_function
        if self._pysqa is None:
            tp = ThreadPool(cores)
        else:
            tp = None
        for counter, input_dict in enumerate(tqdm(input_dict_lst)):
            if job_name_function is not None:
                job_name = job_name_function(input_dict=input_dict,
                                             counter=counter)
                working_directory = os.path.abspath(
                    os.path.join(self.working_directory, job_name))
            else:
                working_directory = os.path.abspath(
                    os.path.join(self.working_directory,
                                 "job_" + str(counter)))
            if self._pysqa is None:
                tp.apply_async(run_parallel,
                               (self, working_directory, input_dict))
            else:
                self._job_id_lst.append([
                    self.job_class(
                        working_directory=working_directory,
                        input_dict=input_dict,
                        pysqa_config=self.pysqa,
                        cores=cores,
                    ).run(),
                    os.path.basename(working_directory),
                ])
        if self._pysqa is None:
            tp.close()
            tp.join()

    def run_job(self, job_working_directory, input_dict):
        """
        Run individual calculation.

        Args:
            job_working_directory (str): path to working directory
            input_dict (dict): dictionary with input parameters

        Returns:
            int/ None: If the job is submitted to a queuing system the queue id is returned, else it is None.
        """
        return self._job_class(
            working_directory=job_working_directory,
            input_dict=input_dict,
            pysqa_config=self.pysqa,
        ).run()

    def run_collect_output(self):
        """
        For each job in this directory and all sub directories collect the output again. Use this function after
        updating the collect_output function.
        """
        for path in tqdm(
                self._fileindex.dataframe[~self._fileindex.dataframe.
                                          is_directory].dirname.values):
            self._job_class(working_directory=path).run_collect_output()
        self.collect()

    def _check_jobs(self):
        """
        Internal helper function to check the jobs and build the results table.
        """
        dict_lst, all_keys_lst, broken_jobs = [], [], []
        for path in tqdm(
                self._fileindex.dataframe[~self._fileindex.dataframe.
                                          is_directory].dirname.values):
            job_dict = {}
            job_dict["dir"] = os.path.basename(path)
            job = self._job_class(working_directory=path)
            job.from_hdf()
            for k, v in job.input_dict.items():
                job_dict[k] = v
            for k, v in job.output_dict.items():
                job_dict[k] = v
            for k in job_dict.keys():
                all_keys_lst.append(k)
            dict_lst.append(job_dict)
        final_keys = list(set(all_keys_lst))
        for d in dict_lst:
            for k in final_keys:
                broken_flag = False
                if k not in d.keys():
                    d[k] = np.nan
                    broken_flag = True
                if broken_flag:
                    broken_jobs.append(d["dir"])
        return dict_lst, broken_jobs
Exemplo n.º 3
0
class FileTable(IsDatabase, metaclass=Singleton):
    def __init__(self, project):
        self._fileindex = None
        self._job_table = None
        self._project = os.path.abspath(project)
        self._columns = [
            "id",
            "status",
            "chemicalformula",
            "job",
            "subjob",
            "projectpath",
            "project",
            "timestart",
            "timestop",
            "totalcputime",
            "computer",
            "hamilton",
            "hamversion",
            "parentid",
            "masterid",
            "username",
        ]
        self.force_reset()

    def _get_view_mode(self):
        return False

    def force_reset(self):
        self._fileindex = PyFileIndex(path=self._project,
                                      filter_function=filter_function)
        df = pandas.DataFrame(
            self.init_table(fileindex=self._fileindex.dataframe))
        if len(df) != 0:
            df.id = df.id.astype(int)
            self._job_table = df[np.array(self._columns)]
        else:
            self._job_table = pandas.DataFrame({k: [] for k in self._columns})

    def init_table(self, fileindex, working_dir_lst=None):
        if working_dir_lst is None:
            working_dir_lst = []
        fileindex = fileindex[~fileindex.is_directory]
        fileindex = fileindex.iloc[fileindex.path.values.argsort()]
        job_lst = []
        for path, mtime in zip(fileindex.path, fileindex.mtime):
            job_dict = self.get_extract(path, mtime)
            job_dict["id"] = len(working_dir_lst) + 1
            working_dir_lst.append(job_dict["project"][:-1] +
                                   job_dict["subjob"] + "_hdf5/")
            if job_dict["project"] in working_dir_lst:
                job_dict["masterid"] = working_dir_lst.index(
                    job_dict["project"]) + 1
            else:
                job_dict["masterid"] = None
            job_lst.append(job_dict)
        return job_lst

    def add_item_dict(self, par_dict):
        par_dict = dict(
            (key.lower(), value) for key, value in par_dict.items())
        if len(self._job_table) != 0:
            job_id = np.max(self._job_table.id.values) + 1
        else:
            job_id = 1
        default_values = {
            "id": job_id,
            "status": "initialized",
            "chemicalformula": None,
            "timestart": datetime.datetime.now(),
            "computer": None,
            "parentid": None,
            "username": None,
            "timestop": None,
            "totalcputime": None,
            "masterid": None,
        }
        for k, v in default_values.items():
            if k not in par_dict.keys():
                par_dict[k] = v
        self._job_table = pandas.concat([
            self._job_table,
            pandas.DataFrame([par_dict])[self._columns]
        ]).reset_index(drop=True)
        return int(par_dict["id"])

    def item_update(self, par_dict, item_id):
        if isinstance(item_id, list):
            item_id = item_id[0]
        if isinstance(item_id, str):
            item_id = float(item_id)
        for k, v in par_dict.items():
            self._job_table.loc[self._job_table.id == int(item_id), k] = v

    def delete_item(self, item_id):
        item_id = int(item_id)
        if item_id in [int(v) for v in self._job_table.id.values]:
            self._job_table = self._job_table[
                self._job_table.id != item_id].reset_index(drop=True)
        else:
            raise ValueError

    def get_item_by_id(self, item_id):
        item_id = int(item_id)
        return {
            k: list(v.values())[0]
            for k, v in self._job_table[self._job_table.id ==
                                        item_id].to_dict().items()
        }

    def get_items_dict(self, item_dict, return_all_columns=True):
        df = self._job_table
        if not isinstance(item_dict, dict):
            raise TypeError
        for k, v in item_dict.items():
            if k in ["id", "parentid", "masterid"]:
                df = df[df[k] == int(v)]
            elif "%" not in str(v):
                df = df[df[k] == v]
            else:
                df = df[df[k].str.contains(v.replace("%", ""))]
        df_dict = df.to_dict()
        if return_all_columns:
            return [{k: v[i]
                     for k, v in df_dict.items()}
                    for i in df_dict["id"].keys()]
        else:
            return [{"id": i} for i in df_dict["id"].values()]

    def update(self):
        self._job_table.status = [
            self._get_job_status_from_hdf5(job_id)
            for job_id in self._job_table.id.values
        ]
        self._fileindex.update()
        if len(self._job_table) != 0:
            files_lst, working_dir_lst = zip(*[[
                project + subjob[1:] + ".h5", project + subjob[1:] + "_hdf5"
            ] for project, subjob in zip(self._job_table.project.values,
                                         self._job_table.subjob.values)])
            df_new = self._fileindex.dataframe[
                ~self._fileindex.dataframe.is_directory
                & ~self._fileindex.dataframe.path.isin(files_lst)]
        else:
            files_lst, working_dir_lst = [], []
            df_new = self._fileindex.dataframe[~self._fileindex.dataframe.
                                               is_directory]
        if len(df_new) > 0:
            job_lst = self.init_table(fileindex=df_new,
                                      working_dir_lst=list(working_dir_lst))
            df = pandas.DataFrame(job_lst)[self._columns]
            if len(files_lst) != 0 and len(working_dir_lst) != 0:
                self._job_table = pandas.concat([self._job_table,
                                                 df]).reset_index(drop=True)
            else:
                self._job_table = df

    def _get_table_headings(self, table_name=None):
        return self._job_table.columns.values

    def _get_job_table(
        self,
        sql_query,
        user,
        project_path=None,
        recursive=True,
        columns=None,
        element_lst=None,
    ):
        self.update()
        if project_path is None:
            project_path = self._project
        if len(self._job_table) != 0:
            if recursive:
                return self._job_table[self._job_table.project.str.contains(
                    project_path)]
            else:
                return self._job_table[self._job_table.project == project_path]
        else:
            return self._job_table

    def get_jobs(self, project=None, recursive=True, columns=None):
        if project is None:
            project = self._project
        if columns is None:
            columns = ["id", "project"]
        df = self.job_table(
            sql_query=None,
            user=None,
            project_path=project,
            recursive=recursive,
            columns=columns,
        )
        if len(df) == 0:
            dictionary = {}
            for key in columns:
                dictionary[key] = list()
            return dictionary
            # return {key: list() for key in columns}
        dictionary = {}
        for key in df.keys():
            dictionary[key] = df[key].tolist(
            )  # ToDo: Check difference of tolist and to_list
        return dictionary

    def get_job_ids(self, project=None, recursive=True):
        return self.get_jobs(project=project,
                             recursive=recursive,
                             columns=["id"])["id"]

    def get_job_id(self, job_specifier, project=None):
        if project is None:
            project = self._project
        if isinstance(job_specifier, (int, np.integer)):
            return job_specifier  # is id

        job_specifier.replace(".", "_")
        job_id_lst = self._job_table[
            (self._job_table.project == project)
            & (self._job_table.job == job_specifier)].id.values
        if len(job_id_lst) == 0:
            job_id_lst = self._job_table[
                self._job_table.project.str.contains(project)
                & (self._job_table.job == job_specifier)].id.values
        if len(job_id_lst) == 0:
            return None
        elif len(job_id_lst) == 1:
            return int(job_id_lst[0])
        else:
            raise ValueError(
                "job name '{0}' in this project is not unique".format(
                    job_specifier))

    def get_child_ids(self, job_specifier, project=None, status=None):
        """
        Get the childs for a specific job

        Args:
            database (DatabaseAccess): Database object
            sql_query (str): SQL query to enter a more specific request
            user (str): username of the user whoes user space should be searched
            project_path (str): root_path - this is in contrast to the project_path in GenericPath
            job_specifier (str): name of the master job or the master jobs job ID
            status (str): filter childs which match a specific status - None by default

        Returns:
            list: list of child IDs
        """
        if project is None:
            project = self._project
        id_master = self.get_job_id(project=project,
                                    job_specifier=job_specifier)
        if id_master is None:
            return []
        else:
            if status is not None:
                id_lst = self._job_table[
                    (self._job_table.masterid == id_master)
                    & (self._job_table.status == status)].id.values
            else:
                id_lst = self._job_table[(
                    self._job_table.masterid == id_master)].id.values
            return sorted(id_lst)

    def get_job_working_directory(self, job_id):
        """
        Get the working directory of a particular job

        Args:
            job_id (int):

        Returns:
            str: working directory as absolute path
        """
        try:
            db_entry = self.get_item_by_id(job_id)
            if db_entry and len(db_entry) > 0:
                job_name = db_entry["subjob"][1:]
                return os.path.join(
                    db_entry["project"],
                    job_name + "_hdf5",
                    job_name,
                )
            else:
                return None
        except KeyError:
            return None

    def _get_job_status_from_hdf5(self, job_id):
        db_entry = self.get_item_by_id(job_id)
        job_name = db_entry["subjob"][1:]
        return get_job_status_from_file(
            hdf5_file=os.path.join(db_entry["project"], job_name + ".h5"),
            job_name=job_name,
        )

    def get_job_status(self, job_id):
        return self._job_table[self._job_table.id == job_id].status.values[0]

    def set_job_status(self, job_id, status):
        db_entry = self.get_item_by_id(item_id=job_id)
        self._job_table.loc[self._job_table.id == job_id, "status"] = status
        h5io.write_hdf5(
            db_entry["project"] + db_entry["subjob"] + ".h5",
            status,
            title=db_entry["subjob"][1:] + "/status",
            overwrite="update",
        )

    @staticmethod
    def get_extract(path, mtime):
        basename = os.path.basename(path)
        job = os.path.splitext(basename)[0]
        time = datetime.datetime.fromtimestamp(mtime)
        return {
            "status":
            get_job_status_from_file(hdf5_file=path, job_name=job),
            "chemicalformula":
            None,
            "job":
            job,
            "subjob":
            "/" + job,
            "projectpath":
            None,
            "project":
            os.path.dirname(path) + "/",
            "timestart":
            time,
            "timestop":
            time,
            "totalcputime":
            0.0,
            "computer":
            None,
            "username":
            None,
            "parentid":
            None,
            "hamilton":
            get_hamilton_from_file(hdf5_file=path, job_name=job),
            "hamversion":
            get_hamilton_version_from_file(hdf5_file=path, job_name=job),
        }