예제 #1
0
 def test_13(self):
     scripts1 = [pytest.helpers.get_py_script(i, 1) for i in range(2)]
     sc = ScriptCollector(scripts1)
     assert sc.scripts == scripts1
     scripts2 = [pytest.helpers.get_py_script(i, 1) for i in range(2)]
     sc.scripts = scripts2
     assert sc.scripts == scripts2
예제 #2
0
 def test_15(self):
     scripts = [pytest.helpers.get_py_script(i, 1) for i in range(2)]
     sc = ScriptCollector(scripts)
     assert sc.scripts == scripts
     sc.dump()
     paths = [s.path for s in scripts]
     assert all(os.path.isfile(p) for p in paths)
     pytest.helpers.unlink(paths)
예제 #3
0
 def test_6(self):
     scripts = [pytest.helpers.get_py_script(i, 1) for i in range(2)]
     [s.write() for s in scripts]
     sc = ScriptCollector([s.path for s in scripts])
     assert len(sc.scripts) == 2
     assert all(isinstance(s, Script) for s in sc)
     pytest.helpers.unlink([s.path for s in scripts])
예제 #4
0
 def test_12(self):
     container = ScriptCollector(None)
     task = MockTask(container)
     assert task.script_collector is container
     assert len(task.script_collector) == 0
     task.add_script(pytest.helpers.get_py_script(0, 1))
     assert len(task.script_collector) == 1
예제 #5
0
 def test_4(self):
     script = pytest.helpers.get_py_script(0, 1)
     script.write()
     sc = ScriptCollector(script.path)
     assert len(sc.scripts) == 1
     assert isinstance(sc.scripts[0], Script)
     pytest.helpers.unlink([script.path])
예제 #6
0
def write_mrbump_files(ensemble_pdbs,
                       amoptd,
                       job_time=MRBUMP_RUNTIME,
                       ensemble_options=None,
                       directory=None):
    """Write the MRBUMP job files for all the ensembles.

    Arguments:
    ensemble_pdbs -- list of the ensembles, each a single pdb file.
    amoptd -- dictionary with job options.
    job_time -- maximum permissible runtime (mainly used for batch queueing systems).
    ensemble_options -- dictionary with ensemble-specific keywords e.g. ensemble_options[ensemble_name] = {'ncopies' : ncopies}
    directory -- working directory to write files to.
    """
    if not directory:
        directory = os.getcwd()

    collector = ScriptCollector(None)
    keyword_options = {}
    for ensemble_pdb in ensemble_pdbs:
        name = os.path.splitext(
            os.path.basename(ensemble_pdb))[0]  # Get name from pdb path

        # Get any options specific to this ensemble
        if ensemble_options and name in ensemble_options:
            keyword_options = ensemble_options[name]

        # Generate dictionary with all the options for this job and write to keyword file
        keyword_dict = mrbump_cmd.keyword_dict(ensemble_pdb, name, amoptd,
                                               keyword_options)
        keyword_file = os.path.join(directory, name + '.mrbump')
        keyword_str = mrbump_cmd.mrbump_keyword_file(keyword_dict)
        with open(keyword_file, 'w') as f:
            f.write(keyword_str)

        script = write_jobscript(name,
                                 keyword_file,
                                 amoptd,
                                 directory=directory,
                                 job_time=job_time)
        collector.add(script)

    if not len(collector.scripts):
        raise RuntimeError("No job scripts created!")

    return collector
예제 #7
0
 def test_14(self):
     container = ScriptCollector(pytest.helpers.get_py_script(10, 1))
     task = MockTask(container)
     assert len(task.script_collector) == 1
     task.lock()
     assert task.locked
     with pytest.raises(PyJobError):
         task.add_script(pytest.helpers.get_py_script(1, 1))
     assert len(task.script_collector) == 1
예제 #8
0
    def _create_scripts(self, rosetta_dir, **kwargs):
        """Create scripts and set path to working directory"""
        collector = ScriptCollector(None)
        owd = os.getcwd()
        for name in self.test_dict.keys():
            os.chdir(self.run_dir)
            work_dir = os.path.join(self.run_dir, name)
            args = self.test_dict[name]['args']

            # Rosetta is the only think likely to change between platforms so we update the entry
            if rosetta_dir and self._is_in_args('-rosetta_dir', args):
                args = self._update_args(args, [['-rosetta_dir', rosetta_dir]])
            # Additional argumenst for submitting to a cluster
            args = self._update_cluster_args(args, **kwargs)
            if EXTRA_ARGS:
                args = self._update_args(args, EXTRA_ARGS)

            # We track different modules using the name of the test case
            if name.startswith(ENSEMBLER):
                testcase_type = ENSEMBLER
            elif name.startswith(MODELLING):
                testcase_type = MODELLING
            else:
                testcase_type = 'ample'
            if testcase_type != 'ample' and sys.platform.startswith('win'):
                logger.critical(
                    "Cannot run module testcases on windows due to multiprocessing bug"
                )
                continue
            script = self.write_script(self.run_dir, name,
                                       args + [['-work_dir', work_dir]],
                                       testcase_type)
            collector.add(script)
            # Set path to the directory the case is run so we can pass it to the unittest
            self.test_dict[name]['work_dir'] = work_dir

            # Run the setup function if one is provided
            if 'setup' in self.test_dict[name] and callable(
                    self.test_dict[name]['setup']):
                self.test_dict[name]['setup'](self.run_dir)

            os.chdir(owd)  # Back to where we started
        return collector
예제 #9
0
    def __init__(self, script, *args, **kwargs):
        """Instantiate a new :obj:`~pyjob.task.Task`

        Parameters
        ----------
        script : :obj:`~pyjob.script.ScriptCollector`, :obj:`~pyjob.script.Script`, str, list, tuple
           A :obj:`str`, :obj:`list` or :obj:`tuple` of one or more script paths

        """
        self.pid = None
        self.locked = False
        if isinstance(script, ScriptCollector):
            self.script_collector = script
        else:
            self.script_collector = ScriptCollector(script)

        self.directory = os.path.abspath(
            kwargs.get("directory") or config.get("directory") or "."
        )
        self.nprocesses = kwargs.get("processes") or config.get("processes") or 1
예제 #10
0
def create_ensemble_db(database,
                       pdb_db,
                       nproc=2,
                       submit_qtype=None,
                       submit_queue=False,
                       chunk_size=5000):
    """Create the MoRDa search database

    Parameters
    ----------
    database : str
       The path to the database folder
    pdb_db : str
        The path to a local copy of the Protein Data Bank
    nproc : int, optional
       The number of processors [default: 2]
    submit_qtype : str
       The cluster submission queue type - currently support SGE and LSF
    submit_queue : str
       The queue to submit to on the cluster
    chunk_size : int, optional
       The number of jobs to submit at the same time [default: 5000]

    Raises
    ------
    RuntimeError
       Windows is currently not supported

    """
    if CUSTOM_PLATFORM == "windows":
        msg = "Windows is currently not supported"
        raise RuntimeError(msg)

    if not is_valid_db_location(database):
        raise RuntimeError("Permission denied! Cannot write to {}!".format(
            os.path.dirname(database)))

    if "MRD_DB" in os.environ:
        morda_installed_through_ccp4 = True
    else:
        download_morda()
        morda_installed_through_ccp4 = False

    morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM',
                                  '*.dat')
    simbad_dat_path = os.path.join(database, '**', '*.dat')
    morda_dat_files = set(
        [os.path.basename(f) for f in glob.glob(morda_dat_path)])
    simbad_dat_files = set(
        [os.path.basename(f) for f in glob.glob(simbad_dat_path)])
    erroneous_files = {
        "1bbzA_0.dat", "1gt0D_0.dat", "1h3oA_0.dat", "1kskA_1.dat",
        "1l0sA_0.dat"
    }

    def delete_erroneous_files(erroneous_paths):
        for f in erroneous_paths:
            if os.path.isfile(f):
                logger.warning(
                    "File flagged to be erroneous ... " +
                    "removing from database: %s", f)
                os.remove(f)

    erroneous_paths = [
        os.path.join(database, name[1:3], name) for name in erroneous_files
    ]
    delete_erroneous_files(erroneous_paths)

    dat_files = list(morda_dat_files - simbad_dat_files - erroneous_files)
    if len(dat_files) < 1:
        logger.info('SIMBAD ensemble database up-to-date')
        if not morda_installed_through_ccp4:
            shutil.rmtree(os.environ["MRD_DB"])
        leave_timestamp(os.path.join(database, 'simbad_morda.txt'))
        return
    else:
        logger.info(
            "%d new entries were found in the MoRDa database, " +
            "updating SIMBAD ensemble database", len(dat_files))

    exe = os.path.join(os.environ["MRD_PROG"], "get_model")

    mrbump_stdin = """
    MDLS True
    MDLC False
    MDLD False
    MDLP False
    MDLM False
    MDLU False
    CHECK False
    UPDATE False
    PICKLE False
    MRNUM 5
    SCOP False
    DEBUG False
    RLEVEL 100
    GESAMT_MERGE False
    USEE True
    GESE True
    GEST True
    AMPT False
    DOPHMMER True
    DOHHPRED False
    PDBLOCAL {}
    END
    """.format(pdb_db)

    run_dir = tmp_dir(directory=os.getcwd())

    # Generate the sub directories in advance
    sub_dir_names = set(
        [os.path.basename(f).rsplit('.', 1)[0][1:3] for f in dat_files])
    for sub_dir_name in sub_dir_names:
        sub_dir = os.path.join(database, sub_dir_name)
        if os.path.isdir(sub_dir):
            continue
        os.makedirs(sub_dir)

    # Submit in chunks, so we don't take too much disk space
    # and can terminate without loosing the processed data
    total_chunk_cycles = len(dat_files) // chunk_size + (len(dat_files) % 5 >
                                                         0)
    for cycle, i in enumerate(range(0, len(dat_files), chunk_size)):
        logger.info("Working on chunk %d out of %d", cycle + 1,
                    total_chunk_cycles)
        chunk_dat_files = dat_files[i:i + chunk_size]

        # Create the database files
        files = []
        collector = ScriptCollector(None)
        for f in chunk_dat_files:
            code = os.path.basename(f).rsplit('.', 1)[0]
            final_file = os.path.join(database, code[1:3], code + ".dat")
            # We need a temporary directory within because "get_model" uses non-unique file names
            tmp_d = tmp_dir(directory=run_dir)
            get_model_output = os.path.join(tmp_d, code + ".pdb")
            get_seq_output = os.path.join(tmp_d, code + ".seq")
            mrbump_directory = os.path.join(tmp_d, 'search_mrbump_1')
            cmd = [["export CCP4_SCR=".format(tmp_d)],
                   ["export MRD_DB=".format(os.environ['MRD_DB'])],
                   ["cd", tmp_d], [exe, "-c", code, "-m", "d"],
                   [
                       'ccp4-python', '-c', "'import simbad.util; "
                       "simbad.util.get_sequence(\"{0}\", \"{1}\")'".format(
                           get_model_output, get_seq_output)
                   ], ['mrbump', 'seqin', get_seq_output, '<< eof'],
                   [mrbump_stdin], ['eof'],
                   [
                       'ccp4-python', '-c', "'import simbad.util; "
                       "simbad.util.get_mrbump_ensemble(\"{0}\", \"{1}\")'".
                       format(mrbump_directory, final_file)
                   ]]

            script = Script(directory=tmp_d)
            for c in cmd:
                script.append(' '.join(map(str, c)))
            collector.add(script)
            log = script.path.rsplit('.', 1)[0] + '.log'
            files += [(script.path, log, tmp_d)]

        scripts, _, tmps = zip(*files)

        submit_chunk(collector=collector,
                     run_dir=os.getcwd(),
                     nproc=nproc,
                     job_name='ensemble_db',
                     submit_qtype=submit_qtype,
                     submit_queue=submit_queue,
                     permit_nonzero=True,
                     monitor=None,
                     success_func=None)

        for d in tmps:
            shutil.rmtree(d)

    shutil.rmtree(run_dir)
    if not morda_installed_through_ccp4:
        shutil.rmtree(os.environ["MRD_DB"])

    validate_compressed_database(database)
    leave_timestamp(os.path.join(database, 'simbad_ensemble.txt'))
예제 #11
0
 def test_11(self):
     container = ScriptCollector(None)
     task = MockTask(container)
     assert task.script_collector is container
예제 #12
0
    def run(self,
            models_dir,
            nproc=2,
            shres=3.0,
            pklim=0.5,
            npic=50,
            rotastep=1.0,
            min_solvent_content=20,
            submit_nproc=None,
            submit_qtype=None,
            submit_queue=None,
            monitor=None,
            chunk_size=0,
            **kwargs):
        """Run amore rotation function on a directory of models

        Parameters
        ----------
        models_dir : str
            The directory containing the models to run the rotation search on
        nproc : int, optional
            The number of processors to run the job on
        shres : int, float, optional
            Spherical harmonic resolution [default 3.0]
        pklim : int, float, optional
            Peak limit, output all peaks above <float> [default: 0.5]
        npic : int, optional
            Number of peaks to output from the translation function map for each orientation [default: 50]
        rotastep : int, float, optional
            Size of rotation step [default : 1.0]
        min_solvent_content : int, float, optional
            The minimum solvent content present in the unit cell with the input model [default: 30]
        submit_nproc : int
            The number of processors to use on the head node when creating submission scripts on a cluster [default: 1]
        submit_qtype : str
            The cluster submission queue type - currently support SGE and LSF
        submit_queue : str
            The queue to submit to on the cluster
        monitor
        chunk_size : int, optional
            The number of jobs to submit at the same time

        Returns
        -------
        file
            log file for each model in the models_dir

        """
        self.shres = shres
        self.pklim = pklim
        self.npic = npic
        self.rotastep = rotastep

        self.submit_qtype = submit_qtype
        self.submit_queue = submit_queue

        self.simbad_dat_files = simbad.db.find_simbad_dat_files(models_dir)

        mtz_labels = simbad.util.mtz_util.GetLabels(self.mtz)

        i = InputMR_DAT()
        i.setHKLI(self.mtz)
        i.setLABI_F_SIGF(mtz_labels.f, mtz_labels.sigf)
        i.setMUTE(True)
        run_mr_data = runMR_DAT(i)

        sg = run_mr_data.getSpaceGroupName().replace(" ", "")
        cell = " ".join(map(str, run_mr_data.getUnitCell()))

        sol_calc = simbad.util.matthews_prob.SolventContent(cell, sg)

        dir_name = "simbad-tmp-" + str(uuid.uuid1())
        self.script_log_dir = os.path.join(self.work_dir, dir_name)
        os.mkdir(self.script_log_dir)

        self.hklpck0 = self._generate_hklpck0()

        self.ccp4_scr = os.environ["CCP4_SCR"]
        default_tmp_dir = os.path.join(self.work_dir, 'tmp')
        if self.tmp_dir:
            self.template_tmp_dir = os.path.join(self.tmp_dir,
                                                 dir_name + "-{0}")
        else:
            self.template_tmp_dir = os.path.join(default_tmp_dir,
                                                 dir_name + "-{0}")

        predicted_molecular_weight = 0
        if run_mr_data.Success():
            i = InputCCA()
            i.setSPAC_HALL(run_mr_data.getSpaceGroupHall())
            i.setCELL6(run_mr_data.getUnitCell())
            i.setMUTE(True)
            run_cca = runCCA(i)

            if run_cca.Success():
                predicted_molecular_weight = run_cca.getAssemblyMW()

        dat_models = []
        for dat_model in self.simbad_dat_files:
            name = os.path.basename(dat_model.replace(".dat", ""))
            pdb_struct = simbad.util.pdb_util.PdbStructure()
            pdb_struct.from_file(dat_model)
            try:
                solvent_content = sol_calc.calculate_from_struct(pdb_struct)
                if solvent_content < min_solvent_content:
                    msg = "Skipping %s: solvent content is predicted to be less than %.2f"
                    logger.debug(msg, name, min_solvent_content)
                    continue
            except ValueError:
                msg = "Skipping %s: Error calculating solvent content"
                logger.debug(msg, name)
                continue
            except IndexError:
                msg = "Skipping %s: Problem with dat file"
                logger.debug(msg, name)
                continue

            x, y, z, intrad = pdb_struct.integration_box
            model_molecular_weight = pdb_struct.molecular_weight
            mw_diff = abs(predicted_molecular_weight - model_molecular_weight)

            info = simbad.core.dat_score.DatModelScore(name, dat_model,
                                                       mw_diff, x, y, z,
                                                       intrad, solvent_content,
                                                       None)
            dat_models.append(info)

        sorted_dat_models = sorted(dat_models,
                                   key=lambda x: float(x.mw_diff),
                                   reverse=False)
        n_files = len(sorted_dat_models)
        chunk_size = simbad.rotsearch.get_chunk_size(n_files, chunk_size)
        total_chunk_cycles = simbad.rotsearch.get_total_chunk_cycles(
            n_files, chunk_size)

        if submit_qtype == 'local':
            processes = nproc
        else:
            processes = submit_nproc

        results = []
        iteration_range = range(0, n_files, chunk_size)
        for cycle, i in enumerate(iteration_range):
            logger.info("Working on chunk %d out of %d", cycle + 1,
                        total_chunk_cycles)

            if self.solution:
                logger.info(
                    "Early termination criteria met, skipping chunk %d",
                    cycle + 1)
                continue

            collector = ScriptCollector(None)
            amore_files = []
            with pool.Pool(processes=processes) as p:
                [(collector.add(i[0]), amore_files.append(i[1]))
                 for i in p.map(self, sorted_dat_models[i:i + chunk_size])
                 if i is not None]

            if len(collector.scripts) > 0:
                logger.info("Running AMORE tab/rot functions")
                amore_logs, dat_models = zip(*amore_files)
                simbad.util.submit_chunk(collector, self.script_log_dir, nproc,
                                         'simbad_amore', submit_qtype,
                                         submit_queue, True, monitor,
                                         self.rot_succeeded_log)

                for dat_model, amore_log in zip(dat_models, amore_logs):
                    base = os.path.basename(amore_log)
                    pdb_code = base.replace("amore_", "").replace(".log", "")
                    try:
                        rotsearch_parser = simbad.parsers.rotsearch_parser.AmoreRotsearchParser(
                            amore_log)
                        score = simbad.core.amore_score.AmoreRotationScore(
                            pdb_code, dat_model, rotsearch_parser.alpha,
                            rotsearch_parser.beta, rotsearch_parser.gamma,
                            rotsearch_parser.cc_f, rotsearch_parser.rf_f,
                            rotsearch_parser.cc_i, rotsearch_parser.cc_p,
                            rotsearch_parser.icp,
                            rotsearch_parser.cc_f_z_score,
                            rotsearch_parser.cc_p_z_score,
                            rotsearch_parser.num_of_rot)
                        if rotsearch_parser.cc_f_z_score:
                            results += [score]
                    except IOError:
                        pass

            else:
                logger.critical("No structures to be trialled")

        self._search_results = results
        shutil.rmtree(self.script_log_dir)

        if os.path.isdir(default_tmp_dir):
            shutil.rmtree(default_tmp_dir)
예제 #13
0
    def comparison(self, models, structures):
        """
        Compare a list of model structures to a second list of reference structures

        Parameters
        ----------
        models : list
           List containing the paths to the model structure files
        structures : list
           List containing the paths to the reference structure files

        Returns
        -------
        entries : list
           List of TMscore data entries on a per-model basis

        """

        if len(models) < 1 or len(structures) < 1:
            msg = 'No model structures provided' if len(models) < 1 else 'No reference structures provided'
            logger.critical(msg)
            raise RuntimeError(msg)

        elif len(structures) == 1:
            logger.info('Using single structure provided for all model comparisons')
            structures = [structures[0] for _ in xrange(len(models))]

        elif len(models) != len(structures):
            msg = "Unequal number of models and structures!"
            logger.critical(msg)
            raise RuntimeError(msg)

        if self.method == "tmalign":
            pt = tm_parser.TMalignLogParser()
        elif self.method == "tmscore":
            pt = tm_parser.TMscoreLogParser()
        else:
            msg = "Invalid method selected: %s", self.method
            logger.critical(msg)
            raise RuntimeError(msg)

        logger.info('Using algorithm: {0}'.format(self.method))
        logger.info('------- Evaluating decoys -------')
        data_entries, log_files = [], []
        collector = ScriptCollector(None)
        for model_pdb, structure_pdb in zip(models, structures):
            model_name = os.path.splitext(os.path.basename(model_pdb))[0]
            structure_name = os.path.splitext(os.path.basename(structure_pdb))[0]
            stem = "_".join([model_name, structure_name, self.method])

            if os.path.isfile(model_pdb) and os.path.isfile(structure_pdb):
                data_entries.append([model_name, structure_name, model_pdb, structure_pdb])
                script = Script(directory=self.tmp_dir, prefic="tmscore_", stem=stem)
                script.append(" ".join([self.executable, model_pdb, structure_pdb]))
                collector.add(script)
                log_files.append(os.path.splitext(script)[0] + ".log")
            else:
                if not os.path.isfile(model_pdb):
                    logger.warning("Cannot find: %s", model_pdb)
                if not os.path.isfile(structure_pdb):
                    logger.warning("Cannot find: %s", structure_pdb)
                continue

        logger.info('Executing TManalysis scripts')

        j = Job(self._qtype)
        j.submit(job_scripts, nproc=self._nproc, max_array_jobs=self._max_array_jobs, queue=self._queue, name="tmscore")
        j.wait(interval=1)

        with TaskFactory(
                self._qtype,
                collector,
                name="tmscore",
                nprocesses=self._nproc,
                max_array_size=self._max_array_jobs,
                queue=self._queue,
                shell="/bin/bash",
        ) as task:
            task.run()
            task.wait(interval=1)

        self.entries = []
        for entry, log, script in zip(data_entries, log_files, job_scripts):
            try:
                pt.reset()
                pt.parse(log)
            except Exception:
                logger.critical("Error processing the %s log file: %s", self.method, log)
                log = "None"
            model_name, structure_name, model_pdb, structure_pdb = entry
            _entry = self._store(model_name, structure_name, model_pdb, structure_pdb, log, pt)
            self.entries.append(_entry)
            os.unlink(script)

        return self.entries
예제 #14
0
 def test_12(self):
     scripts = [pytest.helpers.get_py_script(i, 1) for i in range(2)]
     sc = ScriptCollector(scripts)
     sc.add([])
     assert sc.scripts == scripts
예제 #15
0
class Task(abc.ABC):
    """Abstract base class for executable tasks"""

    def __init__(self, script, *args, **kwargs):
        """Instantiate a new :obj:`~pyjob.task.Task`

        Parameters
        ----------
        script : :obj:`~pyjob.script.ScriptCollector`, :obj:`~pyjob.script.Script`, str, list, tuple
           A :obj:`str`, :obj:`list` or :obj:`tuple` of one or more script paths

        """
        self.pid = None
        self.locked = False
        if isinstance(script, ScriptCollector):
            self.script_collector = script
        else:
            self.script_collector = ScriptCollector(script)

        self.directory = os.path.abspath(
            kwargs.get("directory") or config.get("directory") or "."
        )
        self.nprocesses = kwargs.get("processes") or config.get("processes") or 1

    def __del__(self):
        """Exit function at instance deletion"""
        if not self.locked:
            self.lock()
        self.close()

    def __enter__(self):
        """Contextmanager entry function

        Note
        ----
        For further details see `PEP 343 <https://www.python.org/dev/peps/pep-0343/>`_.

        """
        return self

    def __exit__(self, *exc):
        """Contextmanager exit function

        Note
        ----
        For further details see `PEP 343 <https://www.python.org/dev/peps/pep-0343/>`_.

        """
        if not self.locked:
            self.lock()
        self.close()

    def __repr__(self):
        """Representation of the :obj:`~pyjob.task.Task`"""
        return f"{self.__class__.__qualname__}(pid={self.pid})"

    # ------------------ Abstract methods and properties ------------------

    @property
    @abc.abstractmethod
    def info(self):  # pragma: no cover
        """Abstract property to provide info about the :obj:`~pyjob.task.Task`"""

    @abc.abstractmethod
    def close(self):  # pragma: no cover
        """Abstract method to end :obj:`~pyjob.task.Task`"""

    @abc.abstractmethod
    def kill(self):  # pragma: no cover
        """Abstract method to forcefully terminate :obj:`~pyjob.task.Task`"""

    @abc.abstractmethod
    def _run(self):  # pragma: no cover
        """Abstract property to start execution of the :obj:`~pyjob.task.Task`"""

    # ------------------ Other task-specific general methods ------------------

    @property
    def completed(self):
        """Boolean to indicate :obj:`~pyjob.task.Task` completion"""
        return self.locked and not bool(self.info)

    @property
    def log(self):
        """The log file path"""
        return [script.log for script in self.script_collector]

    @property
    def script(self):
        """The script file path"""
        return [script.path for script in self.script_collector]

    @staticmethod
    def get_time(minutes):
        """Return runtime string with format hh:mm:ss to be used in :obj:`~pyjob.task.Task`

        Parameters
        ----------
        minutes : int
           Integer with the number of minutes to allocate to runtime

        Raises
        ------
        :exc:`~pyjob.exception.PyJobError`
           Argument is not a positive integer
        """
        if isinstance(minutes, int) and minutes > 0:
            h, m = divmod(minutes, 60)
            return f"{h:02d}:{m:02d}:00"
        else:
            raise PyJobError("Task runtime has to be a positive integer!")

    def add_script(self, script):
        """Add further scripts to this :obj:`~pyjob.task.Task`

        Parameters
        ----------
        script : :obj:`~pyjob.script.Script`, str, list, tuple
           Something representing one or more scripts

        """
        if self.locked:
            raise PyJobTaskLockedError("This task is locked!")
        self.script_collector.add(script)

    def lock(self):
        """Lock this :obj:`~pyjob.task.Task`"""
        self.locked = True
        logger.debug("Locked %s [%d]", self.__class__.__qualname__, self.pid)

    def run(self):
        """Start the execution of this :obj:`~pyjob.task.Task`

        Raises
        ------
        :exc:`~pyjob.exception.PyJobError`
           One or more executable scripts required prior to execution
        :exc:`~pyjob.exception.PyJobTaskLockedError`
           Locked task, cannot restart or rerun

        """
        if self.locked:
            raise PyJobTaskLockedError("This task is locked!")
        if len(self.script_collector) < 1:
            raise PyJobError(
                "One or more executable scripts required prior to execution"
            )
        self.script_collector.dump()
        self._run()
        logger.debug(
            "Started execution of %s [%d]", self.__class__.__qualname__, self.pid
        )
        self.lock()

    def wait(self, interval=30, monitor_f=None, success_f=None):
        """Method to wait for the completion of the current :obj:`~pyjob.task.Task`

        Parameters
        ----------
        interval : int, optional
           The interval to wait between checking (in seconds)
        monitor_f : callable, optional
           A :obj:`callable` that is regularly invoked
        success_f : callable, optional
           A :obj:`callable` to check for early termination of :obj:`~pyjob.task.Task`

        Note
        ----
        The `success_f` argument needs to accept a log file as input and return
        a :obj:`bool`.

        """

        def is_successful_run(log):
            return os.path.isfile(log) and success_f(log)

        def is_callable_fn(fn):
            return bool(fn and callable(fn))

        check_success = is_callable_fn(success_f)
        callback = monitor_f if is_callable_fn(monitor_f) else lambda: None

        if check_success:
            msg = "Checking for %s %d success with function %s"
            logger.debug(msg, self.__class__.__qualname__, self.pid, success_f.__name__)

        while not self.completed:
            if check_success:
                for log in self.log:
                    if is_successful_run(log):
                        logger.debug(
                            "%s %d succeeded, run log: %s",
                            self.__class__.__qualname__,
                            self.pid,
                            log,
                        )
                        self.kill()
            callback()
            time.sleep(interval)
예제 #16
0
 def test_8(self):
     with pytest.raises(ValueError):
         ScriptCollector(["test"])
예제 #17
0
 def test_7(self):
     with pytest.raises(PyJobError):
         ScriptCollector([1])
예제 #18
0
    def run(self,
            models_dir,
            nproc=2,
            min_solvent_content=20,
            submit_nproc=None,
            submit_qtype=None,
            submit_queue=None,
            monitor=None,
            chunk_size=0,
            **kwargs):
        """Run phaser rotation function on a directory of models
        Parameters
        ----------
        models_dir : str
            The directory containing the models to run the rotation search on
        nproc : int, optional
            The number of processors to run the job on
        min_solvent_content : int, float, optional
            The minimum solvent content present in the unit cell with the input model [default: 30]
        submit_nproc : int
            The number of processors to use on the head node when creating submission scripts on a cluster [default: 1]
        submit_qtype : str
            The cluster submission queue type - currently support SGE and LSF
        submit_queue : str
            The queue to submit to on the cluster
        monitor
        chunk_size : int, optional
            The number of jobs to submit at the same time

        Returns
        -------
        file
            log file for each model in the models_dir
        """
        self.submit_qtype = submit_qtype
        self.submit_queue = submit_queue
        self.mtz_labels = simbad.util.mtz_util.GetLabels(self.mtz)

        self.simbad_dat_files = simbad.db.find_simbad_dat_files(models_dir)

        i = InputMR_DAT()
        i.setHKLI(self.mtz)
        i.setLABI_F_SIGF(self.mtz_labels.f, self.mtz_labels.sigf)
        i.setMUTE(True)
        run_mr_data = runMR_DAT(i)

        sg = run_mr_data.getSpaceGroupName().replace(" ", "")
        cell = " ".join(map(str, run_mr_data.getUnitCell()))

        mat_coef = simbad.util.matthews_prob.MatthewsProbability(cell, sg)

        dir_name = "simbad-tmp-" + str(uuid.uuid1())
        self.script_log_dir = os.path.join(self.work_dir, dir_name)
        os.mkdir(self.script_log_dir)

        self.ccp4_scr = os.environ["CCP4_SCR"]
        default_tmp_dir = os.path.join(self.work_dir, 'tmp')
        if self.tmp_dir:
            self.template_tmp_dir = os.path.join(self.tmp_dir,
                                                 dir_name + "-{0}")
        else:
            self.template_tmp_dir = os.path.join(default_tmp_dir,
                                                 dir_name + "-{0}")

        predicted_molecular_weight = 0
        if run_mr_data.Success():
            i = InputCCA()
            i.setSPAC_HALL(run_mr_data.getSpaceGroupHall())
            i.setCELL6(run_mr_data.getUnitCell())
            i.setMUTE(True)
            run_cca = runCCA(i)

            if run_cca.Success():
                predicted_molecular_weight = run_cca.getAssemblyMW()

        dat_models = []
        for dat_model in self.simbad_dat_files:
            name = os.path.basename(dat_model.replace(".dat", ""))
            pdb_struct = simbad.util.pdb_util.PdbStructure()
            pdb_struct.from_file(dat_model)
            solvent_fraction, n_copies = mat_coef.calculate_content_ncopies_from_struct(
                pdb_struct)
            solvent_content = solvent_fraction * 100
            if solvent_content < min_solvent_content:
                msg = "Skipping %s: solvent content is predicted to be less than %.2f"
                logger.debug(msg, name, min_solvent_content)
                continue
            mw_diff = abs(predicted_molecular_weight -
                          pdb_struct.molecular_weight)

            info = simbad.core.dat_score.DatModelScore(name, dat_model,
                                                       mw_diff, None, None,
                                                       None, None,
                                                       solvent_fraction,
                                                       n_copies)
            dat_models.append(info)

        sorted_dat_models = sorted(dat_models,
                                   key=lambda x: float(x.mw_diff),
                                   reverse=False)
        n_files = len(sorted_dat_models)
        chunk_size = simbad.rotsearch.get_chunk_size(n_files, chunk_size)
        total_chunk_cycles = simbad.rotsearch.get_total_chunk_cycles(
            n_files, chunk_size)

        results = []
        iteration_range = range(0, n_files, chunk_size)
        for cycle, i in enumerate(iteration_range):
            logger.info("Working on chunk %d out of %d", cycle + 1,
                        total_chunk_cycles)

            if self.solution:
                logger.info(
                    "Early termination criteria met, skipping chunk %d",
                    cycle + 1)
                continue

            self.template_model = os.path.join("$CCP4_SCR", "{0}.pdb")

            if submit_qtype == 'local':
                processes = nproc
            else:
                processes = submit_nproc

            collector = ScriptCollector(None)
            phaser_files = []
            with pool.Pool(processes=processes) as p:
                [(collector.add(i[0]), phaser_files.append(i[1]))
                 for i in p.map(self, sorted_dat_models[i:i + chunk_size])
                 if i is not None]

            if len(phaser_files) > 0:
                logger.info("Running PHASER rotation functions")
                phaser_logs, dat_models = zip(*phaser_files)
                simbad.util.submit_chunk(collector, self.script_log_dir, nproc,
                                         'simbad_phaser', submit_qtype,
                                         submit_queue, True, monitor,
                                         self.rot_succeeded_log)

                for dat_model, phaser_log in zip(dat_models, phaser_logs):
                    base = os.path.basename(phaser_log)
                    pdb_code = base.replace("phaser_", "").replace(".log", "")
                    try:
                        phaser_rotation_parser = simbad.parsers.rotsearch_parser.PhaserRotsearchParser(
                            phaser_log)
                        if phaser_rotation_parser.rfact:
                            phaser_rotation_parser.llg = 100
                            phaser_rotation_parser.rfz = 10
                        score = simbad.core.phaser_score.PhaserRotationScore(
                            pdb_code, dat_model, phaser_rotation_parser.llg,
                            phaser_rotation_parser.rfz)

                        if phaser_rotation_parser.rfz:
                            results += [score]
                    except IOError:
                        pass

            else:
                logger.critical("No structures to be trialled")

        self._search_results = results
        shutil.rmtree(self.script_log_dir)

        if os.path.isdir(default_tmp_dir):
            shutil.rmtree(default_tmp_dir)
예제 #19
0
def create_morda_db(database,
                    nproc=2,
                    submit_qtype=None,
                    submit_queue=False,
                    chunk_size=5000):
    """Create the MoRDa search database

    Parameters
    ----------
    database : str
       The path to the database folder
    nproc : int, optional
       The number of processors [default: 2]
    submit_qtype : str
       The cluster submission queue type - currently support SGE and LSF
    submit_queue : str
       The queue to submit to on the cluster
    chunk_size : int, optional
       The number of jobs to submit at the same time [default: 5000]
    
    Raises
    ------
    RuntimeError
       Windows is currently not supported

    """
    if CUSTOM_PLATFORM == "windows":
        msg = "Windows is currently not supported"
        raise RuntimeError(msg)

    if not is_valid_db_location(database):
        raise RuntimeError("Permission denied! Cannot write to {}!".format(
            os.path.dirname(database)))

    if "MRD_DB" in os.environ:
        morda_installed_through_ccp4 = True
    else:
        download_morda()
        morda_installed_through_ccp4 = False

    morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM',
                                  '*.dat')
    simbad_dat_path = os.path.join(database, '**', '*.dat')
    morda_dat_files = set(
        [os.path.basename(f) for f in glob.glob(morda_dat_path)])
    simbad_dat_files = set(
        [os.path.basename(f) for f in glob.glob(simbad_dat_path)])
    erroneous_files = {
        "1bbzA_0.dat", "1gt0D_0.dat", "1h3oA_0.dat", "1kskA_1.dat",
        "1l0sA_0.dat"
    }

    def delete_erroneous_files(erroneous_paths):
        for f in erroneous_paths:
            if os.path.isfile(f):
                logger.warning(
                    "File flagged to be erroneous ... " +
                    "removing from database: %s", f)
                os.remove(f)

    erroneous_paths = [
        os.path.join(database, name[1:3], name) for name in erroneous_files
    ]
    delete_erroneous_files(erroneous_paths)

    dat_files = list(morda_dat_files - simbad_dat_files - erroneous_files)
    if len(dat_files) < 1:
        logger.info('SIMBAD database up-to-date')
        if not morda_installed_through_ccp4:
            shutil.rmtree(os.environ["MRD_DB"])
        leave_timestamp(os.path.join(database, 'simbad_morda.txt'))
        return
    else:
        logger.info(
            "%d new entries were found in the MoRDa database, " +
            "updating SIMBAD database", len(dat_files))

    exe = os.path.join(os.environ["MRD_PROG"], "get_model")

    run_dir = tmp_dir(directory=os.getcwd())

    # Submit in chunks, so we don't take too much disk space
    # and can terminate without loosing the processed data
    total_chunk_cycles = len(dat_files) // chunk_size + (len(dat_files) % 5 >
                                                         0)
    for cycle, i in enumerate(range(0, len(dat_files), chunk_size)):
        logger.info("Working on chunk %d out of %d", cycle + 1,
                    total_chunk_cycles)
        chunk_dat_files = dat_files[i:i + chunk_size]

        # Create the database files
        files = []
        collector = ScriptCollector(None)
        for f in chunk_dat_files:
            code = os.path.basename(f).rsplit('.', 1)[0]
            final_file = os.path.join(database, code[1:3], code + ".dat")
            # We need a temporary directory within because "get_model" uses non-unique file names
            tmp_d = tmp_dir(directory=run_dir)
            get_model_output = os.path.join(tmp_d, code + ".pdb")
            cmd = [["export CCP4_SCR=" + tmp_d],
                   ["export MRD_DB=" + os.environ['MRD_DB']], ["cd", tmp_d],
                   [exe, "-c", code, "-m", "d"]]
            script = Script(directory=tmp_d)
            for c in cmd:
                script.append(' '.join(map(str, c)))
            collector.add(script)
            log = script.path.rsplit('.', 1)[0] + '.log'
            files += [(script.path, log, tmp_d, (get_model_output, final_file))
                      ]

        scripts, _, tmps, files = zip(*files)

        submit_chunk(collector=collector,
                     run_dir=os.getcwd(),
                     nproc=nproc,
                     job_name='morda_db',
                     submit_qtype=submit_qtype,
                     submit_queue=submit_queue,
                     permit_nonzero=True,
                     monitor=None,
                     success_func=None)

        sub_dir_names = set([
            os.path.basename(f).rsplit('.', 1)[0][1:3] for f in chunk_dat_files
        ])
        for sub_dir_name in sub_dir_names:
            sub_dir = os.path.join(database, sub_dir_name)
            if os.path.isdir(sub_dir):
                continue
            os.makedirs(sub_dir)

        for output, final in files:
            if os.path.isfile(output):
                simbad.db.convert_pdb_to_dat(output, final)
            else:
                logger.critical("File missing: {}".format(output))

        for d in tmps:
            shutil.rmtree(d)

    shutil.rmtree(run_dir)
    if not morda_installed_through_ccp4:
        shutil.rmtree(os.environ["MRD_DB"])

    validate_compressed_database(database)
    leave_timestamp(os.path.join(database, 'simbad_morda.txt'))
예제 #20
0
 def test_3(self):
     script = pytest.helpers.get_py_script(0, 1)
     sc = ScriptCollector(script)
     assert sc.scripts == [script]
예제 #21
0
 def test_2(self):
     sc = ScriptCollector(None)
     assert sc.scripts == []
예제 #22
0
 def test_1(self):
     sc = ScriptCollector([])
     assert sc.scripts == []
예제 #23
0
 def test_13(self):
     container = ScriptCollector(pytest.helpers.get_py_script(10, 1))
     task = MockTask(container)
     task.add_script([pytest.helpers.get_py_script(i, 1) for i in range(5)])
     assert len(task.script_collector) == 6
예제 #24
0
def create_contaminant_db(database,
                          add_morda_domains,
                          nproc=2,
                          submit_qtype=None,
                          submit_queue=False):
    """Create a contaminant database

    Parameters
    ----------
    database : str
        The path to the database folder
    add_morda_domains : bool
        Retrospectively add morda domains to a contaminant database updated when morda was not installed
    nproc : int, optional
        The number of processors [default: 2]
    submit_qtype : str
        The cluster submission queue type - currently support SGE and LSF
    submit_queue : str
        The queue to submit to on the cluster

    Raises
    ------
    RuntimeError
        dimple.contaminants.prepare module not available
    RuntimeError
       Windows is currently not supported
    """
    if not is_valid_db_location(database):
        raise RuntimeError("Permission denied! Cannot write to {}!".format(
            os.path.dirname(database)))

    import dimple.main
    logger.info('DIMPLE version: %s', dimple.main.__version__)

    if StrictVersion(dimple.main.__version__) < StrictVersion('2.5.7'):
        msg = "This feature will be available with dimple version 2.5.7"
        raise RuntimeError(msg)

    if CUSTOM_PLATFORM == "windows":
        msg = "Windows is currently not supported"
        raise RuntimeError(msg)

    import dimple.contaminants.prepare

    dimple.contaminants.prepare.main(verbose=False)

    simbad_dat_path = os.path.join(database, '*', '*', '*', '*.dat')
    existing_dat_files = [
        os.path.basename(f).split('.')[0].lower()
        for f in glob.iglob(simbad_dat_path)
    ]
    erroneous_files = ['4v43']
    dimple_files = ['cached', 'data.json', 'data.py']

    with open("data.json") as data_file:
        data = json.load(data_file)

    results = []
    for child in data["children"]:
        try:
            for child_2 in child["children"]:
                space_group = child_2["name"].replace(" ", "")
                for child_3 in child_2["children"]:
                    pdb_code = child_3["name"].split()[0].lower()
                    if (pdb_code in existing_dat_files or pdb_code
                            in erroneous_files) and not add_morda_domains:
                        continue
                    uniprot_name = child["name"]
                    uniprot_mnemonic = uniprot_name.split('_')[1]
                    score = ContaminantSearchResult(pdb_code, space_group,
                                                    uniprot_name,
                                                    uniprot_mnemonic)
                    results.append(score)
        except KeyError:
            pass

    if len(results) == 0:
        logger.info("Contaminant database up to date")
    else:
        if add_morda_domains:
            logger.info("Adding morda domains to contaminant database")
        else:
            logger.info(
                "%d new entries were found in the contaminant database, " +
                "updating SIMBAD database", len(results))

        if "MRD_DB" in os.environ:
            morda_installed_through_ccp4 = True
        else:
            morda_installed_through_ccp4 = False

        if add_morda_domains and not morda_installed_through_ccp4:
            logger.critical(
                "Morda not installed locally, unable to add morda domains to contaminant database"
            )

        if morda_installed_through_ccp4:
            morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home',
                                          'ca_DOM', '*.dat')
            morda_dat_files = set(
                [os.path.basename(f) for f in glob.iglob(morda_dat_path)])
            exe = os.path.join(os.environ['MRD_PROG'], "get_model")
        else:
            logger.info(
                "Morda not installed locally, therefore morda domains will not be added to contaminant database"
            )

        files = []
        collector = ScriptCollector(None)
        for result in results:
            stem = os.path.join(os.getcwd(), database, result.uniprot_mnemonic,
                                result.uniprot_name, result.space_group)
            if not os.path.exists(stem):
                os.makedirs(stem)

            content = PdbStructure.get_pdb_content(result.pdb_code)
            if content is None:
                logger.debug(
                    "Encountered a problem downloading PDB %s - skipping entry",
                    result.pdb_code)
            else:
                dat_content = simbad.db._str_to_dat(content)
                with open(os.path.join(stem, result.pdb_code + ".dat"),
                          "w") as f_out:
                    f_out.write(dat_content)

                if simbad.db.is_valid_dat(
                        os.path.join(stem, result.pdb_code + ".dat")):
                    pass
                else:
                    logger.debug("Unable to convert %s to dat file",
                                 result.pdb_code)

            if morda_installed_through_ccp4:
                for dat_file in morda_dat_files:
                    if result.pdb_code.lower() == dat_file[0:4]:
                        stem = os.path.join(database, result.uniprot_mnemonic,
                                            result.uniprot_name,
                                            result.space_group, "morda")
                        if not os.path.exists(stem):
                            os.makedirs(stem)
                        code = dat_file.rsplit('.', 1)[0]
                        final_file = os.path.join(stem, dat_file)
                        tmp_d = tmp_dir(directory=os.getcwd())
                        get_model_output = os.path.join(tmp_d, code + ".pdb")
                        cmd = [["export CCP4_SCR=", tmp_d], ["cd", tmp_d],
                               [exe, "-c", code, "-m", "d"]]
                        script = Script(directory=tmp_d)
                        for c in cmd:
                            script.append(' '.join(map(str, c)))
                        collector.add(script)
                        log = script.path.rsplit('.', 1)[0] + '.log'
                        files += [(script.path, log, tmp_d, (get_model_output,
                                                             final_file))]

        if len(files) > 0:
            scripts, _, tmps, files = zip(*files)

            submit_chunk(collector=collector,
                         run_dir=os.getcwd(),
                         nproc=nproc,
                         job_name='cont_db',
                         submit_qtype=submit_qtype,
                         submit_queue=submit_queue,
                         permit_nonzero=True,
                         monitor=None,
                         success_func=None)

            for output, final in files:
                if os.path.isfile(output):
                    simbad.db.convert_pdb_to_dat(output, final)
                else:
                    print "File missing: {}".format(output)

            for d in tmps:
                shutil.rmtree(d)

            for f in dimple_files:
                if os.path.isdir(f):
                    shutil.rmtree(f)
                elif os.path.isfile(f):
                    os.remove(f)

    validate_compressed_database(database)
예제 #25
0
 def test_9(self):
     with pytest.raises(IOError):
         ScriptCollector(["test.sh"])