Exemplo n.º 1
0
class EM4GMM(Component):
    """
    Use `em4gmm <https://github.com/juandavm/em4gmm>`_ for clustering via the
    Expectation Maximisation (EM) algorithm using Gaussian Mixture Models
    (GMMs).

    This component is used to cluster arbitrary lists in the pipeline state
    using GMMs. The list to be clustered is selected via the JMESPath query
    `select_expr`, and the dimensions are chosen via the JMESPath query
    `dimensions_expr`.

    For each sample, the fields ``cluster`` and ``lprob`` are added. The
    ``cluster`` field corresponds to the ``class`` field from the log file
    generated by ``gmmclass``, and the ``lprob`` field is taken verbatim.
    The ``class`` field was renamed to avoid clashing with the common keyword.

    :param str select_expr: JMESPath expresion selecting the list of samples
        to cluster.

    :param str dimensions_expr: JMESPath expression evaluated relative to each
        item in the list returned by `select_expr`, giving the values of each
        dimension.

    The remaining parameters for the component mirror the command line options
    of `gmmtrain` and `gmmclass`:


    :param str mixture_model: Name of the file used to save the trained mixture
        model (``-m`` option of ``gmmtrain``).

    :param str model_details: Log file name containing details of the model
        (``-r`` option of ``gmmtrain``).

    :param str sample_details: File name used to save details of sample
        classifications (``-r`` option of ``gmmclass``).

    :param int num_components: Optional number of components of the mixture (
        (``-n`` option of ``gmmtrain``).

    :param float merge: Optional merge threshold based on similarity (``-u``
        option of ``gmmtrain``).

    :param float stop: Optional stop criterion based on likelihood (``-s``
        option of ``gmmtrain``).

    :param int iterations: Optional maximum number of EM iterations (``-i``
        option of ``gmmtrain``).

    :param int threads: Optional maximum number of threads used (``-t`` option
        of ``gmmtrain`` and ``gmmclass``). Default is 1.

    :param str world_model: Optional world model used for smoothing (``-w``
        option of ``gmmclass``).

    :param str bin_dir: Directory containing the executables ``gmmtrain`` and
        ``gmmclass``. By default, the executables are looked up on the system
        path.
    """
    GMMTRAIN = ExternalTool({
        "samples": "d",
        "mixture_model": "m",
        "model_details": "r",
        "num_components": "n",
        "merge": "u",
        "stop": "s",
        "iterations": "i",
        "threads": "t",
    })

    GMMCLASS = ExternalTool({
        "samples": "d",
        "mixture_model": "m",
        "sample_details": "r",
        "world_model": "w",
        "threads": "t",
    })

    ADDS = ["clusters"]
    REQUIRED = []
    REMOVES = []

    def __init__(self,
                 select_expr,
                 dimensions_expr,
                 mixture_model="model.gmm",
                 model_details="train.log.json",
                 sample_details="classify.log.json",
                 bin_dir=None,
                 **kwargs):
        self.select_expr = select_expr
        self.dimensions_expr = dimensions_expr
        self.bin_dir = bin_dir

        self.gmmtrain_opts = {
            "mixture_model": mixture_model,
            "model_details": model_details,
        }
        for opt in self.GMMTRAIN.flag_map:
            if opt in kwargs:
                self.gmmtrain_opts[opt] = kwargs[opt]

        self.gmmclass_opts = {
            "mixture_model": mixture_model,
            "sample_details": sample_details,
        }
        for opt in self.GMMCLASS.flag_map:
            if opt in kwargs:
                self.gmmclass_opts[opt] = kwargs[opt]

    def run(self, data, config=None, pipeline=None):
        """Run em4gmm for automatic clustering."""

        # Select sample
        jmes_opts = jmespath.Options(custom_functions=JMESExtensions(data))
        sample_list = jmespath.search(self.select_expr, data, jmes_opts)

        # Extract data points
        data_points = [
            jmespath.search(self.dimensions_expr, sample, jmes_opts)
            for sample in sample_list
        ]

        num_samples = len(data_points)
        num_dims = len(data_points[0])

        # Write samples to file
        with tempfile.NamedTemporaryFile("w") as sample_file:
            print("{} {}".format(num_dims, num_samples), file=sample_file)
            for sample in data_points:
                print(" ".join([str(i) for i in sample]), file=sample_file)
            sample_file.flush()

            # Run trainer
            gmmtrain_opts = {"samples": sample_file.name}
            gmmtrain_opts.update(self.gmmtrain_opts)
            gmmtrain = self.GMMTRAIN((self.bin_dir, "gmmtrain"),
                                     options=gmmtrain_opts)
            self.logger.debug("Running %s", gmmtrain)
            subprocess.run(gmmtrain, check=True)

            # Run classifier
            gmmclass_opts = {"samples": sample_file.name}
            gmmclass_opts.update(self.gmmclass_opts)
            gmmclass = self.GMMCLASS((self.bin_dir, "gmmclass"),
                                     options=gmmclass_opts)
            self.logger.debug("Running %s", gmmclass)
            subprocess.run(gmmclass, check=True)

        # Parse cluster definitions from trainer log file
        with open(self.gmmtrain_opts["model_details"], "r") as model_in:
            model = json.load(model_in)
            data["clusters"] = model

        # Parse sample data, adding to the samples
        with open(self.gmmclass_opts["sample_details"], "r") as samples_in:
            sample_details = json.load(samples_in)["samples_results"]

            for details in sample_details:
                i = details["sample"]
                sample_list[i]["cluster"] = details["class"]
                sample_list[i]["lprob"] = details["lprob"]
        return data
Exemplo n.º 2
0
class ModRefiner(Component):
    """
    Run `ModRefiner <https://zhanglab.ccmb.med.umich.edu/ModRefiner/>`_ to
    refine the backbone and side-chains of a protein model.

    The `reference` model is the model towards which the `initial` model is
    pulled. In I-TASSER, the `reference` model is a cluster *centroid* ( the
    average of the structures in that cluster) and the `initial` model is the
    cluster *medoid* (the decoy closest to the centroid). This causes the
    decoy, which has sensible geometry, to be pulled towards the reference
    structure, which has a geometry that is overall more correct but may have
    broken fine structure.

    The parameters `reference` and `initial` are JMESPath expressions that
    should evaluate to the file names of the corresponding structures. The
    refined model is stored in the ``model`` field in the pipeline state.

    :param str reference: JMESPath expression giving the file name of the
        reference model.
    :param str initial: JMESPath expression giving the file name of the
        starting model.
    :param int strength: Strength of the attraction towards the reference
        structure, in the range 0--100.
    :param int seed: Optional random seed.
    :param bool overwrite: If `True`, always run ModRefiner; otherwise,
        any existing output files will be used as-is.
    :param str bin_dir: Directory containing the ModRefiner binaries.
    :param str data_dir: Directory containing the ModRefiner data files.
    """
    REQUIRED = []
    ADDS = ["model"]
    REMOVES = []

    MCREFINEMENT = ExternalTool()
    EMREFINEMENT = ExternalTool()

    CONFIG_SECTION = "modrefiner"

    class ModRefinerException(Exception):
        """
        Raised when initial and reference structures are in different
        directories.
        """
        pass

    def __init__(self,
                 data_dir,
                 reference="model",
                 initial="model",
                 strength=50,
                 seed=None,
                 bin_dir=None,
                 overwrite=False):
        self.reference = reference
        self.initial = initial
        self.strength = strength
        self.seed = seed
        self.bin_dir = bin_dir
        self.data_dir = data_dir
        self.overwrite = overwrite

    def mcrefinement(self, working_dir, initial, reference):
        """Run ``mcrefinement`` to refine main chain."""
        # Called like:
        # ./mcrefinement data_dir bin_dir ini_name ref_name ran_num

        mc_cmd_line = self.MCREFINEMENT(
            (self.bin_dir, "mcrefinement"),
            positional=(working_dir, self.data_dir, initial,
                        reference, self.seed if self.seed else random.randint(
                            0, 1000000)))
        output_file = Path(working_dir, "mc" + str(initial))
        if self.overwrite or not output_file.exists():
            self.logger.debug("Running %s", mc_cmd_line)

            # Can't just use check=True because mcrefinement will return an
            # exit status of 1 even on success.
            result = subprocess.run(mc_cmd_line)
            if result.returncode not in (0, 1):
                raise subprocess.CalledProcessError(result.returncode,
                                                    mc_cmd_line)
        return output_file

    def emrefinement(self, working_dir, start, reference):
        """Run ``emrefinement``, starting from `start`."""
        # Called like:
        # ./emrefinement data_dir bin_dir ini_name ref_name str_val ran_num

        em_cmd_line = self.EMREFINEMENT(
            (self.bin_dir, "emrefinement"),
            positional=(working_dir, self.data_dir, start, reference,
                        self.strength, self.seed
                        if self.seed else random.randint(0, 1000000)))
        output_file = Path(working_dir, "em" + str(start))
        if self.overwrite or not output_file.exists():
            self.logger.debug("Running %s", em_cmd_line)
            # Again, manually check for a "valid" return code.
            result = subprocess.run(em_cmd_line)
            if result.returncode not in (0, 1):
                raise subprocess.CalledProcessError(result.returncode,
                                                    mc_cmd_line)
        return output_file

    @staticmethod
    def normalise_chains(initial, refined):
        """
        ModRefiner sets the chain of the refined model to " ", which is
        confusing. We reset it to the chain ID of the intial model (assuming
        there is only one chain).
        """
        parser = Bio.PDB.PDBParser(QUIET=True)
        initial_chain = list(
            parser.get_structure("initial", initial)[0].get_chains())[0]

        refined_struc = parser.get_structure("refined", refined)
        refined_struc[0][" "].id = initial_chain.get_id()
        pdb_io = Bio.PDB.PDBIO()
        pdb_io.set_structure(refined_struc)
        pdb_io.save(str(refined))

    def run(self, data, config=None, pipeline=None):
        """Run ModRefiner to refine model."""
        jmes_ext = phyre_engine.tools.jmespath.JMESExtensions(data)
        jmes_opts = jmespath.Options(custom_functions=jmes_ext)

        initial = Path(jmespath.search(self.initial, data, jmes_opts))
        reference = Path(jmespath.search(self.reference, data, jmes_opts))

        # We will call the output files "<basename>.mc.pdb" and
        # "<basename>.em.pdb".  The basename is the basename of the initial
        # structure.
        output_files = (initial.with_suffix(".mc.pdb"),
                        initial.with_suffix(".em.pdb"))

        # Use output files if they exist
        if not self.overwrite and all([i.exists() for i in output_files]):
            data["model"] = str(output_files[1])
            return data

        # Modrefiner expects the initial and reference structures to be in the
        # same directory. This is going to be frustrating for us, so we will
        # symlink them in a temporary directory. This also allows us to
        # normalise the names of the files, which is handy because modrefiner
        # bases the output file name on the input file.
        with tempfile.TemporaryDirectory("-modrefiner",
                                         "phyreengine-") as work_dir:

            initial_link = Path(work_dir, "initial.pdb")
            reference_link = Path(work_dir, "reference.pdb")
            initial_link.symlink_to(initial.resolve())
            reference_link.symlink_to(reference.resolve())

            bb_refined = self.mcrefinement(work_dir, initial_link.name,
                                           reference_link.name)
            sc_refined = self.emrefinement(work_dir, bb_refined.name,
                                           reference_link.name)

            # Copy the output files to the current directory
            shutil.copy2(bb_refined, output_files[0])
            shutil.copy2(sc_refined, output_files[1])

            # Normalise chain IDs for each output file
            self.normalise_chains(initial, output_files[0])
            self.normalise_chains(initial, output_files[1])
            data["model"] = output_files[1]
        return data
Exemplo n.º 3
0
    def run(self, data, config=None, pipeline=None):
        """Generating PSSM from MSA."""
        hhconsensus = ExternalTool(
            {
                "input": "i",
                "seqfile": "s",
                "verbose": "v",
            }, long_prefix="-")
        hhfilter = ExternalTool({
            "input": "i",
            "oa3m": "o",
            "verbose": "v"
        },
                                long_prefix="-")
        reformat = ExternalTool({"no_lower": "r"}, long_prefix="-")
        blastpgp = ExternalTool({
            "database": "d",
            "input": "i",
            "iterations": "j",
            "db_seq_alns": "b",
            "evalue_threshold": "h",
            "input_alignment": "B",
            "checkpoint": "C",
            "ascii_pssm": "Q"
        })
        makemat = ExternalTool({"profile_db": "P"})

        try:
            tmpdir = tempfile.mkdtemp("-pssm", "phyreengine-")
            name, a3m = self.get_vals(data)
            query_seq = self.read_query_seq(a3m, name)

            tmp_a3m = Path(tmpdir, "msa.a3m")
            tmp_seq = Path(tmpdir, "seq.fasta")
            tmp_psi = Path(tmpdir, "msa.psi")
            env = os.environ.copy()
            env["HHLIB"] = self.HHLIB

            if "-" in query_seq:
                # Generate consensus sequence
                command_line = hhconsensus(executable=(self.hhsuite_dir,
                                                       "hhconsensus"),
                                           options={
                                               "input": a3m,
                                               "seqfile": tmp_seq,
                                               "oa3m": tmp_a3m
                                           })
                subprocess.run(command_line, check=True, env=env)
            else:
                # If there are no gaps, just copy the query a3m and write the
                # query sequence.
                shutil.copy2(a3m, str(tmp_a3m))
                with tmp_seq.open("w") as tmp_seq_out:
                    tmp_seq_out.write(">{name}\n{query}\n".format(
                        name=name, query=query_seq))

            # Filter query a3m to desired diversity
            command_line = hhfilter(executable=(self.hhsuite_dir, "hhfilter"),
                                    options={
                                        "neff": 7,
                                        "input": tmp_a3m,
                                        "oa3m": tmp_a3m
                                    })
            subprocess.run(command_line, check=True, env=env)

            # Reformat to PSI-BLAST format
            command_line = reformat(
                (str(Path(self.HHLIB, "scripts")), "reformat.pl"),
                positional=["a3m", "psi", tmp_a3m, tmp_psi],
                flags=["no_lower", "noss"])
            subprocess.run(command_line, check=True, env=env)

            # Generate PSSM using blastpgp
            chk_file = "profile.chk"
            mtx_file = "profile.mtx"
            pssm_file = "profile.pssm"

            dummy_db = Path(self.HHLIB, "data/do_not_delete")
            command_line = blastpgp(executable=(self.blast_dir, "blastpgp"),
                                    options={
                                        "db_seq_alns": 1,
                                        "iterations": 1,
                                        "evalue_threshold": 0.001,
                                        "database": dummy_db,
                                        "input": tmp_seq,
                                        "input_alignment": tmp_psi,
                                        "checkpoint": chk_file,
                                        "ascii_pssm": pssm_file
                                    })
            subprocess.run(command_line, check=True)

            # Build mtx file using makemat

            # First build the profile "databases" for use with makemat. These
            # are just two files with the same prefix and the suffixes ".pn"
            # and ".sn", which contain a list of checkpoint files and the
            # corresponding list of sequences. File names are resolve relative
            # to the directory containing the *.sn and *.pn files, so we will
            # symlink the checkpoint.
            tmp_sn_file = Path(tmpdir, "makemat.sn")
            tmp_pn_file = Path(tmpdir, "makemat.pn")

            with tmp_sn_file.open("w") as sn_out:
                print(str(Path(tmp_seq.name)), file=sn_out)
            with tmp_pn_file.open("w") as pn_out:
                chk_path = Path(chk_file)
                chk_link = Path(tmpdir, "makemat.chk")
                chk_link.symlink_to(chk_path.resolve())
                print(str(chk_link.name), file=pn_out)

            command_line = makemat(
                executable=(self.blast_dir, "makemat"),
                options={"profile_db": str(Path(tmpdir, "makemat"))})
            subprocess.run(command_line, check=True)
            shutil.copy2(str(Path(tmpdir, "makemat.mtx")), mtx_file)

            data["pssm"] = {
                "mtx": mtx_file,
                "chk": chk_file,
                "ascii": pssm_file
            }
            return data

        finally:
            shutil.rmtree(tmpdir)
Exemplo n.º 4
0
class LoopModel(Component):
    """
    Use Alex Herbert's loop modeler to fill in as many gaps as possible.

    .. versionchanged:: 0.1a1

        This component no longer operates on every component from the top
        level of the pipeline. If you wish to apply the loop modeller to a
        list of templates, call it from within a
        :py:class:`~phyre_engine.component.component.Map` component.

        You may use :py:class:`~phyre_engine.component.jmespath.Update` to
        copy the ``pssm`` and ``sequence`` keys from the top level of the
        pipeline state into each template.

    :param str bin_dir: Location of the loop modelling executable.
    :param str config: Loop modeller configuration file.
    :param str executable: Name of the executable to run, under `bin_dir`.
    """

    REQUIRED = ["pssm", "query_sequence", "model"]
    ADDS = []
    REMOVES = []

    LOOP_MODELLER = ExternalTool(
        {
            "config": "c",
            "query": "f",
            "out_dir": "d",
            "model_list": "l",
        },
        long_prefix="-")

    CONFIG_SECTION = "sbg_loop"

    def __init__(self, bin_dir, config, executable="assembler.loop"):
        self.bin_dir = bin_dir
        self.config = config
        self.executable = executable

    def convert_ascii_pssm(self, pssm, output):
        """
        Convert the PSSM generated by blastpgp into the loop modeller format.

        This means chopping off the header and the column with residue IDs.
        """
        with open(pssm, "r") as pssm_in:
            for line in pssm_in:
                if re.match(r"^\s+\d", line):
                    cols = line.split()
                    residue = cols[1]
                    counts = cols[2:22]
                    output.write(residue + " ")
                    for count in counts:
                        output.write("{:4d}".format(int(count)))
                    output.write("\n")
        output.flush()

    def run(self, data, config=None, pipeline=None):
        """Fill short gaps with loop modeller."""
        pssm, sequence, model = self.get_vals(data)

        try:
            tmpdir = tempfile.mkdtemp("-loop", "phyreengine-")
            self.logger.debug("Loop modelling using tmpdir: %s", tmpdir)

            out_dir = Path(model).with_suffix(".loop")

            # Attempt to use existing models if the output directory exists.
            if not out_dir.exists():
                loop_pssm = Path(tmpdir, "loop.pssm")
                query_fasta = Path(tmpdir, "query.fasta")
                model_list = Path(tmpdir, "model.list")

                with model_list.open("w") as model_list_out:
                    print(str(Path(model).resolve()), file=model_list_out)
                with loop_pssm.open("w") as loop_out:
                    self.convert_ascii_pssm(pssm["ascii"], loop_out)
                with query_fasta.open("w") as query_out:
                    fasta_seq = ">model\n{}\n".format(sequence)
                    print(fasta_seq, file=query_out)

                command_line = self.LOOP_MODELLER(executable=(self.bin_dir,
                                                              self.executable),
                                                  options={
                                                      "config": self.config,
                                                      "pssm": loop_pssm,
                                                      "query": query_fasta,
                                                      "model_list": model_list,
                                                      "out_dir": out_dir,
                                                  })
                self.logger.debug("Running %s", command_line)
                subprocess.run(command_line, check=True)

            # Replace "model" field with the first loop model.
            model_path = (out_dir / "model.1" / "model.1.pdb")
            if not model_path.exists():
                err_msg = "Loop-modelled file '{}' does not exist"
                raise FileNotFoundError(err_msg.format(model_path))
            data["model"] = str(model_path)
        finally:
            shutil.rmtree(tmpdir)
        return data
Exemplo n.º 5
0
class DSSP(Component):
    """
    Calculate secondary structure state using
    `DSSP <http://swift.cmbi.ru.nl/gv/dssp/>`_.

    This component requires the ``structure`` field to be set, otherwise it has
    no source of tertiary structure from which to calculate the secondary
    structure.

    :param str bin_dir: Directory containing the ``mkdssp`` executable, if it is
        not in the system ``$PATH``.
    """
    CONFIG_SECTION = "dssp"

    REQUIRED = ["structure"]
    ADDS = [SECONDARY_STRUCTURE_KEY]
    REMOVES = []

    TOOL_NAME = "dssp"
    MKDSSP = ExternalTool()

    def __init__(self, bin_dir=None):
        self.bin_dir = bin_dir

    def run(self, data, config=None, pipeline=None):
        """Calculate ``secondary_structure`` key."""
        structure = self.get_vals(data)

        # Create secondary_structure key if it is not present
        if SECONDARY_STRUCTURE_KEY not in data:
            data[SECONDARY_STRUCTURE_KEY] = {}

        # Run DSSP on the structure file and read the output
        mkdssp_cmd_line = self.MKDSSP(
            (self.bin_dir, "mkdssp"),
            options={"input": structure})
        dssp_proc = subprocess.run(
            mkdssp_cmd_line, universal_newlines=True,
            check=True, stdout=subprocess.PIPE)
        dssp_mapping = self.parse_dssp(dssp_proc.stdout.split("\n"))

        data[SECONDARY_STRUCTURE_KEY][self.TOOL_NAME] = dssp_mapping
        return data

    @staticmethod
    def parse_dssp(dssp_lines):
        """
        Parse lines of output from DSSP.

        :return: List of tuples containing the residue ID and secondary
            structure state.
        :rtype: list[tuple(int, str)]
        """
        residue_section = False
        dssp_mapping = []
        for line in dssp_lines:
            if line.startswith("  #  RESIDUE AA"):
                residue_section = True
            elif residue_section and len(line) > 17:
                res_id = line[5:10]
                aa_type = line[13]
                sec_struc = line[16]

                # Ignore missing residues.
                if aa_type == '!':
                    continue
                # Use "C" for coils.
                if sec_struc == ' ':
                    sec_struc = 'C'

                residue_ss = {
                    "assigned": sec_struc,
                    "confidence": {},
                    "res_id": int(res_id)}
                for state in EightStateSS:
                    confidence = 1.0 if state.value == sec_struc else 0.0
                    residue_ss["confidence"][state.value] = confidence
                dssp_mapping.append(residue_ss)
        return dssp_mapping
Exemplo n.º 6
0
class Scwrl4(Component):
    """
    Run `SCWRL4 <http://dunbrack.fccc.edu/scwrl4/>`_ to reconstruct
    the side-chains of a model.

    :param str bin_dir: Directory containing the ``scwrl4`` executable.
    :param bool overwrite: If `True`, always run ``scwrl4``; otherwise,
        an existing output file will be used as-is.
    """
    ADDS = []
    REMOVES = []
    REQUIRED = ["model"]

    SCWRL4 = ExternalTool(
        flag_map={
            "input": "i",
            "output": "o",
            "sequence": "s",
            "parameters": "p",
            "frame": "f",
            "graph": "g",
            "workspace": "w",
            "symmetry": "%",
            "crystal": "#",
            "disable_subrotamers": "v",
            "omit_hydrogens": "h",
            "disable_terminal_capping": "t",
        })

    CONFIG_SECTION = "scwrl4"

    def __init__(self, bin_dir=None, overwrite=False):
        self.bin_dir = bin_dir
        self.overwrite = overwrite

    def run(self, data, config=None, pipeline=None):
        """Run SCWRL4 to reconstruct side-chains."""
        model = self.get_vals(data)
        outfile = Path(model).with_suffix(".scwrl4.pdb")

        if not outfile.exists() or self.overwrite:
            command_line = self.SCWRL4((self.bin_dir, "Scwrl4"),
                                       options={
                                           "input": model,
                                           "output": outfile
                                       })
            program = subprocess.run(command_line,
                                     check=True,
                                     universal_newlines=True,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

            # Scwrl doesn't always set a sensible exit value, so if we see
            # "^Err$" on standard output, treat it as an error.
            if re.search("^Err$", program.stdout, re.MULTILINE):
                self.logger.error(("Error running SCWRL4. Command line: %s\n"
                                   "Standard output: %s\n"
                                   "Standard error: %s\n"), command_line,
                                  program.stdout, program.stderr)
                raise subprocess.CalledProcessError(program.returncode,
                                                    command_line,
                                                    program.stdout,
                                                    program.stderr)
        data["model"] = str(outfile)
        return data
Exemplo n.º 7
0
class MobiDBLite(Component):
    """
    `MobiDB lite <http://protein.bio.unipd.it/mobidblite/>`_ is a meta-predictor
    of disorder. It combines nine fast predictors to quickly produce a consensus
    prediction.

    :param str bin_dir: Directory containing the ``mobidb-lite.py`` script.
    :param str supporting_bin_dir: Root directory of the binaries used by MobiDB
        lite (i.e. the directory passed via the ``--binDirectory`` option).
    """
    REQUIRED = ["sequence"]
    ADDS = [DISORDER_KEY]
    REMOVES = []

    TOOL_NAME = "mobidb-lite"
    EXECUTABLE_NAME = "mobidb-lite.py"

    MOBIDB_TOOL = ExternalTool()
    MOBDIB_DEFAULT_ARGS = {"threads": 1}
    MOBIDB_DEFAULT_FLAGS = {"longOutput"}

    def __init__(self, bin_dir, supporting_bin_dir):
        self.bin_dir = bin_dir
        self.supporting_bin_dir = supporting_bin_dir

    @staticmethod
    def parse_results(mobidb_output):
        """Parse long MobiDB lite result string."""
        mdb_results = json.loads(mobidb_output)
        disorder = []
        for state, prob in zip(mdb_results["consensus"], mdb_results["p"]):
            disorder.append({
                "assigned": state,
                "confidence": {
                    DisorderStates.DISORDERED.value: prob,
                    DisorderStates.STRUCTURED.value: 1 - prob
                }
            })
        return disorder

    def run(self, data, config=None, pipeline=None):
        """Run MobiDB lite to predict disorder."""
        sequence = self.get_vals(data)

        # Add disorder key if it's not present
        if DISORDER_KEY not in data:
            data[DISORDER_KEY] = {}

        # Write sequence to temp file and run mobidb-lite
        with tempfile.NamedTemporaryFile("w") as seq_file:
            print(">query", file=seq_file)
            print(sequence, file=seq_file)
            seq_file.flush()

            mobdib_options = self.MOBDIB_DEFAULT_ARGS.copy()
            mobdib_options["binDirectory"] = self.supporting_bin_dir
            command_line = self.MOBIDB_TOOL(
                (self.bin_dir, self.EXECUTABLE_NAME),
                flags=self.MOBIDB_DEFAULT_FLAGS,
                options=mobdib_options,
                positional=[seq_file.name])

            self.logger.info("Running '%s'", command_line)
            mobidb_proc = subprocess.run(command_line,
                                         stdout=subprocess.PIPE,
                                         check=True,
                                         universal_newlines=True)

            # Mobidb will rudely return no output when no disordered regions are
            # found. In those cases, we just don't add the mobidb-lite key to
            # the disordered predictor.
            if mobidb_proc.stdout.strip():
                disorder = self.parse_results(mobidb_proc.stdout)
                data[DISORDER_KEY][self.TOOL_NAME] = disorder
        return data
Exemplo n.º 8
0
class Disopred(Component):
    """
    Run `DISOPRED <http://bioinf.cs.ucl.ac.uk/psipred/?disopred=1>`_ to predict
    disordered regions of protein structure.

    This component takes a shortcut compared to the stock ``run_disopred.pl``
    script supplied with DISOPRED: it will use an existing ``mtx`` file
    generated, for example, by :py:class:`phyre_engine.component.hhsuite.PSSM`.
    This component also does not predict binding binding sites using
    ProtBind.

    :param str data_dir: Directory containing DISOPRED data files.
    :param str dso_lib_dir: Directory containing DISOPRED library files.
    :param str bin_dir: Directory containing the DISOPRED executables.
    :param bool overwrite: If `True`, always overwrite existing DISOPRED results
        with a new run. Otherwise, existing results will be used as-is.
    """

    REQUIRED = ["pssm"]
    ADDS = ["disorder"]
    REMOVES = []

    CONFIG_SECTION = "disopred"

    #: Adjustable DISOPRED2 false positive rate, from 1-10.
    DISOPRED2_FPR = 5

    DISOPRED2 = ExternalTool()
    DISO_NEU_NET = ExternalTool()
    DISO_NEIGHB = ExternalTool()
    COMBINE = ExternalTool()

    def __init__(self, data_dir, dso_lib_dir, bin_dir=None, overwrite=False):
        self.data_dir = data_dir
        self.dso_lib_dir = dso_lib_dir
        self.bin_dir = bin_dir
        self.overwrite = overwrite

    @staticmethod
    def parse_results(diso_in):
        """
        Parse disopred output file into the format described in
        :py:mod:`.disorder`.

        The disopred format looks like this:

        .. code-block:: none

            #         ----- DISOPRED version 3.1 -----
            # Disordered residues are marked with asterisks (*)
            #    Ordered residues are marked with dots (.)
                1 M * 0.78
                2 K * 0.62
                3 T . 0.45
                4 A . 0.37
                5 Y . 0.20

        We parse this into the following list:

        .. code-block:: python

            [
                {"assigned": "D", "confidence": {"S": 0.22, "D": 0.78}},
                {"assigned": "D", "confidence": {"S": 0.38, "D": 0.62}},
                {"assigned": "S", "confidence": {"S": 0.55, "D": 0.45}},
                {"assigned": "S", "confidence": {"S": 0.63, "D": 0.37}},
                {"assigned": "S", "confidence": {"S": 0.80, "D": 0.20}},
            ]

        :param file diso_in: File handle pointing to DISOPRED output.
        """
        disorder = []
        for line in diso_in:
            line = line.strip()
            if line.startswith("#"):
                continue
            _index, _aa, state, score = line.split()

            if state == "*":
                state = DisorderStates.DISORDERED
            else:
                state = DisorderStates.STRUCTURED
            score = float(score)

            disorder.append({
                "assigned": state.value,
                "confidence": {
                    DisorderStates.DISORDERED.value: score,
                    DisorderStates.STRUCTURED.value: 1 - score
                }
            })
        return disorder

    def _run_disopred(self, mtx_file, disopred_results):
        """Run each tool in the disopred pipeline."""
        with tempfile.TemporaryDirectory() as tmpdir:
            tmpdir = pathlib.Path(tmpdir)

            # $args = join ' ', "$EXE_DIR/disopred2", join('/', $out_dir, $base), $mtx_fn, "$DATA_DIR/", $DISO2_FPR, "\n";
            # system($args) == 0 or die "[$0] ERROR: $args failed. Please report error to psipred\@cs.ucl.ac.uk\n";
            disopred2_results = tmpdir / "disopred2"
            self.logger.info("Predicting disorder with DISOPRED2.")
            disopred2_cmd = self.DISOPRED2(
                (self.bin_dir, "disopred2"),
                positional=(disopred2_results, mtx_file, self.data_dir + "/",
                            self.DISOPRED2_FPR))
            self.logger.debug("Running %s", disopred2_cmd)
            subprocess.run(disopred2_cmd, check=True)

            # $args = join ' ', "$EXE_DIR/diso_neu_net", "$DATA_DIR/weights.dat.nmr_nonpdb", $mtx_fn, ">", $nndiso_fn, "\n";
            # system($args) == 0 or die "[$0] ERROR: $args failed. Please report error to psipred\@cs.ucl.ac.uk";
            diso_neu_net_results = tmpdir / "diso_neu_net"
            with diso_neu_net_results.open("wb") as neu_net_out:
                self.logger.info("Running neural network classifier.")
                diso_neu_net_cmd = self.DISO_NEU_NET(
                    (self.bin_dir, "diso_neu_net"),
                    positional=(pathlib.Path(self.data_dir,
                                             "weights.dat.nmr_nonpdb"),
                                mtx_file))
                self.logger.debug("Running %s > %s", diso_neu_net_cmd,
                                  diso_neu_net_results)
                subprocess.run(diso_neu_net_cmd,
                               stdout=neu_net_out,
                               check=True)

            # $args = join ' ', "$EXE_DIR/diso_neighb", $mtx_fn, "$DATA_DIR/dso.lst", ">", $dnb_fn, "\n";
            # system($args) == 0 or die "[$0] ERROR: $args failed. Please report error to psipred\@cs.ucl.ac.uk\n";
            diso_neighb_results = tmpdir / "diso_neighb"
            with diso_neighb_results.open("wb") as neighb_out:
                self.logger.info("Running nearest neighbour classifier.")
                diso_neighb_cmd = self.DISO_NEIGHB(
                    (self.bin_dir, "diso_neighb"),
                    positional=(mtx_file, pathlib.Path(self.data_dir,
                                                       "dso.lst")))
                self.logger.debug("Running %s > %s", diso_neighb_cmd,
                                  diso_neighb_results)
                environment = dict(os.environ)
                environment["DSO_LIB_PATH"] = str(self.dso_lib_dir) + "/"
                subprocess.run(diso_neighb_cmd,
                               stdout=neighb_out,
                               check=True,
                               env=environment)

            # $args = join ' ', "$EXE_DIR/combine", "$DATA_DIR/weights_comb.dat", $diso2_fn, $nndiso_fn, $dnb_fn, ">", $diso3_fn, "\n";
            # system($args) == 0 or die "[$0] ERROR: $args failed. Please report error to psipred\@cs.ucl.ac.uk";
            with disopred_results.open("wb") as diso_out:
                self.logger.info("Combining disordered residue predictions.")
                combine_cmd = self.COMBINE(
                    (self.bin_dir, "combine"),
                    positional=(
                        pathlib.Path(self.data_dir, "weights_comb.dat"),
                        # Add ".diso" suffix to disopred2 output
                        str(disopred2_results) + ".diso",
                        diso_neu_net_results,
                        diso_neighb_results))
                self.logger.debug("Running %s > %s", combine_cmd,
                                  disopred_results)
                subprocess.run(combine_cmd, stdout=diso_out, check=True)

    def run(self, data, config=None, pipeline=None):
        """Run DISOPRED to predict disorder."""
        pssms = self.get_vals(data)
        mtx_file = pssms["mtx"]
        disopred_results = pathlib.Path("disorder.diso")

        if not disopred_results.exists() or self.overwrite:
            self._run_disopred(mtx_file, disopred_results)

        with disopred_results.open("r") as diso_in:
            disorder = self.parse_results(diso_in)
        if "disorder" not in data:
            data["disorder"] = {}
        data["disorder"]["disopred"] = disorder
        return data