class EM4GMM(Component): """ Use `em4gmm <https://github.com/juandavm/em4gmm>`_ for clustering via the Expectation Maximisation (EM) algorithm using Gaussian Mixture Models (GMMs). This component is used to cluster arbitrary lists in the pipeline state using GMMs. The list to be clustered is selected via the JMESPath query `select_expr`, and the dimensions are chosen via the JMESPath query `dimensions_expr`. For each sample, the fields ``cluster`` and ``lprob`` are added. The ``cluster`` field corresponds to the ``class`` field from the log file generated by ``gmmclass``, and the ``lprob`` field is taken verbatim. The ``class`` field was renamed to avoid clashing with the common keyword. :param str select_expr: JMESPath expresion selecting the list of samples to cluster. :param str dimensions_expr: JMESPath expression evaluated relative to each item in the list returned by `select_expr`, giving the values of each dimension. The remaining parameters for the component mirror the command line options of `gmmtrain` and `gmmclass`: :param str mixture_model: Name of the file used to save the trained mixture model (``-m`` option of ``gmmtrain``). :param str model_details: Log file name containing details of the model (``-r`` option of ``gmmtrain``). :param str sample_details: File name used to save details of sample classifications (``-r`` option of ``gmmclass``). :param int num_components: Optional number of components of the mixture ( (``-n`` option of ``gmmtrain``). :param float merge: Optional merge threshold based on similarity (``-u`` option of ``gmmtrain``). :param float stop: Optional stop criterion based on likelihood (``-s`` option of ``gmmtrain``). :param int iterations: Optional maximum number of EM iterations (``-i`` option of ``gmmtrain``). :param int threads: Optional maximum number of threads used (``-t`` option of ``gmmtrain`` and ``gmmclass``). Default is 1. :param str world_model: Optional world model used for smoothing (``-w`` option of ``gmmclass``). :param str bin_dir: Directory containing the executables ``gmmtrain`` and ``gmmclass``. By default, the executables are looked up on the system path. """ GMMTRAIN = ExternalTool({ "samples": "d", "mixture_model": "m", "model_details": "r", "num_components": "n", "merge": "u", "stop": "s", "iterations": "i", "threads": "t", }) GMMCLASS = ExternalTool({ "samples": "d", "mixture_model": "m", "sample_details": "r", "world_model": "w", "threads": "t", }) ADDS = ["clusters"] REQUIRED = [] REMOVES = [] def __init__(self, select_expr, dimensions_expr, mixture_model="model.gmm", model_details="train.log.json", sample_details="classify.log.json", bin_dir=None, **kwargs): self.select_expr = select_expr self.dimensions_expr = dimensions_expr self.bin_dir = bin_dir self.gmmtrain_opts = { "mixture_model": mixture_model, "model_details": model_details, } for opt in self.GMMTRAIN.flag_map: if opt in kwargs: self.gmmtrain_opts[opt] = kwargs[opt] self.gmmclass_opts = { "mixture_model": mixture_model, "sample_details": sample_details, } for opt in self.GMMCLASS.flag_map: if opt in kwargs: self.gmmclass_opts[opt] = kwargs[opt] def run(self, data, config=None, pipeline=None): """Run em4gmm for automatic clustering.""" # Select sample jmes_opts = jmespath.Options(custom_functions=JMESExtensions(data)) sample_list = jmespath.search(self.select_expr, data, jmes_opts) # Extract data points data_points = [ jmespath.search(self.dimensions_expr, sample, jmes_opts) for sample in sample_list ] num_samples = len(data_points) num_dims = len(data_points[0]) # Write samples to file with tempfile.NamedTemporaryFile("w") as sample_file: print("{} {}".format(num_dims, num_samples), file=sample_file) for sample in data_points: print(" ".join([str(i) for i in sample]), file=sample_file) sample_file.flush() # Run trainer gmmtrain_opts = {"samples": sample_file.name} gmmtrain_opts.update(self.gmmtrain_opts) gmmtrain = self.GMMTRAIN((self.bin_dir, "gmmtrain"), options=gmmtrain_opts) self.logger.debug("Running %s", gmmtrain) subprocess.run(gmmtrain, check=True) # Run classifier gmmclass_opts = {"samples": sample_file.name} gmmclass_opts.update(self.gmmclass_opts) gmmclass = self.GMMCLASS((self.bin_dir, "gmmclass"), options=gmmclass_opts) self.logger.debug("Running %s", gmmclass) subprocess.run(gmmclass, check=True) # Parse cluster definitions from trainer log file with open(self.gmmtrain_opts["model_details"], "r") as model_in: model = json.load(model_in) data["clusters"] = model # Parse sample data, adding to the samples with open(self.gmmclass_opts["sample_details"], "r") as samples_in: sample_details = json.load(samples_in)["samples_results"] for details in sample_details: i = details["sample"] sample_list[i]["cluster"] = details["class"] sample_list[i]["lprob"] = details["lprob"] return data
class ModRefiner(Component): """ Run `ModRefiner <https://zhanglab.ccmb.med.umich.edu/ModRefiner/>`_ to refine the backbone and side-chains of a protein model. The `reference` model is the model towards which the `initial` model is pulled. In I-TASSER, the `reference` model is a cluster *centroid* ( the average of the structures in that cluster) and the `initial` model is the cluster *medoid* (the decoy closest to the centroid). This causes the decoy, which has sensible geometry, to be pulled towards the reference structure, which has a geometry that is overall more correct but may have broken fine structure. The parameters `reference` and `initial` are JMESPath expressions that should evaluate to the file names of the corresponding structures. The refined model is stored in the ``model`` field in the pipeline state. :param str reference: JMESPath expression giving the file name of the reference model. :param str initial: JMESPath expression giving the file name of the starting model. :param int strength: Strength of the attraction towards the reference structure, in the range 0--100. :param int seed: Optional random seed. :param bool overwrite: If `True`, always run ModRefiner; otherwise, any existing output files will be used as-is. :param str bin_dir: Directory containing the ModRefiner binaries. :param str data_dir: Directory containing the ModRefiner data files. """ REQUIRED = [] ADDS = ["model"] REMOVES = [] MCREFINEMENT = ExternalTool() EMREFINEMENT = ExternalTool() CONFIG_SECTION = "modrefiner" class ModRefinerException(Exception): """ Raised when initial and reference structures are in different directories. """ pass def __init__(self, data_dir, reference="model", initial="model", strength=50, seed=None, bin_dir=None, overwrite=False): self.reference = reference self.initial = initial self.strength = strength self.seed = seed self.bin_dir = bin_dir self.data_dir = data_dir self.overwrite = overwrite def mcrefinement(self, working_dir, initial, reference): """Run ``mcrefinement`` to refine main chain.""" # Called like: # ./mcrefinement data_dir bin_dir ini_name ref_name ran_num mc_cmd_line = self.MCREFINEMENT( (self.bin_dir, "mcrefinement"), positional=(working_dir, self.data_dir, initial, reference, self.seed if self.seed else random.randint( 0, 1000000))) output_file = Path(working_dir, "mc" + str(initial)) if self.overwrite or not output_file.exists(): self.logger.debug("Running %s", mc_cmd_line) # Can't just use check=True because mcrefinement will return an # exit status of 1 even on success. result = subprocess.run(mc_cmd_line) if result.returncode not in (0, 1): raise subprocess.CalledProcessError(result.returncode, mc_cmd_line) return output_file def emrefinement(self, working_dir, start, reference): """Run ``emrefinement``, starting from `start`.""" # Called like: # ./emrefinement data_dir bin_dir ini_name ref_name str_val ran_num em_cmd_line = self.EMREFINEMENT( (self.bin_dir, "emrefinement"), positional=(working_dir, self.data_dir, start, reference, self.strength, self.seed if self.seed else random.randint(0, 1000000))) output_file = Path(working_dir, "em" + str(start)) if self.overwrite or not output_file.exists(): self.logger.debug("Running %s", em_cmd_line) # Again, manually check for a "valid" return code. result = subprocess.run(em_cmd_line) if result.returncode not in (0, 1): raise subprocess.CalledProcessError(result.returncode, mc_cmd_line) return output_file @staticmethod def normalise_chains(initial, refined): """ ModRefiner sets the chain of the refined model to " ", which is confusing. We reset it to the chain ID of the intial model (assuming there is only one chain). """ parser = Bio.PDB.PDBParser(QUIET=True) initial_chain = list( parser.get_structure("initial", initial)[0].get_chains())[0] refined_struc = parser.get_structure("refined", refined) refined_struc[0][" "].id = initial_chain.get_id() pdb_io = Bio.PDB.PDBIO() pdb_io.set_structure(refined_struc) pdb_io.save(str(refined)) def run(self, data, config=None, pipeline=None): """Run ModRefiner to refine model.""" jmes_ext = phyre_engine.tools.jmespath.JMESExtensions(data) jmes_opts = jmespath.Options(custom_functions=jmes_ext) initial = Path(jmespath.search(self.initial, data, jmes_opts)) reference = Path(jmespath.search(self.reference, data, jmes_opts)) # We will call the output files "<basename>.mc.pdb" and # "<basename>.em.pdb". The basename is the basename of the initial # structure. output_files = (initial.with_suffix(".mc.pdb"), initial.with_suffix(".em.pdb")) # Use output files if they exist if not self.overwrite and all([i.exists() for i in output_files]): data["model"] = str(output_files[1]) return data # Modrefiner expects the initial and reference structures to be in the # same directory. This is going to be frustrating for us, so we will # symlink them in a temporary directory. This also allows us to # normalise the names of the files, which is handy because modrefiner # bases the output file name on the input file. with tempfile.TemporaryDirectory("-modrefiner", "phyreengine-") as work_dir: initial_link = Path(work_dir, "initial.pdb") reference_link = Path(work_dir, "reference.pdb") initial_link.symlink_to(initial.resolve()) reference_link.symlink_to(reference.resolve()) bb_refined = self.mcrefinement(work_dir, initial_link.name, reference_link.name) sc_refined = self.emrefinement(work_dir, bb_refined.name, reference_link.name) # Copy the output files to the current directory shutil.copy2(bb_refined, output_files[0]) shutil.copy2(sc_refined, output_files[1]) # Normalise chain IDs for each output file self.normalise_chains(initial, output_files[0]) self.normalise_chains(initial, output_files[1]) data["model"] = output_files[1] return data
def run(self, data, config=None, pipeline=None): """Generating PSSM from MSA.""" hhconsensus = ExternalTool( { "input": "i", "seqfile": "s", "verbose": "v", }, long_prefix="-") hhfilter = ExternalTool({ "input": "i", "oa3m": "o", "verbose": "v" }, long_prefix="-") reformat = ExternalTool({"no_lower": "r"}, long_prefix="-") blastpgp = ExternalTool({ "database": "d", "input": "i", "iterations": "j", "db_seq_alns": "b", "evalue_threshold": "h", "input_alignment": "B", "checkpoint": "C", "ascii_pssm": "Q" }) makemat = ExternalTool({"profile_db": "P"}) try: tmpdir = tempfile.mkdtemp("-pssm", "phyreengine-") name, a3m = self.get_vals(data) query_seq = self.read_query_seq(a3m, name) tmp_a3m = Path(tmpdir, "msa.a3m") tmp_seq = Path(tmpdir, "seq.fasta") tmp_psi = Path(tmpdir, "msa.psi") env = os.environ.copy() env["HHLIB"] = self.HHLIB if "-" in query_seq: # Generate consensus sequence command_line = hhconsensus(executable=(self.hhsuite_dir, "hhconsensus"), options={ "input": a3m, "seqfile": tmp_seq, "oa3m": tmp_a3m }) subprocess.run(command_line, check=True, env=env) else: # If there are no gaps, just copy the query a3m and write the # query sequence. shutil.copy2(a3m, str(tmp_a3m)) with tmp_seq.open("w") as tmp_seq_out: tmp_seq_out.write(">{name}\n{query}\n".format( name=name, query=query_seq)) # Filter query a3m to desired diversity command_line = hhfilter(executable=(self.hhsuite_dir, "hhfilter"), options={ "neff": 7, "input": tmp_a3m, "oa3m": tmp_a3m }) subprocess.run(command_line, check=True, env=env) # Reformat to PSI-BLAST format command_line = reformat( (str(Path(self.HHLIB, "scripts")), "reformat.pl"), positional=["a3m", "psi", tmp_a3m, tmp_psi], flags=["no_lower", "noss"]) subprocess.run(command_line, check=True, env=env) # Generate PSSM using blastpgp chk_file = "profile.chk" mtx_file = "profile.mtx" pssm_file = "profile.pssm" dummy_db = Path(self.HHLIB, "data/do_not_delete") command_line = blastpgp(executable=(self.blast_dir, "blastpgp"), options={ "db_seq_alns": 1, "iterations": 1, "evalue_threshold": 0.001, "database": dummy_db, "input": tmp_seq, "input_alignment": tmp_psi, "checkpoint": chk_file, "ascii_pssm": pssm_file }) subprocess.run(command_line, check=True) # Build mtx file using makemat # First build the profile "databases" for use with makemat. These # are just two files with the same prefix and the suffixes ".pn" # and ".sn", which contain a list of checkpoint files and the # corresponding list of sequences. File names are resolve relative # to the directory containing the *.sn and *.pn files, so we will # symlink the checkpoint. tmp_sn_file = Path(tmpdir, "makemat.sn") tmp_pn_file = Path(tmpdir, "makemat.pn") with tmp_sn_file.open("w") as sn_out: print(str(Path(tmp_seq.name)), file=sn_out) with tmp_pn_file.open("w") as pn_out: chk_path = Path(chk_file) chk_link = Path(tmpdir, "makemat.chk") chk_link.symlink_to(chk_path.resolve()) print(str(chk_link.name), file=pn_out) command_line = makemat( executable=(self.blast_dir, "makemat"), options={"profile_db": str(Path(tmpdir, "makemat"))}) subprocess.run(command_line, check=True) shutil.copy2(str(Path(tmpdir, "makemat.mtx")), mtx_file) data["pssm"] = { "mtx": mtx_file, "chk": chk_file, "ascii": pssm_file } return data finally: shutil.rmtree(tmpdir)
class LoopModel(Component): """ Use Alex Herbert's loop modeler to fill in as many gaps as possible. .. versionchanged:: 0.1a1 This component no longer operates on every component from the top level of the pipeline. If you wish to apply the loop modeller to a list of templates, call it from within a :py:class:`~phyre_engine.component.component.Map` component. You may use :py:class:`~phyre_engine.component.jmespath.Update` to copy the ``pssm`` and ``sequence`` keys from the top level of the pipeline state into each template. :param str bin_dir: Location of the loop modelling executable. :param str config: Loop modeller configuration file. :param str executable: Name of the executable to run, under `bin_dir`. """ REQUIRED = ["pssm", "query_sequence", "model"] ADDS = [] REMOVES = [] LOOP_MODELLER = ExternalTool( { "config": "c", "query": "f", "out_dir": "d", "model_list": "l", }, long_prefix="-") CONFIG_SECTION = "sbg_loop" def __init__(self, bin_dir, config, executable="assembler.loop"): self.bin_dir = bin_dir self.config = config self.executable = executable def convert_ascii_pssm(self, pssm, output): """ Convert the PSSM generated by blastpgp into the loop modeller format. This means chopping off the header and the column with residue IDs. """ with open(pssm, "r") as pssm_in: for line in pssm_in: if re.match(r"^\s+\d", line): cols = line.split() residue = cols[1] counts = cols[2:22] output.write(residue + " ") for count in counts: output.write("{:4d}".format(int(count))) output.write("\n") output.flush() def run(self, data, config=None, pipeline=None): """Fill short gaps with loop modeller.""" pssm, sequence, model = self.get_vals(data) try: tmpdir = tempfile.mkdtemp("-loop", "phyreengine-") self.logger.debug("Loop modelling using tmpdir: %s", tmpdir) out_dir = Path(model).with_suffix(".loop") # Attempt to use existing models if the output directory exists. if not out_dir.exists(): loop_pssm = Path(tmpdir, "loop.pssm") query_fasta = Path(tmpdir, "query.fasta") model_list = Path(tmpdir, "model.list") with model_list.open("w") as model_list_out: print(str(Path(model).resolve()), file=model_list_out) with loop_pssm.open("w") as loop_out: self.convert_ascii_pssm(pssm["ascii"], loop_out) with query_fasta.open("w") as query_out: fasta_seq = ">model\n{}\n".format(sequence) print(fasta_seq, file=query_out) command_line = self.LOOP_MODELLER(executable=(self.bin_dir, self.executable), options={ "config": self.config, "pssm": loop_pssm, "query": query_fasta, "model_list": model_list, "out_dir": out_dir, }) self.logger.debug("Running %s", command_line) subprocess.run(command_line, check=True) # Replace "model" field with the first loop model. model_path = (out_dir / "model.1" / "model.1.pdb") if not model_path.exists(): err_msg = "Loop-modelled file '{}' does not exist" raise FileNotFoundError(err_msg.format(model_path)) data["model"] = str(model_path) finally: shutil.rmtree(tmpdir) return data
class DSSP(Component): """ Calculate secondary structure state using `DSSP <http://swift.cmbi.ru.nl/gv/dssp/>`_. This component requires the ``structure`` field to be set, otherwise it has no source of tertiary structure from which to calculate the secondary structure. :param str bin_dir: Directory containing the ``mkdssp`` executable, if it is not in the system ``$PATH``. """ CONFIG_SECTION = "dssp" REQUIRED = ["structure"] ADDS = [SECONDARY_STRUCTURE_KEY] REMOVES = [] TOOL_NAME = "dssp" MKDSSP = ExternalTool() def __init__(self, bin_dir=None): self.bin_dir = bin_dir def run(self, data, config=None, pipeline=None): """Calculate ``secondary_structure`` key.""" structure = self.get_vals(data) # Create secondary_structure key if it is not present if SECONDARY_STRUCTURE_KEY not in data: data[SECONDARY_STRUCTURE_KEY] = {} # Run DSSP on the structure file and read the output mkdssp_cmd_line = self.MKDSSP( (self.bin_dir, "mkdssp"), options={"input": structure}) dssp_proc = subprocess.run( mkdssp_cmd_line, universal_newlines=True, check=True, stdout=subprocess.PIPE) dssp_mapping = self.parse_dssp(dssp_proc.stdout.split("\n")) data[SECONDARY_STRUCTURE_KEY][self.TOOL_NAME] = dssp_mapping return data @staticmethod def parse_dssp(dssp_lines): """ Parse lines of output from DSSP. :return: List of tuples containing the residue ID and secondary structure state. :rtype: list[tuple(int, str)] """ residue_section = False dssp_mapping = [] for line in dssp_lines: if line.startswith(" # RESIDUE AA"): residue_section = True elif residue_section and len(line) > 17: res_id = line[5:10] aa_type = line[13] sec_struc = line[16] # Ignore missing residues. if aa_type == '!': continue # Use "C" for coils. if sec_struc == ' ': sec_struc = 'C' residue_ss = { "assigned": sec_struc, "confidence": {}, "res_id": int(res_id)} for state in EightStateSS: confidence = 1.0 if state.value == sec_struc else 0.0 residue_ss["confidence"][state.value] = confidence dssp_mapping.append(residue_ss) return dssp_mapping
class Scwrl4(Component): """ Run `SCWRL4 <http://dunbrack.fccc.edu/scwrl4/>`_ to reconstruct the side-chains of a model. :param str bin_dir: Directory containing the ``scwrl4`` executable. :param bool overwrite: If `True`, always run ``scwrl4``; otherwise, an existing output file will be used as-is. """ ADDS = [] REMOVES = [] REQUIRED = ["model"] SCWRL4 = ExternalTool( flag_map={ "input": "i", "output": "o", "sequence": "s", "parameters": "p", "frame": "f", "graph": "g", "workspace": "w", "symmetry": "%", "crystal": "#", "disable_subrotamers": "v", "omit_hydrogens": "h", "disable_terminal_capping": "t", }) CONFIG_SECTION = "scwrl4" def __init__(self, bin_dir=None, overwrite=False): self.bin_dir = bin_dir self.overwrite = overwrite def run(self, data, config=None, pipeline=None): """Run SCWRL4 to reconstruct side-chains.""" model = self.get_vals(data) outfile = Path(model).with_suffix(".scwrl4.pdb") if not outfile.exists() or self.overwrite: command_line = self.SCWRL4((self.bin_dir, "Scwrl4"), options={ "input": model, "output": outfile }) program = subprocess.run(command_line, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Scwrl doesn't always set a sensible exit value, so if we see # "^Err$" on standard output, treat it as an error. if re.search("^Err$", program.stdout, re.MULTILINE): self.logger.error(("Error running SCWRL4. Command line: %s\n" "Standard output: %s\n" "Standard error: %s\n"), command_line, program.stdout, program.stderr) raise subprocess.CalledProcessError(program.returncode, command_line, program.stdout, program.stderr) data["model"] = str(outfile) return data
class MobiDBLite(Component): """ `MobiDB lite <http://protein.bio.unipd.it/mobidblite/>`_ is a meta-predictor of disorder. It combines nine fast predictors to quickly produce a consensus prediction. :param str bin_dir: Directory containing the ``mobidb-lite.py`` script. :param str supporting_bin_dir: Root directory of the binaries used by MobiDB lite (i.e. the directory passed via the ``--binDirectory`` option). """ REQUIRED = ["sequence"] ADDS = [DISORDER_KEY] REMOVES = [] TOOL_NAME = "mobidb-lite" EXECUTABLE_NAME = "mobidb-lite.py" MOBIDB_TOOL = ExternalTool() MOBDIB_DEFAULT_ARGS = {"threads": 1} MOBIDB_DEFAULT_FLAGS = {"longOutput"} def __init__(self, bin_dir, supporting_bin_dir): self.bin_dir = bin_dir self.supporting_bin_dir = supporting_bin_dir @staticmethod def parse_results(mobidb_output): """Parse long MobiDB lite result string.""" mdb_results = json.loads(mobidb_output) disorder = [] for state, prob in zip(mdb_results["consensus"], mdb_results["p"]): disorder.append({ "assigned": state, "confidence": { DisorderStates.DISORDERED.value: prob, DisorderStates.STRUCTURED.value: 1 - prob } }) return disorder def run(self, data, config=None, pipeline=None): """Run MobiDB lite to predict disorder.""" sequence = self.get_vals(data) # Add disorder key if it's not present if DISORDER_KEY not in data: data[DISORDER_KEY] = {} # Write sequence to temp file and run mobidb-lite with tempfile.NamedTemporaryFile("w") as seq_file: print(">query", file=seq_file) print(sequence, file=seq_file) seq_file.flush() mobdib_options = self.MOBDIB_DEFAULT_ARGS.copy() mobdib_options["binDirectory"] = self.supporting_bin_dir command_line = self.MOBIDB_TOOL( (self.bin_dir, self.EXECUTABLE_NAME), flags=self.MOBIDB_DEFAULT_FLAGS, options=mobdib_options, positional=[seq_file.name]) self.logger.info("Running '%s'", command_line) mobidb_proc = subprocess.run(command_line, stdout=subprocess.PIPE, check=True, universal_newlines=True) # Mobidb will rudely return no output when no disordered regions are # found. In those cases, we just don't add the mobidb-lite key to # the disordered predictor. if mobidb_proc.stdout.strip(): disorder = self.parse_results(mobidb_proc.stdout) data[DISORDER_KEY][self.TOOL_NAME] = disorder return data
class Disopred(Component): """ Run `DISOPRED <http://bioinf.cs.ucl.ac.uk/psipred/?disopred=1>`_ to predict disordered regions of protein structure. This component takes a shortcut compared to the stock ``run_disopred.pl`` script supplied with DISOPRED: it will use an existing ``mtx`` file generated, for example, by :py:class:`phyre_engine.component.hhsuite.PSSM`. This component also does not predict binding binding sites using ProtBind. :param str data_dir: Directory containing DISOPRED data files. :param str dso_lib_dir: Directory containing DISOPRED library files. :param str bin_dir: Directory containing the DISOPRED executables. :param bool overwrite: If `True`, always overwrite existing DISOPRED results with a new run. Otherwise, existing results will be used as-is. """ REQUIRED = ["pssm"] ADDS = ["disorder"] REMOVES = [] CONFIG_SECTION = "disopred" #: Adjustable DISOPRED2 false positive rate, from 1-10. DISOPRED2_FPR = 5 DISOPRED2 = ExternalTool() DISO_NEU_NET = ExternalTool() DISO_NEIGHB = ExternalTool() COMBINE = ExternalTool() def __init__(self, data_dir, dso_lib_dir, bin_dir=None, overwrite=False): self.data_dir = data_dir self.dso_lib_dir = dso_lib_dir self.bin_dir = bin_dir self.overwrite = overwrite @staticmethod def parse_results(diso_in): """ Parse disopred output file into the format described in :py:mod:`.disorder`. The disopred format looks like this: .. code-block:: none # ----- DISOPRED version 3.1 ----- # Disordered residues are marked with asterisks (*) # Ordered residues are marked with dots (.) 1 M * 0.78 2 K * 0.62 3 T . 0.45 4 A . 0.37 5 Y . 0.20 We parse this into the following list: .. code-block:: python [ {"assigned": "D", "confidence": {"S": 0.22, "D": 0.78}}, {"assigned": "D", "confidence": {"S": 0.38, "D": 0.62}}, {"assigned": "S", "confidence": {"S": 0.55, "D": 0.45}}, {"assigned": "S", "confidence": {"S": 0.63, "D": 0.37}}, {"assigned": "S", "confidence": {"S": 0.80, "D": 0.20}}, ] :param file diso_in: File handle pointing to DISOPRED output. """ disorder = [] for line in diso_in: line = line.strip() if line.startswith("#"): continue _index, _aa, state, score = line.split() if state == "*": state = DisorderStates.DISORDERED else: state = DisorderStates.STRUCTURED score = float(score) disorder.append({ "assigned": state.value, "confidence": { DisorderStates.DISORDERED.value: score, DisorderStates.STRUCTURED.value: 1 - score } }) return disorder def _run_disopred(self, mtx_file, disopred_results): """Run each tool in the disopred pipeline.""" with tempfile.TemporaryDirectory() as tmpdir: tmpdir = pathlib.Path(tmpdir) # $args = join ' ', "$EXE_DIR/disopred2", join('/', $out_dir, $base), $mtx_fn, "$DATA_DIR/", $DISO2_FPR, "\n"; # system($args) == 0 or die "[$0] ERROR: $args failed. Please report error to psipred\@cs.ucl.ac.uk\n"; disopred2_results = tmpdir / "disopred2" self.logger.info("Predicting disorder with DISOPRED2.") disopred2_cmd = self.DISOPRED2( (self.bin_dir, "disopred2"), positional=(disopred2_results, mtx_file, self.data_dir + "/", self.DISOPRED2_FPR)) self.logger.debug("Running %s", disopred2_cmd) subprocess.run(disopred2_cmd, check=True) # $args = join ' ', "$EXE_DIR/diso_neu_net", "$DATA_DIR/weights.dat.nmr_nonpdb", $mtx_fn, ">", $nndiso_fn, "\n"; # system($args) == 0 or die "[$0] ERROR: $args failed. Please report error to psipred\@cs.ucl.ac.uk"; diso_neu_net_results = tmpdir / "diso_neu_net" with diso_neu_net_results.open("wb") as neu_net_out: self.logger.info("Running neural network classifier.") diso_neu_net_cmd = self.DISO_NEU_NET( (self.bin_dir, "diso_neu_net"), positional=(pathlib.Path(self.data_dir, "weights.dat.nmr_nonpdb"), mtx_file)) self.logger.debug("Running %s > %s", diso_neu_net_cmd, diso_neu_net_results) subprocess.run(diso_neu_net_cmd, stdout=neu_net_out, check=True) # $args = join ' ', "$EXE_DIR/diso_neighb", $mtx_fn, "$DATA_DIR/dso.lst", ">", $dnb_fn, "\n"; # system($args) == 0 or die "[$0] ERROR: $args failed. Please report error to psipred\@cs.ucl.ac.uk\n"; diso_neighb_results = tmpdir / "diso_neighb" with diso_neighb_results.open("wb") as neighb_out: self.logger.info("Running nearest neighbour classifier.") diso_neighb_cmd = self.DISO_NEIGHB( (self.bin_dir, "diso_neighb"), positional=(mtx_file, pathlib.Path(self.data_dir, "dso.lst"))) self.logger.debug("Running %s > %s", diso_neighb_cmd, diso_neighb_results) environment = dict(os.environ) environment["DSO_LIB_PATH"] = str(self.dso_lib_dir) + "/" subprocess.run(diso_neighb_cmd, stdout=neighb_out, check=True, env=environment) # $args = join ' ', "$EXE_DIR/combine", "$DATA_DIR/weights_comb.dat", $diso2_fn, $nndiso_fn, $dnb_fn, ">", $diso3_fn, "\n"; # system($args) == 0 or die "[$0] ERROR: $args failed. Please report error to psipred\@cs.ucl.ac.uk"; with disopred_results.open("wb") as diso_out: self.logger.info("Combining disordered residue predictions.") combine_cmd = self.COMBINE( (self.bin_dir, "combine"), positional=( pathlib.Path(self.data_dir, "weights_comb.dat"), # Add ".diso" suffix to disopred2 output str(disopred2_results) + ".diso", diso_neu_net_results, diso_neighb_results)) self.logger.debug("Running %s > %s", combine_cmd, disopred_results) subprocess.run(combine_cmd, stdout=diso_out, check=True) def run(self, data, config=None, pipeline=None): """Run DISOPRED to predict disorder.""" pssms = self.get_vals(data) mtx_file = pssms["mtx"] disopred_results = pathlib.Path("disorder.diso") if not disopred_results.exists() or self.overwrite: self._run_disopred(mtx_file, disopred_results) with disopred_results.open("r") as diso_in: disorder = self.parse_results(diso_in) if "disorder" not in data: data["disorder"] = {} data["disorder"]["disopred"] = disorder return data