def test_getseqs(self): sc_des = {"sequence": "B"} # Start test df = ri.parse_rosetta_file(self.silent1, sc_des) assert df.shape[0] == 6 df.get_sequence_with('B', [(1, 'T')]).shape[0] == 3
def test_split_values(self): # Start test df = ri.parse_rosetta_file(self.silent1) split1 = { 'split': [('GRMSD2Target', 'grmsdTr'), ('GRMSD2Template', 'grmsdTp'), ('LRMSD2Target', 'lrmsdTp'), ('LRMSDH2Target', 'lrmsdh2'), ('LRMSDLH2Target', 'lrmsdlh2')], 'names': ['rmsd', 'rmsd_type'] } dfs1 = ru.split_values(df, split1) split2 = { 'split': [('GRMSD2Target', 'global', 'target'), ('GRMSD2Template', 'global', 'template'), ('LRMSD2Target', 'local', 'target'), ('LRMSDH2Target', 'local', 'helix2'), ('LRMSDLH2Target', 'local', 'lhelix2')], 'names': ['rmsd', 'rmsd_type', 'rmsd_target'] } dfs2 = ru.split_values(df, split2) assert df.shape[0] == 6 assert dfs1.shape[0] == 6 * 5 assert dfs1.shape[0] == dfs2.shape[0] assert dfs1.shape[1] == dfs2.shape[1] - 1 assert 'rmsd' in dfs1.columns assert 'rmsd' in dfs2.columns assert 'rmsd_type' in dfs1.columns assert 'rmsd_type' in dfs2.columns assert 'rmsd_target' not in dfs1.columns assert 'rmsd_target' in dfs2.columns
def test_select_scores(self): """ Select only some scores of interest. """ # pick some sc_des = { "scores": ["score", "rama", "omega", "packstat", "rmsd_drift"] } df = ri.parse_rosetta_file(self.silent1, sc_des) assert list(df.columns.values) == sc_des["scores"] assert df["packstat"].mean() == pytest.approx(0.59, 0.02) # pick all sc_des = {"scores": "*"} df = ri.parse_rosetta_file(self.silent1, sc_des) assert list(df.columns.values) == self.defhead
def test_read_labels(self): """ Check how labels are read and loaded. """ sc_des = {"labels": ["MOTIF", "CONTACT", "CONTEXT"]} motif = ("43-64", "A:#(0),B:#(22)") contact = ("9-26,28-29,31-32,35,37-40,67-68,70-71,89,91-135", "A:#(19),B:#(58)") context = ("117-273", "A:#(157),B:#(0)") df = ri.parse_rosetta_file(self.silent1, sc_des) defhead = self.defhead + ["lbl_MOTIF", "lbl_CONTACT", "lbl_CONTEXT"] assert list(df.columns.values) == defhead # Check the label values and types s = df.iloc[0] assert str(s["lbl_MOTIF"]) == motif[1] assert str(s["lbl_CONTACT"]) == contact[1] assert str(s["lbl_CONTEXT"]) == context[1] assert isinstance(s["lbl_MOTIF"], rc.SelectionContainer) # Check the internal values assert s["lbl_CONTACT"][ "B"] == "9-26,28-29,31-32,35,37-40,67-68,70-71,89,91-116" assert s["lbl_CONTACT"]["A"] == "1-19" # Check that data is not repeated when it should not be f = df.iloc[1] assert cmp(s["lbl_MOTIF"], f["lbl_MOTIF"]) == 0 assert cmp(s["lbl_CONTACT"], f["lbl_CONTACT"]) != 0 assert cmp(s["lbl_CONTEXT"], f["lbl_CONTEXT"]) == 0
def test_structure_similarities(self): sse_ref = "LEEEEEEELLLEEEEEEELLLLHHHHHHHHHHHHLLLLLLLLLLLEEEELLLEEEELL" diff1 = "LEEEEEEELLEEEEEEEELLLLHHHHHHHHHHHHLLLLLLLLLLEEEEELLLEEEEEL" sc_des = {"scores": ["score"], "structure": "C"} # Start test df = ri.parse_rosetta_file(self.silent3, sc_des) df.add_reference_structure("C", sse_ref) # secondary structure distribution dfsse = ra.positional_structural_count(df, 'C') assert set(dfsse.columns.values) == set(['H', 'E', 'L']) assert dfsse.shape[0] == len(sse_ref) assert dfsse.H.mean() == pytest.approx(0.2033, rel=1e-3) assert dfsse.E.mean() == pytest.approx(0.4038, rel=1e-3) assert dfsse.L.mean() == pytest.approx(0.3927, rel=1e-3) # secondary structure match dfsm = ra.positional_structural_identity(df, 'C') assert set(dfsm.columns.values) == set( ['identity_perc', 'sse', 'max_sse']) assert dfsm.shape[0] == len(sse_ref) assert "".join(list(dfsm.sse.values)) == sse_ref assert "".join(list(dfsm.max_sse.values)) == diff1 assert dfsm.identity_perc.mean() == pytest.approx(0.8121, rel=1e-3) # percentages dfpc = ra.secondary_structure_percentage(df, 'C') assert 'structure_C_H' in dfpc.columns assert 'structure_C_E' in dfpc.columns assert 'structure_C_L' in dfpc.columns assert dfpc['structure_C_H'].max() == pytest.approx(0.2413, rel=1e-3) assert dfpc['structure_C_E'].mean() == pytest.approx(0.4038, rel=1e-3) assert dfpc['structure_C_L'].min() == pytest.approx(0.3275, rel=1e-3)
def test_sequence_data(self): """ Load data from sequence, structure or psipred """ with pytest.raises(ValueError): sc_des = {"sequence": "A", "structure": "A"} df = ri.parse_rosetta_file(self.silent2, sc_des) sc_des = {"sequence": "C", "structure": "C"} df = ri.parse_rosetta_file(self.silent2, sc_des) assert len(df.columns.values) == 24 assert len(df["sequence_C"]) == len(df["structure_C"]) sc_des = {"scores_ignore": "*", "sequence": "C", "structure": "C"} df = ri.parse_rosetta_file(self.silent2, sc_des) assert len(df.columns.values) == 2 assert len(df["sequence_C"]) == len(df["structure_C"])
def test_logo_plot_noref(self): sc_des = {"sequence": "B"} # Start test df = ri.parse_rosetta_file(self.silent1, sc_des) fig, _ = rp.logo_plot(df, "B", refseq=False, line_break=50) plt.tight_layout() return fig
def test_naming(self): """ Generate new data columsn from the design's description. """ sc_des = {"naming": ["", "source", "", "status", "dcount"]} df = ri.parse_rosetta_file(self.silent1, sc_des) assert len(df.columns.values) == len(self.defhead) + 3 assert df.iloc[0]["status"] == "labeled" assert df["dcount"].mean() == 3.5
def test_sse_logo(self): custom = { 'E': '#0000FF', 'H': '#00FF00', 'L': '#FF0000' } ff = os.path.join(self.dirpath, 'input_3ssepred.minisilent.gz') df = ri.parse_rosetta_file(ff, {'structure': 'A'}) fs = df.structure_bits('A') fig, axs = rp.logo_plot(fs, "A", refseq=False, line_break=50, font_size=10, hight_prop=2, colors=custom) return fig
def test_read_default(self): """ What do we pick when nothing is defined. """ df = ri.parse_rosetta_file(self.silent1) assert list(df.columns.values) == self.defhead assert list(df.shape) == [6, len(self.defhead)] assert df["score"].mean() == pytest.approx(-207.9, 0.2) assert df["packstat"].mean() == pytest.approx(0.59, 0.02)
def test_symmetry(self): """ Check on sequence capture with symmetry silent files. """ sc_des = {"sequence": "AB"} df = ri.parse_rosetta_file(self.silent3, sc_des) assert set(df.columns.values) == set(self.symhead + ["sequence_A", "sequence_B"]) assert df.iloc[0]["sequence_A"] == df.iloc[0]["sequence_B"]
def test_plot_dssp_vs_psipred(self): # Start test sa_des = {"scores": ["score"], "psipred": "*", "structure": "*"} df = ri.parse_rosetta_file(self.silent4, sa_des) fig = plt.figure(figsize=(15, 10)) ax = plt.gca() rp.plot_dssp_vs_psipred(df.iloc[0], "A", ax) plt.tight_layout() return fig
def test_global_preview(self): df = ri.parse_rosetta_file(self.silent1) values = [ "score", "hbond_sr_bb", "B_ni_rmsd", "hbond_bb_sc", "cav_vol", "design_score", "packstat", "rmsd_drift" ] fig = plt.figure(figsize=(25, 10)) rp.multiple_distributions(df, fig, (2, 4), values) plt.tight_layout() return fig
def test_ramachandran_plot(self): # Start test sa_des = {"scores": ["score"], "sequence": "*", "dihedrals": "*"} df = ri.parse_rosetta_file(self.silent4, sa_des) fig = plt.figure(figsize=(15, 10)) fig2 = plt.figure(figsize=(15, 10)) with pytest.raises(ValueError): rp.plot_ramachandran(df, "A", fig2) rp.plot_ramachandran(df.iloc[0], "A", fig) plt.tight_layout() return fig
def test_sequence_similarities(self): refseq = "GSISDIRKDAEVRMDKAVEAFKNKLDKFKAAVRKVFPTEERIDMRPEIWIAQELRRIGDE" \ "FNAYRDANDKAAALGKDKEINWFDISQSLWDVQKLTDAAIKKIEAALADMEAWLTQ" diff1 = "....+.R+.A....+.A+.....+.++.....++.....E..DM.PE..IA..LR.IG+." \ "FNA......+.....K+.......+.+...+..K+...........+........+" diff2 = "000000100100000010000000000000000000000100110110011001101100" \ "11100000000000010000000000000000010000000000000000000000" diff3 = "000000100110110110100000000000001100111100110110011101101100" \ "11110010011001010010010000000000011000101011010001100000" sc_des = {"scores": ["score"], "sequence": "B"} new_cols = [ "blosum62_B_raw", "blosum62_B_perc", "blosum62_B_identity", "blosum62_B_positive", "blosum62_B_negative", "blosum62_B_ali", "blosum62_B_per_res" ] # Start test df = ri.parse_rosetta_file(self.silent1, sc_des) df.add_reference_sequence("B", refseq) # global sequence similarity dfss = ra.sequence_similarity(df, "B") assert len(dfss.columns) == len(df.columns) + 7 assert len(set(dfss.columns).difference(set( df.columns))) == len(new_cols) assert df.shape[0] == dfss.shape[0] assert dfss.blosum62_B_raw.mean() == 41.0 assert dfss.blosum62_B_perc.mean() == pytest.approx(0.0692, rel=1e-3) assert dfss.blosum62_B_identity.mean() == pytest.approx(24.333, rel=1e-3) assert dfss.blosum62_B_positive.mean() == pytest.approx(46.166, rel=1e-3) assert dfss.blosum62_B_negative.mean() == pytest.approx(69.833, rel=1e-3) assert dfss.blosum62_B_ali.values[0] == diff1 # local sequence similarity dfps = ra.positional_sequence_similarity(df, "B") assert dfps.shape == (len(refseq), 2) assert list(dfps.index.values) == list(range(1, len(refseq) + 1)) assert dfps.identity_perc.mean() < dfps.positive_perc.mean() assert dfps.identity_perc.mean() == pytest.approx(0.2097, rel=1e-3) assert dfps.positive_perc.mean() == pytest.approx(0.3979, rel=1e-3) # binary similarity df01 = ra.binary_similarity(df, "B") assert len(df01.columns) == len(df.columns) + 1 assert df01.identity_B_binary.values[0] == diff2 # binary overlap assert "".join([str(_) for _ in ra.binary_overlap(df01, "B")]) == diff3
def test_summary_plot(self): # Start test df = ri.parse_rosetta_file(self.silent1) fig = plt.figure(figsize=(30, 30)) rp.multiple_distributions(df, fig, (3, 3), [ 'score', 'GRMSD2Target', 'GRMSD2Template', 'LRMSD2Target', 'LRMSDH2Target', 'LRMSDLH2Target', 'design_score', 'packstat', 'rmsd_drift' ]) plt.tight_layout() return fig
def test_logo_plot(self): refseq = "GSISDIRKDAEVRMDKAVEAFKNKLDKFKAAVRKVFPTEERIDMRPEIWIAQELRRIGDE" \ "FNAYRDANDKAAALGKDKEINWFDISQSLWDVQKLTDAAIKKIEAALADMEAWLTQ" sc_des = {"sequence": "B"} # Start test df = ri.parse_rosetta_file(self.silent1, sc_des) df.add_reference_sequence("B", refseq) fig, _ = rp.logo_plot(df, "B", refseq=True, line_break=50) plt.tight_layout() return fig
def test_scores_by_residue(self): """ Pick score_by_residue data. """ # when non-requested, do not pick per-residue data df = ri.parse_rosetta_file(self.silent3) assert set(df.columns.values) == set(self.symhead) # one can request individual positions, though sc_des = {"scores": ["residue_ddg_66"]} df = ri.parse_rosetta_file(self.silent3, sc_des) assert len(df.columns.values) == 1 assert df.iloc[0]["residue_ddg_66"] == pytest.approx(-3.47, 0.02) # get all per-residue values sc_des = {"scores": "-", "scores_by_residue": ["residue_ddg_"]} df = ri.parse_rosetta_file(self.silent3, sc_des) assert len(df.columns.values) == 1 # request wrong per-residue value with pytest.raises(AttributeError): sc_des = {"scores_by_residue": ["residue_score_"]} df = ri.parse_rosetta_file(self.silent3, sc_des)
def test_rename_scores(self): """ Rename scores into something different. """ sc_des = os.path.join(self.dirpath, 'description_rename.json') defhead = self.defhead[:] defhead[defhead.index("packstat")] = "inscore" defhead[defhead.index("rama")] = "dingong" df = ri.parse_rosetta_file(self.silent1, sc_des) assert len(df.columns.values) == len(self.defhead) assert list(df.columns.values) == defhead with pytest.raises(KeyError): assert df["packstat"].mean() == pytest.approx(0.59, 0.02) assert df["inscore"].mean() == pytest.approx(0.59, 0.02)
def test_ignore_scores(self): """ Use the ignore_scores parameter. """ # ignore point columns defhead = self.defhead[:] sc_des = {"scores_ignore": ["dslf_fa13", "rama", "omega", "fa_dun"]} fl_des = os.path.join(self.dirpath, 'description_ignore.yaml') for x in sc_des["scores_ignore"]: defhead.remove(x) df = ri.parse_rosetta_file(self.silent1, fl_des) assert list(df.columns.values) == defhead assert df["packstat"].mean() == pytest.approx(0.59, 0.02) with pytest.raises(KeyError): df["dslf_fa13"] # ignore by widlcard defhead = [x for x in self.defhead if not x.startswith("fa_")] sc_des = {"scores_ignore": ["fa_*"]} df = ri.parse_rosetta_file(self.silent1, sc_des) assert list(df.columns.values) == defhead with pytest.raises(KeyError): df["fa_dun"]
def main(options): if options.ifasta is None: infile = options.ifile if options.ifile is not None else options.ifiles defs = {"sequence": options.seqID} df = parse_rosetta_file(infile, defs, multi=options.ifiles is not None) else: df = read_fasta(options.ifasta) if options.ffile is not None: refseq = read_fasta(options.ffile).get_sequence("A").values[0] df.add_reference_sequence(options.seqID, refseq) # Alignment file alif = options.ofile + ".clw" write_clustalw(df, options.seqID, alif) # Mutation list file if options.ffile is not None: mutf = options.ofile + "_mutants.clw" write_mutant_alignments(df, options.seqID, mutf) # Logo Plot logof = options.ofile + "_logo" + "." + options.iformat lfig, _ = logo_plot(df, options.seqID, refseq=options.ffile is not None, line_break=50, font_size=int(options.ifont)) plt.tight_layout() plt.savefig(logof) # Alignment plot afig = None if options.ffile is not None: alimgf = options.ofile + "_ali" + "." + options.iformat chunks = len(df.get_sequence(options.seqID).values[0]) chunks = int(math.ceil(float(chunks) / 50)) high_correct = math.ceil(df.shape[0] / 7.0) afig = plt.figure(figsize=(chunks * high_correct * 10, 10)) grid = (chunks, 1) ax = [] for i in range(chunks): ax.append(plt.subplot2grid(grid, (i, 0), fig=afig)) plot_alignment(df, options.seqID, ax, line_break=50, matrix=None) plt.savefig(alimgf) return lfig, afig
def test_sse_profile_plot(self): sse_ref = "LEEEEEEELLLEEEEEEELLLLHHHHHHHHHHHHLLLLLLLLLLLEEEELLLEEEELL" sc_des = {"scores": ["score"], "structure": "C"} # Start test df = ri.parse_rosetta_file(self.silent3, sc_des) df.add_reference_structure("C", sse_ref) df1 = ra.positional_structural_count(df, 'C') df2 = ra.positional_structural_identity(df, 'C') fig = plt.figure(figsize=(35, 10)) ax00 = plt.subplot2grid((1, 1), (0, 0)) rp.positional_structural_similarity_plot(pd.concat([df1, df2], axis=1), ax00) plt.tight_layout() return fig
def test_per_res_matrix_score(self): sc_des = {"scores": ["score"], "sequence": "B"} df = ri.parse_rosetta_file(self.silent1, sc_des) df.add_reference_sequence('B', df.iloc[0]['sequence_B']) df.add_reference_shift('B', 10) seles = [('15-25', 'red'), ('45B-60B', 'green')] fig = plt.figure(figsize=(25, 10)) ax0 = plt.subplot2grid((2, 1), (0, 0)) rp.per_residue_matrix_score_plot(df.iloc[1], "B", ax0) ax1 = plt.subplot2grid((2, 1), (1, 0)) rp.per_residue_matrix_score_plot(df.iloc[1], "B", ax1, selections=seles) plt.tight_layout() return fig
def test_pymol(self): df = parse_rosetta_file( os.path.join(self.dirpath, 'input_2seq.minisilent.gz'), {'sequence': 'B'}) df.add_reference_sequence('B', df.iloc[0].get_sequence('B')) df = df.identify_mutants('B').head() pick1 = "" pick2 = "sele test_3lhp_binder_labeled_00002_mut, test_3lhp_binder_labeled_00002 and " \ "((c. B and (i. 1-2 or i. 7-9 or i. 11-12 or i. 14-17 or i. 19 or i. 21-23 or " \ "i. 25-27 or i. 31-33 or i. 35-39 or i. 42 or i. 45 or i. 48 or i. 52 or " \ "i. 64-68 or i. 70-75 or i. 77 or i. 79-82 or i. 84-86 or i. 88-89 or " \ "i. 91-102 or i. 104-111 or i. 113-116)))" sel = pymol_mutant_selector(df) assert len(sel[0]) == 0 assert sel[0] == pick1 assert len(sel[1]) != 0 assert sel[1] == pick2
def main(options): infile = options.ifile if options.ifile is not None else options.ifiles df = parse_rosetta_file(infile, multi=options.ifiles is not None) # Plot if options.fsize[0] is None: fig = plt.figure() else: fig = plt.figure(figsize=[float(x) for x in options.fsize]) ax = plt.subplot2grid((1, 1), (0, 0), fig=fig) sns.regplot(x=options.x, y=options.y, data=df, fit_reg=False, ax=ax, color=sns.color_palette()[int(options.color)]) if options.ylim[0] is not None: ax.set_ylim(bottom=float(options.ylim[0]), top=float(options.ylim[1])) if options.xlim[0] is not None: ax.set_xlim(left=float(options.xlim[0]), right=float(options.xlim[1])) if options.ylab is not None: ax.set_ylabel(options.ylab) if options.xlab is not None: ax.set_xlabel(options.xlab) add_top_title(ax, options.title) plt.tight_layout() # Write to file if options.ofile is not None: plt.savefig(options.ofile) # Show on screen if not options.silent: plt.show() return fig
def main(options): # Get names and make new ones names = parse_rosetta_file(options.ifile, {"scores": ["description"]}) names["count"] = names.index + 1 names["new"] = names.apply( lambda row: new_names(row["count"], options.prefix), axis=1) # Load the silentfile and change names is_gz = options.ifile.endswith(".gz") fd = gzip.open(options.ifile) if is_gz else open(options.ifile) if not is_gz: data = "".join(fd.readlines()) else: data = "".join([_.decode('utf8') for _ in fd.readlines()]) fd.close() for _, row in names.iterrows(): data = data.replace(row["description"], row["new"]) # Save and write is_gz = options.ofile.endswith(".gz") fd = gzip.open(options.ofile, "w") if is_gz else open(options.ofile, "w") fd.write(data.encode('utf-8') if is_gz else data) fd.close()
def test_labels(self): sc_des = { "scores": ["score"], "labels": ["MOTIF", "CONTACT", "CONTEXT"], "sequence": "AB" } df = ri.parse_rosetta_file(self.silent1, sc_des) df = ra.selector_percentage(df, "A", "10-25", "test") df = ra.selector_percentage(df, "B", "12-20", "test") assert set(df.columns) == set([ 'score', 'lbl_MOTIF', 'lbl_CONTACT', 'lbl_CONTEXT', 'sequence_A', 'sequence_B', 'test_A_perc', 'test_B_perc' ]) assert len(df['test_A_perc'].unique()) == 1 assert len(df['test_B_perc'].unique()) == 1 assert df['test_A_perc'].values[0] == pytest.approx(0.1019, rel=1e-3) assert df['test_B_perc'].values[0] == pytest.approx(0.07758, rel=1e-3) df = ra.label_percentage(df, "A", "CONTEXT") df = ra.label_percentage(df, "A", "CONTACT") df = ra.label_percentage(df, "A", "MOTIF") df = ra.label_percentage(df, "B", "CONTACT") df = ra.label_percentage(df, "B", "MOTIF") df = ra.label_percentage(df, "B", "CONTEXT") assert len(df['CONTEXT_A_perc'].unique()) == 1 assert df['CONTEXT_A_perc'].values[0] == 1 assert len(df['MOTIF_A_perc'].unique()) == 1 assert df['MOTIF_A_perc'].values[0] == 0 assert len(df['CONTACT_A_perc'].unique()) > 1 assert df['CONTACT_A_perc'].mean() == pytest.approx(0.0552, rel=1e-3) assert len(df['CONTEXT_B_perc'].unique()) == 1 assert df['CONTEXT_B_perc'].values[0] == 0 assert len(df['MOTIF_B_perc'].unique()) == 1 assert df['MOTIF_B_perc'].values[0] == pytest.approx(0.1896, rel=1e-3) assert len(df['CONTACT_B_perc'].unique()) > 1 assert df['CONTACT_B_perc'].mean() == pytest.approx(0.4669, rel=1e-3)
def apply_resfile( self, seqID, filename, rscript=None, keep_input_scores=False ): # pragma: no cover """Apply a generated Rosetta `resfile <https://www.rosettacommons.org/docs/latest/rosetta_basics/file_types/resfiles>`_ to the decoy. This function needs to be created after the appropiate mutant variants have been created and their corresponding **resfiles** have been written. .. note:: Depends on :ref:`rosetta.path <options>` and :ref:`rosetta.compilation <options>`, if the ``filename`` does not exist. .. attention:: This function **REQUIRES** a local installation of **Rosetta**. To execute this function it is important that the ``source_file`` assigned to the :class:`.DesignFrame` is an original silent file and **not a minisilent**, as the original structure of the decoy needs to be used in order to generate the variants. If that is not the case, use :class:`.DesignFrame.replace_source_files`. :param str seqID: |seqID_param| :param str filename: Name of the final silent file that will contain all the variant's data. If the file exists, it is assumed that the data was already created and data will be directly loaded from that file. :param str rscript: By default, the script executed will be the one generated by :func:`.mutations`. One can provide its own script (either as the file name of the script or as a string of the content itself) **as long as it fulfills two conditions**: (1) It must contain the **AddJobPairData Mover** and (2) it should accept the script variable ``resfile``. An example on how to use these two conditions can be extrapolated from :func:`.mutations`. :param bool keep_input_scores: When :data:`True` (default :data:`False`), it will keep the score terms present in the source decoy (as they appear in the original silent file) for the variants. :return: :class:`.DesignFrame` with the scores for the mutants. :raise: :SystemError: If all variants faile to be generated or if they cannot be merged. :IOError: If Rosetta path cannot be found. :AttributeError: If the resfiles for the variants were not previously created. .. seealso: :meth:`.DesignFrame.generate_mutant_variants` :meth:`.DesignFrame.generate_mutants_from_matrix` :meth:`.DesignFrame.generate_wt_reversions` :meth:`.DesignFrame.make_resfile` :meth:`.DesignSeries.generate_mutant_variants` :meth:`.DesignSeries.generate_mutants_from_matrix` :meth:`.DesignSeries.generate_wt_reversions` :meth:`.DesignSeries.make_resfile` .. rubric:: Example .. ipython:: In [1]: from rstoolbox.io import parse_rosetta_file ...: import pandas as pd ...: pd.set_option('display.width', 1000) ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz", ...: {'scores': ['score', 'description'], 'sequence': 'B'}) ...: df.add_reference_sequence('B', df.get_sequence('B').values[0]) ...: dfwt = df.iloc[0].generate_mutant_variants('B', [(1, "TGP"), (6, "ERG"), ...: (14, "MAT")]) ...: # Call in test-mode ...: dfwt = dfwt.make_resfile("B", "NATAA", "mutants.resfile", write=False ) ...: dfwt2 = dfwt.iloc[:3].apply_resfile("B", ...: "../rstoolbox/tests/data/variants.silent.gz") ...: dfwt2 """ from rstoolbox.components import DesignSeries, DesignFrame from rstoolbox.io import parse_rosetta_file from rstoolbox.utils import mutations if isinstance(self, DesignSeries): self = DesignFrame(self).T resfile = 'resfile_{}'.format(seqID) if not os.path.isfile(filename): wdir = tempfile.mkdtemp() exe = make_rosetta_app_path('rosetta_scripts') if resfile not in self.columns: raise AttributeError("Resfiles are needed to execute this function.") if rscript is None: rscript = mutations(seqID) if not os.path.isfile(rscript): fd = open(os.path.join(wdir, 'script.xml'), 'w') fd.write(rscript) fd.close() rscript = os.path.join(wdir, 'script.xml') command = ['{0}', '-parser:protocol {1}', '-in:file:silent {2}', '-in:file:tags {3}', '-out:file:silent {4}', '-parser:script_vars resfile={5}'] if not keep_input_scores: command.append('-keep_input_scores false') command = ' '.join(command) outfiles = [] errors = 0 sys.stdout.write("Running Rosetta\n") for _, row in self.iterrows(): if re.search(r'_v\d{4}$', row['description']): origin = "_".join(row['description'].split('_')[:-1]) else: origin = row['description'] outfiles.append(os.path.join(wdir, row['description'] + '.silent')) cmd = command.format(exe, rscript, " ".join(self.get_source_files()), origin, outfiles[-1], row[resfile]) sys.stdout.write(cmd + "\n") error = execute_process( cmd ) if bool(error): errors += 1 sys.stdout.write("Execution for variant {} has failed\n".format(row['description'])) if errors < self.shape[0]: exe = make_rosetta_app_path('combine_silent') command = ['{0}', '-in:file:silent {1}', '-out:file:silent {2}'] command = ' '.join(command) cmd = command.format(exe, " ".join(outfiles), filename) sys.stdout.write("Merging all silent files\n") sys.stdout.write(cmd + "\n") error = execute_process( cmd ) if bool(error): raise SystemError("A file with the new variants could not be created.") else: raise SystemError("All variants failed to be generated.") df = parse_rosetta_file(filename) df = df.drop(columns=['description']) return self.merge(df, on=resfile, how='left')
def test_mutants(self): # Static data refseq = "GSISDIRKDAEVRMDKAVEAFKNKLDKFKAAVRKVFPTEERIDMRPEIWIAQELRRIGDE" \ "FNAYRDANDKAAALGKDKEINWFDISQSLWDVQKLTDAAIKKIEAALADMEAWLTQ" columns = ["mutants_B", "mutant_count_B", "mutant_positions_B"] mut_number = [97, 91, 88, 90, 92, 92] mut_type = [ "G1T,S2R,I3P,S4E,D5E,I6A,K8E,D9R,E11W,V12R,R13L,M14A,D15E,K16I,V18M,E19R,A20K,F21G," "K22W,N23E,K24E,L25H,D26E,K27R,F28E,K29W,A30E,A31W,V32W,R33K,K34R,V35A,F36S,P37K," "T38G,E39R,R41E,I42R,R45L,I48R,W49M,Q52A,E53A,R56A,D59E,E60I,Y64E,R65W,D66Q,A67M,N68R" ",D69L,K70E,A71M,A72E,A73K,L74E,G75R,D77N,K78P,E79N,I80A,N81G,W82E,F83E,D84K,I85M,S86K," "Q87E,S88Q,L89K,W90K,D91E,V92A,Q93W,L95I,T96A,D97Y,A98Y,A99W,I100G,K101L,K102M,I103A," "E104A,A105Y,A106W,L107I,A108K,D109Q,M110H,E111R,A112E,W113K,L114E,T115R,Q116K", "G1P,S2K,I3P,S4E,D5E,I6A,R7M,K8R,D9E,E11Y,V12K,R13L,M14I,D15K,A17Y,V18M,E19L,A20K,F21A," "K22Q,N23K,K24E,L25A,D26Q,K27E,F28E,K29W,A30E,A31R,V32M,K34R,V35T,F36D,P37G,E39K,R41E," "I42K,R45F,I48K,W49M,E53A,R56A,D59E,E60I,R65Y,D66W,N68F,D69L,A71L,A72Q,A73E,L74F,G75K," "D77Y,K78P,E79S,I80V,N81R,F83E,D84E,I85Q,S86E,Q87E,S88A,L89R,W90K,D91R,V92L,Q93K,K94I," "L95M,T96M,D97K,A98I,A99G,I100A,K101E,K102W,I103A,E104R,A105E,A106I,L107A,A108R,D109E," "E111K,A112E,W113R,L114I,T115K,Q116R", "G1T,S2K,I3P,S4E,D5E,I6M,R7A,K8R,D9E,E11Y,V12K,D15L,V18L,E19K,A20Q,F21G,K22E,N23E,K24E," "L25M,D26K,K27R,F28M,K29Y,A30E,A31Q,V32M,R33K,V35G,F36V,P37D,T38S,E39K,R41E,I42R,R45E," "I48K,W49M,Q52I,E53A,R56A,D59E,E60L,Y64W,R65M,D66K,N68L,D69R,K70H,A71M,A72K,A73E,G75R," "D77L,K78G,E79T,I80S,N81G,W82P,F83K,D84E,I85E,S86E,Q87K,S88H,L89W,W90R,D91W,V92I,Q93F," "K94E,T96H,D97R,A98W,I100G,K101E,K102E,E104Q,A105R,L107A,A108E,D109I,M110Q,A112R,W113K," "L114A,T115R,Q116W", "G1T,S2K,I3P,S4E,D5E,I6W,R7A,K8R,D9W,E11Y,V12K,R13E,M14H,D15L,A17M,V18A,A20K,F21H,K22R," "N23K,K24E,L25M,D26E,K27I,F28E,K29W,A30E,A31E,V32L,R33K,K34R,V35R,F36D,P37G,T38K,R41E," "I42K,R45W,I48R,W49M,Q52M,E53A,R56A,D59E,E60L,A63H,Y64H,R65M,D66Y,N68E,D69M,K70R,A72K," "A73E,L74E,G75K,D77K,K78P,I80A,N81K,W82T,F83E,D84E,I85A,S86R,Q87R,S88A,L89R,W90R,D91E," "V92I,Q93M,L95Y,T96H,D97H,A98E,I100G,K101R,K102L,A105E,L107M,A108R,D109R,M110L,E111M," "A112E,W113R,L114H,T115K,Q116K", "G1K,S2K,I3W,S4E,D5E,I6M,R7M,K8R,D9E,V12R,R13Q,M14G,D15K,K16E,A17Y,V18A,E19Q,A20K,F21A," "K22W,N23K,K24E,L25A,D26L,K27L,F28E,K29W,A30K,A31W,V32M,V35R,F36P,P37V,R41M,I42K,R45A," "I48W,W49M,Q52A,E53A,R56A,D59E,E60H,A63I,R65W,D66Q,A67Q,N68K,D69L,K70E,A71H,A72E,A73K," "G75R,D77I,K78P,E79N,I80V,N81P,W82E,F83E,D84E,I85L,S86E,Q87K,S88G,L89K,W90E,D91E,V92L," "Q93K,K94R,L95I,T96E,D97E,A98E,I100A,K101R,K102M,I103A,A105K,A106Y,L107M,A108Q,D109E," "M110L,E111R,A112K,W113K,L114M,T115E,Q116S", "G1P,S2R,I3P,S4E,D5E,I6M,R7A,K8R,D9F,E11K,V12E,R13E,D15H,A17H,V18E,A20K,F21A,K22Y,N23R" ",K24E,L25F,D26L,K27L,F28E,K29Y,A30E,A31L,V32A,R33I,K34R,V35K,F36N,R41P,I42K,R45Q,I48W" ",W49A,Q52A,E53A,R56A,D59E,E60I,A63Q,Y64W,R65M,D66Y,A67H,N68L,D69L,K70E,A71I,A72R,A73K" ",L74E,G75N,K76G,D77S,K78S,E79H,I80T,N81R,W82Y,F83E,D84E,I85R,S86E,Q87K,S88Y,L89R,W90K" ",D91L,V92A,Q93K,K94R,T96H,D97E,A98E,I100A,K102E,E104W,A105K,A106F,L107M,A108H,D109E," "M110A,E111M,A112R,W113R,L114F,T115E,Q116S" ] mut_pos = [",".join([_[1:-1] for _ in m.split(",")]) for m in mut_type] sc_des = {"labels": ["MOTIF", "CONTACT"], "sequence": "B"} # Start test df = ri.parse_rosetta_file(self.silent1, sc_des) df.add_reference_sequence("B", refseq) df = df.identify_mutants("B") for col in columns: assert col in df sr = df.iloc[0] assert df.get_reference_sequence("B") == sr.get_reference_sequence("B") assert df.get_identified_mutants() == [ "B", ] for i, row in df.iterrows(): # Check number of mutations assert row.get_mutation_count("B") == mut_number[i] # Check type of mutations assert row.get_mutations("B") == mut_type[i] # Check position of mutations assert row.get_mutation_positions("B") == mut_pos[i] # Make new variants dfm2 = df.iloc[0].generate_mutant_variants('B', [(1, "TGAP"), (14, "MAPT")]) assert dfm2.shape[0] == 16 assert 0 in dfm2.get_mutation_count('B') # Revert to WT dfwt = df.iloc[0:2].generate_wt_reversions('B', [1, 14]) assert dfwt.shape[0] == 8 dfwt = rc.DesignFrame({ "description": ["reference"], "sequence_B": [refseq] }) dfwt.add_reference_sequence('B', refseq) dfwt = dfwt.generate_mutant_variants('B', [(1, "TGP"), (6, "ERG"), (14, "MAT")]) assert dfwt.shape[0] == 28 dfwt = dfwt.generate_wt_reversions('B').identify_mutants('B') assert dfwt.shape[0] == 36 assert 0 in dfwt.get_mutation_count('B').values assert refseq in dfwt.get_sequence('B').values # Make mutants from Matrix dfwt = rc.DesignFrame({ "description": ["reference"], "sequence_B": [refseq] }) dfwt.add_reference_sequence('B', refseq) matrix = random_frequency_matrix(len(df.get_reference_sequence('B')), 0) key_res = [3, 5, 8, 12, 15, 19, 25, 27] mutants = dfwt.generate_mutants_from_matrix('B', matrix, 5, key_res) assert isinstance(mutants, list) assert len(mutants) == 1 mutants = mutants[0].identify_mutants('B') assert mutants.shape[0] == 5 assert mutants.pssm_score_B.mean() != 0 # write to resfiles df.make_resfile("B", "NATAA", os.path.join(self.tmpdir, "mutanttest.resfile")) for i, row in df.iterrows(): newfile = os.path.join( self.tmpdir, "mutanttest" + "_{:>04d}".format(i) + ".resfile") assert row["resfile_B"] == newfile assert os.path.isfile(newfile) # write alignment ri.write_mutant_alignments(df, "B", os.path.join(self.tmpdir, "mutanttest.clw")) assert os.path.isfile(os.path.join(self.tmpdir, "mutanttest.clw")) # plot mutant fig = plt.figure(figsize=(30, 10)) ax = plt.subplot2grid((1, 1), (0, 0), fig=fig) rp.plot_alignment(df, "B", ax, matrix="BLOSUM62") return fig
def test_getters(self): """ Test usage of the getter functions. """ # Assert types. Rows are DesignSeries, columns are not sc_des = {"labels": ["MOTIF", "CONTACT", "CONTEXT"], "sequence": "AB"} df = ri.parse_rosetta_file(self.silent1, sc_des) assert isinstance(df, rc.DesignFrame) sr = df.iloc[0] assert isinstance(sr, rc.DesignSeries) assert not isinstance(df["description"], rc.DesignSeries) assert isinstance(df["description"], pd.Series) # Check working with sequence getters # We check everything both for DesignSeries and DesignFrame # DesignFrame returns Series, while DesignSeries returns the # actual data. assert sorted(df.get_available_sequences()) == ["A", "B"] assert sorted(sr.get_available_sequences()) == ["A", "B"] assert len(df.get_sequence("A")) == 6 assert len(sr.get_sequence("A")) == 157 assert df.get_sequence("B")[0] == sr.get_sequence("B") # Check working with label getters # We check everything both for DesignSeries and DesignFrame # DesignFrame returns Series, while DesignSeries returns the # actual data. assert sorted(df.get_available_labels()) == sorted(sc_des["labels"]) assert sorted(sr.get_available_labels()) == sorted(sc_des["labels"]) with pytest.raises(KeyError): sr.get_label("MOTIF") assert isinstance(df.get_label("MOTIF", "A")[0], rc.Selection) assert isinstance(sr.get_label("MOTIF", "A"), rc.Selection) assert str(df.get_label("CONTEXT", "B")[0]) == "" assert str(sr.get_label("CONTEXT", "A")) == "1-157" assert str(sr.get_label("CONTEXT", "B")) != str( sr.get_label("CONTEXT", "A")) # Check working with structure getters # We check everything both for DesignSeries and DesignFrame # DesignFrame returns Series, while DesignSeries returns the # actual data. sc_des = {"sequence": "C", "structure": "C"} df = ri.parse_rosetta_file(self.silent2, sc_des) sr = df.iloc[0] assert df.get_available_structures() == ["C"] assert sr.get_available_structures() == ["C"] with pytest.raises(KeyError): assert len(df.get_structure("B")) == 6 with pytest.raises(KeyError): assert len(sr.get_structure("B")) == 157 assert df.get_structure("C")[0] == sr.get_structure("C") # Check working with structure prediction getters # We check everything both for DesignSeries and DesignFrame # DesignFrame returns Series, while DesignSeries returns the # actual data. assert df.get_available_structure_predictions() == [] with pytest.raises(KeyError): assert len(df.get_structure_prediction("C")) == 6 sc_des = { 'sequence': 'A', 'structure': 'A', 'psipred': 'A', 'dihedrals': 'A' } df = ri.parse_rosetta_file(self.silent4, sc_des) sr = df.iloc[0] assert df.get_available_structure_predictions() == ['A'] assert df.get_structure_prediction( 'A')[0] == sr.get_structure_prediction('A') assert len(df.get_structure_prediction('A')[0]) == 88 assert isinstance(df.get_dihedrals("A"), pd.DataFrame) assert isinstance(sr.get_dihedrals("A"), list) for e in sr.get_dihedrals("A"): assert isinstance(e, np.ndarray) assert np.array_equal( df.get_dihedrals("A").iloc[0][0], sr.get_dihedrals("A")[0]) # these are the ranges of the rosetta angles. assert sr.get_phi("A").max() <= 180 assert sr.get_phi("A").min() >= -180 assert sr.get_psi("A").max() <= 180 assert sr.get_psi("A").min() >= -180