def test_color_scheme_custom( self ): custom = { 'A': '#e6194b', 'C': '#3cb44b', 'D': '#ffe119', 'E': '#ffe119', 'F': '#f58231', 'G': '#911eb4', 'H': '#46f0f0', 'I': '#f032e6', 'K': '#d2f53c', 'L': '#d2f53c', 'M': '#008080', 'N': '#e6beff', 'P': '#aa6e28', 'Q': '#fffac8', 'R': '#800000', 'S': '#aaffc3', 'T': '#808000', 'V': '#ffd8b1', 'W': '#000080', 'Y': '#808080' } df = rc.DesignFrame(pd.read_csv(os.path.join(self.dirpath, 'logo_plot_sequence.csv'), header=None).rename(columns={0: 'sequence_A'})) fig, axs = rp.logo_plot(df, "A", refseq=False, line_break=50, font_size=10, hight_prop=2, colors=custom) return fig
def read_fastq(filename, seqID='A'): """Reads a FASTQ file and stores the ID together with the sequence. The default generated :class:`.DesignFrame` will contain two columns: ==================== =================================================== Column Name Data Content ==================== =================================================== **description** Sequence identifier. **sequence_<chain>** Sequence content. ==================== =================================================== :param str filename: FASTQ filename. :param str seqID: |seqID_param| :return: :class:`.DesignFrame` .. rubric:: Example .. ipython:: In [1]: from rstoolbox.io import read_fastq ...: import pandas as pd ...: pd.set_option('display.width', 1000) ...: df = read_fastq("../rstoolbox/tests/data/cdk2_rand_001.fasq.gz") ...: df.head(8) """ # Empty array to store tuples of ID & sequence information fastq = [] idq = [] # Create a file handle for parsing is_gz = filename.endswith('gz') fastq_file = gzip.open(filename) if is_gz else open(filename) for line in fastq_file: line = line.decode('utf8') if is_gz else line if line.startswith('@'): idq.append(str(line.split(':')[0].split(';')[0][1:])) if '@' in line or '+' in line or any(c.islower() for c in line): continue if len(line) == 0: continue fastq.append(str(line.strip())) return rc.DesignFrame({ 'description': idq, 'sequence_{}'.format(seqID): fastq })
def parse_rosetta_json(filename): """Read a json formated rosetta score file. Only reads back scores, as those are the only content present in a ``JSON`` file. :param str filename: File containing the Rosetta score file. :return: :class:`.DesignFrame`. .. note:: To be coherent with the silent files, the decoy id column name ``decoy`` is changed to ``description``. :raises: :IOError: if ``filename`` cannot be found. .. rubric:: Example .. ipython:: In [1]: from rstoolbox.io import parse_rosetta_json ...: import pandas as pd ...: pd.set_option('display.width', 1000) ...: pd.set_option('display.max_columns', 500) ...: df = parse_rosetta_json("../rstoolbox/tests/data/score.json.gz") ...: df.head(2) """ is_gz = filename.endswith(".gz") fd = gzip.open(filename) if is_gz else open(filename) data = {} for line in fd: if is_gz: dt = json.loads(line.decode('utf8').strip()) else: dt = json.loads(line.strip()) for k in dt: data.setdefault(k, []).append(dt[k]) df = rc.DesignFrame(data) df.rename(columns={'decoy': 'description'}) df.add_source_file(filename) return df
def read_fasta(filename, expand=False, multi=False, defchain='A'): """Reads one or more **FASTA** files and returns the appropiate object containing the requested data: the :class:`.DesignFrame`. The default generated :class:`.DesignFrame` will contain two columns: ==================== =================================================== Column Name Data Content ==================== =================================================== **description** Sequence identifier. **sequence_<chain>** Sequence content. ==================== =================================================== The sequence column assigned as ``sequence_A`` is an arbitrary decision that has to do compatibility issues with the rest of functions and methods of :class:`.DesignFrame`. .. ipython:: In [1]: from rstoolbox.io import read_fasta ...: import pandas as pd ...: pd.set_option('display.width', 1000) ...: df = read_fasta("../rstoolbox/tests/data/*fa$", multi=True) ...: df If the **FASTA** comes or is formated as **PDB FASTA** (as in the example avobe), it is possible to better assign the column names to the actual sequence ID. To force that behaviour, activate the ``expand`` option. .. ipython:: In [1]: from rstoolbox.io import read_fasta ...: df = read_fasta("../rstoolbox/tests/data/*fa", expand=True, multi=True) ...: df .. note:: Notice everything from the original ``description`` after the ``|`` symbol is lost after that process. :param str filename: file name or file pattern to search. :param bool expand: Try to better associate sequence ID if format is **PDB FASTA**. :param bool multi: When :data:`True`, indicates that data is readed from multiple files. :param str defchain: Default chain to use. If not provided that is 'A'. :return: :class:`.DesignFrame`. :raises: :IOError: if ``filename`` cannot be found. .. seealso:: :func:`~.write_fasta` """ seqcol = "sequence_{}".format(defchain) files = _gather_file_list(filename, multi) data = {"description": [], seqcol: []} for _, f in enumerate(files): fd = gzip.open(f) if f.endswith(".gz") else open(f) for line in fd: line = line.decode('utf8') if f.endswith(".gz") else line line = line.strip() if line.startswith(">"): line = line.strip(">") data["description"].append(line) data[seqcol].append("") elif len(line) > 0: data[seqcol][-1] += line df = cp.DesignFrame(data) if expand and bool(re.search("^\S{4}\:\S{1}", df.iloc[0]["description"])): df["description"] = df["description"].apply( lambda col: col.split("|")[0]) df[['description', 'seq']] = df['description'].str.split(':', expand=True) df = df.pivot('description', 'seq', seqcol).add_prefix("sequence_").rename_axis( None, axis=1).reset_index() df = cp.DesignFrame(df) df.add_source_files(files) return df
def write_fasta(df, seqID, separator=None, filename=None, split=False): """Writes fasta files of the selected decoys. It assumes that the provided data is contained in a :class:`.DesignFrame` or a :class:`~pandas.DataFrame`. Mandatory columns are: ==================== =================================================== Column Name Data Content ==================== =================================================== **description** Sequence identifier. **sequence_<seqID>** Sequence content. ==================== =================================================== .. ipython:: In [1]: from rstoolbox.io import read_fasta, write_fasta ...: df = read_fasta("../rstoolbox/tests/data/*fa", multi=True) ...: print write_fasta(df, "A") When working with multiple ``seqID``, one can select which ones to be printed; empty sequences will be skipped. .. ipython:: In [1]: from rstoolbox.io import read_fasta, write_fasta ...: df = read_fasta("../rstoolbox/tests/data/*fa", expand=True, multi=True) ...: print write_fasta(df, "AC") :param df: Data content. :type df: Union[:class:`.DesignFrame`, :class:`~pandas.DataFrame`] :param str seqID: |seqID_param|. :param str separator: Add ``seqID`` to sequence identifier through a particular string separator. If multiple ``seqID`` are provided, it defaults to ``:``. :param str filename: Output file name. :param bool split: Split each fasta in a different file. ``filename`` first part of the filename is used as `prefix`, with a following enumeration. :return: :class:`str` - **FASTA** formated string. :raises: :IOError: If ``filename`` exists and global option :ref:`system.overwrite <options>` is not :data:`True`. :AttributeError: |seqID_error|. .. note:: Depends on :ref:`system.overwrite <options>` and :ref:`system.output <options>`. .. seealso:: :func:`~.read_fasta` """ def nomenclator(row, seqID, separator): sequence = row.get_sequence(seqID) if sequence is None or isinstance(sequence, float) or len(sequence) == 0: return "" name = ">" + row.get_id() if separator is not None: name = name + separator + seqID return name + "\n" + row.get_sequence(seqID) if filename is not None: if os.path.isfile(filename) and not core.get_option( "system", "overwrite"): raise IOError("File {} already exists".format(filename)) if not isinstance(df, cp.DesignFrame): df = cp.DesignFrame(df) if len(seqID) > 0 and separator is None: separator = ":" data = [] for chain in seqID: eachfa = df.apply(lambda row: nomenclator(row, chain, separator), axis=1) data.extend(eachfa.values) if filename is not None: if not split: fd = open(filename, "w") if not filename.endswith(".gz") else gzip.open( filename, "wb") fd.write("\n".join(data).strip() + "\n") fd.close() else: suffix = "_f{0:04d}" cplxname = os.path.splitext(filename) for i, sequence in enumerate(data): fname = cplxname[0] + suffix.format(i + 1) + cplxname[1] fd = open(fname, "w") if not fname.endswith(".gz") else gzip.open( fname, "wb") fd.write(sequence + "\n") fd.close() return "\n".join(data).strip() + "\n"
def test_color_scheme_charge( self ): df = rc.DesignFrame(pd.read_csv(os.path.join(self.dirpath, 'logo_plot_sequence.csv'), header=None).rename(columns={0: 'sequence_A'})) fig, axs = rp.logo_plot(df, "A", refseq=False, line_break=50, font_size=10, hight_prop=2, colors="CHARGE") return fig
def test_color_scheme_hydrophobicity( self ): df = rc.DesignFrame(pd.read_csv(os.path.join(self.dirpath, 'logo_plot_sequence.csv'), header=None).rename(columns={0: 'sequence_A'})) fig, axs = rp.logo_plot(df, "A", refseq=False, font_size=10, hight_prop=2, colors='HYDROPHOBICITY') return fig
def parse_rosetta_file(filename, description=None, multi=False): """Read a Rosetta score or silent file and returns the design population in a :class:`.DesignFrame`. By default, it will pick the data contained in **all the score columns** with the exception of positional scores (such as *per-residue ddg*). The user can specify scores to be ignored. When working with *silent files*, extra information can be picked, such as *sequence* and *secondary structure* data, *residue labels* or positional scores. The fine control of these options is explained in detail in :ref:`tutorial: reading Rosetta <readrosetta>`. Some basic usage cases:: # (1) The default scenario, just read scores from a single file. df = rstoolbox.io.parse_rosetta_file("silentfile") # (2) Reading from multiple files. Assumes all files start with # the particular prefix. df = rstoolbox.io.parse_rosetta_file("silentfile", multi=True) # (3) Getting all scores and the sequence of each design. description = {'sequence': 'A'} df = rstoolbox.io.parse_rosetta_file("silentfile", description) # (4) Get only total_score and RMSD, and rename total_score to score. description = {'scores': ['RMSD'], 'scores_rename': {'total_score': 'score'}} df = rstoolbox.io.parse_rosetta_file("silentfile", description) :param filename: file name, file pattern to search or list of files. :type filename: Union[:class:`str`, :func:`list`] :param description: Parsing rules. It can be a dictionary describing the rules or the name of a file containing such dictionary. The dictionary definition is explained in :ref:`tutorial: reading Rosetta <readrosetta>`. :type description: Union[:class:`str`, :class:`dict`] :param bool multi: When :data:`True`, indicates that data is readed from multiple files. :return: :class:`.DesignFrame`. :raises: :IOError: if ``filename`` cannot be found. :IOError: if ``filename`` pattern (``multi=True``) generates no files. .. rubric:: Example .. ipython:: In [1]: from rstoolbox.io import parse_rosetta_file ...: import pandas as pd ...: pd.set_option('display.width', 1000) ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz") ...: df.head(2) """ manager = rc.Description(**_file_vs_json(description)) header = [] data = OrderedDict() for line, is_header, _, symm in open_rosetta_file(filename, multi): if is_header: header = line.strip().split()[1:] continue if line.startswith("SCORE"): per_res = {} chains = { "id": [], "seq": "", "dssp": "", "psipred": "", "phi": [], "psi": [] } # General scores for cv, value in enumerate(line.strip().split()[1:]): hcv = header[cv] if manager.wanted_per_residue_score(hcv): hcvn = re.sub("\d+$", "", hcv) per_res.setdefault(hcvn, {}) per_res[hcvn][int(re.findall('\d+$', hcv)[0])] = _check_type(value) continue if manager.wanted_score(hcv): data.setdefault(manager.score_name(hcv), []).append(_check_type(value)) # Namings from the description manager.check_naming(header) for namingID, namingVL in manager.get_naming_pairs( line.strip().split()[-1]): data.setdefault(namingID, []).append(_check_type(namingVL)) # Fix per-residue for k in per_res: data.setdefault(k, []).append( OrderedDict(sorted(per_res[k].items())).values()) # Setup labels data = manager.setup_labels(data) continue if line.startswith( "RES_NUM"): # In multichains and not starting in A1. for x in line.split()[1:-1]: chain, numbers = x.split(":") nums = numbers.split("-") if len(nums) == 1 or nums[0] == "": nums = 1 else: nums = (int(nums[1]) - int(nums[0])) + 1 chains["id"].extend([ chain, ] * nums) continue if line.startswith( "SYMMETRY_INFO" ): # When working with symmetry, RES_NUM is not there... chain = "".join(string.ascii_uppercase[:int(line.split()[2])]) for c in chain: chains["id"].extend([ c, ] * int(line.split()[4])) data = _add_sequences(manager, data, chains) continue if line.startswith("ANNOTATED_SEQUENCE"): chains["seq"] = list( re.sub(r'\[[^]]*\]', '', line.strip().split()[1])) if not symm: # When info is chain A starting in 1, it is not printed in the silent file if len(chains["id"]) == 0: chains["id"].extend([ "A", ] * len(chains["seq"])) data = _add_sequences(manager, data, chains) else: chains["seq"] = list("".join(chains["seq"]).rstrip("X")) continue if line.startswith("REMARK DSSP"): chains["dssp"] = list(line.split()[2].strip()) continue if line.startswith("REMARK PSIPRED"): chains["psipred"] = list(line.split()[2].strip()) continue if line.startswith("REMARK LABELS"): for label in line.split()[2].split(";"): labinfo = label.split(":") if "lbl_" + labinfo[0].upper() in data: data["lbl_" + labinfo[0].upper()][-1] = labinfo[1] continue if line.startswith("REMARK PHI"): chains["phi"] = [ float(x) for x in line.split()[2].strip().split(",") ] continue if line.startswith("REMARK PSI"): chains["psi"] = [ float(x) for x in line.split()[2].strip().split(",") ] continue df = rc.DesignFrame(data) df.add_source_files(_gather_file_list(filename, multi)) return df
def parse_rosetta_pdb(filename, keep_weights=False, per_residue=False, dropna=True): """Read the ``POSE_ENERGIES_TABLE`` from a Rosetta output PDB file. The ``POSE_ENERGIES_TABLE`` only contain the score terms contained inside the executed score function. It will not add other score terms added through filters. :param str filename: Name of the PDB file. :param bool keep_weights: If :data:`True`, keep the weights row. :param bool per_residue: If :data:`True`, keep a row of data for each residue. Otherwise, compress the sequence into ``sequence_{}`` columns. :param bool dropna: If :data:`True`, non-standard residues are dropped when making the sequence. Otherwise, it appears as ``X``. Consider that modifications of residues that are known by Rosetta such as ``LYS:CtermProteinFull`` or ``HIS_D`` are considered standard in this context. :return: :class:`.DesignFrame` """ def chain_ids(infile): with open(infile) as fp: for result in re.findall(r'ATOM.{17}(\w)', fp.read(), re.S): yield result def data_between(infile): with open(infile) as fp: for result in re.findall( r'(#BEGIN_POSE_ENERGIES_TABLE.*?#END_POSE_ENERGIES_TABLE)', fp.read(), re.S): return result d = { 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K', 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M' } chains = list(pd.Series(chain_ids(filename)).unique()) idata = data_between(filename) name = idata.split('\n')[0].strip().split()[-1].replace('.pdb', '') df = pd.read_csv(six.StringIO(idata), comment='#', sep=r'\s+') df = df.assign(description=[ name, ] * df.shape[0])[~df['label'].str.startswith('VRT_')] chcol = ['', ''] pick = [ 'pose', ] if not keep_weights: df = df[df['label'] != 'weights'] else: pick.append('weights') if len(chains) == 1: chcol.extend([ chains[0], ] * (df.shape[0] - len(chcol) + 1)) else: chain_chng = list( df[df['label'].str.contains('NtermProteinFull')].index) chain_chng.append(int(df.iloc[-1].name) + 1) for i in range(0, len(chain_chng) - 1): chcol.extend([ chains[i], ] * (int(chain_chng[i + 1]) - int(chain_chng[i]))) df = df.assign(chain=pd.Series(chcol), ) if not per_residue: sdata = { 'description': [ name, ] } for g, gdf in df[df['chain'] != ''].groupby('chain'): sdata.setdefault('sequence_{}'.format(g), [ ''.join( gdf['label'].str.split('[:_]').str[0].map(d).fillna('X')) ]) if dropna: sdata['sequence_{}'.format(g)][-1] = sdata[ 'sequence_{}'.format(g)][-1].replace('X', '') df = df[df['label'].isin(pick)].merge(pd.DataFrame(sdata), on='description') df = df.drop(columns=['chain']) if not keep_weights: df = df.drop(columns=['label']) return rc.DesignFrame(df)
def test_mutants(self): # Static data refseq = "GSISDIRKDAEVRMDKAVEAFKNKLDKFKAAVRKVFPTEERIDMRPEIWIAQELRRIGDE" \ "FNAYRDANDKAAALGKDKEINWFDISQSLWDVQKLTDAAIKKIEAALADMEAWLTQ" columns = ["mutants_B", "mutant_count_B", "mutant_positions_B"] mut_number = [97, 91, 88, 90, 92, 92] mut_type = [ "G1T,S2R,I3P,S4E,D5E,I6A,K8E,D9R,E11W,V12R,R13L,M14A,D15E,K16I,V18M,E19R,A20K,F21G," "K22W,N23E,K24E,L25H,D26E,K27R,F28E,K29W,A30E,A31W,V32W,R33K,K34R,V35A,F36S,P37K," "T38G,E39R,R41E,I42R,R45L,I48R,W49M,Q52A,E53A,R56A,D59E,E60I,Y64E,R65W,D66Q,A67M,N68R" ",D69L,K70E,A71M,A72E,A73K,L74E,G75R,D77N,K78P,E79N,I80A,N81G,W82E,F83E,D84K,I85M,S86K," "Q87E,S88Q,L89K,W90K,D91E,V92A,Q93W,L95I,T96A,D97Y,A98Y,A99W,I100G,K101L,K102M,I103A," "E104A,A105Y,A106W,L107I,A108K,D109Q,M110H,E111R,A112E,W113K,L114E,T115R,Q116K", "G1P,S2K,I3P,S4E,D5E,I6A,R7M,K8R,D9E,E11Y,V12K,R13L,M14I,D15K,A17Y,V18M,E19L,A20K,F21A," "K22Q,N23K,K24E,L25A,D26Q,K27E,F28E,K29W,A30E,A31R,V32M,K34R,V35T,F36D,P37G,E39K,R41E," "I42K,R45F,I48K,W49M,E53A,R56A,D59E,E60I,R65Y,D66W,N68F,D69L,A71L,A72Q,A73E,L74F,G75K," "D77Y,K78P,E79S,I80V,N81R,F83E,D84E,I85Q,S86E,Q87E,S88A,L89R,W90K,D91R,V92L,Q93K,K94I," "L95M,T96M,D97K,A98I,A99G,I100A,K101E,K102W,I103A,E104R,A105E,A106I,L107A,A108R,D109E," "E111K,A112E,W113R,L114I,T115K,Q116R", "G1T,S2K,I3P,S4E,D5E,I6M,R7A,K8R,D9E,E11Y,V12K,D15L,V18L,E19K,A20Q,F21G,K22E,N23E,K24E," "L25M,D26K,K27R,F28M,K29Y,A30E,A31Q,V32M,R33K,V35G,F36V,P37D,T38S,E39K,R41E,I42R,R45E," "I48K,W49M,Q52I,E53A,R56A,D59E,E60L,Y64W,R65M,D66K,N68L,D69R,K70H,A71M,A72K,A73E,G75R," "D77L,K78G,E79T,I80S,N81G,W82P,F83K,D84E,I85E,S86E,Q87K,S88H,L89W,W90R,D91W,V92I,Q93F," "K94E,T96H,D97R,A98W,I100G,K101E,K102E,E104Q,A105R,L107A,A108E,D109I,M110Q,A112R,W113K," "L114A,T115R,Q116W", "G1T,S2K,I3P,S4E,D5E,I6W,R7A,K8R,D9W,E11Y,V12K,R13E,M14H,D15L,A17M,V18A,A20K,F21H,K22R," "N23K,K24E,L25M,D26E,K27I,F28E,K29W,A30E,A31E,V32L,R33K,K34R,V35R,F36D,P37G,T38K,R41E," "I42K,R45W,I48R,W49M,Q52M,E53A,R56A,D59E,E60L,A63H,Y64H,R65M,D66Y,N68E,D69M,K70R,A72K," "A73E,L74E,G75K,D77K,K78P,I80A,N81K,W82T,F83E,D84E,I85A,S86R,Q87R,S88A,L89R,W90R,D91E," "V92I,Q93M,L95Y,T96H,D97H,A98E,I100G,K101R,K102L,A105E,L107M,A108R,D109R,M110L,E111M," "A112E,W113R,L114H,T115K,Q116K", "G1K,S2K,I3W,S4E,D5E,I6M,R7M,K8R,D9E,V12R,R13Q,M14G,D15K,K16E,A17Y,V18A,E19Q,A20K,F21A," "K22W,N23K,K24E,L25A,D26L,K27L,F28E,K29W,A30K,A31W,V32M,V35R,F36P,P37V,R41M,I42K,R45A," "I48W,W49M,Q52A,E53A,R56A,D59E,E60H,A63I,R65W,D66Q,A67Q,N68K,D69L,K70E,A71H,A72E,A73K," "G75R,D77I,K78P,E79N,I80V,N81P,W82E,F83E,D84E,I85L,S86E,Q87K,S88G,L89K,W90E,D91E,V92L," "Q93K,K94R,L95I,T96E,D97E,A98E,I100A,K101R,K102M,I103A,A105K,A106Y,L107M,A108Q,D109E," "M110L,E111R,A112K,W113K,L114M,T115E,Q116S", "G1P,S2R,I3P,S4E,D5E,I6M,R7A,K8R,D9F,E11K,V12E,R13E,D15H,A17H,V18E,A20K,F21A,K22Y,N23R" ",K24E,L25F,D26L,K27L,F28E,K29Y,A30E,A31L,V32A,R33I,K34R,V35K,F36N,R41P,I42K,R45Q,I48W" ",W49A,Q52A,E53A,R56A,D59E,E60I,A63Q,Y64W,R65M,D66Y,A67H,N68L,D69L,K70E,A71I,A72R,A73K" ",L74E,G75N,K76G,D77S,K78S,E79H,I80T,N81R,W82Y,F83E,D84E,I85R,S86E,Q87K,S88Y,L89R,W90K" ",D91L,V92A,Q93K,K94R,T96H,D97E,A98E,I100A,K102E,E104W,A105K,A106F,L107M,A108H,D109E," "M110A,E111M,A112R,W113R,L114F,T115E,Q116S" ] mut_pos = [",".join([_[1:-1] for _ in m.split(",")]) for m in mut_type] sc_des = {"labels": ["MOTIF", "CONTACT"], "sequence": "B"} # Start test df = ri.parse_rosetta_file(self.silent1, sc_des) df.add_reference_sequence("B", refseq) df = df.identify_mutants("B") for col in columns: assert col in df sr = df.iloc[0] assert df.get_reference_sequence("B") == sr.get_reference_sequence("B") assert df.get_identified_mutants() == [ "B", ] for i, row in df.iterrows(): # Check number of mutations assert row.get_mutation_count("B") == mut_number[i] # Check type of mutations assert row.get_mutations("B") == mut_type[i] # Check position of mutations assert row.get_mutation_positions("B") == mut_pos[i] # Make new variants dfm2 = df.iloc[0].generate_mutant_variants('B', [(1, "TGAP"), (14, "MAPT")]) assert dfm2.shape[0] == 16 assert 0 in dfm2.get_mutation_count('B') # Revert to WT dfwt = df.iloc[0:2].generate_wt_reversions('B', [1, 14]) assert dfwt.shape[0] == 8 dfwt = rc.DesignFrame({ "description": ["reference"], "sequence_B": [refseq] }) dfwt.add_reference_sequence('B', refseq) dfwt = dfwt.generate_mutant_variants('B', [(1, "TGP"), (6, "ERG"), (14, "MAT")]) assert dfwt.shape[0] == 28 dfwt = dfwt.generate_wt_reversions('B').identify_mutants('B') assert dfwt.shape[0] == 36 assert 0 in dfwt.get_mutation_count('B').values assert refseq in dfwt.get_sequence('B').values # Make mutants from Matrix dfwt = rc.DesignFrame({ "description": ["reference"], "sequence_B": [refseq] }) dfwt.add_reference_sequence('B', refseq) matrix = random_frequency_matrix(len(df.get_reference_sequence('B')), 0) key_res = [3, 5, 8, 12, 15, 19, 25, 27] mutants = dfwt.generate_mutants_from_matrix('B', matrix, 5, key_res) assert isinstance(mutants, list) assert len(mutants) == 1 mutants = mutants[0].identify_mutants('B') assert mutants.shape[0] == 5 assert mutants.pssm_score_B.mean() != 0 # write to resfiles df.make_resfile("B", "NATAA", os.path.join(self.tmpdir, "mutanttest.resfile")) for i, row in df.iterrows(): newfile = os.path.join( self.tmpdir, "mutanttest" + "_{:>04d}".format(i) + ".resfile") assert row["resfile_B"] == newfile assert os.path.isfile(newfile) # write alignment ri.write_mutant_alignments(df, "B", os.path.join(self.tmpdir, "mutanttest.clw")) assert os.path.isfile(os.path.join(self.tmpdir, "mutanttest.clw")) # plot mutant fig = plt.figure(figsize=(30, 10)) ax = plt.subplot2grid((1, 1), (0, 0), fig=fig) rp.plot_alignment(df, "B", ax, matrix="BLOSUM62") return fig