Пример #1
0
 def test_color_scheme_custom( self ):
     custom = {
         'A': '#e6194b', 'C': '#3cb44b', 'D': '#ffe119', 'E': '#ffe119',
         'F': '#f58231', 'G': '#911eb4', 'H': '#46f0f0', 'I': '#f032e6',
         'K': '#d2f53c', 'L': '#d2f53c', 'M': '#008080', 'N': '#e6beff',
         'P': '#aa6e28', 'Q': '#fffac8', 'R': '#800000', 'S': '#aaffc3',
         'T': '#808000', 'V': '#ffd8b1', 'W': '#000080', 'Y': '#808080'
     }
     df = rc.DesignFrame(pd.read_csv(os.path.join(self.dirpath, 'logo_plot_sequence.csv'),
                                     header=None).rename(columns={0: 'sequence_A'}))
     fig, axs = rp.logo_plot(df, "A", refseq=False, line_break=50, font_size=10, hight_prop=2,
                             colors=custom)
     return fig
Пример #2
0
def read_fastq(filename, seqID='A'):
    """Reads a FASTQ file and stores the ID together with the sequence.

    The default generated :class:`.DesignFrame` will contain two columns:

    ====================  ===================================================
    Column Name            Data Content
    ====================  ===================================================
    **description**        Sequence identifier.
    **sequence_<chain>**   Sequence content.
    ====================  ===================================================

    :param str filename: FASTQ filename.
    :param str seqID: |seqID_param|

    :return: :class:`.DesignFrame`

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import read_fastq
           ...: import pandas as pd
           ...: pd.set_option('display.width', 1000)
           ...: df = read_fastq("../rstoolbox/tests/data/cdk2_rand_001.fasq.gz")
           ...: df.head(8)
    """
    # Empty array to store tuples of ID & sequence information
    fastq = []
    idq = []

    # Create a file handle for parsing
    is_gz = filename.endswith('gz')
    fastq_file = gzip.open(filename) if is_gz else open(filename)
    for line in fastq_file:
        line = line.decode('utf8') if is_gz else line
        if line.startswith('@'):
            idq.append(str(line.split(':')[0].split(';')[0][1:]))
        if '@' in line or '+' in line or any(c.islower() for c in line):
            continue
        if len(line) == 0:
            continue
        fastq.append(str(line.strip()))
    return rc.DesignFrame({
        'description': idq,
        'sequence_{}'.format(seqID): fastq
    })
Пример #3
0
def parse_rosetta_json(filename):
    """Read a json formated rosetta score file.

    Only reads back scores, as those are the only content present in a ``JSON`` file.

    :param str filename: File containing the Rosetta score file.

    :return: :class:`.DesignFrame`.

    .. note::
        To be coherent with the silent files, the decoy id column name ``decoy`` is
        changed to ``description``.

    :raises:
        :IOError: if ``filename`` cannot be found.

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_json
           ...: import pandas as pd
           ...: pd.set_option('display.width', 1000)
           ...: pd.set_option('display.max_columns', 500)
           ...: df = parse_rosetta_json("../rstoolbox/tests/data/score.json.gz")
           ...: df.head(2)
    """
    is_gz = filename.endswith(".gz")
    fd = gzip.open(filename) if is_gz else open(filename)
    data = {}
    for line in fd:
        if is_gz:
            dt = json.loads(line.decode('utf8').strip())
        else:
            dt = json.loads(line.strip())
        for k in dt:
            data.setdefault(k, []).append(dt[k])
    df = rc.DesignFrame(data)
    df.rename(columns={'decoy': 'description'})
    df.add_source_file(filename)
    return df
Пример #4
0
def read_fasta(filename, expand=False, multi=False, defchain='A'):
    """Reads one or more **FASTA** files and returns the appropiate object
    containing the requested data: the :class:`.DesignFrame`.

    The default generated :class:`.DesignFrame` will contain two columns:

    ====================  ===================================================
    Column Name            Data Content
    ====================  ===================================================
    **description**        Sequence identifier.
    **sequence_<chain>**   Sequence content.
    ====================  ===================================================

    The sequence column assigned as ``sequence_A`` is an arbitrary decision that
    has to do compatibility issues with the rest of functions and methods of
    :class:`.DesignFrame`.

    .. ipython::

        In [1]: from rstoolbox.io import read_fasta
           ...: import pandas as pd
           ...: pd.set_option('display.width', 1000)
           ...: df = read_fasta("../rstoolbox/tests/data/*fa$", multi=True)
           ...: df

    If the **FASTA** comes or is formated as **PDB FASTA** (as in the example avobe),
    it is possible to better assign the column names to the actual sequence ID. To force
    that behaviour, activate the ``expand`` option.

    .. ipython::

        In [1]: from rstoolbox.io import read_fasta
           ...: df = read_fasta("../rstoolbox/tests/data/*fa", expand=True, multi=True)
           ...: df

    .. note::
        Notice everything from the original ``description`` after the ``|`` symbol
        is lost after that process.

    :param str filename: file name or file pattern to search.
    :param bool expand: Try to better associate sequence ID if format is **PDB FASTA**.
    :param bool multi: When :data:`True`, indicates that data is readed from
        multiple files.
    :param str defchain: Default chain to use. If not provided that is 'A'.

    :return: :class:`.DesignFrame`.

    :raises:
        :IOError: if ``filename`` cannot be found.

    .. seealso::
        :func:`~.write_fasta`
    """
    seqcol = "sequence_{}".format(defchain)
    files = _gather_file_list(filename, multi)
    data = {"description": [], seqcol: []}
    for _, f in enumerate(files):
        fd = gzip.open(f) if f.endswith(".gz") else open(f)
        for line in fd:
            line = line.decode('utf8') if f.endswith(".gz") else line
            line = line.strip()
            if line.startswith(">"):
                line = line.strip(">")
                data["description"].append(line)
                data[seqcol].append("")
            elif len(line) > 0:
                data[seqcol][-1] += line

    df = cp.DesignFrame(data)
    if expand and bool(re.search("^\S{4}\:\S{1}", df.iloc[0]["description"])):
        df["description"] = df["description"].apply(
            lambda col: col.split("|")[0])
        df[['description', 'seq']] = df['description'].str.split(':',
                                                                 expand=True)
        df = df.pivot('description', 'seq',
                      seqcol).add_prefix("sequence_").rename_axis(
                          None, axis=1).reset_index()
        df = cp.DesignFrame(df)
    df.add_source_files(files)
    return df
Пример #5
0
def write_fasta(df, seqID, separator=None, filename=None, split=False):
    """Writes fasta files of the selected decoys.

    It assumes that the provided data is contained in a :class:`.DesignFrame`
    or a :class:`~pandas.DataFrame`.

    Mandatory columns are:

    ====================  ===================================================
    Column Name           Data Content
    ====================  ===================================================
    **description**       Sequence identifier.
    **sequence_<seqID>**  Sequence content.
    ====================  ===================================================

    .. ipython::

        In [1]: from rstoolbox.io import read_fasta, write_fasta
           ...: df = read_fasta("../rstoolbox/tests/data/*fa", multi=True)
           ...: print write_fasta(df, "A")

    When working with multiple ``seqID``, one can select which ones to be printed;
    empty sequences will be skipped.

    .. ipython::

        In [1]: from rstoolbox.io import read_fasta, write_fasta
           ...: df = read_fasta("../rstoolbox/tests/data/*fa", expand=True, multi=True)
           ...: print write_fasta(df, "AC")

    :param df: Data content.
    :type df: Union[:class:`.DesignFrame`, :class:`~pandas.DataFrame`]
    :param str seqID: |seqID_param|.
    :param str separator: Add ``seqID`` to sequence identifier through a particular
        string separator. If multiple ``seqID`` are provided, it defaults to ``:``.
    :param str filename: Output file name.
    :param bool split: Split each fasta in a different file. ``filename`` first part of the filename
        is used as `prefix`, with a following enumeration.

    :return: :class:`str` - **FASTA** formated string.

    :raises:
        :IOError: If ``filename`` exists and global option :ref:`system.overwrite <options>`
            is not :data:`True`.
        :AttributeError: |seqID_error|.

    .. note::
        Depends on :ref:`system.overwrite <options>` and :ref:`system.output <options>`.

    .. seealso::
        :func:`~.read_fasta`
    """
    def nomenclator(row, seqID, separator):
        sequence = row.get_sequence(seqID)
        if sequence is None or isinstance(sequence,
                                          float) or len(sequence) == 0:
            return ""
        name = ">" + row.get_id()
        if separator is not None:
            name = name + separator + seqID
        return name + "\n" + row.get_sequence(seqID)

    if filename is not None:
        if os.path.isfile(filename) and not core.get_option(
                "system", "overwrite"):
            raise IOError("File {} already exists".format(filename))
    if not isinstance(df, cp.DesignFrame):
        df = cp.DesignFrame(df)
    if len(seqID) > 0 and separator is None:
        separator = ":"

    data = []
    for chain in seqID:
        eachfa = df.apply(lambda row: nomenclator(row, chain, separator),
                          axis=1)
        data.extend(eachfa.values)

    if filename is not None:
        if not split:
            fd = open(filename,
                      "w") if not filename.endswith(".gz") else gzip.open(
                          filename, "wb")
            fd.write("\n".join(data).strip() + "\n")
            fd.close()
        else:
            suffix = "_f{0:04d}"
            cplxname = os.path.splitext(filename)
            for i, sequence in enumerate(data):
                fname = cplxname[0] + suffix.format(i + 1) + cplxname[1]
                fd = open(fname,
                          "w") if not fname.endswith(".gz") else gzip.open(
                              fname, "wb")
                fd.write(sequence + "\n")
                fd.close()

    return "\n".join(data).strip() + "\n"
Пример #6
0
 def test_color_scheme_charge( self ):
     df = rc.DesignFrame(pd.read_csv(os.path.join(self.dirpath, 'logo_plot_sequence.csv'),
                                     header=None).rename(columns={0: 'sequence_A'}))
     fig, axs = rp.logo_plot(df, "A", refseq=False, line_break=50, font_size=10, hight_prop=2,
                             colors="CHARGE")
     return fig
Пример #7
0
 def test_color_scheme_hydrophobicity( self ):
     df = rc.DesignFrame(pd.read_csv(os.path.join(self.dirpath, 'logo_plot_sequence.csv'),
                                     header=None).rename(columns={0: 'sequence_A'}))
     fig, axs = rp.logo_plot(df, "A", refseq=False, font_size=10, hight_prop=2,
                             colors='HYDROPHOBICITY')
     return fig
Пример #8
0
def parse_rosetta_file(filename, description=None, multi=False):
    """Read a Rosetta score or silent file and returns the design population
    in a :class:`.DesignFrame`.

    By default, it will pick the data contained in **all the score columns**
    with the exception of positional scores (such as *per-residue ddg*). The
    user can specify scores to be ignored.

    When working with *silent files*, extra information can be picked, such as
    *sequence* and *secondary structure* data, *residue labels* or positional
    scores. The fine control of these options is explained in detail in
    :ref:`tutorial: reading Rosetta <readrosetta>`.

    Some basic usage cases::

        # (1) The default scenario, just read scores from a single file.
        df = rstoolbox.io.parse_rosetta_file("silentfile")

        # (2) Reading from multiple files. Assumes all files start with
        # the particular prefix.
        df = rstoolbox.io.parse_rosetta_file("silentfile", multi=True)

        # (3) Getting all scores and the sequence of each design.
        description = {'sequence': 'A'}
        df = rstoolbox.io.parse_rosetta_file("silentfile", description)

        # (4) Get only total_score and RMSD, and rename total_score to score.
        description = {'scores': ['RMSD'], 'scores_rename': {'total_score': 'score'}}
        df = rstoolbox.io.parse_rosetta_file("silentfile", description)

    :param filename: file name, file pattern to search or list of files.
    :type filename: Union[:class:`str`, :func:`list`]
    :param description: Parsing rules. It can be a dictionary describing
        the rules or the name of a file containing such dictionary. The
        dictionary definition is explained in :ref:`tutorial: reading Rosetta <readrosetta>`.
    :type description: Union[:class:`str`, :class:`dict`]
    :param bool multi: When :data:`True`, indicates that data is readed from multiple files.

    :return: :class:`.DesignFrame`.

    :raises:
        :IOError: if ``filename`` cannot be found.
        :IOError: if ``filename`` pattern (``multi=True``) generates no files.

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: import pandas as pd
           ...: pd.set_option('display.width', 1000)
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz")
           ...: df.head(2)
    """

    manager = rc.Description(**_file_vs_json(description))
    header = []
    data = OrderedDict()

    for line, is_header, _, symm in open_rosetta_file(filename, multi):
        if is_header:
            header = line.strip().split()[1:]
            continue

        if line.startswith("SCORE"):
            per_res = {}
            chains = {
                "id": [],
                "seq": "",
                "dssp": "",
                "psipred": "",
                "phi": [],
                "psi": []
            }

            # General scores
            for cv, value in enumerate(line.strip().split()[1:]):
                hcv = header[cv]
                if manager.wanted_per_residue_score(hcv):
                    hcvn = re.sub("\d+$", "", hcv)
                    per_res.setdefault(hcvn, {})
                    per_res[hcvn][int(re.findall('\d+$',
                                                 hcv)[0])] = _check_type(value)
                    continue
                if manager.wanted_score(hcv):
                    data.setdefault(manager.score_name(hcv),
                                    []).append(_check_type(value))

            # Namings from the description
            manager.check_naming(header)
            for namingID, namingVL in manager.get_naming_pairs(
                    line.strip().split()[-1]):
                data.setdefault(namingID, []).append(_check_type(namingVL))

            # Fix per-residue
            for k in per_res:
                data.setdefault(k, []).append(
                    OrderedDict(sorted(per_res[k].items())).values())

            # Setup labels
            data = manager.setup_labels(data)
            continue

        if line.startswith(
                "RES_NUM"):  # In multichains and not starting in A1.
            for x in line.split()[1:-1]:
                chain, numbers = x.split(":")
                nums = numbers.split("-")
                if len(nums) == 1 or nums[0] == "":
                    nums = 1
                else:
                    nums = (int(nums[1]) - int(nums[0])) + 1
                chains["id"].extend([
                    chain,
                ] * nums)
            continue

        if line.startswith(
                "SYMMETRY_INFO"
        ):  # When working with symmetry, RES_NUM is not there...
            chain = "".join(string.ascii_uppercase[:int(line.split()[2])])
            for c in chain:
                chains["id"].extend([
                    c,
                ] * int(line.split()[4]))

            data = _add_sequences(manager, data, chains)
            continue

        if line.startswith("ANNOTATED_SEQUENCE"):
            chains["seq"] = list(
                re.sub(r'\[[^]]*\]', '',
                       line.strip().split()[1]))
            if not symm:
                # When info is chain A starting in 1, it is not printed in the silent file
                if len(chains["id"]) == 0:
                    chains["id"].extend([
                        "A",
                    ] * len(chains["seq"]))

                data = _add_sequences(manager, data, chains)
            else:
                chains["seq"] = list("".join(chains["seq"]).rstrip("X"))

            continue

        if line.startswith("REMARK DSSP"):
            chains["dssp"] = list(line.split()[2].strip())
            continue
        if line.startswith("REMARK PSIPRED"):
            chains["psipred"] = list(line.split()[2].strip())
            continue
        if line.startswith("REMARK LABELS"):
            for label in line.split()[2].split(";"):
                labinfo = label.split(":")
                if "lbl_" + labinfo[0].upper() in data:
                    data["lbl_" + labinfo[0].upper()][-1] = labinfo[1]
            continue
        if line.startswith("REMARK PHI"):
            chains["phi"] = [
                float(x) for x in line.split()[2].strip().split(",")
            ]
            continue
        if line.startswith("REMARK PSI"):
            chains["psi"] = [
                float(x) for x in line.split()[2].strip().split(",")
            ]
            continue

    df = rc.DesignFrame(data)
    df.add_source_files(_gather_file_list(filename, multi))
    return df
Пример #9
0
def parse_rosetta_pdb(filename,
                      keep_weights=False,
                      per_residue=False,
                      dropna=True):
    """Read the ``POSE_ENERGIES_TABLE`` from a Rosetta output PDB file.

    The ``POSE_ENERGIES_TABLE`` only contain the score terms contained inside
    the executed score function. It will not add other score terms added through
    filters.

    :param str filename: Name of the PDB file.
    :param bool keep_weights: If :data:`True`, keep the weights row.
    :param bool per_residue: If :data:`True`, keep a row of data for each residue.
        Otherwise, compress the sequence into ``sequence_{}`` columns.
    :param bool dropna: If :data:`True`, non-standard residues are dropped when making
        the sequence. Otherwise, it appears as ``X``. Consider that modifications of
        residues that are known by Rosetta such as ``LYS:CtermProteinFull`` or ``HIS_D``
        are considered standard in this context.

    :return: :class:`.DesignFrame`
    """
    def chain_ids(infile):
        with open(infile) as fp:
            for result in re.findall(r'ATOM.{17}(\w)', fp.read(), re.S):
                yield result

    def data_between(infile):
        with open(infile) as fp:
            for result in re.findall(
                    r'(#BEGIN_POSE_ENERGIES_TABLE.*?#END_POSE_ENERGIES_TABLE)',
                    fp.read(), re.S):
                return result

    d = {
        'CYS': 'C',
        'ASP': 'D',
        'SER': 'S',
        'GLN': 'Q',
        'LYS': 'K',
        'ILE': 'I',
        'PRO': 'P',
        'THR': 'T',
        'PHE': 'F',
        'ASN': 'N',
        'GLY': 'G',
        'HIS': 'H',
        'LEU': 'L',
        'ARG': 'R',
        'TRP': 'W',
        'ALA': 'A',
        'VAL': 'V',
        'GLU': 'E',
        'TYR': 'Y',
        'MET': 'M'
    }

    chains = list(pd.Series(chain_ids(filename)).unique())
    idata = data_between(filename)
    name = idata.split('\n')[0].strip().split()[-1].replace('.pdb', '')
    df = pd.read_csv(six.StringIO(idata), comment='#', sep=r'\s+')
    df = df.assign(description=[
        name,
    ] * df.shape[0])[~df['label'].str.startswith('VRT_')]

    chcol = ['', '']
    pick = [
        'pose',
    ]
    if not keep_weights:
        df = df[df['label'] != 'weights']
    else:
        pick.append('weights')
    if len(chains) == 1:
        chcol.extend([
            chains[0],
        ] * (df.shape[0] - len(chcol) + 1))
    else:
        chain_chng = list(
            df[df['label'].str.contains('NtermProteinFull')].index)
        chain_chng.append(int(df.iloc[-1].name) + 1)
        for i in range(0, len(chain_chng) - 1):
            chcol.extend([
                chains[i],
            ] * (int(chain_chng[i + 1]) - int(chain_chng[i])))

    df = df.assign(chain=pd.Series(chcol), )
    if not per_residue:
        sdata = {
            'description': [
                name,
            ]
        }
        for g, gdf in df[df['chain'] != ''].groupby('chain'):
            sdata.setdefault('sequence_{}'.format(g), [
                ''.join(
                    gdf['label'].str.split('[:_]').str[0].map(d).fillna('X'))
            ])
            if dropna:
                sdata['sequence_{}'.format(g)][-1] = sdata[
                    'sequence_{}'.format(g)][-1].replace('X', '')
        df = df[df['label'].isin(pick)].merge(pd.DataFrame(sdata),
                                              on='description')
        df = df.drop(columns=['chain'])
        if not keep_weights:
            df = df.drop(columns=['label'])

    return rc.DesignFrame(df)
Пример #10
0
    def test_mutants(self):
        # Static data
        refseq = "GSISDIRKDAEVRMDKAVEAFKNKLDKFKAAVRKVFPTEERIDMRPEIWIAQELRRIGDE" \
                 "FNAYRDANDKAAALGKDKEINWFDISQSLWDVQKLTDAAIKKIEAALADMEAWLTQ"
        columns = ["mutants_B", "mutant_count_B", "mutant_positions_B"]
        mut_number = [97, 91, 88, 90, 92, 92]
        mut_type = [
            "G1T,S2R,I3P,S4E,D5E,I6A,K8E,D9R,E11W,V12R,R13L,M14A,D15E,K16I,V18M,E19R,A20K,F21G,"
            "K22W,N23E,K24E,L25H,D26E,K27R,F28E,K29W,A30E,A31W,V32W,R33K,K34R,V35A,F36S,P37K,"
            "T38G,E39R,R41E,I42R,R45L,I48R,W49M,Q52A,E53A,R56A,D59E,E60I,Y64E,R65W,D66Q,A67M,N68R"
            ",D69L,K70E,A71M,A72E,A73K,L74E,G75R,D77N,K78P,E79N,I80A,N81G,W82E,F83E,D84K,I85M,S86K,"
            "Q87E,S88Q,L89K,W90K,D91E,V92A,Q93W,L95I,T96A,D97Y,A98Y,A99W,I100G,K101L,K102M,I103A,"
            "E104A,A105Y,A106W,L107I,A108K,D109Q,M110H,E111R,A112E,W113K,L114E,T115R,Q116K",
            "G1P,S2K,I3P,S4E,D5E,I6A,R7M,K8R,D9E,E11Y,V12K,R13L,M14I,D15K,A17Y,V18M,E19L,A20K,F21A,"
            "K22Q,N23K,K24E,L25A,D26Q,K27E,F28E,K29W,A30E,A31R,V32M,K34R,V35T,F36D,P37G,E39K,R41E,"
            "I42K,R45F,I48K,W49M,E53A,R56A,D59E,E60I,R65Y,D66W,N68F,D69L,A71L,A72Q,A73E,L74F,G75K,"
            "D77Y,K78P,E79S,I80V,N81R,F83E,D84E,I85Q,S86E,Q87E,S88A,L89R,W90K,D91R,V92L,Q93K,K94I,"
            "L95M,T96M,D97K,A98I,A99G,I100A,K101E,K102W,I103A,E104R,A105E,A106I,L107A,A108R,D109E,"
            "E111K,A112E,W113R,L114I,T115K,Q116R",
            "G1T,S2K,I3P,S4E,D5E,I6M,R7A,K8R,D9E,E11Y,V12K,D15L,V18L,E19K,A20Q,F21G,K22E,N23E,K24E,"
            "L25M,D26K,K27R,F28M,K29Y,A30E,A31Q,V32M,R33K,V35G,F36V,P37D,T38S,E39K,R41E,I42R,R45E,"
            "I48K,W49M,Q52I,E53A,R56A,D59E,E60L,Y64W,R65M,D66K,N68L,D69R,K70H,A71M,A72K,A73E,G75R,"
            "D77L,K78G,E79T,I80S,N81G,W82P,F83K,D84E,I85E,S86E,Q87K,S88H,L89W,W90R,D91W,V92I,Q93F,"
            "K94E,T96H,D97R,A98W,I100G,K101E,K102E,E104Q,A105R,L107A,A108E,D109I,M110Q,A112R,W113K,"
            "L114A,T115R,Q116W",
            "G1T,S2K,I3P,S4E,D5E,I6W,R7A,K8R,D9W,E11Y,V12K,R13E,M14H,D15L,A17M,V18A,A20K,F21H,K22R,"
            "N23K,K24E,L25M,D26E,K27I,F28E,K29W,A30E,A31E,V32L,R33K,K34R,V35R,F36D,P37G,T38K,R41E,"
            "I42K,R45W,I48R,W49M,Q52M,E53A,R56A,D59E,E60L,A63H,Y64H,R65M,D66Y,N68E,D69M,K70R,A72K,"
            "A73E,L74E,G75K,D77K,K78P,I80A,N81K,W82T,F83E,D84E,I85A,S86R,Q87R,S88A,L89R,W90R,D91E,"
            "V92I,Q93M,L95Y,T96H,D97H,A98E,I100G,K101R,K102L,A105E,L107M,A108R,D109R,M110L,E111M,"
            "A112E,W113R,L114H,T115K,Q116K",
            "G1K,S2K,I3W,S4E,D5E,I6M,R7M,K8R,D9E,V12R,R13Q,M14G,D15K,K16E,A17Y,V18A,E19Q,A20K,F21A,"
            "K22W,N23K,K24E,L25A,D26L,K27L,F28E,K29W,A30K,A31W,V32M,V35R,F36P,P37V,R41M,I42K,R45A,"
            "I48W,W49M,Q52A,E53A,R56A,D59E,E60H,A63I,R65W,D66Q,A67Q,N68K,D69L,K70E,A71H,A72E,A73K,"
            "G75R,D77I,K78P,E79N,I80V,N81P,W82E,F83E,D84E,I85L,S86E,Q87K,S88G,L89K,W90E,D91E,V92L,"
            "Q93K,K94R,L95I,T96E,D97E,A98E,I100A,K101R,K102M,I103A,A105K,A106Y,L107M,A108Q,D109E,"
            "M110L,E111R,A112K,W113K,L114M,T115E,Q116S",
            "G1P,S2R,I3P,S4E,D5E,I6M,R7A,K8R,D9F,E11K,V12E,R13E,D15H,A17H,V18E,A20K,F21A,K22Y,N23R"
            ",K24E,L25F,D26L,K27L,F28E,K29Y,A30E,A31L,V32A,R33I,K34R,V35K,F36N,R41P,I42K,R45Q,I48W"
            ",W49A,Q52A,E53A,R56A,D59E,E60I,A63Q,Y64W,R65M,D66Y,A67H,N68L,D69L,K70E,A71I,A72R,A73K"
            ",L74E,G75N,K76G,D77S,K78S,E79H,I80T,N81R,W82Y,F83E,D84E,I85R,S86E,Q87K,S88Y,L89R,W90K"
            ",D91L,V92A,Q93K,K94R,T96H,D97E,A98E,I100A,K102E,E104W,A105K,A106F,L107M,A108H,D109E,"
            "M110A,E111M,A112R,W113R,L114F,T115E,Q116S"
        ]
        mut_pos = [",".join([_[1:-1] for _ in m.split(",")]) for m in mut_type]
        sc_des = {"labels": ["MOTIF", "CONTACT"], "sequence": "B"}

        # Start test
        df = ri.parse_rosetta_file(self.silent1, sc_des)
        df.add_reference_sequence("B", refseq)

        df = df.identify_mutants("B")
        for col in columns:
            assert col in df

        sr = df.iloc[0]
        assert df.get_reference_sequence("B") == sr.get_reference_sequence("B")
        assert df.get_identified_mutants() == [
            "B",
        ]

        for i, row in df.iterrows():
            # Check number of mutations
            assert row.get_mutation_count("B") == mut_number[i]
            # Check type of mutations
            assert row.get_mutations("B") == mut_type[i]
            # Check position of mutations
            assert row.get_mutation_positions("B") == mut_pos[i]

        # Make new variants
        dfm2 = df.iloc[0].generate_mutant_variants('B', [(1, "TGAP"),
                                                         (14, "MAPT")])
        assert dfm2.shape[0] == 16
        assert 0 in dfm2.get_mutation_count('B')

        # Revert to WT
        dfwt = df.iloc[0:2].generate_wt_reversions('B', [1, 14])
        assert dfwt.shape[0] == 8

        dfwt = rc.DesignFrame({
            "description": ["reference"],
            "sequence_B": [refseq]
        })
        dfwt.add_reference_sequence('B', refseq)
        dfwt = dfwt.generate_mutant_variants('B', [(1, "TGP"), (6, "ERG"),
                                                   (14, "MAT")])
        assert dfwt.shape[0] == 28
        dfwt = dfwt.generate_wt_reversions('B').identify_mutants('B')
        assert dfwt.shape[0] == 36
        assert 0 in dfwt.get_mutation_count('B').values
        assert refseq in dfwt.get_sequence('B').values

        # Make mutants from Matrix
        dfwt = rc.DesignFrame({
            "description": ["reference"],
            "sequence_B": [refseq]
        })
        dfwt.add_reference_sequence('B', refseq)
        matrix = random_frequency_matrix(len(df.get_reference_sequence('B')),
                                         0)
        key_res = [3, 5, 8, 12, 15, 19, 25, 27]
        mutants = dfwt.generate_mutants_from_matrix('B', matrix, 5, key_res)
        assert isinstance(mutants, list)
        assert len(mutants) == 1
        mutants = mutants[0].identify_mutants('B')
        assert mutants.shape[0] == 5
        assert mutants.pssm_score_B.mean() != 0

        # write to resfiles
        df.make_resfile("B", "NATAA",
                        os.path.join(self.tmpdir, "mutanttest.resfile"))
        for i, row in df.iterrows():
            newfile = os.path.join(
                self.tmpdir, "mutanttest" + "_{:>04d}".format(i) + ".resfile")
            assert row["resfile_B"] == newfile
            assert os.path.isfile(newfile)

        # write alignment
        ri.write_mutant_alignments(df, "B",
                                   os.path.join(self.tmpdir, "mutanttest.clw"))
        assert os.path.isfile(os.path.join(self.tmpdir, "mutanttest.clw"))

        # plot mutant
        fig = plt.figure(figsize=(30, 10))
        ax = plt.subplot2grid((1, 1), (0, 0), fig=fig)
        rp.plot_alignment(df, "B", ax, matrix="BLOSUM62")
        return fig