def multiplex( row, seqID, muts ): seqNM = _check_column(row, "sequence", seqID) idNM = "description" seq = list(row[seqNM]) for p in muts: # -1 because we are going to access string positions. shift = get_selection(p[0], seqID, row.get_reference_shift(seqID))[0] - 1 seq[shift] = p[1] data = {seqNM: ["".join(x) for x in itertools.product(*seq)]} data[seqNM].insert(0, row[seqNM]) name = row.get_id() if not bool(re.search("_v\d+$", row.get_id())): data[idNM] = [name + "_v{0:04d}".format(x) for x in range(len(data[seqNM]))] data[idNM][0] = name else: data[idNM] = [name + "_v{0:04d}".format(x) for x in range(1, len(data[seqNM]) + 1)] if keep_scores: for col in row.index: if col not in [seqNM, idNM]: data[col] = [row[col]] * len(data[idNM]) else: for seq in row.get_available_sequences(): if seq != seqID: data[_check_column(row, "sequence", seq)] = row.get_sequence(seq) df = row._constructor_expanddim(data) return df
def _get_key_reference(obj, ctype, seqID, key_residues): from rstoolbox.components import get_selection seq = _get_reference(obj, ctype, seqID) sft = _get_reference(obj, "sft", seqID) if key_residues is None: return seq kr = get_selection(key_residues, seqID, sft, len(seq)) # -1 as we are accessing string count return "".join(np.array(list(seq))[kr - 1])
def mutations( reference, row, seqID ): data = [] datn = [] sequence = row.get_sequence(seqID) if len(reference) != len(sequence): raise ValueError("Sequence lengths do not match") for i, refi in enumerate(reference): if refi.upper() != sequence[i].upper(): shift = get_selection(i + 1, seqID, row.get_reference_shift(seqID))[0] data.append(refi.upper() + str(shift) + sequence[i].upper()) datn.append(str(shift)) return ",".join(data), ",".join(datn), len(data)
def format_mutations( row, seqID, key_residues ): shift = row.get_reference_shift(seqID) seq = row.get_sequence(seqID) kr = get_selection(key_residues, seqID, shift, len(seq)) mutations = row.get_mutations(seqID).split(",") muts = [] if mutations != ['']: for m in mutations: m = m.strip() pos = int(re.search("(\d+)", m).group(1)) if pos in kr: muts.append((pos, "".join([m[0], m[-1]]))) return muts
def _get_key_sequence(obj, ctype, seqID, key_residues): from rstoolbox.components import get_selection from .reference import get_reference_shift seq = obj[_check_column(obj, ctype, seqID)] sft = get_reference_shift(obj, seqID) if isinstance(obj, pd.Series): length = len(seq) else: length = len(seq.iloc[0]) kr = get_selection(key_residues, seqID, sft, length) if isinstance(obj, pd.Series): if len(kr) > 1: # -1 because we access string positions return "".join(np.array(list(seq))[kr - 1]) else: return "" else: if len(kr) > 1: return seq.apply(lambda seq: "".join(np.array(list(seq))[kr - 1])) else: return seq.apply(lambda seq: "")
def generate_mutants_from_matrix( self, seqID, matrix, count, key_residues=None, limit_refseq=False ): """From a provided positional frequency matrix, generates ``count`` random variants. It takes into account the individual frequency assigned to each residue type and position. It does **not** generate the highest possible scored sequence according to the matrix, but picks randomly at each position according to the frequencies in for that position. For each :class:`.DesignSeries`, it will generate a :class:`.DesignFrame` in which the original sequence becomes the ``reference_sequence``, inheriting the ``reference_shift``. .. warning:: This is a **computationaly expensive** function. Take this in consideration when trying to run it. Each :class:`.DesignFrame` will have the following structure: ====================== ============================================ Column Data Content ====================== ============================================ **description** Identifier fo the mutant **sequence_<seqID>** Sequence content **pssm_score_<seqID>** Score obtained by applying ``matrix`` ====================== ============================================ :param str seqID: |seqID_param| :param matrix: Positional frequency matrix. **column:** residue type; **index:** sequence position. :type matrix: :class:`~pandas.DataFrame` :param int count: Expected number of **unique** generated combinations. If the number is bigger than the possible options, it will default to the total amount of options. :param key_residues: |keyres_param|. :type key_residues: |keyres_types| :param bool limit_refseq: When :data:`True`, pick only residue types with probabilities equal or higher to the source sequence. :return: :func:`list` of :class:`.DesignFrame` - New set of design sequences. :raises: :ValueError: if matrix rows do not match sequence length. .. seealso:: :meth:`.DesignFrame.generate_mutant_variants` :meth:`.DesignFrame.score_by_pssm` :meth:`.DesignSeries.generate_mutant_variants` :meth:`.DesignSeries.score_by_pssm` .. rubric:: Example .. ipython:: In [1]: from rstoolbox.io import parse_rosetta_file ...: from rstoolbox.tests.helper import random_frequency_matrix ...: import pandas as pd ...: pd.set_option('display.width', 1000) ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz", ...: {'scores': ['score', 'description'], 'sequence': 'B'}) ...: df.add_reference_sequence('B', df.get_sequence('B').values[0]) ...: matrix = random_frequency_matrix(len(df.get_reference_sequence('B')), 0) ...: key_res = [3,5,8,12,15,19,25,27] ...: mutants = df.iloc[1].generate_mutants_from_matrix('B', matrix, 5, key_res) ...: mutants[0].identify_mutants('B') """ from rstoolbox.components import get_selection from rstoolbox.components import DesignSeries, DesignFrame def max_options( matrix, seq, key_residues, limit_refseq): if limit_refseq is False: return np.power(20, len(key_residues)) else: ori_index = matrix.index matrix = matrix.copy() matrix.index = range(0, matrix.shape[0]) options = (matrix.apply(lambda row: np.sum(row >= row[seq[row.name]]), axis=1)) options.index = ori_index return np.prod(options[key_residues]) data = [] if isinstance(self, pd.DataFrame): for _, row in self.iterrows(): data.extend(row.generate_mutants_from_matrix(seqID, matrix, count, key_residues, limit_refseq)) return data if matrix.shape[0] != len(self.get_sequence(seqID)): raise ValueError("Matrix rows and sequence length should match.") # Make sure index and sequence shift match matrix = matrix.copy() shift = self.get_reference_shift(seqID) matrix.index = get_selection(None, seqID, shift, length=matrix.shape[0]) if key_residues is not None: key_residues = get_selection(key_residues, seqID, shift, matrix.shape[0]) else: key_residues = list(matrix.index.values) seqnm = "sequence_{}".format(seqID) data.append(DesignFrame([], columns=["description", seqnm])) name = self.get_id() options = max_options(matrix, self.get_sequence(seqID), key_residues, limit_refseq) # some numbers are just too big for python... if options <= 0: options = count + 1 while data[-1].shape[0] < min(count, options): seqaa = list(self.get_sequence(seqID)) thisname = name + "_v{0:04d}".format(data[-1].shape[0] + 1) for aap in key_residues: matI = matrix.loc[aap].copy() if limit_refseq: matI[matI < matI[seqaa[aap - 1]]] = 0 matI = matI / matI.sum() seqaa[aap - 1] = np.random.choice(matI.index.values, 1, p=list(matI))[0] if "".join(seqaa) == self.get_sequence(seqID): continue data[-1] = data[-1].append(DesignSeries([thisname, "".join(seqaa)], ["description", seqnm]), ignore_index=True) data[-1].drop_duplicates([seqnm]) data[-1].add_reference(seqID, self.get_sequence(seqID), shift=self.get_reference_shift(seqID)) data[-1] = data[-1].score_by_pssm(seqID, matrix) return data
def logo_plot(df, seqID, refseq=True, key_residues=None, line_break=None, font_size=35, colors="WEBLOGO"): """Generates classic **LOGO** plots. :param df: Data container. :type df: Union[:class:`.DesignFrame`, :class:`.SequenceFrame`] :param str seqID: |seqID_param|. :param bool refseq: if :data:`True` (default), mark the original residues according to the reference sequence. :param key_residues: |keyres_param|. :type key_residue: |keyres_param| :param int line_break: Force a line-change in the plot after n residues are plotted. :param float font_size: Expected size of the axis font. :param colors: Colors to assign; it can be the name of a available color set or a dictionary with a color for each type. :type colors: Union[:class:`str`, :class:`dict`] :return: :class:`~matplotlib.figure.Figure` and :func:`list` of :class:`~matplotlib.axes.Axes` .. rubric:: Example .. ipython:: In [1]: from rstoolbox.io import parse_rosetta_file ...: from rstoolbox.plot import logo_plot ...: import matplotlib.pyplot as plt ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz", ...: {"sequence": "B"}) ...: df.add_reference_sequence("B", df.get_sequence("B")[0]) ...: fig, axes = logo_plot(df, "B", refseq=True, line_break=50) ...: plt.tight_layout() @savefig sequence_logo_plot_docs.png width=5in In [2]: plt.show() """ def _letterAt(letter, x, y, yscale=1, ax=None, globscale=1.35, LETTERS=None, COLOR_SCHEME=None): text = LETTERS[letter] t = mpl.transforms.Affine2D().scale(1 * globscale, yscale * globscale) + \ mpl.transforms.Affine2D().translate(x, y) + ax.transData p = PathPatch(text, lw=0, fc=COLOR_SCHEME[letter], transform=t) if ax is not None: ax.add_artist(p) return p def _dataframe2logo(data): aa = list(data) odata = [] for _, pos in data.iterrows(): pdata = [] for k in aa: if pos[k] > 0.0000000: pdata.append((k, float(pos[k]))) odata.append(sorted(pdata, key=operator.itemgetter(1, 0))) return odata def _chunks(l, n): for i in range(0, len(l), n): yield l[i:i + n] order = [ "A", "V", "I", "L", "M", "F", "Y", "W", "S", "T", "N", "Q", "R", "H", "K", "D", "E", "C", "G", "P" ] data = copy.deepcopy(df) mpl.rcParams['svg.fonttype'] = 'none' # Graphical Properties of resizable letters path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../components/square.ttf') fp = FontProperties(fname=path, weight="bold") globscale = 1.22 letters_shift = -0.5 LETTERS = {} for aa in color_scheme(colors): LETTERS[aa] = TextPath((letters_shift, 0), aa, size=1, prop=fp) # Data type management. if not isinstance(data, pd.DataFrame): raise ValueError( "Input data must be in a DataFrame, DesignFrame or SequenceFrame") else: if not isinstance(data, (DesignFrame, SequenceFrame)): if len(set(data.columns.values).intersection( set(order))) == len(order): data = SequenceFrame(data) else: data = DesignFrame(data) if isinstance(data, DesignFrame): data = data.sequence_frequencies(seqID) # key_residues management. length = len(data.get_reference_sequence(seqID)) if refseq else None key_residues = get_selection(key_residues, seqID, list(data.index.values), length) # Plot if line_break is None: figsize = (len(data) * 2, 2.3 * 2) grid = (1, 1) fig = plt.figure(figsize=figsize) axs = [ plt.subplot2grid(grid, (0, 0)), ] krs = [ key_residues, ] else: rows = int(math.ceil(float(len(data)) / line_break)) figsize = (float(len(data) * 2) / rows, 2.3 * 2 * rows) grid = (rows, 1) fig = plt.figure(figsize=figsize) axs = [plt.subplot2grid(grid, (_, 0)) for _ in range(rows)] krs = list(_chunks(key_residues, line_break)) font = FontProperties() font.set_size(font_size) font.set_weight('bold') for _, ax in enumerate(axs): # Refseq and key_residues management. ref_seq = data.get_reference_sequence(seqID, krs[_]) if refseq else "" # data and key_residues management. _data = data.get_key_residues(krs[_]) maxv = int(math.ceil(data.max_hight())) ticks = len(_data) if line_break is not None and len(_data) < line_break: ticks = line_break ax.set_xticks(np.arange(0.5, ticks + 1)) ax.set_yticks(range(0, maxv + 1)) ax.set_xticklabels(_data.index.values) ax.set_yticklabels(np.arange(0, maxv + 1, 1)) if ref_seq is not None: ax2 = ax.twiny() ax2.set_xticks(ax.get_xticks()) ax2.set_xticklabels(list(ref_seq)) sns.despine(ax=ax, trim=True) ax.grid(False) if ref_seq is not None: sns.despine(ax=ax2, top=False, right=True, left=True, trim=True) ax2.grid(False) ax.lines = [] wdata = _dataframe2logo(_data) x = 0.5 maxi = 0 for scores in wdata: y = 0 for base, score in scores: _letterAt(base, x, y, score, ax, globscale, LETTERS, color_scheme(colors)) y += score x += 1 maxi = max(maxi, y) for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontproperties(font) if ref_seq is not None: for label in (ax2.get_xticklabels() + ax2.get_yticklabels()): label.set_fontproperties(font) return fig, axs
def per_residue_matrix_score_plot(df, seqID, ax, matrix="BLOSUM62", selections=None, **kwargs): """Plot a linear representation of the scoring obtained by applying a substitution matrix. Applies to a single decoy against the ``reference_sequence``. Parameters to control the properties of the plotted line (``color``, ``linestyle``...) can be provided too. :param df: |df_param| :type df: :class:`.DesignSeries` :param str seqID: |seqID_param| :param ax: matplotlib axis to which we will plot. :type ax: :py:class:`~matplotlib.axes.Axes` :param str matrix: |matrix_param| :param selections: List of regions to highlight; each position should be a selector and a color. :type selections: :func:`list` of :class:`tuple` with |keyres_types| and a color (:class:`str` or :class:`int`) :raises: :ValueError: If the data container is not :class:`.DesignSeries` or it does not have a ``reference_sequence``. .. rubric:: Example .. ipython:: In [1]: from rstoolbox.io import parse_rosetta_file ...: from rstoolbox.plot import per_residue_matrix_score_plot ...: import matplotlib.pyplot as plt ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz", ...: {"sequence": "B"}) ...: df.add_reference_sequence('B', df.iloc[0]['sequence_B']) ...: df.add_reference_shift('B', 10) ...: seles = [('15-25', 'red'), ('45B-60B', 'green')] ...: fig = plt.figure(figsize=(25, 10)) ...: ax0 = plt.subplot2grid((2, 1), (0, 0)) ...: per_residue_matrix_score_plot(df.iloc[1], "B", ax0) ...: ax1 = plt.subplot2grid((2, 1), (1, 0)) ...: per_residue_matrix_score_plot(df.iloc[1], "B", ax1, selections=seles) @savefig per_residue_matrix_score_plot_docs.png width=5in In [2]: plt.show() """ if not isinstance(df, DesignSeries) or not df.has_reference_sequence(seqID): raise ValueError( "Data must be a DesignSeries with reference for the requested seqID" ) shift = df.get_reference_shift(seqID) refsq = df.get_reference_sequence(seqID) column = '{0}_{1}_per_res'.format(matrix.lower(), seqID) if column not in df.index: df = sequence_similarity(df.to_frame().T, seqID, matrix=matrix).iloc[0] ax.plot(range(0, len(refsq)), [ 0, ] * len(refsq), color='grey', linestyle='dashed') ax.plot(range(0, len(refsq)), df[column], **kwargs) ax.set_xlim(0, len(refsq) - 1) ax.set_xticks(range(0, len(refsq), 5)) if isinstance(shift, int): ax.set_xticklabels([_ + shift for _ in range(0, len(refsq), 5)]) else: ax.set_xticklabels(shift[0::5]) axb = ax.twiny() axb.set_xticks(range(0, len(refsq))) axb.set_xticklabels( list(df['{0}_{1}_ali'.format(matrix.lower(), seqID)].replace('.', ' '))) axb.tick_params('x', top=False, pad=0) axlim = ax.get_ylim() if selections is None: selections = [] for s in selections: xift = False try: xift = Selection(s[0]).is_shifted() except AttributeError: xift = False s_ = get_selection(s[0], seqID, shift, len(refsq)) ax.fill([ s_[0] - int(xift), s_[-1] - int(xift), s_[-1] - int(xift), s_[0] - int(xift) ], [axlim[0] - 1, axlim[0] - 1, axlim[1] + 1, axlim[1] + 1], color=s[1], alpha=0.2, zorder=-100) ax.set_ylim(axlim[0], axlim[1]) ax.set_ylabel(matrix.upper())
def positional_structural_count(df, seqID=None, key_residues=None): """Percentage of secondary structure types for each sequence position of all decoys. The secondary structure dictionary is a minimized one: ``H``, ``E`` and ``L``. :param df: |df_param|. :type df: Union[:py:class:`.DesignFrame`, :py:class:`.FragmentFrame`] :param str seqID: |seqID_param|. Required when input is :class:`.DesignFrame`. :param key_residues: |keyres_param|. :type key_residues: |keyres_types| :return: :class:`~pandas.DataFrame` - where rows are sequence positions and columns are the secondary structure identifiers ``H``, ``E``, ``L``. :raises: :AttributeError: if the data passed is not in Union[:class:`.DesignFrame`, :class:`.FragmentFrame`]. It will *not* try to cast a provided :class:`~pandas.DataFrame`, as it would not be possible to know into which of the two possible inputs it needs to be casted. :AttributeError: if input is :class:`.DesignFrame` and ``seqID`` is not provided. :KeyError: |sseID_error| when input is :class:`.DesignFrame`. .. rubric:: Example .. ipython:: In [1]: from rstoolbox.io import parse_rosetta_file ...: from rstoolbox.analysis import positional_structural_count ...: import pandas as pd ...: pd.set_option('display.width', 1000) ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_ssebig.minisilent.gz", ...: {'scores': ['score'], 'structure': 'C'}) ...: df = positional_structural_count(df.iloc[1:], 'C') ...: df.head() """ from rstoolbox.components import DesignFrame, FragmentFrame from rstoolbox.components import get_selection data = {"H": [], "E": [], "L": []} if isinstance(df, DesignFrame): if seqID is None: raise AttributeError("seqID needs to be provided") if not "structure_{}".format(seqID) in df: raise KeyError("Structure {} not found in decoys.".format(seqID)) seqdata = df.get_sequential_data('structure', seqID) seqdata = seqdata.apply(lambda x: pd.Series(list(x))) for _, i in enumerate(seqdata.columns.values): qseq = "".join(seqdata[i].tolist()) sse = collections.Counter(qseq) data["H"].append(float(sse["H"]) / float(len(qseq))) data["E"].append(float(sse["E"]) / float(len(qseq))) data["L"].append(float(sse["L"]) / float(len(qseq))) elif isinstance(df, FragmentFrame): for i in df["position"].drop_duplicates().values: qseq = "".join(df[df["position"] == i]["sse"].values).upper() sse = collections.Counter(qseq) data["H"].append(float(sse["H"]) / float(len(qseq))) data["E"].append(float(sse["E"]) / float(len(qseq))) data["L"].append(float(sse["L"]) / float(len(qseq))) else: raise AttributeError( "Input data has to be a DesignFrame or a FragmentFrame.") dfo = pd.DataFrame(data) # Get shift only from DesignFrame; FragmentFrame does not have one shft = df.get_reference_shift(seqID) if isinstance(df, DesignFrame) else 1 # Shift the index so that index == PDB count if isinstance(shft, int): dfo.index = dfo.index + shft else: dfo.index = shft return dfo.loc[list(get_selection(key_residues, seqID, list(dfo.index)))]
def positional_structural_identity(df, seqID=None, ref_sse=None, key_residues=None): """Per position evaluation of how many times the provided data matches the expected ``reference_structure``. :param df: |df_param|. :type df: Union[:class:`.DesignFrame`, :class:`.FragmentFrame`] :param str seqID: |seqID_param|. Required when input is :class:`.DesignFrame` :param str ref_sse: Reference sequence. Required when input is :class:`.FragmentFrame`. Will overwrite the reference sequence of :class:`.DesignFrame` if provided. :param key_residues: |keyres_param|. :type key_residues: |keyres_types| :return: :class:`~pandas.DataFrame` - where rows are sequence positions and columns are ``sse`` (expected secondary structure), ``max_sse`` (most represented secondary structure) and ``identity_perc`` (percentage of matched secondary structure). :raises: :AttributeError: if the data passed is not in Union[:class:`.DesignFrame`, :class:`.FragmentFrame`]. It will *not* try to cast a provided :class:`~pandas.DataFrame`, as it would not be possible to know into which of the two possible inputs it needs to be casted. :AttributeError: if input is :class:`.DesignFrame` and ``seqID`` is not provided. :KeyError: |sseID_error| when input is :class:`.DesignFrame`. :AttributeError: if input is :class:`.FragmentFrame` and ``ref_sse`` is not provided. .. rubric:: Example .. ipython:: In [1]: from rstoolbox.io import parse_rosetta_file ...: from rstoolbox.analysis import positional_structural_identity ...: import pandas as pd ...: pd.set_option('display.width', 1000) ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_ssebig.minisilent.gz", ...: {'scores': ['score'], 'structure': 'C'}) ...: df.add_reference_structure('C', df.get_structure('C').values[0]) ...: df = positional_structural_identity(df.iloc[1:], 'C') ...: df.head() """ from rstoolbox.components import DesignFrame, FragmentFrame from rstoolbox.components import get_selection data = {"sse": [], "max_sse": [], "identity_perc": []} if isinstance(df, DesignFrame): if seqID is None: raise AttributeError("seqID needs to be provided") if not df.has_reference_structure(seqID): raise AttributeError( "There is no reference structure for seqID {}".format(seqID)) if not "structure_{}".format(seqID) in df: raise KeyError("Structure {} not found in decoys.".format(seqID)) ref_sse = ref_sse if ref_sse is not None else df.get_reference_structure( seqID) seqdata = df.get_structure(seqID) seqdata = seqdata.apply(lambda x: pd.Series(list(x))) for _, i in enumerate(seqdata.columns.values): qseq = "".join(seqdata[i].tolist()) sse = collections.Counter(qseq) data["sse"].append(ref_sse[i]) data["max_sse"].append(sse.most_common(1)[0][0]) data["identity_perc"].append( float(sse[ref_sse[i - 1]]) / float(len(qseq))) elif isinstance(df, FragmentFrame): if ref_sse is None: raise AttributeError("ref_sse needs to be provided") for i in df["position"].drop_duplicates().values: qseq = "".join(df[df["position"] == i]["sse"].values).upper() sse = collections.Counter(qseq) data["sse"].append(ref_sse[i - 1]) data["max_sse"].append(sse.most_common(1)[0][0]) data["identity_perc"].append( float(sse[ref_sse[i - 1]]) / float(len(qseq))) else: raise AttributeError( "Input data has to be a DesignFrame with a reference sequence " "or a FragmentFrame.") dfo = pd.DataFrame(data) # Get shift only from DesignFrame; FragmentFrame does not have one shft = df.get_reference_shift(seqID) if isinstance(df, DesignFrame) else 1 # Shift the index so that index == PDB count if isinstance(shft, int): dfo.index = dfo.index + shft else: dfo.index = shft return dfo.loc[list(get_selection(key_residues, seqID, list(dfo.index)))]
def logo_plot(df, seqID, refseq=True, key_residues=None, line_break=None, hight_prop=4, font_size=35, refplot=False, colors="WEBLOGO"): """Generates full figure classic **LOGO** plots. :param df: Data container. :type df: Union[:class:`.DesignFrame`, :class:`.SequenceFrame`] :param str seqID: |seqID_param|. :param bool refseq: if :data:`True` (default), mark the original residues according to the reference sequence. :param key_residues: |keyres_param|. :type key_residues: |keyres_param| :param int line_break: Force a line-change in the plot after n residues are plotted. :param int hight_prop: Hight proportion for each row of the plot. :param float font_size: Expected size of the axis font. :param bool refplot: When :data:`True`, it will reorder the residues in each position so that the reference residue will be on the bottom and setting a two-color scheme (provide a single color name in ``colors``) that allows to quickly identify the reference type in each position. :param colors: Colors to assign; it can be the name of a available color set or a dictionary with a color for each type. Available color schemes are: Weblogo (default), Hydrophobicity, Chemistry, and Charge. :type colors: Union[:class:`str`, :class:`dict`] :return: :class:`~matplotlib.figure.Figure` and :func:`list` of [:class:`~matplotlib.axes.Axes`, :class:`~matplotlib.axes.Axes`] - with primary and secondary axis of each subplot. .. seealso:: :func:`.logo_plot_in_axis` .. rubric:: Example .. ipython:: :okwarning: In [1]: from rstoolbox.io import parse_rosetta_file ...: from rstoolbox.plot import logo_plot ...: import matplotlib.pyplot as plt ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz", ...: {"sequence": "B"}) ...: df.add_reference_sequence("B", df.get_sequence("B")[0]) ...: fig, axes = logo_plot(df, "B", refseq=True, line_break=50) ...: plt.tight_layout() @savefig sequence_logo_plot_docs.png width=5in In [2]: plt.show() In [3]: plt.close() """ order = [ "A", "V", "I", "L", "M", "F", "Y", "W", "S", "T", "N", "Q", "R", "H", "K", "D", "E", "C", "G", "P" ] data = copy.deepcopy(df) if data.empty: raise ValueError("Provided data container is empty. Nothing to plot.") # Data type management. if not isinstance(data, pd.DataFrame): raise ValueError( "Input data must be in a DataFrame, DesignFrame or SequenceFrame") else: if not isinstance(data, (DesignFrame, SequenceFrame)): if len(set(data.columns.values).intersection( set(order))) == len(order): data = SequenceFrame(data) else: data = DesignFrame(data) if isinstance(data, DesignFrame): data = data.sequence_frequencies(seqID) # key_residues management. length = len(data.get_reference_sequence(seqID)) if refseq else None key_residues = get_selection(key_residues, seqID, list(data.index.values), length) # Plot if line_break is None: figsize = (len(data) * 2, 2.3 * hight_prop) grid = (1, 1) fig = plt.figure(figsize=figsize) axs = [ [plt.subplot2grid(grid, (0, 0)), None], ] krs = [ key_residues, ] else: rows = int(math.ceil(float(len(key_residues)) / line_break)) figsize = (float(len(data) * 2) / rows, 2.3 * hight_prop * rows) grid = (rows, 1) fig = plt.figure(figsize=figsize) axs = [[plt.subplot2grid(grid, (_, 0)), None] for _ in range(rows)] krs = list(_chunks(key_residues, line_break)) font = FontProperties() font.set_size(font_size) font.set_weight('bold') for _, ax in enumerate(axs): axs[_][1] = logo_plot_in_axis(data, seqID, ax[0], refseq=refseq, key_residues=krs[_], refplot=refplot, colors=colors, line_break=line_break) return fig, axs
def positional_sequence_similarity(df, seqID=None, ref_seq=None, key_residues=None, matrix="BLOSUM62"): """Per position identity and similarity against a ``reference_sequence``. Provided a data container with a set of sequences, it will evaluate the percentage of identities and similarities that the whole set has against a ``reference_sequence``. It would do so by sequence position instead that by each individual sequence. In a way, this generates an extreme simplification from a :class:`.SequenceFrame`. :param df: |df_param|. :type df: Union[:class:`.DesignFrame`, :class:`.FragmentFrame`] :param str seqID: |seqID_param|. Required when input is :class:`.DesignFrame`. :param str ref_seq: Reference sequence. Required when input is :class:`.FragmentFrame`. Will overwrite the reference sequence of :class:`.DesignFrame` if provided. :param key_residues: |keyres_param|. :type key_residues: |keyres_types| :param str matrix: |matrix_param|. Default is ``BLOSUM62``. :return: :class:`~pandas.DataFrame` - where rows are sequence positions and columns are ``identity_perc`` and ``positive_perc``. :raises: :AttributeError: if the data passed is not in Union[:class:`.DesignFrame`, :class:`.FragmentFrame`]. It will *not* try to cast a provided :class:`~pandas.DataFrame`, as it would not be possible to know into which of the two possible inputs it needs to be casted. :AttributeError: if input is :class:`.DesignFrame` and ``seqID`` is not provided. :KeyError: |seqID_error| when input is :class:`.DesignFrame`. :AttributeError: |reference_error| when input is :class:`.DesignFrame`. :AttributeError: if input is :class:`.FragmentFrame` and ``ref_seq`` is not provided. .. rubric:: Example .. ipython:: In [1]: from rstoolbox.io import parse_rosetta_file ...: from rstoolbox.analysis import positional_sequence_similarity ...: import pandas as pd ...: pd.set_option('display.width', 1000) ...: pd.set_option('display.max_columns', 500) ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz", ...: {'scores': ['score'], 'sequence': 'B'}) ...: df.add_reference_sequence('B', df.get_sequence('B').values[0]) ...: df = positional_sequence_similarity(df.iloc[1:], 'B') ...: df.head() """ from rstoolbox.components import DesignFrame, FragmentFrame from rstoolbox.components import get_selection data = {"identity_perc": [], "positive_perc": []} # Get matrix data mat = SM.get_matrix(matrix) if isinstance(df, DesignFrame): if seqID is None: raise AttributeError("seqID needs to be provided") if not df.has_reference_sequence(seqID): raise AttributeError( "There is no reference sequence for seqID {}".format(seqID)) if not "sequence_{}".format(seqID) in df: raise KeyError("Sequence {} not found in decoys.".format(seqID)) ref_seq = ref_seq if ref_seq is not None else df.get_reference_sequence( seqID) seqdata = df.get_sequence(seqID) seqdata = seqdata.apply(lambda x: pd.Series(list(x))) for _, i in enumerate(seqdata.columns.values): qseq = "".join(seqdata[i].tolist()) _, idn, pos, _ = _positional_similarity(qseq, ref_seq[_], mat) data["identity_perc"].append(float(idn) / float(len(qseq))) data["positive_perc"].append(float(pos) / float(len(qseq))) elif isinstance(df, FragmentFrame): if ref_seq is None: raise AttributeError("ref_seq needs to be provided") for i in df["position"].drop_duplicates().values: qseq = "".join(df[df["position"] == i]["aa"].values) _, idn, pos, _ = _positional_similarity(qseq, ref_seq[i - 1], mat) data["identity_perc"].append(float(idn) / float(len(qseq))) data["positive_perc"].append(float(pos) / float(len(qseq))) else: raise AttributeError("Input data has to be a DesignFrame with a " "reference sequence or a FragmentFrame.") dfo = pd.DataFrame(data) # Get shift only from DesignFrame; FragmentFrame does not have one shft = df.get_reference_shift(seqID) if isinstance(df, DesignFrame) else 1 # Shift the index so that index == PDB count if isinstance(shft, int): dfo.index = dfo.index + shft else: dfo.index = shft selection = list(get_selection(key_residues, seqID, list(dfo.index))) selection = [x - 1 for x in selection] # -1 for array like count return dfo.iloc[selection]