Exemplo n.º 1
0
def _charge(seq, ph=7.0, amide=False):
    """Calculates charge of a single sequence. The method used is first described by Bjellqvist. In the case of
    amidation, the value for the  'Cterm' pKa is 15 (and Cterm is added to the pos_pks dictionary.
    The pKa scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed).

    **pos_pks** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}

    **neg_pks** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}

    :param ph: {float} pH at which to calculate peptide charge.
    :param amide: {boolean} whether the sequences have an amidated C-terminus.
    :return: {array} descriptor values in the attribute :py:attr:`descriptor
    """

    if amide:
        pos_pks = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}
        neg_pks = {'Cterm': 15., 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}
    else:
        pos_pks = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}
        neg_pks = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}

    aa_content = count_aas(seq, scale='absolute')
    aa_content['Nterm'] = 1.0
    aa_content['Cterm'] = 1.0
    pos_charge = 0.0
    for aa, pK in pos_pks.items():
        c_r = 10**(pK - ph)
        partial_charge = c_r / (c_r + 1.0)
        pos_charge += aa_content[aa] * partial_charge
    neg_charge = 0.0
    for aa, pK in neg_pks.items():
        c_r = 10**(ph - pK)
        partial_charge = c_r / (c_r + 1.0)
        neg_charge += aa_content[aa] * partial_charge
    return round(pos_charge - neg_charge, 3)
Exemplo n.º 2
0
    def calc_aa_freq(self, plot=True, color='#83AF9B', filename=None):
        """Method to get the frequency of every amino acid in the library. If the library consists of sub-libraries,
        the frequencies of these are calculated independently.
        
        :param plot: {bool} whether the amino acid frequencies should be plotted in a histogram.
        :param color: {str} color of the plot
        :param filename: {str} filename to save the plot to, if None, the plot is shown
        :return: {numpy.ndarray} amino acid frequencies in the attribute :py:attr:`aafreq`. The values are oredered
            alphabetically.
        :Example:
        
        >>> g = GlobalAnalysis(sequences)  # sequences being a list / array of amino acid sequences
        >>> g.calc_aa_freq()
        >>> g.aafreq
            array([[ 0.08250071,  0.        ,  0.02083928,  0.0159863 ,  0.1464459 ,
                     0.04795889,  0.06622895,  0.0262632 ,  0.12988867,  0.        ,
                     0.09192121,  0.03111619,  0.01712818,  0.04852983,  0.05937768,
                     0.07079646,  0.04396232,  0.0225521 ,  0.05994862,  0.01855552]])
        
        .. image:: ../docs/static/AA_dist.png
            :height: 300px
        """
        for l in range(self.library.shape[0]):
            concatseq = ''.join(self.library[l])
            d_aa = count_aas(concatseq)
            self.aafreq[l] = [d_aa[a] for a in self.AAs]
            if plot:
                fig, ax = plt.subplots()

                for a in range(20):
                    plt.bar(a, self.aafreq[l, a], 0.9, color=color)

                plt.xlim([-0.75, 19.75])
                plt.ylim([0, max(self.aafreq[l, :]) + 0.05])
                plt.xticks(range(20), d_aa.keys(), fontweight='bold')
                plt.ylabel('Amino Acid Frequency', fontweight='bold')
                plt.title('Amino Acid Distribution',
                          fontsize=16,
                          fontweight='bold')

                # only left and bottom axes, no box
                ax.spines['right'].set_visible(False)
                ax.spines['top'].set_visible(False)
                ax.xaxis.set_ticks_position('bottom')
                ax.yaxis.set_ticks_position('left')

                if filename:
                    plt.savefig(filename)
                else:
                    plt.show()
Exemplo n.º 3
0
def plot_aa_distr(sequences, color='#83AF9B', filename=None):
    """Method to plot the amino acid distribution of a given list of sequences

    :param sequences: {list} list of sequences to calculate the amino acid distribution fore
    :param color: {str} color to be used (matplotlib style / hex)
    :param filename: {str} location / filename where to save the plot to. *default = None* --> show the plot
    :Example:

    >>> plot_aa_distr(['KLLKLLKKLLKLLK', 'WWRRWWRAARWWRRWWRR', 'ACDEFGHKLCMNPQRSTVWY', 'GGGGGIIKLWGGGGGGGGGGGGG'])

    .. image:: ../docs/static/AA_dist.png
        :height: 300px

    .. versionadded:: v2.2.5
    """
    concatseq = ''.join(sequences)
    aa = count_aas(concatseq, scale='relative')

    fig, ax = plt.subplots()

    for a in range(20):
        plt.bar(a, aa.values()[a], 0.9, color=color)

    plt.xlim([-0.75, 19.75])
    plt.ylim([0, max(aa.values()) + 0.05])
    plt.xticks(range(20), aa.keys(), fontweight='bold')
    plt.ylabel('Amino Acid Frequency', fontweight='bold')
    plt.title('Amino Acid Distribution', fontsize=16, fontweight='bold')

    # only left and bottom axes, no box
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')

    if filename:
        plt.savefig(filename, dpi=300)
    else:
        plt.show()
Exemplo n.º 4
0
    def plot_summary(self, filename=None, colors=None, plot=True):
        """Method to generate a visual summary of different characteristics of the given library. The class methods
        are used with their standard options.
    
        :param filename: {str} path to save the generated plot to.
        :param colors: {str / list} color or list of colors to use for plotting. e.g. '#4E395D', 'red', 'k'
        :param plot: {boolean} whether the plot should be created or just the features are calculated
        :return: visual summary (plot) of the library characteristics (if ``plot=True``).
        :Example:
        
        >>> g = GlobalAnalysis([seqs1, seqs2, seqs3])  # seqs being lists / arrays of sequences
        >>> g.plot_summary()
        
        .. image:: ../docs/static/summary.png
            :height: 600px
        """
        # calculate all global properties
        self.calc_len()
        self.calc_aa_freq(plot=False)
        self.calc_charge(ph=7.4, amide=True)
        self.calc_H()
        self.calc_uH()

        if plot:

            # plot settings
            fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(25, 15))
            ((ax2, ax5, ax1), (ax3, ax4, ax6)) = axes
            plt.suptitle('Summary', fontweight='bold', fontsize=16.)
            labels = self.libnames
            if not colors:
                colors = [
                    '#FA6900', '#69D2E7', '#542437', '#53777A', '#CCFC8E',
                    '#9CC4E4'
                ]
            num = len(labels)

            for a in [ax1, ax2, ax3, ax4, ax5, ax6]:
                # only left and bottom axes, no box
                a.spines['right'].set_visible(False)
                a.spines['top'].set_visible(False)
                a.xaxis.set_ticks_position('bottom')
                a.yaxis.set_ticks_position('left')

            # 1 length box plot
            box = ax1.boxplot(self.len, notch=1, vert=1, patch_artist=True)
            plt.setp(box['whiskers'], color='black')
            plt.setp(box['medians'],
                     linestyle='-',
                     linewidth=1.5,
                     color='black')
            for p, patch in enumerate(box['boxes']):
                patch.set(facecolor=colors[p], edgecolor='black', alpha=0.8)
            ax1.set_ylabel('Sequence Length', fontweight='bold', fontsize=14.)
            ax1.set_xticks([x + 1 for x in range(len(labels))])
            ax1.set_xticklabels(labels, fontweight='bold')

            # 2 AA bar plot
            d_aa = count_aas('')
            hands = [
                mpatches.Patch(label=labels[i], facecolor=colors[i], alpha=0.8)
                for i in range(len(labels))
            ]
            w = .9 / num  # bar width
            offsets = np.arange(start=-w, step=w,
                                stop=num * w)  # bar offsets if many libs
            for i, l in enumerate(self.aafreq):
                for a in range(20):
                    ax2.bar(a - offsets[i],
                            l[a],
                            w,
                            color=colors[i],
                            alpha=0.8)
            ax2.set_xlim([-1., 20.])
            ax2.set_ylim([0, 1.05 * np.max(self.aafreq)])
            ax2.set_xticks(range(20))
            ax2.set_xticklabels(d_aa.keys(), fontweight='bold')
            ax2.set_ylabel('Fraction', fontweight='bold', fontsize=14.)
            ax2.set_xlabel('Amino Acids', fontweight='bold', fontsize=14.)
            ax2.legend(handles=hands, labels=labels)

            # 3 hydophobicity violin plot
            for i, l in enumerate(self.H):
                vplot = ax3.violinplot(l,
                                       positions=[i + 1],
                                       widths=0.5,
                                       showmeans=True,
                                       showmedians=False)
                # crappy adaptions of violin dictionary elements
                vplot['cbars'].set_edgecolor('black')
                vplot['cmins'].set_edgecolor('black')
                vplot['cmeans'].set_edgecolor('black')
                vplot['cmaxes'].set_edgecolor('black')
                vplot['cmeans'].set_linestyle('--')
                for pc in vplot['bodies']:
                    pc.set_facecolor(colors[i])
                    pc.set_alpha(0.8)
                    pc.set_edgecolor('black')
                    pc.set_linewidth(1.5)
                    pc.set_alpha(0.7)
                    pc.set_label(labels[i])
            ax3.set_xticks([x + 1 for x in range(len(labels))])
            ax3.set_xticklabels(labels, fontweight='bold')
            ax3.set_ylabel('Global Hydrophobicity',
                           fontweight='bold',
                           fontsize=14.)

            # 4 hydrophobic moment violin plot
            for i, l in enumerate(self.uH):
                vplot = ax4.violinplot(l,
                                       positions=[i + 1],
                                       widths=0.5,
                                       showmeans=True,
                                       showmedians=False)
                # crappy adaptions of violin dictionary elements
                vplot['cbars'].set_edgecolor('black')
                vplot['cmins'].set_edgecolor('black')
                vplot['cmeans'].set_edgecolor('black')
                vplot['cmaxes'].set_edgecolor('black')
                vplot['cmeans'].set_linestyle('--')
                for pc in vplot['bodies']:
                    pc.set_facecolor(colors[i])
                    pc.set_alpha(0.8)
                    pc.set_edgecolor('black')
                    pc.set_linewidth(1.5)
                    pc.set_alpha(0.7)
                    pc.set_label(labels[i])
            ax4.set_xticks([x + 1 for x in range(len(labels))])
            ax4.set_xticklabels(labels, fontweight='bold')
            ax4.set_ylabel('Global Hydrophobic Moment',
                           fontweight='bold',
                           fontsize=14.)

            # 5 charge histogram
            if self.shapes:  # if the library consists of different sized sub libraries
                bwidth = 1. / len(self.shapes)
                for i, c in enumerate(self.charge):
                    counts, bins = np.histogram(c,
                                                range=[-5, 20],
                                                bins=25,
                                                normed=True)
                    ax5.bar(bins[1:] + i * bwidth,
                            counts,
                            bwidth,
                            color=colors[i],
                            label=labels[i],
                            alpha=0.8)
                    # ax5.hist(c, bins, alpha=0.7, align=alignments[i], rwidth=0.95 / len(self.shapes), histtype='bar',
                    #         normed=1, label=labels[i], color=colors[i])
            else:
                ax5.hist(self.charge,
                         25,
                         normed=1,
                         alpha=0.8,
                         align='left',
                         rwidth=0.95,
                         histtype='bar',
                         label=labels,
                         color=colors[:num])
            ax5.set_xlabel('Global Charge', fontweight='bold', fontsize=14.)
            ax5.set_ylabel('Fraction', fontweight='bold', fontsize=14.)
            ax5.set_xlim(-6, 21)
            ax5.text(0.95,
                     0.8,
                     b'amide: $true$',
                     verticalalignment='center',
                     horizontalalignment='right',
                     transform=ax5.transAxes,
                     fontsize=15)
            ax5.text(0.95,
                     0.75,
                     b'pH:  $7.4$',
                     verticalalignment='center',
                     horizontalalignment='right',
                     transform=ax5.transAxes,
                     fontsize=15)
            ax5.legend()

            # 6 3D plot
            ax6.spines['left'].set_visible(False)
            ax6.spines['bottom'].set_visible(False)
            ax6.set_xticks([])
            ax6.set_yticks([])
            ax6 = fig.add_subplot(2, 3, 6, projection='3d')
            for i, l in enumerate(range(num)):
                xt = self.H[l]  # find all values in x for the given target
                yt = self.charge[
                    l]  # find all values in y for the given target
                zt = self.uH[l]  # find all values in y for the given target
                ax6.scatter(xt,
                            yt,
                            zt,
                            c=colors[l],
                            alpha=.8,
                            s=25,
                            label=labels[i])

            ax6.set_xlabel('H', fontweight='bold', fontsize=14.)
            ax6.set_ylabel('Charge', fontweight='bold', fontsize=14.)
            ax6.set_zlabel('uH', fontweight='bold', fontsize=14.)
            data_c = [item for sublist in self.charge
                      for item in sublist]  # flatten charge data into one list
            data_H = [item for sublist in self.H
                      for item in sublist]  # flatten H data into one list
            data_uH = [item for sublist in self.uH
                       for item in sublist]  # flatten uH data into one list
            ax6.set_xlim([np.min(data_H), np.max(data_H)])
            ax6.set_ylim([np.min(data_c), np.max(data_c)])
            ax6.set_zlim([np.min(data_uH), np.max(data_uH)])
            ax6.legend(loc='best')

            if filename:
                plt.savefig(filename, dpi=200)
            else:
                plt.show()
Exemplo n.º 5
0
    def analyze_generated(self, num, fname='analysis.txt', plot=False):
        """ Method to analyze the generated sequences located in `self.generated`.

        :param num: {int} wanted number of sequences to sample
        :param fname: {str} filename to save analysis info to
        :param plot: {bool} whether to plot an overview of descriptors
        :return: file with analysis info (distances)
        """
        with open(fname, 'w') as f:
            print("Analyzing...")
            f.write("ANALYSIS OF SAMPLED SEQUENCES\n==============================\n\n")
            f.write("Nr. of duplicates in generated sequences: %i\n" % (len(self.generated) - len(set(self.generated))))
            count = len(set(self.generated) & set(self.sequences))  # get shared entries in both lists
            f.write("%.1f percent of generated sequences are present in the training data.\n" %
                    ((count / len(self.generated)) * 100))
            d = GlobalDescriptor(self.generated)
            len1 = len(d.sequences)
            d.filter_aa('B')
            len2 = len(d.sequences)
            d.length()
            f.write("\n\nLENGTH DISTRIBUTION OF GENERATED DATA:\n\n")
            f.write("Number of sequences too short:\t%i\n" % (num - len1))
            f.write("Number of invalid (with 'B'):\t%i\n" % (len1 - len2))
            f.write("Number of valid unique seqs:\t%i\n" % len2)
            f.write("Mean sequence length:     \t\t%.1f ± %.1f\n" % (np.mean(d.descriptor), np.std(d.descriptor)))
            f.write("Median sequence length:   \t\t%i\n" % np.median(d.descriptor))
            f.write("Minimal sequence length:  \t\t%i\n" % np.min(d.descriptor))
            f.write("Maximal sequence length:  \t\t%i\n" % np.max(d.descriptor))
            
            descriptor = 'pepcats'
            seq_desc = PeptideDescriptor([s[1:].rstrip() for s in self.sequences], descriptor)
            seq_desc.calculate_autocorr(7)
            gen_desc = PeptideDescriptor(d.sequences, descriptor)
            gen_desc.calculate_autocorr(7)
            
            # random comparison set
            self.ran = Random(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))  # generate rand seqs
            probas = count_aas(''.join(seq_desc.sequences)).values()  # get the aa distribution of training seqs
            self.ran.generate_sequences(proba=probas)
            ran_desc = PeptideDescriptor(self.ran.sequences, descriptor)
            ran_desc.calculate_autocorr(7)
            
            # amphipathic helices comparison set
            self.hel = Helices(len(self.generated), np.min(d.descriptor), np.max(d.descriptor))
            self.hel.generate_sequences()
            hel_desc = PeptideDescriptor(self.hel.sequences, descriptor)
            hel_desc.calculate_autocorr(7)
            
            # distance calculation
            f.write("\n\nDISTANCE CALCULATION IN '%s' DESCRIPTOR SPACE\n\n" % descriptor.upper())
            desc_dist = distance.cdist(gen_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.3f +/- %.3f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(ran_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(hel_desc.descriptor, seq_desc.descriptor, metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # more simple descriptors
            g_seq = GlobalDescriptor(seq_desc.sequences)
            g_gen = GlobalDescriptor(gen_desc.sequences)
            g_ran = GlobalDescriptor(ran_desc.sequences)
            g_hel = GlobalDescriptor(hel_desc.sequences)
            g_seq.calculate_all()
            g_gen.calculate_all()
            g_ran.calculate_all()
            g_hel.calculate_all()
            sclr = StandardScaler()
            sclr.fit(g_seq.descriptor)
            f.write("\n\nDISTANCE CALCULATION FOR SCALED GLOBAL DESCRIPTORS\n\n")
            desc_dist = distance.cdist(sclr.transform(g_gen.descriptor), sclr.transform(g_seq.descriptor),
                                       metric='euclidean')
            f.write("Average euclidean distance of sampled to training data:\t%.2f +/- %.2f\n" %
                    (np.mean(desc_dist), np.std(desc_dist)))
            ran_dist = distance.cdist(sclr.transform(g_ran.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if randomly sampled seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(ran_dist), np.std(ran_dist)))
            hel_dist = distance.cdist(sclr.transform(g_hel.descriptor), sclr.transform(g_seq.descriptor),
                                      metric='euclidean')
            f.write("Average euclidean distance if amphipathic helical seqs:\t%.2f +/- %.2f\n" %
                    (np.mean(hel_dist), np.std(hel_dist)))
            
            # hydrophobic moments
            uh_seq = PeptideDescriptor(seq_desc.sequences, 'eisenberg')
            uh_seq.calculate_moment()
            uh_gen = PeptideDescriptor(gen_desc.sequences, 'eisenberg')
            uh_gen.calculate_moment()
            uh_ran = PeptideDescriptor(ran_desc.sequences, 'eisenberg')
            uh_ran.calculate_moment()
            uh_hel = PeptideDescriptor(hel_desc.sequences, 'eisenberg')
            uh_hel.calculate_moment()
            f.write("\n\nHYDROPHOBIC MOMENTS\n\n")
            f.write("Hydrophobic moment of training seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_seq.descriptor), np.std(uh_seq.descriptor)))
            f.write("Hydrophobic moment of sampled seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_gen.descriptor), np.std(uh_gen.descriptor)))
            f.write("Hydrophobic moment of random seqs:\t\t%.3f +/- %.3f\n" %
                    (np.mean(uh_ran.descriptor), np.std(uh_ran.descriptor)))
            f.write("Hydrophobic moment of amphipathic seqs:\t%.3f +/- %.3f\n" %
                    (np.mean(uh_hel.descriptor), np.std(uh_hel.descriptor)))
        
        if plot:
            if self.refs:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences, uh_hel.sequences, uh_ran.sequences],
                                   ['training', 'sampled', 'hel', 'ran'])
            else:
                a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences], ['training', 'sampled'])
            a.plot_summary(filename=fname[:-4] + '.png')