def _charge(seq, ph=7.0, amide=False): """Calculates charge of a single sequence. The method used is first described by Bjellqvist. In the case of amidation, the value for the 'Cterm' pKa is 15 (and Cterm is added to the pos_pks dictionary. The pKa scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed). **pos_pks** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} **neg_pks** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} :param ph: {float} pH at which to calculate peptide charge. :param amide: {boolean} whether the sequences have an amidated C-terminus. :return: {array} descriptor values in the attribute :py:attr:`descriptor """ if amide: pos_pks = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} neg_pks = {'Cterm': 15., 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} else: pos_pks = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} neg_pks = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} aa_content = count_aas(seq, scale='absolute') aa_content['Nterm'] = 1.0 aa_content['Cterm'] = 1.0 pos_charge = 0.0 for aa, pK in pos_pks.items(): c_r = 10**(pK - ph) partial_charge = c_r / (c_r + 1.0) pos_charge += aa_content[aa] * partial_charge neg_charge = 0.0 for aa, pK in neg_pks.items(): c_r = 10**(ph - pK) partial_charge = c_r / (c_r + 1.0) neg_charge += aa_content[aa] * partial_charge return round(pos_charge - neg_charge, 3)
def calc_aa_freq(self, plot=True, color='#83AF9B', filename=None): """Method to get the frequency of every amino acid in the library. If the library consists of sub-libraries, the frequencies of these are calculated independently. :param plot: {bool} whether the amino acid frequencies should be plotted in a histogram. :param color: {str} color of the plot :param filename: {str} filename to save the plot to, if None, the plot is shown :return: {numpy.ndarray} amino acid frequencies in the attribute :py:attr:`aafreq`. The values are oredered alphabetically. :Example: >>> g = GlobalAnalysis(sequences) # sequences being a list / array of amino acid sequences >>> g.calc_aa_freq() >>> g.aafreq array([[ 0.08250071, 0. , 0.02083928, 0.0159863 , 0.1464459 , 0.04795889, 0.06622895, 0.0262632 , 0.12988867, 0. , 0.09192121, 0.03111619, 0.01712818, 0.04852983, 0.05937768, 0.07079646, 0.04396232, 0.0225521 , 0.05994862, 0.01855552]]) .. image:: ../docs/static/AA_dist.png :height: 300px """ for l in range(self.library.shape[0]): concatseq = ''.join(self.library[l]) d_aa = count_aas(concatseq) self.aafreq[l] = [d_aa[a] for a in self.AAs] if plot: fig, ax = plt.subplots() for a in range(20): plt.bar(a, self.aafreq[l, a], 0.9, color=color) plt.xlim([-0.75, 19.75]) plt.ylim([0, max(self.aafreq[l, :]) + 0.05]) plt.xticks(range(20), d_aa.keys(), fontweight='bold') plt.ylabel('Amino Acid Frequency', fontweight='bold') plt.title('Amino Acid Distribution', fontsize=16, fontweight='bold') # only left and bottom axes, no box ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') if filename: plt.savefig(filename) else: plt.show()
def plot_aa_distr(sequences, color='#83AF9B', filename=None): """Method to plot the amino acid distribution of a given list of sequences :param sequences: {list} list of sequences to calculate the amino acid distribution fore :param color: {str} color to be used (matplotlib style / hex) :param filename: {str} location / filename where to save the plot to. *default = None* --> show the plot :Example: >>> plot_aa_distr(['KLLKLLKKLLKLLK', 'WWRRWWRAARWWRRWWRR', 'ACDEFGHKLCMNPQRSTVWY', 'GGGGGIIKLWGGGGGGGGGGGGG']) .. image:: ../docs/static/AA_dist.png :height: 300px .. versionadded:: v2.2.5 """ concatseq = ''.join(sequences) aa = count_aas(concatseq, scale='relative') fig, ax = plt.subplots() for a in range(20): plt.bar(a, aa.values()[a], 0.9, color=color) plt.xlim([-0.75, 19.75]) plt.ylim([0, max(aa.values()) + 0.05]) plt.xticks(range(20), aa.keys(), fontweight='bold') plt.ylabel('Amino Acid Frequency', fontweight='bold') plt.title('Amino Acid Distribution', fontsize=16, fontweight='bold') # only left and bottom axes, no box ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') if filename: plt.savefig(filename, dpi=300) else: plt.show()
def plot_summary(self, filename=None, colors=None, plot=True): """Method to generate a visual summary of different characteristics of the given library. The class methods are used with their standard options. :param filename: {str} path to save the generated plot to. :param colors: {str / list} color or list of colors to use for plotting. e.g. '#4E395D', 'red', 'k' :param plot: {boolean} whether the plot should be created or just the features are calculated :return: visual summary (plot) of the library characteristics (if ``plot=True``). :Example: >>> g = GlobalAnalysis([seqs1, seqs2, seqs3]) # seqs being lists / arrays of sequences >>> g.plot_summary() .. image:: ../docs/static/summary.png :height: 600px """ # calculate all global properties self.calc_len() self.calc_aa_freq(plot=False) self.calc_charge(ph=7.4, amide=True) self.calc_H() self.calc_uH() if plot: # plot settings fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(25, 15)) ((ax2, ax5, ax1), (ax3, ax4, ax6)) = axes plt.suptitle('Summary', fontweight='bold', fontsize=16.) labels = self.libnames if not colors: colors = [ '#FA6900', '#69D2E7', '#542437', '#53777A', '#CCFC8E', '#9CC4E4' ] num = len(labels) for a in [ax1, ax2, ax3, ax4, ax5, ax6]: # only left and bottom axes, no box a.spines['right'].set_visible(False) a.spines['top'].set_visible(False) a.xaxis.set_ticks_position('bottom') a.yaxis.set_ticks_position('left') # 1 length box plot box = ax1.boxplot(self.len, notch=1, vert=1, patch_artist=True) plt.setp(box['whiskers'], color='black') plt.setp(box['medians'], linestyle='-', linewidth=1.5, color='black') for p, patch in enumerate(box['boxes']): patch.set(facecolor=colors[p], edgecolor='black', alpha=0.8) ax1.set_ylabel('Sequence Length', fontweight='bold', fontsize=14.) ax1.set_xticks([x + 1 for x in range(len(labels))]) ax1.set_xticklabels(labels, fontweight='bold') # 2 AA bar plot d_aa = count_aas('') hands = [ mpatches.Patch(label=labels[i], facecolor=colors[i], alpha=0.8) for i in range(len(labels)) ] w = .9 / num # bar width offsets = np.arange(start=-w, step=w, stop=num * w) # bar offsets if many libs for i, l in enumerate(self.aafreq): for a in range(20): ax2.bar(a - offsets[i], l[a], w, color=colors[i], alpha=0.8) ax2.set_xlim([-1., 20.]) ax2.set_ylim([0, 1.05 * np.max(self.aafreq)]) ax2.set_xticks(range(20)) ax2.set_xticklabels(d_aa.keys(), fontweight='bold') ax2.set_ylabel('Fraction', fontweight='bold', fontsize=14.) ax2.set_xlabel('Amino Acids', fontweight='bold', fontsize=14.) ax2.legend(handles=hands, labels=labels) # 3 hydophobicity violin plot for i, l in enumerate(self.H): vplot = ax3.violinplot(l, positions=[i + 1], widths=0.5, showmeans=True, showmedians=False) # crappy adaptions of violin dictionary elements vplot['cbars'].set_edgecolor('black') vplot['cmins'].set_edgecolor('black') vplot['cmeans'].set_edgecolor('black') vplot['cmaxes'].set_edgecolor('black') vplot['cmeans'].set_linestyle('--') for pc in vplot['bodies']: pc.set_facecolor(colors[i]) pc.set_alpha(0.8) pc.set_edgecolor('black') pc.set_linewidth(1.5) pc.set_alpha(0.7) pc.set_label(labels[i]) ax3.set_xticks([x + 1 for x in range(len(labels))]) ax3.set_xticklabels(labels, fontweight='bold') ax3.set_ylabel('Global Hydrophobicity', fontweight='bold', fontsize=14.) # 4 hydrophobic moment violin plot for i, l in enumerate(self.uH): vplot = ax4.violinplot(l, positions=[i + 1], widths=0.5, showmeans=True, showmedians=False) # crappy adaptions of violin dictionary elements vplot['cbars'].set_edgecolor('black') vplot['cmins'].set_edgecolor('black') vplot['cmeans'].set_edgecolor('black') vplot['cmaxes'].set_edgecolor('black') vplot['cmeans'].set_linestyle('--') for pc in vplot['bodies']: pc.set_facecolor(colors[i]) pc.set_alpha(0.8) pc.set_edgecolor('black') pc.set_linewidth(1.5) pc.set_alpha(0.7) pc.set_label(labels[i]) ax4.set_xticks([x + 1 for x in range(len(labels))]) ax4.set_xticklabels(labels, fontweight='bold') ax4.set_ylabel('Global Hydrophobic Moment', fontweight='bold', fontsize=14.) # 5 charge histogram if self.shapes: # if the library consists of different sized sub libraries bwidth = 1. / len(self.shapes) for i, c in enumerate(self.charge): counts, bins = np.histogram(c, range=[-5, 20], bins=25, normed=True) ax5.bar(bins[1:] + i * bwidth, counts, bwidth, color=colors[i], label=labels[i], alpha=0.8) # ax5.hist(c, bins, alpha=0.7, align=alignments[i], rwidth=0.95 / len(self.shapes), histtype='bar', # normed=1, label=labels[i], color=colors[i]) else: ax5.hist(self.charge, 25, normed=1, alpha=0.8, align='left', rwidth=0.95, histtype='bar', label=labels, color=colors[:num]) ax5.set_xlabel('Global Charge', fontweight='bold', fontsize=14.) ax5.set_ylabel('Fraction', fontweight='bold', fontsize=14.) ax5.set_xlim(-6, 21) ax5.text(0.95, 0.8, b'amide: $true$', verticalalignment='center', horizontalalignment='right', transform=ax5.transAxes, fontsize=15) ax5.text(0.95, 0.75, b'pH: $7.4$', verticalalignment='center', horizontalalignment='right', transform=ax5.transAxes, fontsize=15) ax5.legend() # 6 3D plot ax6.spines['left'].set_visible(False) ax6.spines['bottom'].set_visible(False) ax6.set_xticks([]) ax6.set_yticks([]) ax6 = fig.add_subplot(2, 3, 6, projection='3d') for i, l in enumerate(range(num)): xt = self.H[l] # find all values in x for the given target yt = self.charge[ l] # find all values in y for the given target zt = self.uH[l] # find all values in y for the given target ax6.scatter(xt, yt, zt, c=colors[l], alpha=.8, s=25, label=labels[i]) ax6.set_xlabel('H', fontweight='bold', fontsize=14.) ax6.set_ylabel('Charge', fontweight='bold', fontsize=14.) ax6.set_zlabel('uH', fontweight='bold', fontsize=14.) data_c = [item for sublist in self.charge for item in sublist] # flatten charge data into one list data_H = [item for sublist in self.H for item in sublist] # flatten H data into one list data_uH = [item for sublist in self.uH for item in sublist] # flatten uH data into one list ax6.set_xlim([np.min(data_H), np.max(data_H)]) ax6.set_ylim([np.min(data_c), np.max(data_c)]) ax6.set_zlim([np.min(data_uH), np.max(data_uH)]) ax6.legend(loc='best') if filename: plt.savefig(filename, dpi=200) else: plt.show()
def analyze_generated(self, num, fname='analysis.txt', plot=False): """ Method to analyze the generated sequences located in `self.generated`. :param num: {int} wanted number of sequences to sample :param fname: {str} filename to save analysis info to :param plot: {bool} whether to plot an overview of descriptors :return: file with analysis info (distances) """ with open(fname, 'w') as f: print("Analyzing...") f.write("ANALYSIS OF SAMPLED SEQUENCES\n==============================\n\n") f.write("Nr. of duplicates in generated sequences: %i\n" % (len(self.generated) - len(set(self.generated)))) count = len(set(self.generated) & set(self.sequences)) # get shared entries in both lists f.write("%.1f percent of generated sequences are present in the training data.\n" % ((count / len(self.generated)) * 100)) d = GlobalDescriptor(self.generated) len1 = len(d.sequences) d.filter_aa('B') len2 = len(d.sequences) d.length() f.write("\n\nLENGTH DISTRIBUTION OF GENERATED DATA:\n\n") f.write("Number of sequences too short:\t%i\n" % (num - len1)) f.write("Number of invalid (with 'B'):\t%i\n" % (len1 - len2)) f.write("Number of valid unique seqs:\t%i\n" % len2) f.write("Mean sequence length: \t\t%.1f ± %.1f\n" % (np.mean(d.descriptor), np.std(d.descriptor))) f.write("Median sequence length: \t\t%i\n" % np.median(d.descriptor)) f.write("Minimal sequence length: \t\t%i\n" % np.min(d.descriptor)) f.write("Maximal sequence length: \t\t%i\n" % np.max(d.descriptor)) descriptor = 'pepcats' seq_desc = PeptideDescriptor([s[1:].rstrip() for s in self.sequences], descriptor) seq_desc.calculate_autocorr(7) gen_desc = PeptideDescriptor(d.sequences, descriptor) gen_desc.calculate_autocorr(7) # random comparison set self.ran = Random(len(self.generated), np.min(d.descriptor), np.max(d.descriptor)) # generate rand seqs probas = count_aas(''.join(seq_desc.sequences)).values() # get the aa distribution of training seqs self.ran.generate_sequences(proba=probas) ran_desc = PeptideDescriptor(self.ran.sequences, descriptor) ran_desc.calculate_autocorr(7) # amphipathic helices comparison set self.hel = Helices(len(self.generated), np.min(d.descriptor), np.max(d.descriptor)) self.hel.generate_sequences() hel_desc = PeptideDescriptor(self.hel.sequences, descriptor) hel_desc.calculate_autocorr(7) # distance calculation f.write("\n\nDISTANCE CALCULATION IN '%s' DESCRIPTOR SPACE\n\n" % descriptor.upper()) desc_dist = distance.cdist(gen_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance of sampled to training data:\t%.3f +/- %.3f\n" % (np.mean(desc_dist), np.std(desc_dist))) ran_dist = distance.cdist(ran_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance if randomly sampled seqs:\t%.3f +/- %.3f\n" % (np.mean(ran_dist), np.std(ran_dist))) hel_dist = distance.cdist(hel_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance if amphipathic helical seqs:\t%.3f +/- %.3f\n" % (np.mean(hel_dist), np.std(hel_dist))) # more simple descriptors g_seq = GlobalDescriptor(seq_desc.sequences) g_gen = GlobalDescriptor(gen_desc.sequences) g_ran = GlobalDescriptor(ran_desc.sequences) g_hel = GlobalDescriptor(hel_desc.sequences) g_seq.calculate_all() g_gen.calculate_all() g_ran.calculate_all() g_hel.calculate_all() sclr = StandardScaler() sclr.fit(g_seq.descriptor) f.write("\n\nDISTANCE CALCULATION FOR SCALED GLOBAL DESCRIPTORS\n\n") desc_dist = distance.cdist(sclr.transform(g_gen.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance of sampled to training data:\t%.2f +/- %.2f\n" % (np.mean(desc_dist), np.std(desc_dist))) ran_dist = distance.cdist(sclr.transform(g_ran.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance if randomly sampled seqs:\t%.2f +/- %.2f\n" % (np.mean(ran_dist), np.std(ran_dist))) hel_dist = distance.cdist(sclr.transform(g_hel.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance if amphipathic helical seqs:\t%.2f +/- %.2f\n" % (np.mean(hel_dist), np.std(hel_dist))) # hydrophobic moments uh_seq = PeptideDescriptor(seq_desc.sequences, 'eisenberg') uh_seq.calculate_moment() uh_gen = PeptideDescriptor(gen_desc.sequences, 'eisenberg') uh_gen.calculate_moment() uh_ran = PeptideDescriptor(ran_desc.sequences, 'eisenberg') uh_ran.calculate_moment() uh_hel = PeptideDescriptor(hel_desc.sequences, 'eisenberg') uh_hel.calculate_moment() f.write("\n\nHYDROPHOBIC MOMENTS\n\n") f.write("Hydrophobic moment of training seqs:\t%.3f +/- %.3f\n" % (np.mean(uh_seq.descriptor), np.std(uh_seq.descriptor))) f.write("Hydrophobic moment of sampled seqs:\t\t%.3f +/- %.3f\n" % (np.mean(uh_gen.descriptor), np.std(uh_gen.descriptor))) f.write("Hydrophobic moment of random seqs:\t\t%.3f +/- %.3f\n" % (np.mean(uh_ran.descriptor), np.std(uh_ran.descriptor))) f.write("Hydrophobic moment of amphipathic seqs:\t%.3f +/- %.3f\n" % (np.mean(uh_hel.descriptor), np.std(uh_hel.descriptor))) if plot: if self.refs: a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences, uh_hel.sequences, uh_ran.sequences], ['training', 'sampled', 'hel', 'ran']) else: a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences], ['training', 'sampled']) a.plot_summary(filename=fname[:-4] + '.png')