def __init__(self): """ Load sequences and vaccine efficacy scores according to literature. """ print("class being initialized") _, _ = trim("data/NA_2012_aligned_seq.fa" ) # test that sequence_processing still works print("done trimming NA_2012 sequences") self.circ_labels, self.circ_seqs = trim( "data/aligned_circulating_strains.fa") self.vax_labels, self.vax_seqs = trim( "data/aligned_vaccine_strains.fa") self.year = np.array([ 1971, 1972, 1973, 1975, 1984, 1985, 1987, 1989, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2001, 2003 ]) self.eff = np.array([ 7, 15, 11, -3, -6, -2, 17, -5, 59, 38, 25, 45, 28, -17, 34, 43, 55, 12 ]) # https://doi.org/10.1016/j.vaccine.2006.01.010 assert (len(self.vax_seqs) == len(self.circ_seqs) == len(self.year) == len(self.eff)) self.size = len(self.vax_seqs)
def test_mat(self, seqFile): """ function is the same as "dist_mat()" except that it only looks at first 10 sequences in order to get a proof of concept for all my functions before scaling up to the full dataset """ labels, sequences = trim(seqFile) testMat = np.zeros((1000,1000)) print('calculating the test distance matrix based on PAM250') for i in range(0,1000): for j in range(i,1000): testMat[i,j] = self.seq_dist(self.sequences[i], self.sequences[j]) # plug in values for mirror images testMat[j,i] = testMat[i,j] return testMat
def dist_mat(self, seqFile, hybrid=False, hamming=False): """ Calculates all pairwise sequence distances for all sequences in a given file. """ labels, sequences = trim(seqFile) numSeq = sequences.shape[0] distMat = np.zeros((numSeq, numSeq)) print('calculating the full distance matrix') for i in range(0,numSeq): for j in range(i,numSeq): if hamming: # use hamming distance if option is true distMat[i,j] = Hamming_dist(sequences[i], sequences[j]) else: # use substitution matrix otherwise (allow for hybrid option) distMat[i,j] = self.seq_dist(sequences[i], sequences[j], hybrid=hybrid) distMat[j,i] = distMat[i,j] # plug in mirror image values return distMat
def generate_MDS(dist_mat, method, out_file): """ Runs MDS and saves figures given a specific dist_mat. method represents the method in which the dist_mat was generated. out_file is the name of the figure that gets saved.""" labels_2012, sequences_2012 = trim("data/NA_2012_aligned_seq.fa") numSeq_2012 = sequences_2012.shape[0] mds = manifold.MDS(n_components=2, max_iter=8000, dissimilarity="precomputed", n_jobs=1) results = mds.fit(dist_mat) pos = results.embedding_ stress = results.stress_ print('stress: ' + str(stress)) cmap = mpl.cm.autumn # more options found here: https://matplotlib.org/tutorials/colors/colormaps.html autumn_map = plt.get_cmap('autumn') color_array = np.zeros((numSeq_2012, 4)) for ii in range(0, numSeq_2012): color_array[ii, :] = cmap(ii / float(numSeq_2012)) plt.figure( figsize=(8, 5)) # set up figure for plotting, width and height in inches pts = plt.scatter(pos[:, 0], pos[:, 1], color=color_array, cmap=autumn_map, s=10, alpha=0.5) plt.scatter(pos[1, 0], pos[1, 1], color='b', marker='*', s=50, alpha=0.9, label=('vaccine target: ' + str(labels_2012[1]))) plt.legend() plt.xlabel('Component 1') plt.ylabel('Component 2') plt.grid(alpha=0.3) plt.title('HA1 sequences of H3N2 virus: North America - 2012 - ' + str(method)) plt.savefig(out_file)
def generate_tSNE(dist_mat, method, out_file): """ Runs tSNE and saves figures given a specific dist_mat. method represents the method in which the dist_mat was generated. out_file is the name of the figure that gets saved. """ labels_2012, sequences_2012 = trim("data/NA_2012_aligned_seq.fa") numSeq_2012 = sequences_2012.shape[0] tsne = manifold.TSNE(n_components=2, metric='precomputed') tsne_results = tsne.fit(dist_mat) tsne_pos = tsne_results.embedding_ tsne_divergence = tsne_results.kl_divergence_ tsne_iter = tsne_results.n_iter_ print('Kullback-Leibler divergence after optimization: ' + str(tsne_divergence)) print('Number of iterations run: ' + str(tsne_iter)) cmap = mpl.cm.autumn # more options found here: https://matplotlib.org/tutorials/colors/colormaps.html autumn_map = plt.get_cmap('autumn') color_array = np.zeros((numSeq_2012, 4)) for ii in range(0, numSeq_2012): color_array[ii, :] = cmap(ii / float(numSeq_2012)) plt.figure( figsize=(8, 5)) # set up figure for plotting, width and height in inches pts = plt.scatter(tsne_pos[:, 0], tsne_pos[:, 1], color=color_array, cmap=autumn_map, s=10, alpha=0.5) plt.scatter(tsne_pos[1, 0], tsne_pos[1, 1], color='b', marker='*', s=50, alpha=0.9, label=('vaccine target: ' + str(labels_2012[1]))) plt.legend() plt.xlabel('Component 1') plt.ylabel('Component 2') plt.grid(alpha=0.3) plt.title('HA1 H3N2 North America 2012 - tSNE - ' + str(method) + ' - KL divergence: ' + str(round(tsne_divergence, 3))) plt.savefig(out_file)