示例#1
0
 def __init__(self):
     """ Load sequences and vaccine efficacy scores according to literature. """
     print("class being initialized")
     _, _ = trim("data/NA_2012_aligned_seq.fa"
                 )  # test that sequence_processing still works
     print("done trimming NA_2012 sequences")
     self.circ_labels, self.circ_seqs = trim(
         "data/aligned_circulating_strains.fa")
     self.vax_labels, self.vax_seqs = trim(
         "data/aligned_vaccine_strains.fa")
     self.year = np.array([
         1971, 1972, 1973, 1975, 1984, 1985, 1987, 1989, 1992, 1993, 1994,
         1995, 1996, 1997, 1998, 1999, 2001, 2003
     ])
     self.eff = np.array([
         7, 15, 11, -3, -6, -2, 17, -5, 59, 38, 25, 45, 28, -17, 34, 43, 55,
         12
     ])  # https://doi.org/10.1016/j.vaccine.2006.01.010
     assert (len(self.vax_seqs) == len(self.circ_seqs) == len(self.year) ==
             len(self.eff))
     self.size = len(self.vax_seqs)
    def test_mat(self, seqFile):
        """ function is the same as "dist_mat()" except that it only looks at first 10 sequences
        in order to get a proof of concept for all my functions before scaling up to the full dataset 
        """
        labels, sequences = trim(seqFile)

        testMat = np.zeros((1000,1000))
        print('calculating the test distance matrix based on PAM250')
        for i in range(0,1000):
            for j in range(i,1000):
                testMat[i,j] = self.seq_dist(self.sequences[i], self.sequences[j])
                # plug in values for mirror images
                testMat[j,i] = testMat[i,j]

        return testMat
    def dist_mat(self, seqFile, hybrid=False, hamming=False):
        """ Calculates all pairwise sequence distances for all sequences in a given file. """
        labels, sequences = trim(seqFile)
        numSeq = sequences.shape[0]

        distMat = np.zeros((numSeq, numSeq))
        print('calculating the full distance matrix')
        for i in range(0,numSeq):
            for j in range(i,numSeq):
                if hamming:  # use hamming distance if option is true
                    distMat[i,j] =  Hamming_dist(sequences[i], sequences[j])
                else:  # use substitution matrix otherwise (allow for hybrid option)
                    distMat[i,j] = self.seq_dist(sequences[i], sequences[j], hybrid=hybrid)
                distMat[j,i] = distMat[i,j] # plug in mirror image values

        return distMat
示例#4
0
def generate_MDS(dist_mat, method, out_file):
    """ Runs MDS and saves figures given a specific dist_mat. 
        method represents the method in which the dist_mat was generated.
        out_file is the name of the figure that gets saved."""
    labels_2012, sequences_2012 = trim("data/NA_2012_aligned_seq.fa")
    numSeq_2012 = sequences_2012.shape[0]

    mds = manifold.MDS(n_components=2,
                       max_iter=8000,
                       dissimilarity="precomputed",
                       n_jobs=1)

    results = mds.fit(dist_mat)
    pos = results.embedding_
    stress = results.stress_
    print('stress: ' + str(stress))

    cmap = mpl.cm.autumn  # more options found here: https://matplotlib.org/tutorials/colors/colormaps.html
    autumn_map = plt.get_cmap('autumn')
    color_array = np.zeros((numSeq_2012, 4))
    for ii in range(0, numSeq_2012):
        color_array[ii, :] = cmap(ii / float(numSeq_2012))

    plt.figure(
        figsize=(8,
                 5))  # set up figure for plotting, width and height in inches
    pts = plt.scatter(pos[:, 0],
                      pos[:, 1],
                      color=color_array,
                      cmap=autumn_map,
                      s=10,
                      alpha=0.5)
    plt.scatter(pos[1, 0],
                pos[1, 1],
                color='b',
                marker='*',
                s=50,
                alpha=0.9,
                label=('vaccine target: ' + str(labels_2012[1])))
    plt.legend()
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.grid(alpha=0.3)
    plt.title('HA1 sequences of H3N2 virus: North America - 2012 - ' +
              str(method))
    plt.savefig(out_file)
示例#5
0
def generate_tSNE(dist_mat, method, out_file):
    """ Runs tSNE and saves figures given a specific dist_mat. 
        method represents the method in which the dist_mat was generated.
        out_file is the name of the figure that gets saved. """
    labels_2012, sequences_2012 = trim("data/NA_2012_aligned_seq.fa")
    numSeq_2012 = sequences_2012.shape[0]
    tsne = manifold.TSNE(n_components=2, metric='precomputed')
    tsne_results = tsne.fit(dist_mat)
    tsne_pos = tsne_results.embedding_
    tsne_divergence = tsne_results.kl_divergence_
    tsne_iter = tsne_results.n_iter_
    print('Kullback-Leibler divergence after optimization: ' +
          str(tsne_divergence))
    print('Number of iterations run: ' + str(tsne_iter))

    cmap = mpl.cm.autumn  # more options found here: https://matplotlib.org/tutorials/colors/colormaps.html
    autumn_map = plt.get_cmap('autumn')
    color_array = np.zeros((numSeq_2012, 4))
    for ii in range(0, numSeq_2012):
        color_array[ii, :] = cmap(ii / float(numSeq_2012))

    plt.figure(
        figsize=(8,
                 5))  # set up figure for plotting, width and height in inches
    pts = plt.scatter(tsne_pos[:, 0],
                      tsne_pos[:, 1],
                      color=color_array,
                      cmap=autumn_map,
                      s=10,
                      alpha=0.5)
    plt.scatter(tsne_pos[1, 0],
                tsne_pos[1, 1],
                color='b',
                marker='*',
                s=50,
                alpha=0.9,
                label=('vaccine target: ' + str(labels_2012[1])))

    plt.legend()
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.grid(alpha=0.3)
    plt.title('HA1 H3N2 North America 2012 - tSNE - ' + str(method) +
              ' - KL divergence: ' + str(round(tsne_divergence, 3)))
    plt.savefig(out_file)