예제 #1
0
    def test_fit_parameters_single_cluster(self):
        # A sample with two contigs, each with three data points.
        
        f = fileinput.input(os.path.join(data_path,"bambus2.scaffold.linear.fasta.one_contig"))
        c = list(SeqIO.parse(f,"fasta"))
        f.close()
        
        dna_c = dna.DNA(id = c[0].id, seq = str(c[0].seq))
        dna_c.calculate_signature()

        x = np.log(np.array([[0.5,3.0,2.0],[3.0,1.0,1.0]]))
        cov_matrix = x
        # one cluster, two contigs
        exp_clust = np.array([[1.0],[1.0]])
        mu0 = np.log(np.array([0.5,3.0])).sum()/2.0
        mu1 = np.log([3.0,1.0]).sum()/2.0
        mu2 = np.log([2.0,1.0]).sum()/2.0
        correct_mu = np.array([mu0,mu1,mu2])

        diff_vec = [(x[i,0] - mu0)**2 + (x[i,1] -mu1)**2 + (x[i,2]-mu2)**2 for i in range(2)]
        correct_sigma = np.array(diff_vec).sum()

        correct_sigma /= 2.0
        
        c_sig = self.CORRECT_SIGNATURES_ONE_CONTIG
        n = sum(c_sig.values())
        correct_parameters_mul = np.zeros(dna.DNA.kmer_hash_count)
        for i,v in c_sig.iteritems():
            correct_parameters_mul[i] += v*2 + 1
        correct_parameters_mul/=np.sum(correct_parameters_mul)
        cal_prob_v,cal_mu,cal_sigma = model.fit_nonzero_parameters([dna_c,dna_c],cov_matrix,expected_clustering=exp_clust)

        assert_equal((cal_prob_v==correct_parameters_mul).all(),True)
        assert_equal((cal_mu == correct_mu).all(),True)
        assert_equal((cal_sigma == correct_sigma).all(),True)
예제 #2
0
    def test_fit_parameters_two_clusters(self):
        # A sample with four contigs, each with three data points.

        f = fileinput.input(os.path.join(data_path,"bambus2.scaffold.linear.fasta.one_contig"))
        c = list(SeqIO.parse(f,"fasta"))
        f.close()
        
        dna_c = dna.DNA(id = c[0].id, seq = str(c[0].seq))
        dna_c.calculate_signature()
        
        x = np.log(np.array([[0.5,3.0,2.0],[3.0,1.0,1.0],
                             [2.5,1.5,1.0],[1.0,1.0,2.0]]))
        cov_matrix = x
        # two clusters, four contigs
        exp_clust = np.array([[0.7,0.3],
                              [0.1,0.9],
                              [0.5,0.5],
                              [0.2,0.8]])

        mu00 = (np.log(0.5)*0.7+np.log(3.0)*0.1+np.log(2.5)*0.5+np.log(1.0)*0.2)/1.5
        mu10 = (np.log(3.0)*0.7+np.log(1.0)*0.1+np.log(1.5)*0.5+np.log(1.0)*0.2)/1.5
        mu20 = (np.log(2.0)*0.7+np.log(1.0)*0.1+np.log(1.0)*0.5+np.log(2.0)*0.2)/1.5

        mu01 = (np.log(0.5)*0.3+np.log(3.0)*0.9+np.log(2.5)*0.5+np.log(1.0)*0.8)/2.5
        mu11 = (np.log(3.0)*0.3+np.log(1.0)*0.9+np.log(1.5)*0.5+np.log(1.0)*0.8)/2.5
        mu21 = (np.log(2.0)*0.3+np.log(1.0)*0.9+np.log(1.0)*0.5+np.log(2.0)*0.8)/2.5

        correct_mu = np.array([[mu00,mu10,mu20],[mu01,mu11,mu21]])
        mu = correct_mu

        sigma_test0 = np.array([((x[i,0] - mu[0,0])**2 + (x[i,1] -mu[0,1])**2 + (x[i,2]-mu[0,2])**2)*exp_clust[i,0] for i in range(4)]).sum()
        sigma_test0 /= 1.5

        sigma_test1 = np.array([((x[i,0] - mu[1,0])**2 + (x[i,1] -mu[1,1])**2 + (x[i,2]-mu[1,2])**2)*exp_clust[i,1] for i in range(4)]).sum()
        sigma_test1 /= 2.5
        
        correct_sigma = np.array([sigma_test0,sigma_test1])


        c_sig = self.CORRECT_SIGNATURES_ONE_CONTIG
        n = sum(c_sig.values())
        correct_parameters_mul0 = np.zeros(dna.DNA.kmer_hash_count)
        correct_parameters_mul1 = np.zeros(dna.DNA.kmer_hash_count)
        for i,v in c_sig.iteritems():
            correct_parameters_mul0[i] += v*(0.7+0.1+0.5+0.2) + 1
            correct_parameters_mul1[i] += v*(0.3+0.9+0.5+0.8) + 1
        correct_parameters_mul0/=np.sum(correct_parameters_mul0)
        correct_parameters_mul1/=np.sum(correct_parameters_mul1)
        correct_parameters_mul = np.array([correct_parameters_mul0,
                                           correct_parameters_mul1])
        cal_prob_v,cal_mu,cal_sigma = model.fit_nonzero_parameters([dna_c,dna_c,dna_c,dna_c],cov_matrix,expected_clustering=exp_clust)

        assert_equal((np.abs(cal_prob_v-correct_parameters_mul)<1e-5).all(),True)
                
        assert_equal((np.abs(cal_mu - correct_mu)<1e-7).all(),True)
        assert_equal((np.abs(cal_sigma - correct_sigma)<1e-7).all(),True)