def test_fit_parameters_single_cluster(self): # A sample with two contigs, each with three data points. f = fileinput.input(os.path.join(data_path,"bambus2.scaffold.linear.fasta.one_contig")) c = list(SeqIO.parse(f,"fasta")) f.close() dna_c = dna.DNA(id = c[0].id, seq = str(c[0].seq)) dna_c.calculate_signature() x = np.log(np.array([[0.5,3.0,2.0],[3.0,1.0,1.0]])) cov_matrix = x # one cluster, two contigs exp_clust = np.array([[1.0],[1.0]]) mu0 = np.log(np.array([0.5,3.0])).sum()/2.0 mu1 = np.log([3.0,1.0]).sum()/2.0 mu2 = np.log([2.0,1.0]).sum()/2.0 correct_mu = np.array([mu0,mu1,mu2]) diff_vec = [(x[i,0] - mu0)**2 + (x[i,1] -mu1)**2 + (x[i,2]-mu2)**2 for i in range(2)] correct_sigma = np.array(diff_vec).sum() correct_sigma /= 2.0 c_sig = self.CORRECT_SIGNATURES_ONE_CONTIG n = sum(c_sig.values()) correct_parameters_mul = np.zeros(dna.DNA.kmer_hash_count) for i,v in c_sig.iteritems(): correct_parameters_mul[i] += v*2 + 1 correct_parameters_mul/=np.sum(correct_parameters_mul) cal_prob_v,cal_mu,cal_sigma = model.fit_nonzero_parameters([dna_c,dna_c],cov_matrix,expected_clustering=exp_clust) assert_equal((cal_prob_v==correct_parameters_mul).all(),True) assert_equal((cal_mu == correct_mu).all(),True) assert_equal((cal_sigma == correct_sigma).all(),True)
def test_fit_parameters_two_clusters(self): # A sample with four contigs, each with three data points. f = fileinput.input(os.path.join(data_path,"bambus2.scaffold.linear.fasta.one_contig")) c = list(SeqIO.parse(f,"fasta")) f.close() dna_c = dna.DNA(id = c[0].id, seq = str(c[0].seq)) dna_c.calculate_signature() x = np.log(np.array([[0.5,3.0,2.0],[3.0,1.0,1.0], [2.5,1.5,1.0],[1.0,1.0,2.0]])) cov_matrix = x # two clusters, four contigs exp_clust = np.array([[0.7,0.3], [0.1,0.9], [0.5,0.5], [0.2,0.8]]) mu00 = (np.log(0.5)*0.7+np.log(3.0)*0.1+np.log(2.5)*0.5+np.log(1.0)*0.2)/1.5 mu10 = (np.log(3.0)*0.7+np.log(1.0)*0.1+np.log(1.5)*0.5+np.log(1.0)*0.2)/1.5 mu20 = (np.log(2.0)*0.7+np.log(1.0)*0.1+np.log(1.0)*0.5+np.log(2.0)*0.2)/1.5 mu01 = (np.log(0.5)*0.3+np.log(3.0)*0.9+np.log(2.5)*0.5+np.log(1.0)*0.8)/2.5 mu11 = (np.log(3.0)*0.3+np.log(1.0)*0.9+np.log(1.5)*0.5+np.log(1.0)*0.8)/2.5 mu21 = (np.log(2.0)*0.3+np.log(1.0)*0.9+np.log(1.0)*0.5+np.log(2.0)*0.8)/2.5 correct_mu = np.array([[mu00,mu10,mu20],[mu01,mu11,mu21]]) mu = correct_mu sigma_test0 = np.array([((x[i,0] - mu[0,0])**2 + (x[i,1] -mu[0,1])**2 + (x[i,2]-mu[0,2])**2)*exp_clust[i,0] for i in range(4)]).sum() sigma_test0 /= 1.5 sigma_test1 = np.array([((x[i,0] - mu[1,0])**2 + (x[i,1] -mu[1,1])**2 + (x[i,2]-mu[1,2])**2)*exp_clust[i,1] for i in range(4)]).sum() sigma_test1 /= 2.5 correct_sigma = np.array([sigma_test0,sigma_test1]) c_sig = self.CORRECT_SIGNATURES_ONE_CONTIG n = sum(c_sig.values()) correct_parameters_mul0 = np.zeros(dna.DNA.kmer_hash_count) correct_parameters_mul1 = np.zeros(dna.DNA.kmer_hash_count) for i,v in c_sig.iteritems(): correct_parameters_mul0[i] += v*(0.7+0.1+0.5+0.2) + 1 correct_parameters_mul1[i] += v*(0.3+0.9+0.5+0.8) + 1 correct_parameters_mul0/=np.sum(correct_parameters_mul0) correct_parameters_mul1/=np.sum(correct_parameters_mul1) correct_parameters_mul = np.array([correct_parameters_mul0, correct_parameters_mul1]) cal_prob_v,cal_mu,cal_sigma = model.fit_nonzero_parameters([dna_c,dna_c,dna_c,dna_c],cov_matrix,expected_clustering=exp_clust) assert_equal((np.abs(cal_prob_v-correct_parameters_mul)<1e-5).all(),True) assert_equal((np.abs(cal_mu - correct_mu)<1e-7).all(),True) assert_equal((np.abs(cal_sigma - correct_sigma)<1e-7).all(),True)