def setUp(self): reload(dna) reload(kmeans) dna.DNA.generate_kmer_hash(6) print >> sys.stderr, dna.DNA.kmer_hash_count self.cluster_count = 7 fh = fileinput.input(os.path.join(data_path,"generated_contigs_10000_test.fna")) seqs = SeqIO.parse(fh,"fasta") seqs = list(seqs) self.contigs = [] for seq in seqs: self.contigs.append(dna.DNA(seq.id,seq.seq.tostring())) for contig in self.contigs: contig.calculate_signature() correct_centroids = np.zeros((self.cluster_count,dna.DNA.kmer_hash_count)) contigs_hash = {0:[0,1,2],1:[3,4,5],2:[6,7,8],3:[9,10,11,12,13],4:[14,15,16,17,18],5:[19,20,21,22,23],6:[24,25,26,27,28]} for i in xrange(self.cluster_count): contig_inds = contigs_hash[i] correct_centroids[i,:] = multinomial.fit_nonzero_parameters([self.contigs[j] for j in contig_inds]) self.correct_centroids = correct_centroids self.correct_clusters = kmeans._expectation(self.contigs,multinomial.log_probabilities,self.correct_centroids) self.rs = np.random.RandomState(seed=1) self.params = {"contigs":self.contigs} self.max_iter=100 self.run = 3 self.epsilon = 0.001 self.verbose = False
def test_cluster_perfect_center(self): centroids = np.zeros((self.cluster_count,dna.DNA.kmer_hash_count)) centroids[0,:] = multinomial.fit_nonzero_parameters([self.contigs[0],self.contigs[1]]) centroids[1,:] = multinomial.fit_nonzero_parameters([self.contigs[2],self.contigs[3]]) correct_clusters = kmeans._expectation(self.contigs,multinomial.log_probabilities,centroids) self.params["centroids"] = centroids (clusters, clust_prob,new_centroids) = kmeans._clustering(self.cluster_count, self.max_iter, self.run, self.epsilon, self.verbose, multinomial.log_probabilities, multinomial.fit_nonzero_parameters, **self.params) assert_equal(kmeans._evaluate_clustering(multinomial.log_probabilities, correct_clusters, centroids),clust_prob)
def test_generate_kplusplus_centroids(self): centroids = kmeans._generate_kplusplus(self.contigs,multinomial.log_probabilities, multinomial.fit_nonzero_parameters,self.cluster_count,dna.DNA.kmer_hash_count,self.rs) assert_equal(len(centroids), self.cluster_count) assert_equal(len(centroids[0]),dna.DNA.kmer_hash_count ) assert_equal(np.sum(centroids,axis=1).all(),1) correct_centroids = np.zeros((self.cluster_count,dna.DNA.kmer_hash_count)) correct_centroids[0,:] = multinomial.fit_nonzero_parameters([self.contigs[0], self.contigs[1]]) correct_centroids[1,:] = multinomial.fit_nonzero_parameters([self.contigs[2], self.contigs[3]]) correct_clusters = kmeans._expectation(self.contigs,multinomial.log_probabilities,correct_centroids) correct_clust_prob = kmeans._evaluate_clustering(multinomial.log_probabilities, correct_clusters,correct_centroids) self.params["centroids"] = correct_centroids (clusters, clust_prob,new_centroids) = kmeans._clustering(self.cluster_count, self.max_iter, self.run, self.epsilon, self.verbose, multinomial.log_probabilities, multinomial.fit_nonzero_parameters, **self.params) print clust_prob assert_almost_equal(0, min(np.abs(clust_prob - np.array([-1659.9510320847476, -1652.322663414292, -1658.28785337, -1665.52431153]))))