def test_cluster_perfect_center(self): centroids = np.zeros((self.cluster_count,dna.DNA.kmer_hash_count)) centroids[0,:] = multinomial.fit_nonzero_parameters([self.contigs[0],self.contigs[1]]) centroids[1,:] = multinomial.fit_nonzero_parameters([self.contigs[2],self.contigs[3]]) correct_clusters = kmeans._expectation(self.contigs,multinomial.log_probabilities,centroids) self.params["centroids"] = centroids (clusters, clust_prob,new_centroids) = kmeans._clustering(self.cluster_count, self.max_iter, self.run, self.epsilon, self.verbose, multinomial.log_probabilities, multinomial.fit_nonzero_parameters, **self.params) assert_equal(kmeans._evaluate_clustering(multinomial.log_probabilities, correct_clusters, centroids),clust_prob)
def test_generate_kplusplus_centroids(self): centroids = kmeans._generate_kplusplus(self.contigs,multinomial.log_probabilities, multinomial.fit_nonzero_parameters,self.cluster_count,dna.DNA.kmer_hash_count,self.rs) assert_equal(len(centroids), self.cluster_count) assert_equal(len(centroids[0]),dna.DNA.kmer_hash_count ) assert_equal(np.sum(centroids,axis=1).all(),1) correct_centroids = np.zeros((self.cluster_count,dna.DNA.kmer_hash_count)) correct_centroids[0,:] = multinomial.fit_nonzero_parameters([self.contigs[0], self.contigs[1]]) correct_centroids[1,:] = multinomial.fit_nonzero_parameters([self.contigs[2], self.contigs[3]]) correct_clusters = kmeans._expectation(self.contigs,multinomial.log_probabilities,correct_centroids) correct_clust_prob = kmeans._evaluate_clustering(multinomial.log_probabilities, correct_clusters,correct_centroids) self.params["centroids"] = correct_centroids (clusters, clust_prob,new_centroids) = kmeans._clustering(self.cluster_count, self.max_iter, self.run, self.epsilon, self.verbose, multinomial.log_probabilities, multinomial.fit_nonzero_parameters, **self.params) print clust_prob assert_almost_equal(0, min(np.abs(clust_prob - np.array([-1659.9510320847476, -1652.322663414292, -1658.28785337, -1665.52431153]))))
def setUp(self): reload(dna) reload(kmeans) dna.DNA.generate_kmer_hash(6) print >> sys.stderr, dna.DNA.kmer_hash_count self.cluster_count = 7 fh = fileinput.input(os.path.join(data_path,"generated_contigs_10000_test.fna")) seqs = SeqIO.parse(fh,"fasta") seqs = list(seqs) self.contigs = [] for seq in seqs: self.contigs.append(dna.DNA(seq.id,seq.seq.tostring())) for contig in self.contigs: contig.calculate_signature() correct_centroids = np.zeros((self.cluster_count,dna.DNA.kmer_hash_count)) contigs_hash = {0:[0,1,2],1:[3,4,5],2:[6,7,8],3:[9,10,11,12,13],4:[14,15,16,17,18],5:[19,20,21,22,23],6:[24,25,26,27,28]} for i in xrange(self.cluster_count): contig_inds = contigs_hash[i] correct_centroids[i,:] = multinomial.fit_nonzero_parameters([self.contigs[j] for j in contig_inds]) self.correct_centroids = correct_centroids self.correct_clusters = kmeans._expectation(self.contigs,multinomial.log_probabilities,self.correct_centroids) self.rs = np.random.RandomState(seed=1) self.params = {"contigs":self.contigs} self.max_iter=100 self.run = 3 self.epsilon = 0.001 self.verbose = False
def test_fit_nonzero_parameters(self): c = dna.DNA(id="hej",seq="AAAA") c.calculate_signature() distribution = ml.fit_nonzero_parameters([c]) true_dist = np.ones(136) true_dist[0] = 2 true_dist /= np.sum(true_dist) assert_equal((true_dist== distribution).all(),True)
def test_fit_nonzero_parameters_multiple_contigs(self): c = dna.DNA(id="hej",seq="AAAA",calc_sign=True) d = dna.DNA(id="ja",seq="AAAA",calc_sign=True) distribution = ml.fit_nonzero_parameters([c,d]) true_dist = np.ones(136) true_dist[0] = 3 true_dist /= np.sum(true_dist) assert_equal((true_dist== distribution).all(),True) assert_equal(true_dist.shape,distribution.shape)
def main(contigs_file, taxonomy_file, dir_path, kmer_length, dir_structure, taxonomy_info_in_contigs): groups = [] DNA.generate_kmer_hash(kmer_length) contigs = read_contigs_file(contigs_file, taxonomy_info=taxonomy_info_in_contigs) # Divide genomes into groups, one for each genus meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file) # Fetch sequence for each genome genomes = read_FASTA_files_no_groups(meta_genomes, dir_path, dir_structure=dir_structure) for genome in genomes: genome.calculate_signature() genome.pseudo_par = mn.fit_nonzero_parameters([genome]) scores = [] for contig in contigs: contig.calculate_signature() for genome in genomes: if contig.id == genome.id: temp_genome = deepcopy(genome) temp_genome.signature.subtract(contig.signature) temp_pseudo_par = mn.fit_nonzero_parameters([temp_genome]) p_val = mn.log_probability(contig, temp_pseudo_par) else: p_val = mn.log_probability(contig, genome.pseudo_par) scores.append(Score(p_val, contig, genome, contig.contig_id, taxonomy_info=taxonomy_info_in_contigs)) if taxonomy_info_in_contigs: sys.stdout.write( "p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep ) else: sys.stdout.write( "p_value\t\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep ) for score in scores: sys.stdout.write(str(score) + "\n")
def fit_nonzero_parameters(dna_l,cov_matrix=None,expected_clustering=None,**kwargs): if not cov_matrix is None: if len(dna_l) != cov_matrix.shape[0]: sys.stderr.write("ERROR! Different numbers of contigs in fit nonzero parameters in simple add model!\n") sys.exit(-1) par_ig = isotropic_gaussian.fit_nonzero_parameters( cov_matrix, expected_clustering=expected_clustering) par_mul = multinomial.fit_nonzero_parameters( dna_l, expected_clustering=expected_clustering) else: par_ig = (None,None) par_mul = multinomial.fit_nonzero_parameters( dna_l, expected_clustering=expected_clustering) par = (par_mul,par_ig[0],par_ig[1]) return par