Пример #1
0
 def test_cluster_perfect_center(self):
     centroids = np.zeros((self.cluster_count,dna.DNA.kmer_hash_count))
     centroids[0,:] = multinomial.fit_nonzero_parameters([self.contigs[0],self.contigs[1]])
     centroids[1,:] = multinomial.fit_nonzero_parameters([self.contigs[2],self.contigs[3]])
     correct_clusters = kmeans._expectation(self.contigs,multinomial.log_probabilities,centroids)        
     
    	self.params["centroids"] = centroids
     (clusters, clust_prob,new_centroids) = kmeans._clustering(self.cluster_count, self.max_iter, self.run, self.epsilon, self.verbose, multinomial.log_probabilities, multinomial.fit_nonzero_parameters, **self.params)
     assert_equal(kmeans._evaluate_clustering(multinomial.log_probabilities, correct_clusters, centroids),clust_prob)
Пример #2
0
 def test_generate_kplusplus_centroids(self):
     centroids = kmeans._generate_kplusplus(self.contigs,multinomial.log_probabilities, multinomial.fit_nonzero_parameters,self.cluster_count,dna.DNA.kmer_hash_count,self.rs)
     assert_equal(len(centroids), self.cluster_count)
     assert_equal(len(centroids[0]),dna.DNA.kmer_hash_count )
     assert_equal(np.sum(centroids,axis=1).all(),1)
     
     correct_centroids = np.zeros((self.cluster_count,dna.DNA.kmer_hash_count))
     correct_centroids[0,:] = multinomial.fit_nonzero_parameters([self.contigs[0], self.contigs[1]])
     correct_centroids[1,:] = multinomial.fit_nonzero_parameters([self.contigs[2], self.contigs[3]])
     correct_clusters = kmeans._expectation(self.contigs,multinomial.log_probabilities,correct_centroids)
     correct_clust_prob = kmeans._evaluate_clustering(multinomial.log_probabilities, correct_clusters,correct_centroids)
    	self.params["centroids"] = correct_centroids
     (clusters, clust_prob,new_centroids) = kmeans._clustering(self.cluster_count, self.max_iter, self.run, self.epsilon, self.verbose, multinomial.log_probabilities, multinomial.fit_nonzero_parameters, **self.params)
     print clust_prob
     assert_almost_equal(0, min(np.abs(clust_prob - np.array([-1659.9510320847476, -1652.322663414292, -1658.28785337, -1665.52431153]))))
    def setUp(self):
        reload(dna)
        reload(kmeans)
        dna.DNA.generate_kmer_hash(6)
        print >> sys.stderr, dna.DNA.kmer_hash_count
        self.cluster_count = 7
        fh = fileinput.input(os.path.join(data_path,"generated_contigs_10000_test.fna"))
        seqs = SeqIO.parse(fh,"fasta")
        seqs = list(seqs)
        self.contigs = []

        for seq in seqs:
            self.contigs.append(dna.DNA(seq.id,seq.seq.tostring()))
            
        for contig in self.contigs:
            contig.calculate_signature()

        correct_centroids = np.zeros((self.cluster_count,dna.DNA.kmer_hash_count))
        contigs_hash = {0:[0,1,2],1:[3,4,5],2:[6,7,8],3:[9,10,11,12,13],4:[14,15,16,17,18],5:[19,20,21,22,23],6:[24,25,26,27,28]}

        for i in xrange(self.cluster_count):
            contig_inds =  contigs_hash[i]
            correct_centroids[i,:] = multinomial.fit_nonzero_parameters([self.contigs[j] for j in contig_inds])
        
        self.correct_centroids = correct_centroids
        self.correct_clusters = kmeans._expectation(self.contigs,multinomial.log_probabilities,self.correct_centroids)        
        self.rs = np.random.RandomState(seed=1)
	self.params = {"contigs":self.contigs}
	self.max_iter=100
	self.run = 3
	self.epsilon = 0.001
	self.verbose = False
Пример #4
0
 def test_fit_nonzero_parameters(self):
     c = dna.DNA(id="hej",seq="AAAA")
     c.calculate_signature()
     distribution = ml.fit_nonzero_parameters([c])
     true_dist = np.ones(136)
     true_dist[0] = 2
     true_dist /= np.sum(true_dist)
     assert_equal((true_dist== distribution).all(),True)
Пример #5
0
 def test_fit_nonzero_parameters_multiple_contigs(self):
     c = dna.DNA(id="hej",seq="AAAA",calc_sign=True)
     d = dna.DNA(id="ja",seq="AAAA",calc_sign=True)
     distribution = ml.fit_nonzero_parameters([c,d])
     true_dist = np.ones(136)
     true_dist[0] = 3
     true_dist /= np.sum(true_dist)
     assert_equal((true_dist== distribution).all(),True)
     assert_equal(true_dist.shape,distribution.shape)
def main(contigs_file, taxonomy_file, dir_path, kmer_length, dir_structure, taxonomy_info_in_contigs):

    groups = []
    DNA.generate_kmer_hash(kmer_length)

    contigs = read_contigs_file(contigs_file, taxonomy_info=taxonomy_info_in_contigs)

    # Divide genomes into groups, one for each genus
    meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file)

    # Fetch sequence for each genome
    genomes = read_FASTA_files_no_groups(meta_genomes, dir_path, dir_structure=dir_structure)

    for genome in genomes:
        genome.calculate_signature()
        genome.pseudo_par = mn.fit_nonzero_parameters([genome])

    scores = []
    for contig in contigs:
        contig.calculate_signature()
        for genome in genomes:
            if contig.id == genome.id:
                temp_genome = deepcopy(genome)
                temp_genome.signature.subtract(contig.signature)
                temp_pseudo_par = mn.fit_nonzero_parameters([temp_genome])
                p_val = mn.log_probability(contig, temp_pseudo_par)
            else:
                p_val = mn.log_probability(contig, genome.pseudo_par)
            scores.append(Score(p_val, contig, genome, contig.contig_id, taxonomy_info=taxonomy_info_in_contigs))

    if taxonomy_info_in_contigs:
        sys.stdout.write(
            "p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id"
            + os.linesep
        )
    else:
        sys.stdout.write(
            "p_value\t\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id"
            + os.linesep
        )
    for score in scores:
        sys.stdout.write(str(score) + "\n")
Пример #7
0
def fit_nonzero_parameters(dna_l,cov_matrix=None,expected_clustering=None,**kwargs):
    if not cov_matrix is None:
        if len(dna_l) != cov_matrix.shape[0]:
            sys.stderr.write("ERROR! Different numbers of contigs in fit nonzero parameters in simple add model!\n")
            sys.exit(-1)
        par_ig =  isotropic_gaussian.fit_nonzero_parameters(
            cov_matrix,
            expected_clustering=expected_clustering)
        par_mul = multinomial.fit_nonzero_parameters(
            dna_l,
            expected_clustering=expected_clustering)
    else:
        par_ig = (None,None)
        par_mul = multinomial.fit_nonzero_parameters(
            dna_l,
            expected_clustering=expected_clustering)

    par = (par_mul,par_ig[0],par_ig[1])

    return par