def get_MH_data(n, k, genome_file, rev_comp=False):
    '''

    :param n:
    :param k: kmer size
    :param genome_file: fasta format
    :param rev_comp:
    :return:
    '''
    estimator = MH.CountEstimator(n=n,
                                  ksize=k,
                                  save_kmers='n',
                                  input_file_name=genome_file,
                                  rev_comp=rev_comp)
    counts = estimator._counts
    count_dict = dict()
    for count in counts:
        if count > 0:
            if count in count_dict.keys():
                count_dict[count] += 1
            else:
                count_dict[count] = 1
    normed_dict = dict()
    total_count = sum(count_dict.values())
    for k, v in count_dict.items():
        normed_dict[k] = count_dict[k] / total_count
    #print("minhash results:")
    #print(normed_dict)
    #print(len(normed_dict.keys()))
    #print("checking if MH estimate is correct:")
    #print(sum(count_dict.values()))
    #print(count_dict)
    return normed_dict
Пример #2
0
def make_minhash(genome, max_h, prime, ksize):
	MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y', input_file_name=genome, rev_comp=False)  # the query automatically takes care of rev_comp's for me
	# Just use HLL to estimate the number of kmers, no need to get exact count
	hll = khmer.HLLCounter(0.01, ksize)
	hll.consume_seqfile(genome)
	MHS._true_num_kmers = hll.estimate_cardinality()
	MHS.input_file_name = genome
	return MHS
def quick_dump(k_list, n, input_file):
    for k in k_list:
        pickle_file = 'k' + str(k) + 'n' + str(n) + input_file + '.pickle'
        print(pickle_file)
        estimator = MH.CountEstimator(n=n,
                                      ksize=k,
                                      save_kmers='n',
                                      input_file_name=input_file,
                                      rev_comp=False)
        counts = estimator._counts
        with open(pickle_file, 'wb') as pf:
            pickle.dump(counts, pf)
def quicker_dump(input_file):
    n = 10000
    for k in [25, 50, 75]:
        pickle_file = 'k' + str(k) + 'n10000' + input_file + '.pickle'
        print(pickle_file)
        estimator = MH.CountEstimator(n=n,
                                      ksize=k,
                                      save_kmers='n',
                                      input_file_name=input_file,
                                      rev_comp=False)
        counts = estimator._counts
        with open(pickle_file, 'wb') as pf:
            pickle.dump(counts, pf)
def kmc_cmash_compare(k, n, input_file):
    kmc_normed_dict = get_kmc_data(k, input_file, input_file + '_out', 'out')
    #minhash estimate
    estimator = MH.CountEstimator(n=n,
                                  ksize=k,
                                  save_kmers='n',
                                  input_file_name=input_file)
    real_dist = pd.DataFrame(list(kmc_normed_dict.items()),
                             columns=['kmer_count', 'percentage'])
    sns.barplot(x='kmer_count', y='percentage', data=real_dist)
    plt.savefig('quicklook_real.png')
    counts = estimator._counts
    estimated_normed_dict = get_count_dict(counts)
    #quick look at distribution
    df = pd.DataFrame(list(estimated_normed_dict.items()),
                      columns=['kmer_count', 'percentage'])
    sns.barplot(x='kmer_count', y='percentage', data=df)
    plt.savefig('quicklook.png')
    #####
    print(sum(estimated_normed_dict.values()))
    print(counts)
    print(get_distance(kmc_normed_dict, estimated_normed_dict, 'wasserstein'))
Пример #6
0
import khmer

# FIXME: could probably do all the data creation, module initialization, and method calling, and then have the tests
# FIXME: just test the data

# create some test data
# First, the TST
seq1 = "ATCGTATGAGTATCGTCGATGCATGCATCGATGCATGCTACGTATCGCATGCATG"
seq2 = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
seq3 = "ATATATATATATATATATATATATATATATATATATATATATATATATATATATAT"
seq4 = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
seqs = [seq1, seq2, seq3, seq4]
query_seq = seq3
num_hashes = 5
CE1 = MH.CountEstimator(n=num_hashes,
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE2 = MH.CountEstimator(n=num_hashes,
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE3 = MH.CountEstimator(n=num_hashes,
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE4 = MH.CountEstimator(n=num_hashes,
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE1.add_sequence(seq1)
CE2.add_sequence(seq2)
Пример #7
0
def k_mer_sketch_histogram(n, k, genome, rev_comp=False):
    n = int(n)
    k = int(k)
    # input: n - sketch size (# Hash function), k - k-mer size, genome - fasta(.gz)
    # return np.array of abundance and normalized abundance distribution
    KMC_outname = genome.split('/')[-1] + '.ksize' + str(k) + '.res'
    outpath = os.path.dirname(
        os.path.realpath(__file__)) + '/kmc_global_count/'
    # if the value not stored, compute it, else load it
    if not os.path.isfile(outpath + KMC_outname + '.sketch' + str(n) +
                          '.pickle'):
        # if MinHash Estimator with larger sketch size doesn't exists, compute it with current sketch size
        MHS_filenames = os.listdir(outpath + 'MH_counts/')
        if MHS_filenames:
            try:
                # get min sketch sizes of existing MinHash Estimator which is greater than n
                sketch_size_existing = [
                    int(_.split('.sketch')[-1].split('.MHScounts.pickle')[0])
                    for _ in MHS_filenames
                    if (_.endswith('.MHScounts.pickle') and '.ksize' + str(k) +
                        '.' in _ and KMC_outname in _)
                ]
                sketch_size_existing_greater_than_n = min(
                    [_ for _ in sketch_size_existing if _ >= n])
                MHS_count_name = outpath + 'MH_counts/' + KMC_outname + '.sketch' + str(
                    sketch_size_existing_greater_than_n) + '.MHScounts.pickle'
                with open(MHS_count_name, 'rb') as MHS_sketch_count_file:
                    MHS_count = pickle.load(MHS_sketch_count_file)
                    counts = MHS_count[:n]
            # sketch_size_existing_greater_than_n is empty
            except (ValueError, FileNotFoundError):
                MHS = MH.CountEstimator(n=n,
                                        ksize=k,
                                        save_kmers='n',
                                        input_file_name=genome,
                                        rev_comp=rev_comp)
                counts = MHS._counts
        else:
            MHS = MH.CountEstimator(n=n,
                                    ksize=k,
                                    save_kmers='n',
                                    input_file_name=genome,
                                    rev_comp=rev_comp)
            counts = MHS._counts
        # check if MHS counts with k & n is saved nor not
        MHS_count_name = outpath + 'MH_counts/' + KMC_outname + '.sketch' + str(
            n) + '.MHScounts.pickle'
        if not os.path.isfile(MHS_count_name):
            with open(MHS_count_name, 'wb') as MHS_sketch_count_file:
                pickle.dump(counts, MHS_sketch_count_file)
        # turn array of counts of k-mers into occurrence of k-mers with the counts
        dist = np.zeros(max(counts))
        for _c in counts:
            dist[_c - 1] = dist[_c - 1] + 1
        dist_norm = dist / np.sum(dist)
        with open(outpath + KMC_outname + '.sketch' + str(n) + '.pickle',
                  'wb') as config_sketch_file:
            pickle.dump([dist, dist_norm], config_sketch_file)
    else:
        with open(outpath + KMC_outname + '.sketch' + str(n) + '.pickle',
                  'rb') as config_sketch_file:
            dist, dist_norm = pickle.load(config_sketch_file)
    return dist, dist_norm  # np.array(list(dist))