Python MinHash.CountEstimator примеры использования

Язык программирования: Python

Пространство имен/Пакет: CMash

Класс/Тип: MinHash

Метод/Функция: CountEstimator

Примеров на hotexamples.com: 7

Python MinHash.CountEstimator - 7 примеров найдено. Это лучшие примеры Python кода для CMash.MinHash.CountEstimator, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

import_multiple_from_single_hdf5(8)

CountEstimator(7)

export_multiple_to_single_hdf5(5)

_yield_overlaps(3)

Пример #1

Показать файл

Файл: Kmer_estimate.py Проект: KoslickiLab/CmashKmerAbundance

def get_MH_data(n, k, genome_file, rev_comp=False):
    '''

    :param n:
    :param k: kmer size
    :param genome_file: fasta format
    :param rev_comp:
    :return:
    '''
    estimator = MH.CountEstimator(n=n,
                                  ksize=k,
                                  save_kmers='n',
                                  input_file_name=genome_file,
                                  rev_comp=rev_comp)
    counts = estimator._counts
    count_dict = dict()
    for count in counts:
        if count > 0:
            if count in count_dict.keys():
                count_dict[count] += 1
            else:
                count_dict[count] = 1
    normed_dict = dict()
    total_count = sum(count_dict.values())
    for k, v in count_dict.items():
        normed_dict[k] = count_dict[k] / total_count
    #print("minhash results:")
    #print(normed_dict)
    #print(len(normed_dict.keys()))
    #print("checking if MH estimate is correct:")
    #print(sum(count_dict.values()))
    #print(count_dict)
    return normed_dict

Пример #2

Показать файл

Файл: MakeStreamingDNADatabase.py Проект: slebras/CMash

def make_minhash(genome, max_h, prime, ksize):
	MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y', input_file_name=genome, rev_comp=False)  # the query automatically takes care of rev_comp's for me
	# Just use HLL to estimate the number of kmers, no need to get exact count
	hll = khmer.HLLCounter(0.01, ksize)
	hll.consume_seqfile(genome)
	MHS._true_num_kmers = hll.estimate_cardinality()
	MHS.input_file_name = genome
	return MHS

Пример #3

Показать файл

Файл: Kmer_estimate.py Проект: KoslickiLab/CmashKmerAbundance

def quick_dump(k_list, n, input_file):
    for k in k_list:
        pickle_file = 'k' + str(k) + 'n' + str(n) + input_file + '.pickle'
        print(pickle_file)
        estimator = MH.CountEstimator(n=n,
                                      ksize=k,
                                      save_kmers='n',
                                      input_file_name=input_file,
                                      rev_comp=False)
        counts = estimator._counts
        with open(pickle_file, 'wb') as pf:
            pickle.dump(counts, pf)

Пример #4

Показать файл

Файл: Kmer_estimate.py Проект: KoslickiLab/CmashKmerAbundance

def quicker_dump(input_file):
    n = 10000
    for k in [25, 50, 75]:
        pickle_file = 'k' + str(k) + 'n10000' + input_file + '.pickle'
        print(pickle_file)
        estimator = MH.CountEstimator(n=n,
                                      ksize=k,
                                      save_kmers='n',
                                      input_file_name=input_file,
                                      rev_comp=False)
        counts = estimator._counts
        with open(pickle_file, 'wb') as pf:
            pickle.dump(counts, pf)

Пример #5

Показать файл

Файл: Kmer_estimate.py Проект: KoslickiLab/CmashKmerAbundance

def kmc_cmash_compare(k, n, input_file):
    kmc_normed_dict = get_kmc_data(k, input_file, input_file + '_out', 'out')
    #minhash estimate
    estimator = MH.CountEstimator(n=n,
                                  ksize=k,
                                  save_kmers='n',
                                  input_file_name=input_file)
    real_dist = pd.DataFrame(list(kmc_normed_dict.items()),
                             columns=['kmer_count', 'percentage'])
    sns.barplot(x='kmer_count', y='percentage', data=real_dist)
    plt.savefig('quicklook_real.png')
    counts = estimator._counts
    estimated_normed_dict = get_count_dict(counts)
    #quick look at distribution
    df = pd.DataFrame(list(estimated_normed_dict.items()),
                      columns=['kmer_count', 'percentage'])
    sns.barplot(x='kmer_count', y='percentage', data=df)
    plt.savefig('quicklook.png')
    #####
    print(sum(estimated_normed_dict.values()))
    print(counts)
    print(get_distance(kmc_normed_dict, estimated_normed_dict, 'wasserstein'))

Пример #6

Показать файл

import khmer

# FIXME: could probably do all the data creation, module initialization, and method calling, and then have the tests
# FIXME: just test the data

# create some test data
# First, the TST
seq1 = "ATCGTATGAGTATCGTCGATGCATGCATCGATGCATGCTACGTATCGCATGCATG"
seq2 = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
seq3 = "ATATATATATATATATATATATATATATATATATATATATATATATATATATATAT"
seq4 = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
seqs = [seq1, seq2, seq3, seq4]
query_seq = seq3
num_hashes = 5
CE1 = MH.CountEstimator(n=num_hashes,
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE2 = MH.CountEstimator(n=num_hashes,
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE3 = MH.CountEstimator(n=num_hashes,
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE4 = MH.CountEstimator(n=num_hashes,
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE1.add_sequence(seq1)
CE2.add_sequence(seq2)

Пример #7

Показать файл

def k_mer_sketch_histogram(n, k, genome, rev_comp=False):
    n = int(n)
    k = int(k)
    # input: n - sketch size (# Hash function), k - k-mer size, genome - fasta(.gz)
    # return np.array of abundance and normalized abundance distribution
    KMC_outname = genome.split('/')[-1] + '.ksize' + str(k) + '.res'
    outpath = os.path.dirname(
        os.path.realpath(__file__)) + '/kmc_global_count/'
    # if the value not stored, compute it, else load it
    if not os.path.isfile(outpath + KMC_outname + '.sketch' + str(n) +
                          '.pickle'):
        # if MinHash Estimator with larger sketch size doesn't exists, compute it with current sketch size
        MHS_filenames = os.listdir(outpath + 'MH_counts/')
        if MHS_filenames:
            try:
                # get min sketch sizes of existing MinHash Estimator which is greater than n
                sketch_size_existing = [
                    int(_.split('.sketch')[-1].split('.MHScounts.pickle')[0])
                    for _ in MHS_filenames
                    if (_.endswith('.MHScounts.pickle') and '.ksize' + str(k) +
                        '.' in _ and KMC_outname in _)
                ]
                sketch_size_existing_greater_than_n = min(
                    [_ for _ in sketch_size_existing if _ >= n])
                MHS_count_name = outpath + 'MH_counts/' + KMC_outname + '.sketch' + str(
                    sketch_size_existing_greater_than_n) + '.MHScounts.pickle'
                with open(MHS_count_name, 'rb') as MHS_sketch_count_file:
                    MHS_count = pickle.load(MHS_sketch_count_file)
                    counts = MHS_count[:n]
            # sketch_size_existing_greater_than_n is empty
            except (ValueError, FileNotFoundError):
                MHS = MH.CountEstimator(n=n,
                                        ksize=k,
                                        save_kmers='n',
                                        input_file_name=genome,
                                        rev_comp=rev_comp)
                counts = MHS._counts
        else:
            MHS = MH.CountEstimator(n=n,
                                    ksize=k,
                                    save_kmers='n',
                                    input_file_name=genome,
                                    rev_comp=rev_comp)
            counts = MHS._counts
        # check if MHS counts with k & n is saved nor not
        MHS_count_name = outpath + 'MH_counts/' + KMC_outname + '.sketch' + str(
            n) + '.MHScounts.pickle'
        if not os.path.isfile(MHS_count_name):
            with open(MHS_count_name, 'wb') as MHS_sketch_count_file:
                pickle.dump(counts, MHS_sketch_count_file)
        # turn array of counts of k-mers into occurrence of k-mers with the counts
        dist = np.zeros(max(counts))
        for _c in counts:
            dist[_c - 1] = dist[_c - 1] + 1
        dist_norm = dist / np.sum(dist)
        with open(outpath + KMC_outname + '.sketch' + str(n) + '.pickle',
                  'wb') as config_sketch_file:
            pickle.dump([dist, dist_norm], config_sketch_file)
    else:
        with open(outpath + KMC_outname + '.sketch' + str(n) + '.pickle',
                  'rb') as config_sketch_file:
            dist, dist_norm = pickle.load(config_sketch_file)
    return dist, dist_norm  # np.array(list(dist))