def get_kmc_data(k, genome_file, out_file, out_dir, verbose=False): #run from CmashKmerAbundance dir kmc_database = kmc.KMCFile() abs_path = os.getcwd() out_path = abs_path + '/' + out_dir + '/' + out_file os.system('kmc -k%d -cs1000 -b -v -fm -ci2 %s %s %s' % (k, genome_file, out_path, '.')) kmc_database.OpenForListing(out_path) kmer_obj = kmc.KmerAPI(kmc_database.Info().kmer_length) counter = kmc.Count() counter_dict = dict() while kmc_database.ReadNextKmer(kmer_obj, counter): if int(counter.value) in counter_dict.keys(): counter_dict[int(counter.value)] += 1 else: counter_dict[int(counter.value)] = 1 print(len(counter_dict.keys())) print(counter_dict) normed_dict = dict() total_count = sum(counter_dict.values()) for k, v in counter_dict.items(): normed_dict[k] = counter_dict[k] / total_count #df = pd.DataFrame(list(normed_dict.items()), columns=['kmer_count', 'percentage']) #print(sum(normed_dict.values())) #print(df) #sns.histplot(x='kmer_count', y='percentage', binwidth=1, data=df) #plt.savefig('test1.png') print(normed_dict) return normed_dict
def test_kmc_file_next_kmer(create_kmc_db): ''' Test if all counted k-mers are returned by KMC API using NextKmer method. ''' pattern = create_kmc_db['kmers'] kmc_file = _open_for_listing() counter = pka.Count() kmer = pka.KmerAPI(create_kmc_db['kmer_len']) res = {} while kmc_file.ReadNextKmer(kmer, counter): res[str(kmer)] = counter.value assert res == pattern
def k_mer_global_histogram_KMC(k, genome, runKMC=False): # input: k - k-mer size, genome - fasta(.gz) # return np.array of abundance and normalized abundance distribution # create KMC database KMC_outname = genome.split('/')[-1] + '.ksize' + str(k) + '.res' outpath = os.path.dirname( os.path.realpath(__file__)) + '/kmc_global_count/' # if the value not stored, compute it, else load it if not os.path.isfile(outpath + KMC_outname + '.global.pickle') or runKMC: # check if KMC database exists if runKMC or not os.path.isfile(outpath + KMC_outname + '.kmc_pre'): # -ci2 - exclude k-mers occurring less than 2 times if '.fastq' in genome or '.fq' in genome: os.system( '/storage/home/xbz5174/work/tools/KMC-3.1.1/bin/kmc -fq -v -ci0 -b -cs3000 -k%d %s %s %s' % (k, genome, outpath + KMC_outname, outpath)) elif '.fasta' in genome or '.fa' in genome or '.fna' in genome: os.system( '/storage/home/xbz5174/work/tools/KMC-3.1.1/bin/kmc -fm -v -ci0 -b -cs3000 -k%d %s %s %s' % (k, genome, outpath + KMC_outname, outpath)) else: print("Is file fa/fq? Check file and its name!", file=sys.stderr) exit(2) # read KMC database to get count values kmer_data_base = kmc.KMCFile() kmer_data_base.OpenForListing(outpath + KMC_outname) kmer_object = kmc.KmerAPI(kmer_data_base.Info().kmer_length) counter = kmc.Count() counter_dict = {} while kmer_data_base.ReadNextKmer(kmer_object, counter): try: counter_dict[int( counter.value)] = counter_dict[int(counter.value)] + 1 except KeyError: counter_dict[int(counter.value)] = 1 # get normalized distribution dist = np.zeros(max(counter_dict.keys())) for _k, _v in counter_dict.items(): dist[_k - 1] = _v dist_norm = dist / np.sum(dist) with open(outpath + KMC_outname + '.global.pickle', 'wb') as config_global_file: pickle.dump([dist, dist_norm], config_global_file) else: with open(outpath + KMC_outname + '.global.pickle', 'rb') as config_global_file: dist, dist_norm = pickle.load(config_global_file) return dist, dist_norm
def test_check_kmer(create_kmc_db): ''' Test case for CheckKmer method. Check if are k-mers from input are present in the database and if some not present in the input are absent in the output. ''' kmers = create_kmc_db['kmers'] kmer_len = create_kmc_db['kmer_len'] kmer = pka.KmerAPI(kmer_len) counter = pka.Count() kmc_file = _open_for_ra() for kmer_str, count in kmers.items(): kmer.from_string(kmer_str) assert kmc_file.CheckKmer(kmer, counter) assert counter.value == count absent_kmers = create_kmc_db['absent_kmers'] for kmer_str in absent_kmers: kmer.from_string(kmer_str) assert not kmc_file.CheckKmer(kmer, counter)
def _py_kmer_api_from_string(kmer_str): kmer = pka.KmerAPI(len(kmer_str)) kmer.from_string(kmer_str) return kmer
default=0) group2.add_argument("-cx", "--cutoff_max", type=int, help="exclude k-mers occurring more of than CX times", default=0) args = parser.parse_args() kmer_data_base = pka.KMCFile() if not kmer_data_base.OpenForListing(args.kmc_database): print("Error: cannot open kmc database") sys.exit(1) info = kmer_data_base.Info() kmer_object = pka.KmerAPI(info.kmer_length) if args.cutoff_min > 0: if not kmer_data_base.SetMinCount(args.cutoff_min): print("Error: cannot set cutoff min") sys.exit(1) if args.cutoff_max > 0: if not kmer_data_base.SetMaxCount(args.cutoff_max): print("Error: cannot set cutoff max") sys.exit(1) output_file = open(args.output_file, 'w') counter = pka.Count() while kmer_data_base.ReadNextKmer(kmer_object, counter):
def make_kmc_genome_counter(path, lag, reverse=True, no_end=False): """ Get a function that takes a batch of kmers and returns transition counts. End symbol is 0 because ends in assemblies aren't reliable. Parameters ---------- kmc_path : str Path to kmc file with counts. lag : int reverse : bool, default=True Whether to include counts of the reverse complement of kmers as well. no_end : bool, default=False Don't load kmc files for starts and ends and assume kmers don't end. In this case you can enter the exact res file. Returns ------- counter : function Takes kmer strings and returns transition counts. """ global kmc import py_kmc_api as kmc alphabet = core.alphabets_en['dna'][:-1] alphabet_size = len(alphabet) # create tokens for calling kmc kmer_token = kmc.KmerAPI(lag+1) c = kmc.Count() if no_end: # Load kmc file into memory print("loading", path) if '.res' not in path: path = path + '_kmc_inter_0_full_{}.res'.format(lag+1) file = load_kmc(path) def counter(kmers): final_shape = np.r_[np.shape(kmers), [alphabet_size+1]] counts = np.zeros([np.size(kmers), alphabet_size+1]) for i, k in enumerate(kmers.flatten()): for j, b in enumerate(alphabet): # Get kp1mer count counts[i, j] = get_kmc_count(k + b, file, kmer_token, c) # Get reverse count (assemblies only look at one strand). if reverse: counts[i, j] += get_kmc_count(Seq.reverse_complement(k + b), file, kmer_token, c) return counts.reshape(final_shape) else: # Load kmc file into memory print("loading", path) files = [] files_suf = [] for l in np.arange(lag) + 1: files.append(load_kmc(path + '_kmc_inter_0_pre_{}.res'.format(l))) files_suf.append(load_kmc(path + '_kmc_inter_0_suf_{}.res'.format(l))) files.append(load_kmc(path + '_kmc_inter_0_full_{}.res'.format(lag+1))) def counter(kmers): final_shape = np.r_[np.shape(kmers), [alphabet_size+1]] counts = np.zeros([np.size(kmers), alphabet_size+1]) for i, k in enumerate(kmers.flatten()): k = k.replace('[', '') for j, b in enumerate(alphabet): # Get kp1mer count counts[i, j] = get_kmc_count(k + b, files[len(k)], kmer_token, c) # Get reverse count (assemblies only look at one strand). if reverse: if len(k) == lag: counts[i, j] += get_kmc_count(Seq.reverse_complement(k + b), files[len(k)], kmer_token, c) if len(k) < lag: counts[i, j] += get_kmc_count(Seq.reverse_complement(k + b), files_suf[len(k)], kmer_token, c) if len(k) == lag: counts[i, -1] = get_kmc_count(k, files_suf[len(k)-1], kmer_token, c) if reverse: counts[i, -1] += get_kmc_count(Seq.reverse_complement(k), files[len(k)-1], kmer_token, c) return counts.reshape(final_shape) return counter