def test_insertOneTime(self): print(self._testMethodName) kSize = 31 _kmer, rand_count = self.params.generate_singleKmer() kf = kp.kDataFrameMQF(kSize) self.assertTrue(kf.empty()) kf.insert(_kmer, rand_count) self.assertFalse(kf.empty()) self.assertEqual(kf.getCount(_kmer), rand_count)
import kProcessor as kp import sys fasta_file = sys.argv[1] names_file = sys.argv[2] kf = kp.kDataFrameMQF(21, 29, 1) # Mode 1 activates the kmers mode ckf = kp.index(kf, {"mode": 1}, fasta_file, 10000, names_file) print("Serializing the index ...") ckf.save("index/idx_gencode.v32.transcripts")
import kProcessor as kp from glob import glob import os import sys if len(sys.argv) < 3: sys.exit( "run: python genomes_kmerCounting.py <genomes_dir> <kDataframes_output_dir>" ) genomes_dir = sys.argv[1] output_directory = sys.argv[2] kSize = 21 chunk_size = 10000 hashing_mode = 1 # Integer hashing genomes = glob(f"{genomes_dir}/*") for genome_fasta in genomes: print(f"counting {genome_fasta} kmers ...") kf = kp.kDataFrameMQF(kSize) kp.countKmersFromFile(kf, {"mode": hashing_mode}, genome_fasta, chunk_size) print(f"finished and counted {kf.size()} kmers...") print(f"saving ...") output_prefix = os.path.join(output_directory, "idx_" + os.path.basename(genome_fasta)) kf.save(output_prefix)
import os import sys import kProcessor as kp fasta_file = str() namesFile = str() if len(sys.argv) < 4: sys.exit("run: python genes_indexing.py <fasta> <namesFiles> <kSize>") else: fasta_file = sys.argv[1] namesFile = sys.argv[2] kSize = int(sys.argv[3]) idx_suffix = os.path.basename(fasta_file).replace(".fa.names", "") print(f"Indexing {idx_suffix} ...", file=sys.stderr) kmers_mode = 1 hashing_mode = 1 chunk_size = 10000 #kf_PHMAP = kp.kDataFramePHMAP(kSize, hashing_mode) kf_PHMAP = kp.kDataFrameMQF(kSize, 29, hashing_mode) ckf = kp.index(kf_PHMAP, {"mode": kmers_mode}, fasta_file, chunk_size, namesFile) ckf.save(f"idx_{idx_suffix}")