def map_kmers_to_read(self, sequence, read, candidate_gene_names): self.logger.info("Map k-mers to read") seq_length = len(sequence) end = seq_length - self.k kmers_obj = Kmers(sequence, self.k, self.homopolyer_compression) read_kmers = kmers_obj.get_all_kmers( max_kmer_count=self.max_kmer_count) is_read_matching = False sequence_hits, hit_counter, read_kmer_hits = self.put_kmers_in_read_bins( seq_length, end, self.fasta_kmers, read_kmers) blocks_obj = Blocks(self.k, self.min_block_size, self.max_gap, self.margin) block_start, block_end = blocks_obj.find_largest_block(sequence_hits) block_start = blocks_obj.adjust_block_start(block_start) block_end = blocks_obj.adjust_block_end(block_end, seq_length) block_kmers = self.create_kmers_for_block(block_start, block_end, read_kmer_hits) is_read_matching = self.apply_kmers_to_genes(self.fasta_obj, block_kmers, candidate_gene_names) if self.filtered_reads_file: self.append_read_to_fastq_file(read, block_start, block_end) return is_read_matching
def does_read_contain_quick_pass_kmers(self, sequence): self.logger.info("Perform quick pass k-mer check on read") seq_length = len(sequence) if seq_length < self.min_block_size: self.logger.info("Read below minimum size") return {} kmers_obj = Kmers(sequence, self.k, self.homopolyer_compression) read_onex_kmers = kmers_obj.get_one_x_coverage_of_kmers() intersect_read_fasta_kmers = self.fasta_obj.kmer_keys_set & set( read_onex_kmers) if len(intersect_read_fasta_kmers) > self.min_kmers_for_onex_pass: gene_names = self.genes_containing_first_pass_kmers( self.fasta_obj, intersect_read_fasta_kmers) candidate_gene_names = { k: v for k, v in gene_names.items() if v > self.min_kmers_for_onex_pass } return candidate_gene_names return {}
def test_four_kmers_compression(self): k = Kmers('GCAAAAATTTTTGC', 4, True) self.assertEqual(k.get_all_kmers_counter(), { 'GCAT': 0, 'CATG': 0, 'ATGC': 0 })
def test_four_kmers(self): k = Kmers('AAAAATTTTT', 4, False) self.assertEqual(k.get_all_kmers_counter(), { 'AAAA': 0, 'AAAT': 0, 'AATT': 0, 'ATTT': 0, 'TTTT': 0 })
def sequence_kmers(self): seq_counter = 0 kmer_to_sequences = {} for record in SeqIO.parse(self.filename, "fasta"): kmers = Kmers(str(record.seq), self.k, self.homopolyer_compression) # We assume here that the sequence name is unique in the FASTA file kmer_to_sequences[record.id] = kmers.get_all_kmers_counter( max_kmer_count=self.max_kmer_count) seq_counter += 1 return kmer_to_sequences
def test_short_sequence(self): k = Kmers('A', 10, False) self.assertEqual(k.get_all_kmers_counter(), {})
def test_four_kmers_compression_all_repeats(self): k = Kmers('AAAAATTTTTGGGGGCCCCC', 4, True) self.assertEqual(k.get_all_kmers_counter(), {'ATGC': 0})