def map_kmers_to_read(self, sequence, read): seq_length = len(sequence) end = seq_length - self.k kmers_obj = Kmers(sequence, self.k) read_kmers = kmers_obj.get_all_kmers_filtered(self.max_kmers) is_read_matching = False for (fasta_obj, fasta_kmers) in self.fasta_kmers.items(): sequence_hits, hit_counter = self.put_kmers_in_read_bins( seq_length, end, fasta_kmers, read_kmers) if hit_counter < self.min_fasta_hits: continue blocks_obj = Blocks(self.k, self.min_block_size, self.max_gap, self.margin) block_start, block_end = blocks_obj.find_largest_block( sequence_hits) if block_end == 0: continue block_start = blocks_obj.adjust_block_start(block_start) block_end = blocks_obj.adjust_block_end(block_end, seq_length) block_kmers = self.create_kmers_for_block(block_start, block_end, sequence) self.apply_kmers_to_genes(fasta_obj, block_kmers) is_read_matching = True if self.filtered_reads_file: self.append_subread_to_fastq_file(read, block_start, block_end) return is_read_matching
def test_four_kmers_all(self): k = Kmers('AAAAATTTTTTTT', 4) self.assertEqual(k.get_all_kmers_freq(max_kmer_count=5), { 'AAAA': 2, 'AAAT': 1, 'AATT': 1, 'ATTT': 1, 'TTTT': 4 })
def test_four_kmers(self): k = Kmers('AAAAATTTTT', 4) self.assertEqual(k.get_all_kmers_counter(max_kmer_count=5), { 'AAAA': 0, 'AAAT': 0, 'AATT': 0, 'ATTT': 0, 'TTTT': 0 })
def create_kmers_for_block(self, block_start, block_end, sequence): if block_end == 0: return {} block_seq = sequence[block_start:block_end] return Kmers(block_seq, self.k).get_all_kmers(self.max_kmers)
def sequence_kmers(self): seq_counter = 0 kmer_to_sequences = {} for record in SeqIO.parse(self.filename, "fasta"): sequence_length = len(record.seq) if self.divisible_by_3 and sequence_length % 3 != 0: self.logger.warning("Excluding gene as it is not divisible by 3:"+record.id) continue kmers = Kmers(str(record.seq), self.k) # We assume here that the sequence name is unique in the FASTA file kmer_to_sequences[record.id] = kmers.get_all_kmers_counter() seq_counter += 1 return kmer_to_sequences
def does_read_contain_quick_pass_kmers(self, sequence): seq_length = len(sequence) if seq_length < self.min_block_size: return False kmers_obj = Kmers(sequence, self.k) read_onex_kmers = kmers_obj.get_one_x_coverage_of_kmers() for fasta_kmers in self.fasta_kmers.values(): hit_counter = 0 for r in read_onex_kmers: if r in fasta_kmers: hit_counter += 1 if hit_counter > self.min_kmers_for_onex_pass: return True return False
def test_short_sequence(self): k = Kmers('A', 10) self.assertEqual(k.get_all_kmers_counter(), {})