예제 #1
0
    def map_kmers_to_read(self, sequence, read):
        seq_length = len(sequence)
        end = seq_length - self.k

        kmers_obj = Kmers(sequence, self.k)
        read_kmers = kmers_obj.get_all_kmers_filtered(self.max_kmers)
        is_read_matching = False

        for (fasta_obj, fasta_kmers) in self.fasta_kmers.items():
            sequence_hits, hit_counter = self.put_kmers_in_read_bins(
                seq_length, end, fasta_kmers, read_kmers)

            if hit_counter < self.min_fasta_hits:
                continue

            blocks_obj = Blocks(self.k, self.min_block_size, self.max_gap,
                                self.margin)
            block_start, block_end = blocks_obj.find_largest_block(
                sequence_hits)
            if block_end == 0:
                continue

            block_start = blocks_obj.adjust_block_start(block_start)
            block_end = blocks_obj.adjust_block_end(block_end, seq_length)

            block_kmers = self.create_kmers_for_block(block_start, block_end,
                                                      sequence)
            self.apply_kmers_to_genes(fasta_obj, block_kmers)
            is_read_matching = True

            if self.filtered_reads_file:
                self.append_subread_to_fastq_file(read, block_start, block_end)

        return is_read_matching
예제 #2
0
 def test_four_kmers_all(self):
     k = Kmers('AAAAATTTTTTTT', 4)
     self.assertEqual(k.get_all_kmers_freq(max_kmer_count=5), {
         'AAAA': 2,
         'AAAT': 1,
         'AATT': 1,
         'ATTT': 1,
         'TTTT': 4
     })
예제 #3
0
 def test_four_kmers(self):
     k = Kmers('AAAAATTTTT', 4)
     self.assertEqual(k.get_all_kmers_counter(max_kmer_count=5), {
         'AAAA': 0,
         'AAAT': 0,
         'AATT': 0,
         'ATTT': 0,
         'TTTT': 0
     })
예제 #4
0
    def create_kmers_for_block(self, block_start, block_end, sequence):
        if block_end == 0:
            return {}

        block_seq = sequence[block_start:block_end]

        return Kmers(block_seq, self.k).get_all_kmers(self.max_kmers)
예제 #5
0
	def sequence_kmers(self):
		seq_counter = 0
		
		kmer_to_sequences = {}
		for record in SeqIO.parse(self.filename, "fasta"):
			sequence_length  = len(record.seq)
			if self.divisible_by_3 and sequence_length % 3 != 0:
				self.logger.warning("Excluding gene as it is not divisible by 3:"+record.id)
				continue
			
			kmers = Kmers(str(record.seq), self.k)
			# We assume here that the sequence name is unique in the FASTA file
			kmer_to_sequences[record.id] = kmers.get_all_kmers_counter()
			
			seq_counter += 1
			
		return kmer_to_sequences
예제 #6
0
    def does_read_contain_quick_pass_kmers(self, sequence):
        seq_length = len(sequence)
        if seq_length < self.min_block_size:
            return False

        kmers_obj = Kmers(sequence, self.k)
        read_onex_kmers = kmers_obj.get_one_x_coverage_of_kmers()

        for fasta_kmers in self.fasta_kmers.values():
            hit_counter = 0
            for r in read_onex_kmers:
                if r in fasta_kmers:
                    hit_counter += 1

                    if hit_counter > self.min_kmers_for_onex_pass:
                        return True

        return False
예제 #7
0
 def test_short_sequence(self):
     k = Kmers('A', 10)
     self.assertEqual(k.get_all_kmers_counter(), {})