示例#1
0
    def map_kmers_to_read(self, sequence, read, candidate_gene_names):
        self.logger.info("Map k-mers to read")

        seq_length = len(sequence)
        end = seq_length - self.k

        kmers_obj = Kmers(sequence, self.k, self.homopolyer_compression)
        read_kmers = kmers_obj.get_all_kmers(
            max_kmer_count=self.max_kmer_count)
        is_read_matching = False

        sequence_hits, hit_counter, read_kmer_hits = self.put_kmers_in_read_bins(
            seq_length, end, self.fasta_kmers, read_kmers)

        blocks_obj = Blocks(self.k, self.min_block_size, self.max_gap,
                            self.margin)
        block_start, block_end = blocks_obj.find_largest_block(sequence_hits)

        block_start = blocks_obj.adjust_block_start(block_start)
        block_end = blocks_obj.adjust_block_end(block_end, seq_length)

        block_kmers = self.create_kmers_for_block(block_start, block_end,
                                                  read_kmer_hits)
        is_read_matching = self.apply_kmers_to_genes(self.fasta_obj,
                                                     block_kmers,
                                                     candidate_gene_names)

        if self.filtered_reads_file:
            self.append_read_to_fastq_file(read, block_start, block_end)

        return is_read_matching
示例#2
0
    def does_read_contain_quick_pass_kmers(self, sequence):
        self.logger.info("Perform quick pass k-mer check on read")
        seq_length = len(sequence)
        if seq_length < self.min_block_size:
            self.logger.info("Read below minimum size")
            return {}

        kmers_obj = Kmers(sequence, self.k, self.homopolyer_compression)
        read_onex_kmers = kmers_obj.get_one_x_coverage_of_kmers()

        intersect_read_fasta_kmers = self.fasta_obj.kmer_keys_set & set(
            read_onex_kmers)

        if len(intersect_read_fasta_kmers) > self.min_kmers_for_onex_pass:
            gene_names = self.genes_containing_first_pass_kmers(
                self.fasta_obj, intersect_read_fasta_kmers)

            candidate_gene_names = {
                k: v
                for k, v in gene_names.items()
                if v > self.min_kmers_for_onex_pass
            }

            return candidate_gene_names

        return {}
示例#3
0
 def test_four_kmers_compression(self):
     k = Kmers('GCAAAAATTTTTGC', 4, True)
     self.assertEqual(k.get_all_kmers_counter(), {
         'GCAT': 0,
         'CATG': 0,
         'ATGC': 0
     })
示例#4
0
 def test_four_kmers(self):
     k = Kmers('AAAAATTTTT', 4, False)
     self.assertEqual(k.get_all_kmers_counter(), {
         'AAAA': 0,
         'AAAT': 0,
         'AATT': 0,
         'ATTT': 0,
         'TTTT': 0
     })
示例#5
0
文件: Fasta.py 项目: tseemann/tiptoft
    def sequence_kmers(self):
        seq_counter = 0

        kmer_to_sequences = {}
        for record in SeqIO.parse(self.filename, "fasta"):

            kmers = Kmers(str(record.seq), self.k, self.homopolyer_compression)
            # We assume here that the sequence name is unique in the FASTA file
            kmer_to_sequences[record.id] = kmers.get_all_kmers_counter(
                max_kmer_count=self.max_kmer_count)

            seq_counter += 1

        return kmer_to_sequences
示例#6
0
 def test_short_sequence(self):
     k = Kmers('A', 10, False)
     self.assertEqual(k.get_all_kmers_counter(), {})
示例#7
0
 def test_four_kmers_compression_all_repeats(self):
     k = Kmers('AAAAATTTTTGGGGGCCCCC', 4, True)
     self.assertEqual(k.get_all_kmers_counter(), {'ATGC': 0})