Exemplo n.º 1
0
 def test_four_kmers_all(self):
     k = Kmers('AAAAATTTTTTTT', 4)
     self.assertEqual(k.get_all_kmers_freq(max_kmer_count=5), {
         'AAAA': 2,
         'AAAT': 1,
         'AATT': 1,
         'ATTT': 1,
         'TTTT': 4
     })
Exemplo n.º 2
0
    def sequence_kmers_vals(self):
        seq_counter = 0

        kmer_to_sequences = {}
        for record in SeqIO.parse(self.filename, "fasta"):
            sequence_length = len(record.seq)
            if self.divisible_by_3 and sequence_length % 3 != 0:
                self.logger.warning(
                    "Excluding gene as it is not divisible by 3:" + record.id)
                continue

            kmers = Kmers(str(record.seq), self.k)
            # We assume here that the sequence name is unique in the FASTA file
            kmer_to_sequences[record.id] = kmers.get_all_kmers_freq(
                max_kmer_count=self.max_kmers)

            seq_counter += 1

        return kmer_to_sequences