def test_create_kmers_from_sequence(self): kmers = KmerHelper.create_kmers_from_sequence(ReceptorSequence(amino_acid_sequence="ABCDEFG"), 3, sequence_type=SequenceType.AMINO_ACID) self.assertTrue("ABC" in kmers and "BCD" in kmers and "CDE" in kmers and "DEF" in kmers and "EFG" in kmers) self.assertEqual(5, len(kmers)) kmers = KmerHelper.create_kmers_from_sequence(ReceptorSequence(amino_acid_sequence="AB"), 3, sequence_type=SequenceType.AMINO_ACID) self.assertTrue(len(kmers) == 0)
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ Encodes a receptor sequence into a sequence of k-mers Args: sequence: ReceptorSequence object params: EncoderParams object with information on k-mer length Returns: """ k = params.model["k"] sequence_type = params.model.get('sequence_type', None) length = len(sequence.get_sequence(sequence_type)) if length < k: logging.warning( f'KmerSequenceEncoder: Sequence length {length} is less than {k}. Ignoring sequence...' ) return None kmers = KmerHelper.create_kmers_from_sequence( sequence=sequence, k=k, sequence_type=sequence_type) return kmers
def _encode_repertoire(self, repertoire, vectors): repertoire_vector = np.zeros(vectors.vector_size) for (index2, sequence) in enumerate(repertoire.sequences): kmers = KmerHelper.create_kmers_from_sequence(sequence=sequence, k=self.k) sequence_vector = np.zeros(vectors.vector_size) for kmer in kmers: try: word_vector = vectors.get_vector(kmer) sequence_vector = np.add(sequence_vector, word_vector) except KeyError: pass repertoire_vector = np.add(repertoire_vector, sequence_vector) return repertoire_vector
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ creates overlapping continuous k-mers and IMGT position pairs from a sequence as features for use in KmerFrequencyEncoder object of type EncoderParams, same object as passed into KmerFrequencyEncoder. :param sequence: ReceptorSequence :param params: EncoderParams (where params["model"]["k"] is used) :return: SequenceEncodingResult """ k = params.model["k"] length = len(sequence.get_sequence()) if length < k: logging.warning('KmerSequenceEncoder: Sequence length is less than k. Ignoring sequence...') return None kmers = KmerHelper.create_kmers_from_sequence(sequence, k) return kmers