def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ creates overlapping continuous k-mers from a sequence as features for use in KmerFrequencyEncoder object of type EncoderParams, same object as passed into KmerFrequencyEncoder :param sequence: ReceptorSequence :param params: EncoderParams (where params["model"]["k"] is used) :return: SequenceEncodingResult consisting of features and feature information names """ k = params.model["k"] sequence_type = params.model.get('sequence_type', None) length = len(sequence.get_sequence(sequence_type)) if length < k: logging.warning( 'Sequence length is less than k. Ignoring sequence') return None kmers = KmerHelper.create_IMGT_kmers_from_sequence( sequence=sequence, k=k, sequence_type=sequence_type) kmers = [ Constants.FEATURE_DELIMITER.join([str(mer) for mer in kmer]) for kmer in kmers ] return kmers
def test_create_IMGT_kmers_from_sequence(self): kmers = KmerHelper.create_IMGT_kmers_from_sequence(ReceptorSequence("CASSRYUF"), 3, sequence_type=SequenceType.AMINO_ACID) self.assertTrue(("CAS", 105) in kmers) self.assertTrue(("ASS", 106) in kmers) self.assertTrue(("SSR", 107) in kmers) self.assertTrue(("SRY", 108) in kmers) self.assertTrue(("RYU", 114) in kmers) self.assertTrue(("YUF", 115) in kmers)