def _build_new_sequence(self, sequence: ReceptorSequence, position, signal: dict) -> ReceptorSequence: gap_length = signal["motif_instance"].gap if "/" in signal["motif_instance"].instance: motif_left, motif_right = signal["motif_instance"].instance.split("/") else: motif_left = signal["motif_instance"].instance motif_right = "" gap_start = position+len(motif_left) gap_end = gap_start+gap_length part1 = sequence.get_sequence()[:position] part2 = sequence.get_sequence()[gap_start:gap_end] part3 = sequence.get_sequence()[gap_end+len(motif_right):] new_sequence_string = part1 + motif_left + part2 + motif_right + part3 annotation = SequenceAnnotation() implant = ImplantAnnotation(signal_id=signal["signal_id"], motif_id=signal["motif_id"], motif_instance=signal["motif_instance"], position=position) annotation.add_implant(implant) new_sequence = ReceptorSequence() new_sequence.set_annotation(annotation) new_sequence.set_metadata(copy.deepcopy(sequence.metadata)) new_sequence.set_sequence(new_sequence_string, EnvironmentSettings.get_sequence_type()) return new_sequence
def create_IMGT_gapped_kmers_from_sequence(sequence: ReceptorSequence, k_left: int, max_gap: int, k_right: int = None, min_gap: int = 0): positions = PositionHelper.gen_imgt_positions_from_length( len(sequence.get_sequence())) sequence_w_pos = list(zip(list(sequence.get_sequence()), positions)) kmers = KmerHelper.create_gapped_kmers_from_string(sequence_w_pos, k_left=k_left, max_gap=max_gap, k_right=k_right, min_gap=min_gap) if kmers is not None: kmers = [( ''.join([x[0] if isinstance(x, tuple) else x for x in kmer]), min([i[1] if isinstance(i, tuple) else 1000 for i in kmer]) if int(min([i[1] if isinstance(i, tuple) else 1000 for i in kmer])) != 112 else max([ i[1] if isinstance(i, tuple) else 0 for i in kmer if int(i[1] if isinstance(i, tuple) else 0) == 112 ])) for kmer in kmers] return kmers else: return None
def create_IMGT_kmers_from_sequence(sequence: ReceptorSequence, k: int): positions = PositionHelper.gen_imgt_positions_from_length( len(sequence.get_sequence())) sequence_w_pos = list(zip(list(sequence.get_sequence()), positions)) kmers = KmerHelper.create_kmers_from_string(sequence_w_pos, k) kmers = [(''.join([x[0] for x in kmer]), min([i[1] for i in kmer]) if int(min([i[1] for i in kmer])) != 112 else max([i[1] for i in kmer if int(i[1]) == 112])) for kmer in kmers] return kmers
def matches_sequence(self, original_sequence: ReceptorSequence, reference_sequence: ReceptorSequence, max_distance): """ :param original_sequence: ReceptorSequence :param reference_sequence: ReceptorSequence :param max_distance: max allowed Levenshtein distance between two sequences to be considered a match :return: True if chain, v_gene and j_gene are the same and sequences are within given Levenshtein distance """ return reference_sequence.metadata.chain == original_sequence.metadata.chain \ and self.matches_gene(reference_sequence.metadata.v_gene, original_sequence.metadata.v_gene) \ and self.matches_gene(reference_sequence.metadata.j_gene, original_sequence.metadata.j_gene) \ and edit_distance(original_sequence.get_sequence(), reference_sequence.get_sequence()) <= max_distance
def create_gapped_kmers_from_sequence(sequence: ReceptorSequence, k_left: int, max_gap: int, k_right: int = None, min_gap: int = 0): return KmerHelper.create_gapped_kmers_from_string( sequence.get_sequence(), k_left, max_gap, k_right, min_gap)
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ creates all overlapping gapped k-mers and IMGT position pairs from a sequence as features for use in KmerFrequencyEncoder. this gap length goes from min_gap to max_gap inclusive, and there is a k-mer of length k_left on the left side of the gap and a k-mer of length k_right on the right side of the gap. :param sequence: ReceptorSequence :param params: EncoderParams (within the "model", the following keys are used: "k_left", "k_right", "max_gap", "min_gap") :return: SequenceEncodingResult """ k_left = params.model.get('k_left') k_right = params.model.get('k_right', k_left) max_gap = params.model.get('max_gap') min_gap = params.model.get('min_gap', 0) length = len(sequence.get_sequence()) if length < k_left + k_right + max_gap: warnings.warn( 'Sequence length is less than k_left + k_right + max_gap. Ignoring sequence' ) return None gapped_kmers = KmerHelper.create_IMGT_gapped_kmers_from_sequence( sequence, k_left=k_left, max_gap=max_gap, min_gap=min_gap, k_right=k_right) gapped_kmers = [ Constants.FEATURE_DELIMITER.join([str(mer) for mer in kmer]) for kmer in gapped_kmers ] return gapped_kmers
def test_get_sequence(self): sequence = ReceptorSequence(amino_acid_sequence="CAS", nucleotide_sequence="TGTGCTTCC") EnvironmentSettings.set_sequence_type(SequenceType.AMINO_ACID) self.assertEqual(sequence.get_sequence(), "CAS")
def match_sequence(self, sequence: ReceptorSequence, reference_sequences: list, max_distance: int) -> dict: matching_sequences = [seq.get_sequence() for seq in reference_sequences if self.matches_sequence(sequence, seq, max_distance)] return { "matching_sequences": matching_sequences, "sequence": sequence.get_sequence(), "v_gene": sequence.metadata.v_gene, "j_gene": sequence.metadata.j_gene, "chain": sequence.metadata.chain }
def test_implant_in_sequence(self): implanting = HealthySequenceImplanting( GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND) signal = Signal("1", [Motif("m1", GappedKmerInstantiation(), "CCC")], implanting) sequence = ReceptorSequence(amino_acid_sequence="ACDFQ") sequence2 = implanting.implant_in_sequence(sequence, signal) self.assertEqual(len(sequence.get_sequence()), len(sequence2.get_sequence())) self.assertTrue("CCC" in sequence2.get_sequence())
def test_encode_sequence(self): sequence = ReceptorSequence("CASSPRERATYEQCASSPRERATYEQCASSPRERATYEQ", None, None) result = IMGTKmerSequenceEncoder.encode_sequence(sequence, EncoderParams( model={"k": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'CAS///105', 'ASS///106', 'SSP///107', 'SPR///108', 'PRE///109', 'RER///110', 'ERA///111', 'RAT///111.001', 'ATY///111.002', 'TYE///111.003', 'YEQ///111.004', 'EQC///111.005', 'QCA///111.006', 'CAS///111.007', 'ASS///111.008', 'SSP///111.009', 'SPR///111.01', 'PRE///111.011', 'RER///111.012', 'ERA///111.013', 'RAT///112.013', 'ATY///112.012', 'TYE///112.011', 'YEQ///112.01', 'EQC///112.009', 'QCA///112.008', 'CAS///112.007', 'ASS///112.006', 'SSP///112.005', 'SPR///112.004', 'PRE///112.003', 'RER///112.002', 'ERA///112.001', 'RAT///112', 'ATY///113', 'TYE///114', 'YEQ///115'}, set(result)) self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1) sequence = ReceptorSequence("AHCDE", None, None) result = IMGTKmerSequenceEncoder.encode_sequence(sequence, EncoderParams( model={"k": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'AHC///105', 'HCD///106', 'CDE///107'}, set(result)) self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1) self.assertEqual( IMGTKmerSequenceEncoder.encode_sequence( sequence, EncoderParams(model={"k": 25}, label_config=LabelConfiguration(), result_path="") ), None )
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ creates overlapping continuous k-mers and IMGT position pairs from a sequence as features for use in KmerFrequencyEncoder object of type EncoderParams, same object as passed into KmerFrequencyEncoder. :param sequence: ReceptorSequence :param params: EncoderParams (where params["model"]["k"] is used) :return: SequenceEncodingResult """ k = params.model["k"] length = len(sequence.get_sequence()) if length < k: logging.warning( 'KmerSequenceEncoder: Sequence length is less than k. Ignoring sequence...' ) return None kmers = KmerHelper.create_kmers_from_sequence(sequence, k) return kmers
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ Encodes a ReceptorSequence based on information from within the ReceptorSequence and SequenceMetadata instances. This allows for looking at frequency for whole sequences, with flexible definition of what a unique whole sequence is. :param sequence: ReceptorSequence :param params: EncoderParams (params["model"]["sequence"] and params["model"]["metadata_fields_to_include"] are used) :return: list with only single feature """ res = [] if params.model.get("sequence", True): res.append(sequence.get_sequence()) for field in params.model.get("metadata_fields_to_include", []): if sequence.metadata is None: res.append("unknown") else: res.append(getattr(sequence.metadata, field)) return [Constants.FEATURE_DELIMITER.join(res)]
def get_formatted_node_metadata(self, seq: ReceptorSequence): # sequence, v_gene_subgroup, v_gene, j_gene_subgroup, j_gene chain = seq.get_attribute('chain').value v_gene = seq.get_attribute('v_gene') j_gene = seq.get_attribute('j_gene') additional_info = [] for attr in self.additional_node_attributes: try: additional_info.append(seq.get_attribute(attr)) except KeyError: additional_info.append(None) warnings.warn( f"CytoscapeNetworkExporter: additional metadata attribute {attr} was not found for some receptor chain(s), " f"value None was used instead.") return [ seq.get_sequence(), f"{chain}{v_gene.split('-')[0]}", f"{chain}{v_gene}", f"{chain}{j_gene.split('-')[0]}", f"{chain}{j_gene}" ] + additional_info
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ creates overlapping continuous k-mers from a sequence as features for use in KmerFrequencyEncoder object of type EncoderParams, same object as passed into KmerFrequencyEncoder :param sequence: ReceptorSequence :param params: EncoderParams (where params["model"]["k"] is used) :return: SequenceEncodingResult consisting of features and feature information names """ k = params.model["k"] length = len(sequence.get_sequence()) if length < k: logging.warning( 'Sequence length is less than k. Ignoring sequence') return None kmers = KmerHelper.create_IMGT_kmers_from_sequence(sequence, k) kmers = [ Constants.FEATURE_DELIMITER.join([str(mer) for mer in kmer]) for kmer in kmers ] return kmers
def create_kmers_from_sequence(sequence: ReceptorSequence, k: int, overlap: bool = True): return KmerHelper.create_kmers_from_string(sequence.get_sequence(), k, overlap)
def gen_imgt_positions_from_sequence(sequence: ReceptorSequence): input_length = len(sequence.get_sequence()) return PositionHelper.gen_imgt_positions_from_length(input_length)
def _build_imgt_positions(self, sequence: ReceptorSequence, motif_instance: MotifInstance): assert len(sequence.get_sequence()) >= motif_instance.gap + len(motif_instance.instance) - 1, \ "The motif instance is longer than receptor_sequence length. Remove the receptor_sequence from the repertoire or reduce max gap length " \ "to be able to proceed. " length = len(sequence.get_sequence()) return PositionHelper.gen_imgt_positions_from_length(length)