Пример #1
0
    def _build_new_sequence(self, sequence: ReceptorSequence, position, signal: dict) -> ReceptorSequence:

        gap_length = signal["motif_instance"].gap
        if "/" in signal["motif_instance"].instance:
            motif_left, motif_right = signal["motif_instance"].instance.split("/")
        else:
            motif_left = signal["motif_instance"].instance
            motif_right = ""

        gap_start = position+len(motif_left)
        gap_end = gap_start+gap_length
        part1 = sequence.get_sequence()[:position]
        part2 = sequence.get_sequence()[gap_start:gap_end]
        part3 = sequence.get_sequence()[gap_end+len(motif_right):]

        new_sequence_string = part1 + motif_left + part2 + motif_right + part3

        annotation = SequenceAnnotation()
        implant = ImplantAnnotation(signal_id=signal["signal_id"],
                                    motif_id=signal["motif_id"],
                                    motif_instance=signal["motif_instance"],
                                    position=position)
        annotation.add_implant(implant)

        new_sequence = ReceptorSequence()
        new_sequence.set_annotation(annotation)
        new_sequence.set_metadata(copy.deepcopy(sequence.metadata))
        new_sequence.set_sequence(new_sequence_string, EnvironmentSettings.get_sequence_type())

        return new_sequence
Пример #2
0
 def create_IMGT_gapped_kmers_from_sequence(sequence: ReceptorSequence,
                                            k_left: int,
                                            max_gap: int,
                                            k_right: int = None,
                                            min_gap: int = 0):
     positions = PositionHelper.gen_imgt_positions_from_length(
         len(sequence.get_sequence()))
     sequence_w_pos = list(zip(list(sequence.get_sequence()), positions))
     kmers = KmerHelper.create_gapped_kmers_from_string(sequence_w_pos,
                                                        k_left=k_left,
                                                        max_gap=max_gap,
                                                        k_right=k_right,
                                                        min_gap=min_gap)
     if kmers is not None:
         kmers = [(
             ''.join([x[0] if isinstance(x, tuple) else x for x in kmer]),
             min([i[1] if isinstance(i, tuple) else 1000 for i in kmer]) if
             int(min([i[1] if isinstance(i, tuple) else 1000
                      for i in kmer])) != 112 else max([
                          i[1] if isinstance(i, tuple) else 0 for i in kmer
                          if int(i[1] if isinstance(i, tuple) else 0) == 112
                      ])) for kmer in kmers]
         return kmers
     else:
         return None
Пример #3
0
 def create_IMGT_kmers_from_sequence(sequence: ReceptorSequence, k: int):
     positions = PositionHelper.gen_imgt_positions_from_length(
         len(sequence.get_sequence()))
     sequence_w_pos = list(zip(list(sequence.get_sequence()), positions))
     kmers = KmerHelper.create_kmers_from_string(sequence_w_pos, k)
     kmers = [(''.join([x[0] for x in kmer]),
               min([i[1] for i in kmer]) if int(min([i[1]
                                                     for i in kmer])) != 112
               else max([i[1] for i in kmer if int(i[1]) == 112]))
              for kmer in kmers]
     return kmers
Пример #4
0
 def matches_sequence(self, original_sequence: ReceptorSequence, reference_sequence: ReceptorSequence, max_distance):
     """
     :param original_sequence: ReceptorSequence
     :param reference_sequence: ReceptorSequence
     :param max_distance: max allowed Levenshtein distance between two sequences to be considered a match
     :return: True if chain, v_gene and j_gene are the same and sequences are within given Levenshtein distance
     """
     return reference_sequence.metadata.chain == original_sequence.metadata.chain \
         and self.matches_gene(reference_sequence.metadata.v_gene, original_sequence.metadata.v_gene) \
         and self.matches_gene(reference_sequence.metadata.j_gene, original_sequence.metadata.j_gene) \
         and edit_distance(original_sequence.get_sequence(), reference_sequence.get_sequence()) <= max_distance
Пример #5
0
 def create_gapped_kmers_from_sequence(sequence: ReceptorSequence,
                                       k_left: int,
                                       max_gap: int,
                                       k_right: int = None,
                                       min_gap: int = 0):
     return KmerHelper.create_gapped_kmers_from_string(
         sequence.get_sequence(), k_left, max_gap, k_right, min_gap)
Пример #6
0
    def encode_sequence(sequence: ReceptorSequence, params: EncoderParams):
        """
        creates all overlapping gapped k-mers and IMGT position pairs from a sequence as features for use in KmerFrequencyEncoder.
        this gap length goes from min_gap to max_gap inclusive, and there is a k-mer of length k_left on the left
        side of the gap and a k-mer of length k_right on the right side of the gap.
        :param sequence: ReceptorSequence
        :param params: EncoderParams (within the "model", the following keys are used: "k_left", "k_right", "max_gap",
                        "min_gap")
        :return: SequenceEncodingResult
        """
        k_left = params.model.get('k_left')
        k_right = params.model.get('k_right', k_left)
        max_gap = params.model.get('max_gap')
        min_gap = params.model.get('min_gap', 0)
        length = len(sequence.get_sequence())

        if length < k_left + k_right + max_gap:
            warnings.warn(
                'Sequence length is less than k_left + k_right + max_gap. Ignoring sequence'
            )
            return None

        gapped_kmers = KmerHelper.create_IMGT_gapped_kmers_from_sequence(
            sequence,
            k_left=k_left,
            max_gap=max_gap,
            min_gap=min_gap,
            k_right=k_right)

        gapped_kmers = [
            Constants.FEATURE_DELIMITER.join([str(mer) for mer in kmer])
            for kmer in gapped_kmers
        ]

        return gapped_kmers
Пример #7
0
    def test_get_sequence(self):

        sequence = ReceptorSequence(amino_acid_sequence="CAS",
                                    nucleotide_sequence="TGTGCTTCC")

        EnvironmentSettings.set_sequence_type(SequenceType.AMINO_ACID)

        self.assertEqual(sequence.get_sequence(), "CAS")
Пример #8
0
    def match_sequence(self, sequence: ReceptorSequence, reference_sequences: list, max_distance: int) -> dict:
        matching_sequences = [seq.get_sequence() for seq in reference_sequences
                              if self.matches_sequence(sequence, seq, max_distance)]

        return {
            "matching_sequences": matching_sequences,
            "sequence": sequence.get_sequence(),
            "v_gene": sequence.metadata.v_gene,
            "j_gene": sequence.metadata.j_gene,
            "chain": sequence.metadata.chain
        }
Пример #9
0
    def test_implant_in_sequence(self):
        implanting = HealthySequenceImplanting(
            GappedMotifImplanting(),
            implanting_computation=ImplantingComputation.ROUND)
        signal = Signal("1", [Motif("m1", GappedKmerInstantiation(), "CCC")],
                        implanting)
        sequence = ReceptorSequence(amino_acid_sequence="ACDFQ")
        sequence2 = implanting.implant_in_sequence(sequence, signal)

        self.assertEqual(len(sequence.get_sequence()),
                         len(sequence2.get_sequence()))
        self.assertTrue("CCC" in sequence2.get_sequence())
    def test_encode_sequence(self):
        sequence = ReceptorSequence("CASSPRERATYEQCASSPRERATYEQCASSPRERATYEQ", None, None)
        result = IMGTKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(
                                                                    model={"k": 3},
                                                                    label_config=LabelConfiguration(),
                                                                    result_path=""))

        self.assertEqual({'CAS///105', 'ASS///106', 'SSP///107', 'SPR///108', 'PRE///109', 'RER///110', 'ERA///111',
                          'RAT///111.001', 'ATY///111.002', 'TYE///111.003', 'YEQ///111.004', 'EQC///111.005',
                          'QCA///111.006', 'CAS///111.007', 'ASS///111.008', 'SSP///111.009', 'SPR///111.01',
                          'PRE///111.011', 'RER///111.012', 'ERA///111.013', 'RAT///112.013', 'ATY///112.012',
                          'TYE///112.011', 'YEQ///112.01', 'EQC///112.009', 'QCA///112.008', 'CAS///112.007',
                          'ASS///112.006', 'SSP///112.005', 'SPR///112.004', 'PRE///112.003', 'RER///112.002',
                          'ERA///112.001', 'RAT///112', 'ATY///113', 'TYE///114', 'YEQ///115'},
                         set(result))

        self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1)

        sequence = ReceptorSequence("AHCDE", None, None)
        result = IMGTKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(
                                                                    model={"k": 3},
                                                                    label_config=LabelConfiguration(),
                                                                    result_path=""))

        self.assertEqual({'AHC///105', 'HCD///106', 'CDE///107'},
                         set(result))

        self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1)
        self.assertEqual(
            IMGTKmerSequenceEncoder.encode_sequence(
                              sequence,
                              EncoderParams(model={"k": 25},
                                            label_config=LabelConfiguration(),
                                            result_path="")
            ),
            None
        )
Пример #11
0
    def encode_sequence(sequence: ReceptorSequence, params: EncoderParams):
        """
        creates overlapping continuous k-mers and IMGT position pairs from a sequence as features for use in
        KmerFrequencyEncoder object of type EncoderParams, same object as passed into KmerFrequencyEncoder.
        :param sequence: ReceptorSequence
        :param params: EncoderParams (where params["model"]["k"] is used)
        :return: SequenceEncodingResult
        """
        k = params.model["k"]
        length = len(sequence.get_sequence())

        if length < k:
            logging.warning(
                'KmerSequenceEncoder: Sequence length is less than k. Ignoring sequence...'
            )
            return None

        kmers = KmerHelper.create_kmers_from_sequence(sequence, k)

        return kmers
Пример #12
0
    def encode_sequence(sequence: ReceptorSequence, params: EncoderParams):
        """
        Encodes a ReceptorSequence based on information from within the ReceptorSequence and SequenceMetadata
        instances. This allows for looking at frequency for whole sequences, with flexible definition of what a unique
        whole sequence is.
        :param sequence: ReceptorSequence
        :param params: EncoderParams (params["model"]["sequence"] and params["model"]["metadata_fields_to_include"] are
                        used)
        :return: list with only single feature
        """

        res = []
        if params.model.get("sequence", True):
            res.append(sequence.get_sequence())

        for field in params.model.get("metadata_fields_to_include", []):
            if sequence.metadata is None:
                res.append("unknown")
            else:
                res.append(getattr(sequence.metadata, field))

        return [Constants.FEATURE_DELIMITER.join(res)]
Пример #13
0
    def get_formatted_node_metadata(self, seq: ReceptorSequence):
        # sequence, v_gene_subgroup, v_gene, j_gene_subgroup, j_gene
        chain = seq.get_attribute('chain').value
        v_gene = seq.get_attribute('v_gene')
        j_gene = seq.get_attribute('j_gene')

        additional_info = []

        for attr in self.additional_node_attributes:
            try:
                additional_info.append(seq.get_attribute(attr))
            except KeyError:
                additional_info.append(None)
                warnings.warn(
                    f"CytoscapeNetworkExporter: additional metadata attribute {attr} was not found for some receptor chain(s), "
                    f"value None was used instead.")

        return [
            seq.get_sequence(), f"{chain}{v_gene.split('-')[0]}",
            f"{chain}{v_gene}", f"{chain}{j_gene.split('-')[0]}",
            f"{chain}{j_gene}"
        ] + additional_info
Пример #14
0
    def encode_sequence(sequence: ReceptorSequence, params: EncoderParams):
        """
        creates overlapping continuous k-mers from a sequence as features for use in KmerFrequencyEncoder
        object of type EncoderParams, same object as passed into KmerFrequencyEncoder
        :param sequence: ReceptorSequence
        :param params: EncoderParams (where params["model"]["k"] is used)
        :return: SequenceEncodingResult consisting of features and feature information names
        """
        k = params.model["k"]
        length = len(sequence.get_sequence())

        if length < k:
            logging.warning(
                'Sequence length is less than k. Ignoring sequence')
            return None

        kmers = KmerHelper.create_IMGT_kmers_from_sequence(sequence, k)

        kmers = [
            Constants.FEATURE_DELIMITER.join([str(mer) for mer in kmer])
            for kmer in kmers
        ]

        return kmers
Пример #15
0
 def create_kmers_from_sequence(sequence: ReceptorSequence,
                                k: int,
                                overlap: bool = True):
     return KmerHelper.create_kmers_from_string(sequence.get_sequence(), k,
                                                overlap)
Пример #16
0
 def gen_imgt_positions_from_sequence(sequence: ReceptorSequence):
     input_length = len(sequence.get_sequence())
     return PositionHelper.gen_imgt_positions_from_length(input_length)
Пример #17
0
 def _build_imgt_positions(self, sequence: ReceptorSequence, motif_instance: MotifInstance):
     assert len(sequence.get_sequence()) >= motif_instance.gap + len(motif_instance.instance) - 1, \
         "The motif instance is longer than receptor_sequence length. Remove the receptor_sequence from the repertoire or reduce max gap length " \
         "to be able to proceed. "
     length = len(sequence.get_sequence())
     return PositionHelper.gen_imgt_positions_from_length(length)