Exemplo n.º 1
0
    def parse_mhc_allele(self,
                         allele: str,
                         pattern=H2_ALLELE_PATTERN) -> MhcAllele:
        match = H2_NETMHCPAN_ALLELE_PATTERN.match(allele)
        if match:
            # this ensures that netmhcpan output is normalized
            allele = "H2{gene}{protein}".format(gene=match.group(1),
                                                protein=match.group(2))
        match = H2_ALLELE_PATTERN.match(allele)
        if match is None:
            raise NeofoxDataValidationException(
                "Allele does not match H2 allele pattern {}".
                format(allele) if allele != "" else
                "Please check the format of provided alleles. An empty allele is provided"
            )

        gene = match.group(1)
        protein = match.group(2)

        # controls for existence in the HLA database and warns the user
        mhc_allele = MhcAllele(gene=gene, protein=protein)
        if not self.mhc_database.exists(mhc_allele):
            logger.warning(
                "Allele {} does not exist in the H2 database".format(allele))

        # builds a normalized representation of the allele
        name = "{gene}{protein}".format(gene=gene, protein=protein)

        # full name is the same as name in this case as the pattern does not allow variability
        mhc_allele.name = name
        mhc_allele.full_name = name
        return mhc_allele
Exemplo n.º 2
0
 def _get_mhc2_isoforms(isoform_name: Mhc2Name,
                        genes: List[Mhc2Gene]) -> List[Mhc2Isoform]:
     isoforms = []
     if isoform_name == Mhc2Name.DR:
         assert len(genes) <= 1, "More than one gene provided for MHC II DR"
         # alpha chain of the MHC II DR is not modelled as it is constant
         isoforms = [
             Mhc2Isoform(name=a.name, alpha_chain=MhcAllele(), beta_chain=a)
             for g in genes for a in g.alleles
         ]
     elif isoform_name == Mhc2Name.DP:
         assert len(
             genes) <= 2, "More than two genes provided for MHC II DP"
         alpha_alleles = [
             a for g in genes if g.name == Mhc2GeneName.DPA1
             for a in g.alleles
         ]
         beta_alleles = [
             a for g in genes if g.name == Mhc2GeneName.DPB1
             for a in g.alleles
         ]
         isoforms = [
             Mhc2Isoform(name=get_mhc2_isoform_name(a, b),
                         alpha_chain=a,
                         beta_chain=b) for a in alpha_alleles
             for b in beta_alleles
         ]
     elif isoform_name == Mhc2Name.DQ:
         assert len(
             genes) <= 2, "More than two genes provided for MHC II DQ"
         alpha_alleles = [
             a for g in genes if g.name == Mhc2GeneName.DQA1
             for a in g.alleles
         ]
         beta_alleles = [
             a for g in genes if g.name == Mhc2GeneName.DQB1
             for a in g.alleles
         ]
         isoforms = [
             Mhc2Isoform(name=get_mhc2_isoform_name(a, b),
                         alpha_chain=a,
                         beta_chain=b) for a in alpha_alleles
             for b in beta_alleles
         ]
     # mouse MHC II molecules do not act as pairs
     elif isoform_name == Mhc2Name.H2A_molecule:
         assert len(genes) <= 2, "More than two genes provided for H2A"
         isoforms = [
             Mhc2Isoform(name=a.name, alpha_chain=a, beta_chain=MhcAllele())
             for g in genes if g.name == Mhc2GeneName.H2A for a in g.alleles
         ]
     elif isoform_name == Mhc2Name.H2E_molecule:
         assert len(genes) <= 2, "More than two genes provided for H2E"
         isoforms = [
             Mhc2Isoform(name=a.name, alpha_chain=a, beta_chain=MhcAllele())
             for g in genes if g.name == Mhc2GeneName.H2E for a in g.alleles
         ]
     return isoforms
Exemplo n.º 3
0
 def test_immunogenicity(self):
     iedb_immunogenicity = IEDBimmunogenicity(affinity_threshold=500)
     result = iedb_immunogenicity.calculate_iedb_immunogenicity(
         peptide="ENPVVHFF",
         mhc_allele=MhcAllele(name="HLA-A*68:01"),
         mhc_score=400)
     self.assertGreater(result, 0)
     result = iedb_immunogenicity.calculate_iedb_immunogenicity(
         peptide="ENPVVHFF",
         mhc_allele=MhcAllele(name="HLA-A*68:01"),
         mhc_score=600,
     )
     self.assertIsNone(result)
Exemplo n.º 4
0
    def parse_mhc_allele(self, allele: str) -> MhcAllele:
        match = HLA_ALLELE_PATTERN_WITHOUT_SEPARATOR.match(allele)
        if match is not None:
            # allele without separator, controls for ambiguities
            gene = match.group(1)
            group = match.group(2)
            protein = match.group(3)
            default_allele_exists = self.mhc_database.exists(
                MhcAllele(gene=gene, group=group, protein=protein))
            if not default_allele_exists:
                # if default allele does not exist, tries alternative
                protein = group[-1:] + protein
                group = group[0:-1]
        else:
            # infers gene, group and protein from the name
            match = HLA_ALLELE_PATTERN.match(allele)
            if match is None:
                raise NeofoxDataValidationException(
                    "Allele does not match HLA allele pattern {}".
                    format(allele) if allele != "" else
                    "Please check the format of provided alleles. An empty allele is provided"
                )
            gene = match.group(1)
            group = match.group(2)
            protein = match.group(3)

        # controls for existence in the HLA database and warns the user
        mhc_allele = MhcAllele(gene=gene, group=group, protein=protein)
        if not self.mhc_database.exists(mhc_allele):
            logger.warning(
                "Allele {} does not exist in the HLA database".format(allele))

        # builds a normalized representation of the allele
        name = "HLA-{gene}*{serotype}:{protein}".format(gene=gene,
                                                        serotype=group,
                                                        protein=protein)
        # ensures that full name stores the complete allele as provided but normalizes
        # its representation
        full_name = name
        six_digits_id = match.group(4)
        if six_digits_id is not None and six_digits_id != "":
            full_name = full_name + ":{}".format(six_digits_id)
            eight_digits_id = match.group(5)
            if eight_digits_id is not None and eight_digits_id != "":
                full_name = full_name + ":{}".format(eight_digits_id)
                expression_change = match.group(6)
                if expression_change is not None and expression_change != "":
                    full_name = full_name + expression_change
        mhc_allele.name = name
        mhc_allele.full_name = full_name
        return mhc_allele
 def test_exists(self):
     hla_database = FakeHlaDatabase()
     self.assertTrue(
         hla_database.exists(
             MhcAllele(gene="DPB1", group="104", protein="01")))
     self.assertFalse(
         hla_database.exists(
             MhcAllele(gene="DPB1", group="10", protein="401")))
     self.assertTrue(
         hla_database.exists(MhcAllele(gene="B", group="15",
                                       protein="228")))
     self.assertFalse(
         hla_database.exists(MhcAllele(gene="B", group="152",
                                       protein="28")))
     # badly formed HLA alleles
     self.assertFalse(
         hla_database.exists(MhcAllele(gene="B", group="15", protein=None)))
     self.assertFalse(
         hla_database.exists(MhcAllele(gene="B", group=None,
                                       protein="228")))
     self.assertFalse(
         hla_database.exists(MhcAllele(gene=None, group="15",
                                       protein="228")))
     self.assertFalse(
         hla_database.exists(MhcAllele(gene="Z", group="15",
                                       protein="228")))
Exemplo n.º 6
0
 def test_mhc_ii_allele_parsing(self):
     # add the star
     self._assert_allele_parsing(
         expected="HLA-DPB1*01:01",
         allele=self.mhc_parser.parse_mhc_allele("HLA-DPB101:01"))
     # adds the HLA-
     self._assert_allele_parsing(
         expected="HLA-DPB1*01:01",
         allele=self.mhc_parser.parse_mhc_allele("DPB1*01:01"))
     # adds the colon to homogenise representation
     self._assert_allele_parsing(
         expected="HLA-DPA1*01:01",
         allele=self.mhc_parser.parse_mhc_allele("HLA-DPA101:01"))
     # does not reove the star
     self._assert_allele_parsing(
         expected="HLA-DPA1*01:01",
         allele=self.mhc_parser.parse_mhc_allele("HLA-DPA1*01:01"))
     # removes further information
     self._assert_allele_parsing(
         expected="HLA-DPA1*01:01",
         allele=self.mhc_parser.parse_mhc_allele("HLA-DPA101:01:02:03N"))
     self._assert_allele_parsing(
         expected="HLA-DPA1*01:01",
         allele=self.mhc_parser.parse_mhc_allele("HLA-DPA101:01:02N"))
     self._assert_allele_parsing(
         expected="HLA-DPB1*01:01",
         allele=self.mhc_parser.parse_mhc_allele("HLA-DPB101:01"))
     self._assert_allele_parsing(
         expected="HLA-DPA1*01:01",
         allele=MhcAllele(gene="DPA1",
                          group="01",
                          protein="01",
                          name="HLA-DPA1*01:01"),
     )
Exemplo n.º 7
0
 def test_mhc_i_allele_parsing(self):
     # adds the star
     self._assert_allele_parsing(
         expected="HLA-A*01:01",
         allele=self.mhc_parser.parse_mhc_allele("HLA-A01:01"))
     # adds the HLA-
     self._assert_allele_parsing(
         expected="HLA-A*01:01",
         allele=self.mhc_parser.parse_mhc_allele("A01:01"))
     # adds the colon to homogenise representation
     self._assert_allele_parsing(
         expected="HLA-A*01:01",
         allele=self.mhc_parser.parse_mhc_allele("HLA-A01:01"))
     # does not modify an originally good representation
     self._assert_allele_parsing(
         expected="HLA-A*01:01",
         allele=self.mhc_parser.parse_mhc_allele("HLA-A*01:01"))
     # removes further information
     self._assert_allele_parsing(
         expected="HLA-A*01:01",
         allele=self.mhc_parser.parse_mhc_allele("HLA-A01:01:02:03N"))
     self._assert_allele_parsing(
         expected="HLA-A*01:01",
         allele=self.mhc_parser.parse_mhc_allele("HLA-A01:01:02N"))
     self._assert_allele_parsing(
         expected="HLA-A*01:01",
         allele=self.mhc_parser.parse_mhc_allele("HLA-A01:01N"))
     self._assert_allele_parsing(expected="HLA-A*01:01",
                                 allele=MhcAllele(gene="A",
                                                  group="01",
                                                  protein="01",
                                                  name="HLA-A*01:01"))
Exemplo n.º 8
0
 def _get_empty_epitope():
     return PredictedEpitope(
         peptide=None,
         pos=None,
         hla=MhcAllele(name=None),
         affinity_score=None,
         rank=None,
     )
Exemplo n.º 9
0
 def parse_mhc2_isoform(self, isoform: str) -> Mhc2Isoform:
     # TODO: this method currently fails for netmhc2pan alleles which are like 'HLA-DQA10509-DQB10630'
     # infers gene, group and protein from the name
     isoform = isoform.strip("HLA-")
     if "DQA" in isoform or "DPA" in isoform:
         alpha_chain = self.mhc_parser.parse_mhc_allele(isoform.split("-")[0])
         beta_chain = self.mhc_parser.parse_mhc_allele(isoform.split("-")[1])
     else:
         alpha_chain = MhcAllele()
         beta_chain = self.mhc_parser.parse_mhc_allele(isoform)
     # builds the final allele representation and validates it just in case
     name = get_mhc2_isoform_name(alpha_chain, beta_chain)
     return Mhc2Isoform(name=name, alpha_chain=alpha_chain, beta_chain=beta_chain)
 def test_exists(self):
     h2_database = FakeH2Database()
     self.assertTrue(h2_database.exists(MhcAllele(gene="H2K", protein="p")))
     self.assertFalse(h2_database.exists(MhcAllele(gene="H2K",
                                                   protein="x")))
     self.assertTrue(h2_database.exists(MhcAllele(gene="H2L", protein="f")))
     # badly formed H2 alleles
     self.assertFalse(
         h2_database.exists(MhcAllele(gene="H2K", group="p", protein=None)))
     self.assertFalse(
         h2_database.exists(MhcAllele(gene="H2K", group=None,
                                      protein=None)))
     self.assertFalse(h2_database.exists(MhcAllele(gene=None, protein="p")))
     self.assertFalse(
         h2_database.exists(MhcAllele(gene="Z", group="15", protein="228")))
Exemplo n.º 11
0
 def parse_mhc2_isoform(self, isoform: str) -> Mhc2Isoform:
     # TODO: this method currently fails for netmhc2pan alleles which are like 'HLA-DQA10509-DQB10630'
     # infers gene, group and protein from the name
     match = HLA_MOLECULE_PATTERN.match(isoform)
     if match:
         alpha_chain = self.parse_mhc_allele(match.group(1))
         beta_chain = self.parse_mhc_allele(match.group(2))
     else:
         match = HLA_DR_MOLECULE_PATTERN.match(isoform)
         assert (
             match is not None
         ), "Molecule does not match HLA isoform pattern {}".format(isoform)
         alpha_chain = MhcAllele()
         beta_chain = self.parse_mhc_allele(match.group(1))
     # builds the final allele representation and validates it just in case
     name = get_mhc2_isoform_name(alpha_chain, beta_chain)
     return Mhc2Isoform(name=name,
                        alpha_chain=alpha_chain,
                        beta_chain=beta_chain)
Exemplo n.º 12
0
    def validate_mhc_allele_representation(allele: MhcAllele, organism: str):
        try:
            allele_pattern = ALLELE_PATTERN_BY_ORGANISM.get(organism)
            valid_genes = [g.name for g in MHC_I_GENES_BY_ORGANISM.get(organism) + MHC_II_GENES_BY_ORGANISM.get(organism)]

            assert allele_pattern.match(allele.name) is not None, \
                "Allele name does not match expected pattern: {}".format(allele.name)
            assert allele.gene in valid_genes, "MHC gene {} not from classic MHC for organism {}".format(
                allele.gene, organism)
            assert isinstance(allele.protein, str), \
                "The field protein in MHC allele model has the value {} and wrong type but must be a character " \
                "instead of {}".format(allele.protein, type(allele.protein))
            if organism == ORGANISM_HOMO_SAPIENS:
                assert isinstance(allele.group, str), \
                    "The field group in MHC allele model has the value {} and wrong type but must be a character " \
                    "instead of {}".format(allele.group, type(allele.group))
            elif organism == ORGANISM_MUS_MUSCULUS:
                assert allele.group is None or allele.group == "", \
                    "Provided group for H2 allele"
            else:
                raise NeofoxDataValidationException("Not supported organism {}".format(organism))
        except AssertionError as e:
            logger.error(allele.to_json(indent=3))
            raise NeofoxDataValidationException(e)