def parse_mhc_allele(self, allele: str, pattern=H2_ALLELE_PATTERN) -> MhcAllele: match = H2_NETMHCPAN_ALLELE_PATTERN.match(allele) if match: # this ensures that netmhcpan output is normalized allele = "H2{gene}{protein}".format(gene=match.group(1), protein=match.group(2)) match = H2_ALLELE_PATTERN.match(allele) if match is None: raise NeofoxDataValidationException( "Allele does not match H2 allele pattern {}". format(allele) if allele != "" else "Please check the format of provided alleles. An empty allele is provided" ) gene = match.group(1) protein = match.group(2) # controls for existence in the HLA database and warns the user mhc_allele = MhcAllele(gene=gene, protein=protein) if not self.mhc_database.exists(mhc_allele): logger.warning( "Allele {} does not exist in the H2 database".format(allele)) # builds a normalized representation of the allele name = "{gene}{protein}".format(gene=gene, protein=protein) # full name is the same as name in this case as the pattern does not allow variability mhc_allele.name = name mhc_allele.full_name = name return mhc_allele
def _get_mhc2_isoforms(isoform_name: Mhc2Name, genes: List[Mhc2Gene]) -> List[Mhc2Isoform]: isoforms = [] if isoform_name == Mhc2Name.DR: assert len(genes) <= 1, "More than one gene provided for MHC II DR" # alpha chain of the MHC II DR is not modelled as it is constant isoforms = [ Mhc2Isoform(name=a.name, alpha_chain=MhcAllele(), beta_chain=a) for g in genes for a in g.alleles ] elif isoform_name == Mhc2Name.DP: assert len( genes) <= 2, "More than two genes provided for MHC II DP" alpha_alleles = [ a for g in genes if g.name == Mhc2GeneName.DPA1 for a in g.alleles ] beta_alleles = [ a for g in genes if g.name == Mhc2GeneName.DPB1 for a in g.alleles ] isoforms = [ Mhc2Isoform(name=get_mhc2_isoform_name(a, b), alpha_chain=a, beta_chain=b) for a in alpha_alleles for b in beta_alleles ] elif isoform_name == Mhc2Name.DQ: assert len( genes) <= 2, "More than two genes provided for MHC II DQ" alpha_alleles = [ a for g in genes if g.name == Mhc2GeneName.DQA1 for a in g.alleles ] beta_alleles = [ a for g in genes if g.name == Mhc2GeneName.DQB1 for a in g.alleles ] isoforms = [ Mhc2Isoform(name=get_mhc2_isoform_name(a, b), alpha_chain=a, beta_chain=b) for a in alpha_alleles for b in beta_alleles ] # mouse MHC II molecules do not act as pairs elif isoform_name == Mhc2Name.H2A_molecule: assert len(genes) <= 2, "More than two genes provided for H2A" isoforms = [ Mhc2Isoform(name=a.name, alpha_chain=a, beta_chain=MhcAllele()) for g in genes if g.name == Mhc2GeneName.H2A for a in g.alleles ] elif isoform_name == Mhc2Name.H2E_molecule: assert len(genes) <= 2, "More than two genes provided for H2E" isoforms = [ Mhc2Isoform(name=a.name, alpha_chain=a, beta_chain=MhcAllele()) for g in genes if g.name == Mhc2GeneName.H2E for a in g.alleles ] return isoforms
def test_immunogenicity(self): iedb_immunogenicity = IEDBimmunogenicity(affinity_threshold=500) result = iedb_immunogenicity.calculate_iedb_immunogenicity( peptide="ENPVVHFF", mhc_allele=MhcAllele(name="HLA-A*68:01"), mhc_score=400) self.assertGreater(result, 0) result = iedb_immunogenicity.calculate_iedb_immunogenicity( peptide="ENPVVHFF", mhc_allele=MhcAllele(name="HLA-A*68:01"), mhc_score=600, ) self.assertIsNone(result)
def parse_mhc_allele(self, allele: str) -> MhcAllele: match = HLA_ALLELE_PATTERN_WITHOUT_SEPARATOR.match(allele) if match is not None: # allele without separator, controls for ambiguities gene = match.group(1) group = match.group(2) protein = match.group(3) default_allele_exists = self.mhc_database.exists( MhcAllele(gene=gene, group=group, protein=protein)) if not default_allele_exists: # if default allele does not exist, tries alternative protein = group[-1:] + protein group = group[0:-1] else: # infers gene, group and protein from the name match = HLA_ALLELE_PATTERN.match(allele) if match is None: raise NeofoxDataValidationException( "Allele does not match HLA allele pattern {}". format(allele) if allele != "" else "Please check the format of provided alleles. An empty allele is provided" ) gene = match.group(1) group = match.group(2) protein = match.group(3) # controls for existence in the HLA database and warns the user mhc_allele = MhcAllele(gene=gene, group=group, protein=protein) if not self.mhc_database.exists(mhc_allele): logger.warning( "Allele {} does not exist in the HLA database".format(allele)) # builds a normalized representation of the allele name = "HLA-{gene}*{serotype}:{protein}".format(gene=gene, serotype=group, protein=protein) # ensures that full name stores the complete allele as provided but normalizes # its representation full_name = name six_digits_id = match.group(4) if six_digits_id is not None and six_digits_id != "": full_name = full_name + ":{}".format(six_digits_id) eight_digits_id = match.group(5) if eight_digits_id is not None and eight_digits_id != "": full_name = full_name + ":{}".format(eight_digits_id) expression_change = match.group(6) if expression_change is not None and expression_change != "": full_name = full_name + expression_change mhc_allele.name = name mhc_allele.full_name = full_name return mhc_allele
def test_exists(self): hla_database = FakeHlaDatabase() self.assertTrue( hla_database.exists( MhcAllele(gene="DPB1", group="104", protein="01"))) self.assertFalse( hla_database.exists( MhcAllele(gene="DPB1", group="10", protein="401"))) self.assertTrue( hla_database.exists(MhcAllele(gene="B", group="15", protein="228"))) self.assertFalse( hla_database.exists(MhcAllele(gene="B", group="152", protein="28"))) # badly formed HLA alleles self.assertFalse( hla_database.exists(MhcAllele(gene="B", group="15", protein=None))) self.assertFalse( hla_database.exists(MhcAllele(gene="B", group=None, protein="228"))) self.assertFalse( hla_database.exists(MhcAllele(gene=None, group="15", protein="228"))) self.assertFalse( hla_database.exists(MhcAllele(gene="Z", group="15", protein="228")))
def test_mhc_ii_allele_parsing(self): # add the star self._assert_allele_parsing( expected="HLA-DPB1*01:01", allele=self.mhc_parser.parse_mhc_allele("HLA-DPB101:01")) # adds the HLA- self._assert_allele_parsing( expected="HLA-DPB1*01:01", allele=self.mhc_parser.parse_mhc_allele("DPB1*01:01")) # adds the colon to homogenise representation self._assert_allele_parsing( expected="HLA-DPA1*01:01", allele=self.mhc_parser.parse_mhc_allele("HLA-DPA101:01")) # does not reove the star self._assert_allele_parsing( expected="HLA-DPA1*01:01", allele=self.mhc_parser.parse_mhc_allele("HLA-DPA1*01:01")) # removes further information self._assert_allele_parsing( expected="HLA-DPA1*01:01", allele=self.mhc_parser.parse_mhc_allele("HLA-DPA101:01:02:03N")) self._assert_allele_parsing( expected="HLA-DPA1*01:01", allele=self.mhc_parser.parse_mhc_allele("HLA-DPA101:01:02N")) self._assert_allele_parsing( expected="HLA-DPB1*01:01", allele=self.mhc_parser.parse_mhc_allele("HLA-DPB101:01")) self._assert_allele_parsing( expected="HLA-DPA1*01:01", allele=MhcAllele(gene="DPA1", group="01", protein="01", name="HLA-DPA1*01:01"), )
def test_mhc_i_allele_parsing(self): # adds the star self._assert_allele_parsing( expected="HLA-A*01:01", allele=self.mhc_parser.parse_mhc_allele("HLA-A01:01")) # adds the HLA- self._assert_allele_parsing( expected="HLA-A*01:01", allele=self.mhc_parser.parse_mhc_allele("A01:01")) # adds the colon to homogenise representation self._assert_allele_parsing( expected="HLA-A*01:01", allele=self.mhc_parser.parse_mhc_allele("HLA-A01:01")) # does not modify an originally good representation self._assert_allele_parsing( expected="HLA-A*01:01", allele=self.mhc_parser.parse_mhc_allele("HLA-A*01:01")) # removes further information self._assert_allele_parsing( expected="HLA-A*01:01", allele=self.mhc_parser.parse_mhc_allele("HLA-A01:01:02:03N")) self._assert_allele_parsing( expected="HLA-A*01:01", allele=self.mhc_parser.parse_mhc_allele("HLA-A01:01:02N")) self._assert_allele_parsing( expected="HLA-A*01:01", allele=self.mhc_parser.parse_mhc_allele("HLA-A01:01N")) self._assert_allele_parsing(expected="HLA-A*01:01", allele=MhcAllele(gene="A", group="01", protein="01", name="HLA-A*01:01"))
def _get_empty_epitope(): return PredictedEpitope( peptide=None, pos=None, hla=MhcAllele(name=None), affinity_score=None, rank=None, )
def parse_mhc2_isoform(self, isoform: str) -> Mhc2Isoform: # TODO: this method currently fails for netmhc2pan alleles which are like 'HLA-DQA10509-DQB10630' # infers gene, group and protein from the name isoform = isoform.strip("HLA-") if "DQA" in isoform or "DPA" in isoform: alpha_chain = self.mhc_parser.parse_mhc_allele(isoform.split("-")[0]) beta_chain = self.mhc_parser.parse_mhc_allele(isoform.split("-")[1]) else: alpha_chain = MhcAllele() beta_chain = self.mhc_parser.parse_mhc_allele(isoform) # builds the final allele representation and validates it just in case name = get_mhc2_isoform_name(alpha_chain, beta_chain) return Mhc2Isoform(name=name, alpha_chain=alpha_chain, beta_chain=beta_chain)
def test_exists(self): h2_database = FakeH2Database() self.assertTrue(h2_database.exists(MhcAllele(gene="H2K", protein="p"))) self.assertFalse(h2_database.exists(MhcAllele(gene="H2K", protein="x"))) self.assertTrue(h2_database.exists(MhcAllele(gene="H2L", protein="f"))) # badly formed H2 alleles self.assertFalse( h2_database.exists(MhcAllele(gene="H2K", group="p", protein=None))) self.assertFalse( h2_database.exists(MhcAllele(gene="H2K", group=None, protein=None))) self.assertFalse(h2_database.exists(MhcAllele(gene=None, protein="p"))) self.assertFalse( h2_database.exists(MhcAllele(gene="Z", group="15", protein="228")))
def parse_mhc2_isoform(self, isoform: str) -> Mhc2Isoform: # TODO: this method currently fails for netmhc2pan alleles which are like 'HLA-DQA10509-DQB10630' # infers gene, group and protein from the name match = HLA_MOLECULE_PATTERN.match(isoform) if match: alpha_chain = self.parse_mhc_allele(match.group(1)) beta_chain = self.parse_mhc_allele(match.group(2)) else: match = HLA_DR_MOLECULE_PATTERN.match(isoform) assert ( match is not None ), "Molecule does not match HLA isoform pattern {}".format(isoform) alpha_chain = MhcAllele() beta_chain = self.parse_mhc_allele(match.group(1)) # builds the final allele representation and validates it just in case name = get_mhc2_isoform_name(alpha_chain, beta_chain) return Mhc2Isoform(name=name, alpha_chain=alpha_chain, beta_chain=beta_chain)
def validate_mhc_allele_representation(allele: MhcAllele, organism: str): try: allele_pattern = ALLELE_PATTERN_BY_ORGANISM.get(organism) valid_genes = [g.name for g in MHC_I_GENES_BY_ORGANISM.get(organism) + MHC_II_GENES_BY_ORGANISM.get(organism)] assert allele_pattern.match(allele.name) is not None, \ "Allele name does not match expected pattern: {}".format(allele.name) assert allele.gene in valid_genes, "MHC gene {} not from classic MHC for organism {}".format( allele.gene, organism) assert isinstance(allele.protein, str), \ "The field protein in MHC allele model has the value {} and wrong type but must be a character " \ "instead of {}".format(allele.protein, type(allele.protein)) if organism == ORGANISM_HOMO_SAPIENS: assert isinstance(allele.group, str), \ "The field group in MHC allele model has the value {} and wrong type but must be a character " \ "instead of {}".format(allele.group, type(allele.group)) elif organism == ORGANISM_MUS_MUSCULUS: assert allele.group is None or allele.group == "", \ "Provided group for H2 allele" else: raise NeofoxDataValidationException("Not supported organism {}".format(organism)) except AssertionError as e: logger.error(allele.to_json(indent=3)) raise NeofoxDataValidationException(e)