def validate_patient(patient: Patient, organism=ORGANISM_HOMO_SAPIENS): # checks format consistency first ModelValidator.validate(patient) try: # checks that patient id is not empty considering white spaces patient_id = patient.identifier.strip() if patient.identifier else patient.identifier assert patient_id is not None and patient_id != "", "A patient identifier is missing" assert patient.identifier == patient.identifier.strip(), \ "Patient identifier contains white spaces at start or end: {}".format(patient.identifier) # checks MHC I if patient.mhc1: for m in patient.mhc1: ModelValidator._validate_mhc1(m, organism=organism) # checks MHC II if patient.mhc2: for m in patient.mhc2: ModelValidator._validate_mhc2(m, organism=organism) except AssertionError as e: logger.error(patient.to_json(indent=3)) raise NeofoxDataValidationException(e)
def build_mhc1_alleles(alleles: List[str], mhc_database: MhcDatabase) -> List[Mhc1]: isoforms = [] try: mhc_parser = MhcParser.get_mhc_parser(mhc_database) # NOTE: during the pandas parsing of empty columns empty lists become a list with one empty string parsed_alleles = list( map(mhc_parser.parse_mhc_allele, filter(lambda a: a != "", alleles))) for a in parsed_alleles: ModelValidator.validate_mhc1_gene(a) # do we need to validate genes anymore? add test creating MhcAllele with bad gene and see what happens for mhc1_gene in mhc_database.mhc1_genes: gene_alleles = list( filter(lambda a: a.gene == mhc1_gene.name, parsed_alleles)) zygosity = MhcFactory._get_zygosity_from_alleles(gene_alleles) if zygosity == Zygosity.HOMOZYGOUS: gene_alleles = [ gene_alleles[0] ] # we don't want repeated instances of the same allele isoforms.append( Mhc1(name=mhc1_gene, zygosity=zygosity, alleles=gene_alleles)) except AssertionError as e: raise NeofoxDataValidationException(e) return list(filter(lambda i: i.zygosity != Zygosity.LOSS, isoforms))
def parse_mhc_allele(self, allele: str, pattern=H2_ALLELE_PATTERN) -> MhcAllele: match = H2_NETMHCPAN_ALLELE_PATTERN.match(allele) if match: # this ensures that netmhcpan output is normalized allele = "H2{gene}{protein}".format(gene=match.group(1), protein=match.group(2)) match = H2_ALLELE_PATTERN.match(allele) if match is None: raise NeofoxDataValidationException( "Allele does not match H2 allele pattern {}". format(allele) if allele != "" else "Please check the format of provided alleles. An empty allele is provided" ) gene = match.group(1) protein = match.group(2) # controls for existence in the HLA database and warns the user mhc_allele = MhcAllele(gene=gene, protein=protein) if not self.mhc_database.exists(mhc_allele): logger.warning( "Allele {} does not exist in the H2 database".format(allele)) # builds a normalized representation of the allele name = "{gene}{protein}".format(gene=gene, protein=protein) # full name is the same as name in this case as the pattern does not allow variability mhc_allele.name = name mhc_allele.full_name = name return mhc_allele
def _validate_input_data(self): patient_identifiers_from_neoantigens = set( [n.patient_identifier for n in self.neoantigens]) patient_identifiers_from_patients = set( [p.identifier for p in self.patients.values()]) # checks that no neoantigen is referring to an empty patient if ("" in patient_identifiers_from_neoantigens or None in patient_identifiers_from_neoantigens): raise NeofoxDataValidationException( "There are neoantigens missing a reference to a patient") # checks that there is no neoantigen referring to a non existing patient missing_patient_identifiers = patient_identifiers_from_neoantigens.difference( patient_identifiers_from_patients) if len(missing_patient_identifiers) > 0: raise NeofoxDataValidationException( "There are neoantigens referring to missing patients: {}". format(missing_patient_identifiers))
def parse_mhc_allele(self, allele: str) -> MhcAllele: match = HLA_ALLELE_PATTERN_WITHOUT_SEPARATOR.match(allele) if match is not None: # allele without separator, controls for ambiguities gene = match.group(1) group = match.group(2) protein = match.group(3) default_allele_exists = self.mhc_database.exists( MhcAllele(gene=gene, group=group, protein=protein)) if not default_allele_exists: # if default allele does not exist, tries alternative protein = group[-1:] + protein group = group[0:-1] else: # infers gene, group and protein from the name match = HLA_ALLELE_PATTERN.match(allele) if match is None: raise NeofoxDataValidationException( "Allele does not match HLA allele pattern {}". format(allele) if allele != "" else "Please check the format of provided alleles. An empty allele is provided" ) gene = match.group(1) group = match.group(2) protein = match.group(3) # controls for existence in the HLA database and warns the user mhc_allele = MhcAllele(gene=gene, group=group, protein=protein) if not self.mhc_database.exists(mhc_allele): logger.warning( "Allele {} does not exist in the HLA database".format(allele)) # builds a normalized representation of the allele name = "HLA-{gene}*{serotype}:{protein}".format(gene=gene, serotype=group, protein=protein) # ensures that full name stores the complete allele as provided but normalizes # its representation full_name = name six_digits_id = match.group(4) if six_digits_id is not None and six_digits_id != "": full_name = full_name + ":{}".format(six_digits_id) eight_digits_id = match.group(5) if eight_digits_id is not None and eight_digits_id != "": full_name = full_name + ":{}".format(eight_digits_id) expression_change = match.group(6) if expression_change is not None and expression_change != "": full_name = full_name + expression_change mhc_allele.name = name mhc_allele.full_name = full_name return mhc_allele
def validate_mhc2_isoform_representation(isoform: Mhc2Isoform, organism: str): try: if organism == ORGANISM_HOMO_SAPIENS: match_molecule = HLA_MOLECULE_PATTERN.match(isoform.name) match_single_allele = HLA_DR_MOLECULE_PATTERN.match(isoform.name) assert match_molecule or match_single_allele, "MHC II isoform not following molecule pattern" ModelValidator.validate_mhc_allele_representation(isoform.beta_chain, organism) if match_molecule: # the DR molecule does not have alpha chain ModelValidator.validate_mhc_allele_representation(isoform.alpha_chain, organism) elif organism == ORGANISM_MUS_MUSCULUS: match = H2_MOLECULE_PATTERN.match(isoform.name) if match: ModelValidator.validate_mhc_allele_representation(isoform.alpha_chain, organism) #ModelValidator.validate_mhc_allele_representation(isoform.beta_chain, organism) else: raise NeofoxDataValidationException( "Transformed MHC II molecule name does not match H2 isoform pattern {}".format(isoform.name)) else: raise NeofoxDataValidationException("Not supported organism {}".format(organism)) except AssertionError as e: logger.error(isoform.to_json(indent=3)) raise NeofoxDataValidationException(e)
def validate_mhc_allele_representation(allele: MhcAllele, organism: str): try: allele_pattern = ALLELE_PATTERN_BY_ORGANISM.get(organism) valid_genes = [g.name for g in MHC_I_GENES_BY_ORGANISM.get(organism) + MHC_II_GENES_BY_ORGANISM.get(organism)] assert allele_pattern.match(allele.name) is not None, \ "Allele name does not match expected pattern: {}".format(allele.name) assert allele.gene in valid_genes, "MHC gene {} not from classic MHC for organism {}".format( allele.gene, organism) assert isinstance(allele.protein, str), \ "The field protein in MHC allele model has the value {} and wrong type but must be a character " \ "instead of {}".format(allele.protein, type(allele.protein)) if organism == ORGANISM_HOMO_SAPIENS: assert isinstance(allele.group, str), \ "The field group in MHC allele model has the value {} and wrong type but must be a character " \ "instead of {}".format(allele.group, type(allele.group)) elif organism == ORGANISM_MUS_MUSCULUS: assert allele.group is None or allele.group == "", \ "Provided group for H2 allele" else: raise NeofoxDataValidationException("Not supported organism {}".format(organism)) except AssertionError as e: logger.error(allele.to_json(indent=3)) raise NeofoxDataValidationException(e)
def build_mhc2_alleles(alleles: List[str], mhc_database: MhcDatabase) -> List[Mhc2]: mhc2s = [] try: mhc_parser = MhcParser.get_mhc_parser(mhc_database) # NOTE: during the pandas parsing of empty columns empty lists become a list with one empty string parsed_alleles = list( map(mhc_parser.parse_mhc_allele, filter(lambda a: a != "", alleles))) for a in parsed_alleles: ModelValidator.validate_mhc2_gene(a) # do we need to validate genes anymore? add test creating MhcAllele with bad gene and see what happens for mhc2_isoform_name in mhc_database.mhc2_molecules: mhc2_isoform_genes = GENES_BY_MOLECULE.get(mhc2_isoform_name) isoform_alleles = list( filter( lambda a: a.gene in [g.name for g in mhc2_isoform_genes], parsed_alleles)) genes = [] for gene_name in mhc2_isoform_genes: gene_alleles = list( filter(lambda a: a.gene == gene_name.name, isoform_alleles)) zygosity = MhcFactory._get_zygosity_from_alleles( gene_alleles) if zygosity == Zygosity.HOMOZYGOUS: gene_alleles = [ gene_alleles[0] ] # we don't want repeated instances of the same allele genes.append( Mhc2Gene(name=gene_name, zygosity=zygosity, alleles=gene_alleles)) isoforms = MhcFactory._get_mhc2_isoforms( mhc2_isoform_name, genes) mhc2s.append( Mhc2(name=mhc2_isoform_name, genes=genes, isoforms=isoforms)) except AssertionError as e: raise NeofoxDataValidationException(e) return list( filter( lambda m: all( map(lambda g: g.zygosity != Zygosity.LOSS, m.genes)), mhc2s))
def validate_neoantigen(neoantigen: Neoantigen): # checks format consistency first ModelValidator.validate(neoantigen) try: assert neoantigen.patient_identifier is not None and len(neoantigen.patient_identifier) > 0, \ "A patient identifier is missing. Please provide patientIdentifier in the input file" # checks mutation ModelValidator._validate_mutation(neoantigen.mutation) # check the expression values ModelValidator._validate_expression_values(neoantigen) except AssertionError as e: logger.error(neoantigen.to_json(indent=3)) raise NeofoxDataValidationException(e)
def validate(model: betterproto.Message): # TODO: make this method capture appropriately validation issues when dealing with int and float try: model.__bytes__() except Exception as e: raise NeofoxDataValidationException(e)