def read_hla_file(path, permissive_parsing=True): """ Read in HLA alleles and normalize them, returning a list of HLA allele names. """ assert path.endswith(".hla"), \ "Expected HLA file %s to end with suffix .hla" % path logging.info("Reading HLA file %s", path) alleles = [] with open(path, 'r') as f: contents = f.read() for line in contents.split("\n"): for raw_allele in line.split(","): if permissive_parsing: # get rid of surrounding whitespace raw_allele = raw_allele.strip() # sometimes we get extra columns with scores, # ignore those raw_allele = raw_allele.split(" ")[0] raw_allele = raw_allele.split("\t")[0] raw_allele = raw_allele.split("'")[0] if len(raw_allele) > 0: alleles.append(normalize_hla_allele_name(raw_allele)) return alleles
def read_hla_file(path, permissive_parsing=True): """ Read in HLA alleles and normalize them, returning a list of HLA allele names. """ assert path.endswith(".hla"), \ "Expected HLA file %s to end with suffix .hla" % path logging.info("Reading HLA file %s", path) alleles = [] with open(path, 'r') as f: contents = f.read() for line in contents.split("\n"): for raw_allele in line.split(","): if permissive_parsing: # get rid of surrounding whitespace raw_allele = raw_allele.strip() # sometimes we get extra columns with scores, # ignore those raw_allele = raw_allele.split(" ")[0] raw_allele = raw_allele.split("\t")[0] raw_allele = raw_allele.split("'")[0] if len(raw_allele) > 0: alleles.append( normalize_hla_allele_name( raw_allele)) return alleles
def __init__( self, hla_alleles, netmhc_command = "netMHCcons"): self.netmhc_command = netmhc_command try: subprocess.check_output([self.netmhc_command], stderr=subprocess.STDOUT) except: assert False, "Failed to run %s" % self.netmhc_command # normalize alleles and keep only unique names normalized_alleles = { normalize_hla_allele_name(allele.strip().upper()).replace("*", "") for allele in hla_alleles } self.alleles = [] # try running "netMHCcons -a" with each allele name # and check if it gives you back a "wrong format" error for allele in normalized_alleles: try: subprocess.check_output( [self.netmhc_command, '-a', allele], stderr=subprocess.STDOUT) except subprocess.CalledProcessError, e: if "allele" in e.output and "wrong format" in e.output: logging.warning( "Allele %s not recognized by NetMHCcons", allele) continue except:
def __init__( self, hla_alleles, netmhc_command = "netMHCpan"): self.netmhc_command = netmhc_command try: run_command([self.netmhc_command]) except: assert False, "Failed to run %s" % self.netmhc_command try: valid_alleles_str = check_output([self.netmhc_command, "-listMHC"]) assert len(valid_alleles_str) > 0, \ "%s returned empty allele list" % self.self.netmhc_command valid_alleles = set([]) for line in valid_alleles_str.split("\n"): if not line.startswith("#"): valid_alleles.add(line) except: logging.warning("Failed to run %s -listMHC", self.netmhc_command) valid_alleles = None self.alleles = [] for allele in hla_alleles: allele = normalize_hla_allele_name(allele.strip().upper()) # for some reason netMHCpan drop the "*" in names # such as "HLA-A*03:01" becomes "HLA-A03:01" if valid_alleles and allele.replace("*", "") not in valid_alleles: print "Skipping %s (not available in NetMHCpan)" % allele else: self.alleles.append(allele) # don't run the MHC predictor twice for homozygous alleles, # only run it for unique alleles self.alleles = set(self.alleles)
def run_pipeline(patient_id, score_epitopes): """Run the pipeline for this patient, and save the output to the DB as a Run.""" hla_types = HLAType.query.with_entities(HLAType.allele, HLAType.mhc_class).filter_by(patient_id=patient_id).all() peptide_length = 31 alleles = [normalize_hla_allele_name( allele) for allele, mhc_class in hla_types] vcf_df = get_vcf_df(patient_id) transcripts_df, vcf_df, variant_report = expand_transcripts( vcf_df, patient_id, min_peptide_length = peptide_length, max_peptide_length = peptide_length) scored_epitopes = score_epitopes(transcripts_df, alleles) imm = ImmunogenicityPredictor(alleles=alleles) scored_epitopes = imm.predict(scored_epitopes) # TODO(tavi) Make this expansion more robust. It breaks the IEDB predictor, # for example. short_transcripts_df = transcripts_df[['chr', 'pos', 'ref', 'alt', 'TranscriptId']] scored_epitopes = merge(scored_epitopes, short_transcripts_df, on='TranscriptId', how='left') peptides = group_epitopes_dataframe( scored_epitopes, use_transcript_name = True) run = Run(patient_id=patient_id, output=dumps(peptides)) db.session.add(run)
def __init__(self, hla_alleles, netmhc_command="netMHCpan"): self.netmhc_command = netmhc_command try: run_command([self.netmhc_command]) except: assert False, "Failed to run %s" % self.netmhc_command try: valid_alleles_str = check_output([self.netmhc_command, "-listMHC"]) assert len(valid_alleles_str) > 0, \ "%s returned empty allele list" % self.self.netmhc_command valid_alleles = set([]) for line in valid_alleles_str.split("\n"): if not line.startswith("#"): valid_alleles.add(line) except: logging.warning("Failed to run %s -listMHC", self.netmhc_command) valid_alleles = None self.alleles = [] for allele in hla_alleles: allele = normalize_hla_allele_name(allele.strip().upper()) # for some reason netMHCpan drop the "*" in names # such as "HLA-A*03:01" becomes "HLA-A03:01" if valid_alleles and allele.replace("*", "") not in valid_alleles: print "Skipping %s (not available in NetMHCpan)" % allele else: self.alleles.append(allele) # don't run the MHC predictor twice for homozygous alleles, # only run it for unique alleles self.alleles = set(self.alleles)
def create_binding_result_row(mutation_entry, allele, pos, epitope, log_ic50, ic50, rank, mutation_window_size=None): # if we have a bad IC50 score we might still get a salvageable # log of the score. Strangely, this is necessary sometimes! if invalid_binding_score(ic50): ic50 = 50000**(-log_ic50 + 1) # if IC50 is still NaN or otherwise invalid, abort if invalid_binding_score(ic50): logging.warn("Invalid IC50 value %0.4f for %s w/ allele %s", ic50, epitope, allele) return None if invalid_binding_score(rank) or rank > 100: logging.warn("Invalid percentile rank %s for %s w/ allele %s", rank, epitope, allele) return None if mutation_window_size: # if we clipped parts of the amino acid sequence which don't # overlap mutations then we have to offset epitope positions by # however much was removed from the beginning of the sequence original_start = max( 0, mutation_entry.MutationStart - mutation_window_size) pos += original_start # keep track of original genetic variant that # gave rise to this epitope new_row = {} # fields shared by all epitopes from this sequence new_row['chr'] = mutation_entry.chr new_row['pos'] = mutation_entry.pos new_row['ref'] = mutation_entry.ref new_row['alt'] = mutation_entry.alt new_row['SourceSequence'] = mutation_entry.SourceSequence new_row['MutationStart'] = mutation_entry.MutationStart new_row['MutationEnd'] = mutation_entry.MutationEnd new_row['GeneInfo'] = mutation_entry.GeneInfo new_row['Gene'] = mutation_entry.Gene new_row["GeneMutationInfo"] = mutation_entry.GeneMutationInfo new_row['PeptideMutationInfo'] = mutation_entry.PeptideMutationInfo new_row['TranscriptId'] = mutation_entry.TranscriptId # fields specific to this epitope new_row['Allele'] = normalize_hla_allele_name(allele) new_row['EpitopeStart'] = pos new_row['EpitopeEnd'] = pos + len(epitope) new_row['Epitope'] = epitope new_row[IC50_FIELD_NAME] = ic50 new_row[PERCENTILE_RANK_FIELD_NAME] = rank return new_row
def create_hla_types(file, patient_id): filename = secure_filename(file.filename) filepath = join(app.config['UPLOAD_FOLDER'], filename) file.save(filepath) alleles = read_hla_file(filepath) hla_types = [] for allele in alleles: allele_normalized = normalize_hla_allele_name(allele) mhc_class = mhc_class_from_normalized_allele_name(allele_normalized) hla_type = HLAType(patient_id=patient_id, allele=allele_normalized, mhc_class=mhc_class) hla_types.append(hla_type) return hla_types
def create_binding_result_row( mutation_entry, allele, pos, epitope, log_ic50, ic50, rank, mutation_window_size = None): # if we have a bad IC50 score we might still get a salvageable # log of the score. Strangely, this is necessary sometimes! if invalid_binding_score(ic50): ic50 = 50000 ** (-log_ic50 + 1) # if IC50 is still NaN or otherwise invalid, abort if invalid_binding_score(ic50): logging.warn( "Invalid IC50 value %0.4f for %s w/ allele %s", ic50, epitope, allele) return None if invalid_binding_score(rank) or rank > 100: logging.warn( "Invalid percentile rank %s for %s w/ allele %s", rank, epitope, allele) return None if mutation_window_size: # if we clipped parts of the amino acid sequence which don't # overlap mutations then we have to offset epitope positions by # however much was removed from the beginning of the sequence original_start = max( 0, mutation_entry.MutationStart - mutation_window_size) pos += original_start # keep track of original genetic variant that # gave rise to this epitope new_row = {} # fields shared by all epitopes from this sequence new_row['chr'] = mutation_entry.chr new_row['pos'] = mutation_entry.pos new_row['ref'] = mutation_entry.ref new_row['alt'] = mutation_entry.alt new_row['SourceSequence'] = mutation_entry.SourceSequence new_row['MutationStart'] = mutation_entry.MutationStart new_row['MutationEnd'] = mutation_entry.MutationEnd new_row['GeneInfo'] = mutation_entry.GeneInfo new_row['Gene'] = mutation_entry.Gene new_row["GeneMutationInfo"] = mutation_entry.GeneMutationInfo new_row['PeptideMutationInfo'] = mutation_entry.PeptideMutationInfo new_row['TranscriptId'] = mutation_entry.TranscriptId # fields specific to this epitope new_row['Allele'] = normalize_hla_allele_name(allele) new_row['EpitopeStart'] = pos new_row['EpitopeEnd'] = pos + len(epitope) new_row['Epitope'] = epitope new_row[IC50_FIELD_NAME] = ic50 new_row[PERCENTILE_RANK_FIELD_NAME] = rank return new_row
predictor = ConsensusBindingPredictor(alleles) return predictor.predict(mutated_regions) else: predictor = PanBindingPredictor(alleles) return predictor.predict(mutated_regions) if __name__ == '__main__': args = parser.parse_args() init_logging(args.quiet) peptide_length = int(args.vaccine_peptide_length) # get rid of gene descriptions if they're in the dataframe if args.hla_file: alleles = [normalize_hla_allele_name(l) for l in open(args.hla_file)] elif args.hla: alleles = [normalize_hla_allele_name(l) for l in args.hla.split(",")] else: alleles = [normalize_hla_allele_name(DEFAULT_ALLELE)] # stack up the dataframes and later concatenate in case we # want both commandline strings (for weird mutations like translocations) # and files mutated_region_dfs = [] if args.string: df = load_comma_string(args.string) mutated_region_dfs.append(df) # loop over all the input files and