def run_pipeline(patient_id, score_epitopes): """Run the pipeline for this patient, and save the output to the DB as a Run.""" hla_types = HLAType.query.with_entities(HLAType.allele, HLAType.mhc_class).filter_by(patient_id=patient_id).all() peptide_length = 31 alleles = [normalize_hla_allele_name( allele) for allele, mhc_class in hla_types] vcf_df = get_vcf_df(patient_id) transcripts_df, vcf_df, variant_report = expand_transcripts( vcf_df, patient_id, min_peptide_length = peptide_length, max_peptide_length = peptide_length) scored_epitopes = score_epitopes(transcripts_df, alleles) imm = ImmunogenicityPredictor(alleles=alleles) scored_epitopes = imm.predict(scored_epitopes) # TODO(tavi) Make this expansion more robust. It breaks the IEDB predictor, # for example. short_transcripts_df = transcripts_df[['chr', 'pos', 'ref', 'alt', 'TranscriptId']] scored_epitopes = merge(scored_epitopes, short_transcripts_df, on='TranscriptId', how='left') peptides = group_epitopes_dataframe( scored_epitopes, use_transcript_name = True) run = Run(patient_id=patient_id, output=dumps(peptides)) db.session.add(run)
def generate_mutation_counts( mutation_files, hla_types, genes_expressed, max_peptide_length=31, skip_identifiers = {}, output_file=None): """ Returns dictionary that maps each patient ID to a tuple with six fields: - total number of mutated epitopes across all transcripts - number of mutated genes - number of mutated genes with MHC binding mutated epitope - number of mutated epitopes which are predicted to bind to an MHC allele - number of mutated genes with at least one immunogenic mutated epitope - number of mutated epitopes which are predicted to be immunogenic (MHC binder + non-self) """ mutation_counts = OrderedDict() n = len(mutation_files) for i, (patient_id, vcf_df) in enumerate(mutation_files.iteritems()): if patient_id in skip_identifiers: logging.info("Skipping patient ID %s", patient_id) continue hla_allele_names = hla_types[patient_id] logging.info( "Processing %s (#%d/%d) with HLA alleles %s", patient_id, i + 1, n, hla_allele_names) if not args.quiet: print vcf_df try: transcripts_df, raw_genomic_mutation_df, variant_report = ( expand_transcripts( vcf_df, patient_id, max_peptide_length=max_peptide_length)) except KeyboardInterrupt: raise except: logging.warning("Failed to apply mutations for %s", patient_id) raise # print each genetic mutation applied to each possible transcript # and either why it failed or what protein mutation resulted if not args.quiet: print_mutation_report( patient_id, variant_report, raw_genomic_mutation_df, transcripts_df) logging.info( "Calling MHC binding predictor for %s (#%d/%d)", patient_id, i + 1, n) def make_mhc_predictor(): if args.netmhc_cons: return ConsensusBindingPredictor(hla_allele_names) else: return PanBindingPredictor(hla_allele_names) # If we want to read scored_epitopes from a CSV file, do that. if args.debug_scored_epitopes_csv: csv_file = args.debug_scored_epitopes_csv if isfile(csv_file): scored_epitopes = pd.read_csv(csv_file) else: mhc = make_mhc_predictor() scored_epitopes = mhc.predict(transcripts_df, mutation_window_size=9) scored_epitopes.to_csv(csv_file) else: mhc = make_mhc_predictor() scored_epitopes = mhc.predict(transcripts_df, mutation_window_size=9) if not args.quiet: print scored_epitopes imm = ImmunogenicityPredictor( alleles=hla_allele_names, binding_threshold=args.binding_threshold) scored_epitopes = imm.predict(scored_epitopes) scored_epitopes.to_csv("scored_epitopes.csv") scored_epitopes = pd.read_csv("scored_epitopes.csv") grouped = scored_epitopes.groupby(["Gene", "GeneMutationInfo"]) n_coding_mutations = len(grouped) n_epitopes = 0 n_ligand_mutations = 0 n_ligands = 0 n_immunogenic_mutations = 0 n_immunogenic_epitopes = 0 for (gene, mut), group in grouped: start_mask = group.EpitopeStart < group.MutationEnd stop_mask = group.EpitopeEnd > group.MutationStart mutated_subset = group[start_mask & stop_mask] # we might have duplicate epitopes from multiple transcripts, so # drop them n_curr_epitopes = len(mutated_subset.groupby(['Epitope'])) n_epitopes += n_curr_epitopes below_threshold_mask = \ mutated_subset.MHC_IC50 <= args.binding_threshold ligands = mutated_subset[below_threshold_mask] n_curr_ligands = len(ligands.groupby(['Epitope'])) n_ligands += n_curr_ligands n_ligand_mutations += (n_curr_ligands) > 0 thymic_deletion_mask = \ np.array(ligands.ThymicDeletion).astype(bool) immunogenic_epitopes = ligands[~thymic_deletion_mask] curr_immunogenic_epitopes = immunogenic_epitopes.groupby(['Epitope']).first() n_immunogenic_epitopes += len(curr_immunogenic_epitopes) n_immunogenic_mutations += len(curr_immunogenic_epitopes) > 0 logging.info(("%s %s: epitopes %s, ligands %d, imm %d"), gene, mut, n_curr_epitopes, n_curr_ligands, len(curr_immunogenic_epitopes), ) result_tuple = ( n_coding_mutations, n_epitopes, n_ligand_mutations, n_ligands, n_immunogenic_mutations, n_immunogenic_epitopes, ) if output_file: data_string = ",".join(str(d) for d in result_tuple) output_file.write("%s,%s\n" % (patient_id, data_string)) output_file.flush() mutation_counts[patient_id] = result_tuple return mutation_counts
def generate_mutation_counts(mutation_files, hla_types, genes_expressed, max_peptide_length=31, skip_identifiers={}, output_file=None): """ Returns dictionary that maps each patient ID to a tuple with six fields: - total number of mutated epitopes across all transcripts - number of mutated genes - number of mutated genes with MHC binding mutated epitope - number of mutated epitopes which are predicted to bind to an MHC allele - number of mutated genes with at least one immunogenic mutated epitope - number of mutated epitopes which are predicted to be immunogenic (MHC binder + non-self) """ mutation_counts = OrderedDict() n = len(mutation_files) for i, (patient_id, vcf_df) in enumerate(mutation_files.iteritems()): if patient_id in skip_identifiers: logging.info("Skipping patient ID %s", patient_id) continue hla_allele_names = hla_types[patient_id] logging.info("Processing %s (#%d/%d) with HLA alleles %s", patient_id, i + 1, n, hla_allele_names) if not args.quiet: print vcf_df try: transcripts_df, raw_genomic_mutation_df, variant_report = ( expand_transcripts(vcf_df, patient_id, max_peptide_length=max_peptide_length)) except KeyboardInterrupt: raise except: logging.warning("Failed to apply mutations for %s", patient_id) raise # print each genetic mutation applied to each possible transcript # and either why it failed or what protein mutation resulted if not args.quiet: print_mutation_report(patient_id, variant_report, raw_genomic_mutation_df, transcripts_df) logging.info("Calling MHC binding predictor for %s (#%d/%d)", patient_id, i + 1, n) def make_mhc_predictor(): if args.netmhc_cons: return ConsensusBindingPredictor(hla_allele_names) else: return PanBindingPredictor(hla_allele_names) # If we want to read scored_epitopes from a CSV file, do that. if args.debug_scored_epitopes_csv: csv_file = args.debug_scored_epitopes_csv if isfile(csv_file): scored_epitopes = pd.read_csv(csv_file) else: mhc = make_mhc_predictor() scored_epitopes = mhc.predict(transcripts_df, mutation_window_size=9) scored_epitopes.to_csv(csv_file) else: mhc = make_mhc_predictor() scored_epitopes = mhc.predict(transcripts_df, mutation_window_size=9) if not args.quiet: print scored_epitopes imm = ImmunogenicityPredictor(alleles=hla_allele_names, binding_threshold=args.binding_threshold) scored_epitopes = imm.predict(scored_epitopes) scored_epitopes.to_csv("scored_epitopes.csv") scored_epitopes = pd.read_csv("scored_epitopes.csv") grouped = scored_epitopes.groupby(["Gene", "GeneMutationInfo"]) n_coding_mutations = len(grouped) n_epitopes = 0 n_ligand_mutations = 0 n_ligands = 0 n_immunogenic_mutations = 0 n_immunogenic_epitopes = 0 for (gene, mut), group in grouped: start_mask = group.EpitopeStart < group.MutationEnd stop_mask = group.EpitopeEnd > group.MutationStart mutated_subset = group[start_mask & stop_mask] # we might have duplicate epitopes from multiple transcripts, so # drop them n_curr_epitopes = len(mutated_subset.groupby(['Epitope'])) n_epitopes += n_curr_epitopes below_threshold_mask = \ mutated_subset.MHC_IC50 <= args.binding_threshold ligands = mutated_subset[below_threshold_mask] n_curr_ligands = len(ligands.groupby(['Epitope'])) n_ligands += n_curr_ligands n_ligand_mutations += (n_curr_ligands) > 0 thymic_deletion_mask = \ np.array(ligands.ThymicDeletion).astype(bool) immunogenic_epitopes = ligands[~thymic_deletion_mask] curr_immunogenic_epitopes = immunogenic_epitopes.groupby( ['Epitope']).first() n_immunogenic_epitopes += len(curr_immunogenic_epitopes) n_immunogenic_mutations += len(curr_immunogenic_epitopes) > 0 logging.info( ("%s %s: epitopes %s, ligands %d, imm %d"), gene, mut, n_curr_epitopes, n_curr_ligands, len(curr_immunogenic_epitopes), ) result_tuple = ( n_coding_mutations, n_epitopes, n_ligand_mutations, n_ligands, n_immunogenic_mutations, n_immunogenic_epitopes, ) if output_file: data_string = ",".join(str(d) for d in result_tuple) output_file.write("%s,%s\n" % (patient_id, data_string)) output_file.flush() mutation_counts[patient_id] = result_tuple return mutation_counts