def test_mutant_amino_acids_in_mm10_chr9_82927102_refGT_altTG_pT441H(): # In the Isovar repository this test is weird because the VCF only # mentions the G>T variant but doesn't include the subsequent nucleotide # change T>G. To avoid having to think about phasing of variants I changed # the VCF in vaxrank to contain a GT>TG variant. arg_parser = make_variant_sequences_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), ]) reads_generator = allele_reads_generator_from_args(args) variants = variant_collection_from_args(args) core_logic = VaxrankCoreLogic( reads_generator=reads_generator, mhc_predictor=random_binding_predictor, variants=variants, vaccine_peptide_length=15, padding_around_mutation=5, min_alt_rna_reads=1, min_variant_sequence_coverage=1, variant_sequence_assembly=True, max_vaccine_peptides_per_variant=1) ranked_list = core_logic.ranked_vaccine_peptides() for variant, vaccine_peptides in ranked_list: vaccine_peptide = vaccine_peptides[0] mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment check_mutant_amino_acids( variant, mutant_protein_fragment)
def test_keep_top_k_epitopes(): arg_parser = make_variant_sequences_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), ]) reads_generator = allele_reads_generator_from_args(args) variants = variant_collection_from_args(args) keep_k_epitopes = 3 core_logic = VaxrankCoreLogic( reads_generator=reads_generator, mhc_predictor=random_binding_predictor, variants=variants, vaccine_peptide_length=15, padding_around_mutation=5, min_alt_rna_reads=1, min_variant_sequence_coverage=1, variant_sequence_assembly=True, max_vaccine_peptides_per_variant=1, num_mutant_epitopes_to_keep=keep_k_epitopes) ranked_list = core_logic.ranked_vaccine_peptides() for variant, vaccine_peptides in ranked_list: vaccine_peptide = vaccine_peptides[0] eq_(keep_k_epitopes, len(vaccine_peptide.mutant_epitope_predictions)) # recompute the expected score, make sure the top-k argument from ranked_vaccine_peptides() # propagated as expected mutant_epitope_score = sum( p.logistic_epitope_score() for p in vaccine_peptide.mutant_epitope_predictions) assert_almost_equal(mutant_epitope_score, vaccine_peptide.mutant_epitope_score)
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I(): # there are two co-occurring variants in the RNAseq data but since # they don't happen in the same codon then we're considering the Varcode # annotation to be correct # TODO: deal with phasing of variants explicitly so that both # variant positions are considered mutated arg_parser = make_variant_sequences_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Wdr13.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), ]) reads_generator = allele_reads_generator_from_args(args) variants = variant_collection_from_args(args) core_logic = VaxrankCoreLogic( variants=variants, reads_generator=reads_generator, mhc_predictor=random_binding_predictor, vaccine_peptide_length=15, padding_around_mutation=5, max_vaccine_peptides_per_variant=1, min_alt_rna_reads=1, min_variant_sequence_coverage=1, variant_sequence_assembly=True) ranked_list = core_logic.ranked_vaccine_peptides() for variant, vaccine_peptides in ranked_list: eq_( 1, len(vaccine_peptides), "Expected 1 vaccine peptide for variant '%s' but got %d" % ( variant, len(vaccine_peptides))) vaccine_peptide = vaccine_peptides[0] mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment check_mutant_amino_acids(variant, mutant_protein_fragment)
def test_mutant_amino_acids_in_mm10_chr9_82927102_refGT_altTG_pT441H(): # In the Isovar repository this test is weird because the VCF only # mentions the G>T variant but doesn't include the subsequent nucleotide # change T>G. To avoid having to think about phasing of variants I changed # the VCF in vaxrank to contain a GT>TG variant. arg_parser = make_variant_sequences_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), ]) reads_generator = allele_reads_generator_from_args(args) variants = variant_collection_from_args(args) core_logic = VaxrankCoreLogic(reads_generator=reads_generator, mhc_predictor=random_binding_predictor, variants=variants, vaccine_peptide_length=15, padding_around_mutation=5, min_alt_rna_reads=1, min_variant_sequence_coverage=1, variant_sequence_assembly=True, max_vaccine_peptides_per_variant=1) ranked_list = core_logic.ranked_vaccine_peptides() for variant, vaccine_peptides in ranked_list: vaccine_peptide = vaccine_peptides[0] mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment check_mutant_amino_acids(variant, mutant_protein_fragment)
def predict_epitopes_from_args(args): """ Returns an epitope collection from the given commandline arguments. Parameters ---------- args : argparse.Namespace Parsed commandline arguments for Topiary """ mhc_model = mhc_binding_predictor_from_args(args) variants = variant_collection_from_args(args) gene_expression_dict = rna_gene_expression_dict_from_args(args) transcript_expression_dict = rna_transcript_expression_dict_from_args(args) predictor = TopiaryPredictor( mhc_model=mhc_model, padding_around_mutation=args.padding_around_mutation, ic50_cutoff=args.ic50_cutoff, percentile_cutoff=args.percentile_cutoff, min_transcript_expression=args.rna_min_transcript_expression, min_gene_expression=args.rna_min_gene_expression, only_novel_epitopes=args.only_novel_epitopes, raise_on_error=not args.skip_variant_errors) return predictor.predict_from_variants( variants=variants, transcript_expression_dict=transcript_expression_dict, gene_expression_dict=gene_expression_dict)
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I(): # there are two co-occurring variants in the RNAseq data but since # they don't happen in the same codon then we're considering the Varcode # annotation to be correct # TODO: deal with phasing of variants explicitly so that both # variant positions are considered mutated arg_parser = make_variant_sequences_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Wdr13.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), ]) reads_generator = allele_reads_generator_from_args(args) variants = variant_collection_from_args(args) core_logic = VaxrankCoreLogic(variants=variants, reads_generator=reads_generator, mhc_predictor=random_binding_predictor, vaccine_peptide_length=15, padding_around_mutation=5, max_vaccine_peptides_per_variant=1, min_alt_rna_reads=1, min_variant_sequence_coverage=1, variant_sequence_assembly=True) ranked_list = core_logic.ranked_vaccine_peptides() for variant, vaccine_peptides in ranked_list: eq_( 1, len(vaccine_peptides), "Expected 1 vaccine peptide for variant '%s' but got %d" % (variant, len(vaccine_peptides))) vaccine_peptide = vaccine_peptides[0] mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment check_mutant_amino_acids(variant, mutant_protein_fragment)
def test_keep_top_k_epitopes(): arg_parser = make_variant_sequences_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), ]) reads_generator = allele_reads_generator_from_args(args) variants = variant_collection_from_args(args) keep_k_epitopes = 3 core_logic = VaxrankCoreLogic(reads_generator=reads_generator, mhc_predictor=random_binding_predictor, variants=variants, vaccine_peptide_length=15, padding_around_mutation=5, min_alt_rna_reads=1, min_variant_sequence_coverage=1, variant_sequence_assembly=True, max_vaccine_peptides_per_variant=1, num_mutant_epitopes_to_keep=keep_k_epitopes) ranked_list = core_logic.ranked_vaccine_peptides() for variant, vaccine_peptides in ranked_list: vaccine_peptide = vaccine_peptides[0] eq_(keep_k_epitopes, len(vaccine_peptide.mutant_epitope_predictions)) # recompute the expected score, make sure the top-k argument from ranked_vaccine_peptides() # propagated as expected mutant_epitope_score = sum( p.logistic_epitope_score() for p in vaccine_peptide.mutant_epitope_predictions) assert_almost_equal(mutant_epitope_score, vaccine_peptide.mutant_epitope_score)
def variant_reads_generator_from_args(args): variants = variant_collection_from_args(args) samfile = samfile_from_args(args) return reads_supporting_variants( variants=variants, samfile=samfile, use_duplicate_reads=args.use_duplicate_reads, use_secondary_alignments=not args.drop_secondary_alignments, min_mapping_quality=args.min_mapping_quality)
def read_evidence_generator_from_args(args): """ Creates a generator of (Variant, ReadEvidence) pairs from parsed arguments. """ variants = variant_collection_from_args(args) samfile = alignment_file_from_args(args) read_creator = read_collector_from_args(args) return read_creator.read_evidence_generator( variants=variants, alignment_file=samfile)
def run_isovar_from_parsed_args(args): """ Extract parameters from parsed arguments and use them to run Isovar """ variants = variant_collection_from_args(args) read_collector = read_collector_from_args(args) alignment_file = alignment_file_from_args(args) protein_sequence_creator = protein_sequence_creator_from_args(args) filter_thresholds = filter_threshold_dict_from_args(args) return run_isovar(variants=variants, alignment_file=alignment_file, read_collector=read_collector, protein_sequence_creator=protein_sequence_creator, filter_thresholds=filter_thresholds)
def main(args_list=None): if args_list is None: args_list = sys.argv[1:] args = parser.parse_args(args_list) variants = variant_collection_from_args(args) all_effects = variants.effects() coding_effects = all_effects.drop_silent_and_noncoding() coding_effects_per_variant = coding_effects.top_priority_effect_per_variant() with StringIO() as string_io: string_io.write("%30s: %5d\n" % ("total variants", len(variants))) string_io.write("%30s: %5d\n" % ( "# SNVs", sum([v.is_snv for v in variants]) )) string_io.write("%30s: %5d\n" % ( "# indels", sum([v.is_indel for v in variants]) )) string_io.write("%30s: %5d\n" % ( "coding non-synonymous variants", len(coding_effects_per_variant))) string_io.write("===\n\n") string_io.write("\nCoding variants in known cancer genes:\n") for v, e in coding_effects_per_variant.items(): if e.gene_id in cancer_driver_gene_id_set: string_io.write("-- %s %s (%s)\n" % (e.gene_name, e.short_description, v.short_description)) string_io.write("\nCoding variants in MHC-I presentation genes:\n") for v, e in coding_effects_per_variant.items(): if e.gene_id in class1_mhc_gene_id_set: string_io.write("-- %s %s (%s)\n" % (e.gene_name, e.short_description, v.short_description)) string_io.write("\nCoding variants in interferon response genes:\n") for v, e in coding_effects_per_variant.items(): if e.gene_id in interferon_response_gene_id_set: string_io.write("-- %s %s (%s)\n" % (e.gene_name, e.short_description, v.short_description)) text = string_io.getvalue() print(text) if args.output_text: with open(args.output_text, "w") as f: f.write(text)
def run(args_list=None): if args_list is None: args_list = argv[1:] args = parser.parse_args(args_list) print("MS-MHC version %s" % __version__) reference_genome = genome_for_reference_name( args.genome if args.genome else "grch37") print("Using reference genome %s" % reference_genome) if args.vcf or args.maf or args.variant or args.json_variants: variants = variant_collection_from_args(args) else: variants = [] hits = generate_protein_sequences( genome=reference_genome, variants=variants, upstream_reading_frames=args.upstream_reading_frames, downstream_reading_frames=args.downstream_reading_frames, skip_exons=args.skip_exons, min_peptide_length=args.min_peptide_length, restrict_sources_to_gene_name=args.gene_name) if args.extract_peptides: print("Extracting %dmer-%dmer peptides from generated sequences" % (args.min_peptide_length, args.max_peptide_length)) sequence_dict = extract_peptides(hits, min_length=args.min_peptide_length, max_length=args.max_peptide_length) else: # make sure we don't have repeated protein sequences sequence_dict = defaultdict(list) for sequence_obj in hits: sequence_dict[sequence_obj.amino_acids].append(sequence_obj) hits = collapse_peptide_sources(sequence_dict) decoys = generate_decoys(hits, n_decoys=len(hits) * args.num_decoys_per_hit, random_seed=args.random_seed) combined_sequences = hits + decoys print("Writing %d FASTA records (%d hits, %d decoys)" % (len(combined_sequences), len(hits), len(decoys))) with open(args.output, "w") as f: for seq in progressbar(combined_sequences): seq.write_to_fasta_file(f) print("Done.")
def ranked_variant_list_with_metadata(args): """ Computes all the data needed for report generation. Parameters ---------- args : Namespace Parsed user args from this run Returns a dictionary containing 3 items: - ranked variant/vaccine peptide list - a dictionary of command-line arguments used to generate it - patient info object """ if hasattr(args, 'input_json_file'): with open(args.input_json_file) as f: data = serializable.from_json(f.read()) # the JSON data from the previous run will have the older args saved, which may need to # be overridden with args from this run (which all be output related) data['args'].update(vars(args)) # if we need to truncate the variant list based on max_mutations_in_report, do that here if len(data['variants']) > args.max_mutations_in_report: data['variants'] = data['variants'][:args. max_mutations_in_report] return data # get various things from user args mhc_alleles = mhc_alleles_from_args(args) logger.info("MHC alleles: %s", mhc_alleles) variants = variant_collection_from_args(args) logger.info("Variants: %s", variants) # generator that for each variant gathers all RNA reads, both those # supporting the variant and reference alleles reads_generator = allele_reads_generator_from_args(args) mhc_predictor = mhc_binding_predictor_from_args(args) ranked_list, variants_count_dict = ranked_vaccine_peptides( reads_generator=reads_generator, mhc_predictor=mhc_predictor, vaccine_peptide_length=args.vaccine_peptide_length, padding_around_mutation=args.padding_around_mutation, max_vaccine_peptides_per_variant=args. max_vaccine_peptides_per_mutation, min_alt_rna_reads=args.min_alt_rna_reads, min_variant_sequence_coverage=args.min_variant_sequence_coverage, min_epitope_score=args.min_epitope_score, num_mutant_epitopes_to_keep=args.num_epitopes_per_peptide, variant_sequence_assembly=args.variant_sequence_assembly) ranked_list_for_report = ranked_list[:args.max_mutations_in_report] patient_info = PatientInfo( patient_id=args.output_patient_id, vcf_paths=variants.sources, bam_path=args.bam, mhc_alleles=mhc_alleles, num_somatic_variants=len(variants), num_coding_effect_variants=variants_count_dict[ 'num_coding_effect_variants'], num_variants_with_rna_support=variants_count_dict[ 'num_variants_with_rna_support'], num_variants_with_vaccine_peptides=variants_count_dict[ 'num_variants_with_vaccine_peptides']) # return variants, patient info, and command-line args data = { 'variants': ranked_list_for_report, 'patient_info': patient_info, 'args': vars(args), } logger.info('About to save args: %s', data['args']) # save JSON data if necessary. as of time of writing, vaxrank takes ~25 min to run, # most of which is core logic. the formatting is super fast, and it can # be useful to save the data to be able to iterate just on the formatting if args.output_json_file: with open(args.output_json_file, 'w') as f: f.write(serializable.to_json(data)) logger.info('Wrote JSON report data to %s', args.output_json_file) return data
def ranked_vaccine_peptides_with_metadata_from_parsed_args(args): """ Computes all the data needed for report generation. Parameters ---------- args : Namespace Parsed user args from this run Returns a dictionary containing 3 items: - ranked variant/vaccine peptide list - a dictionary of command-line arguments used to generate it - patient info object """ if hasattr(args, 'input_json_file'): with open(args.input_json_file) as f: data = serializable.from_json(f.read()) # the JSON data from the previous run will have the older args saved, which may need to # be overridden with args from this run (which all be output related) data['args'].update(vars(args)) # if we need to truncate the variant list based on max_mutations_in_report, do that here if len(data['variants']) > args.max_mutations_in_report: data['variants'] = data['variants'][:args. max_mutations_in_report] return data # get various things from user args mhc_alleles = mhc_alleles_from_args(args) logger.info("MHC alleles: %s", mhc_alleles) variants = variant_collection_from_args(args) logger.info("Variants: %s", variants) vaxrank_results = run_vaxrank_from_parsed_args(args) variants_count_dict = vaxrank_results.variant_counts() assert len(variants) == variants_count_dict['num_total_variants'], \ "Len(variants) is %d but variants_count_dict came back with %d" % ( len(variants), variants_count_dict['num_total_variants']) if args.output_passing_variants_csv: variant_metadata_dicts = vaxrank_results.variant_properties( gene_pathway_check=GenePathwayCheck()) df = pd.DataFrame(variant_metadata_dicts) df.to_csv(args.output_passing_variants_csv, index=False) ranked_variants_with_vaccine_peptides = vaxrank_results.ranked_vaccine_peptides ranked_variants_with_vaccine_peptides_for_report = \ ranked_variants_with_vaccine_peptides[:args.max_mutations_in_report] patient_info = PatientInfo( patient_id=args.output_patient_id, vcf_paths=variants.sources, bam_path=args.bam, mhc_alleles=mhc_alleles, num_somatic_variants=variants_count_dict['num_total_variants'], num_coding_effect_variants=variants_count_dict[ 'num_coding_effect_variants'], num_variants_with_rna_support=variants_count_dict[ 'num_variants_with_rna_support'], num_variants_with_vaccine_peptides=variants_count_dict[ 'num_variants_with_vaccine_peptides']) # return variants, patient info, and command-line args data = { # TODO: # change this field to 'ranked_variants_with_vaccine_peptides' # but figure out how to do it in a backwards compatible way 'variants': ranked_variants_with_vaccine_peptides_for_report, 'patient_info': patient_info, 'args': vars(args), } logger.info('About to save args: %s', data['args']) # save JSON data if necessary. as of time of writing, vaxrank takes ~25 min to run, # most of which is core logic. the formatting is super fast, and it can # be useful to save the data to be able to iterate just on the formatting if args.output_json_file: with open(args.output_json_file, 'w') as f: f.write(serializable.to_json(data)) logger.info('Wrote JSON report data to %s', args.output_json_file) return data
def ranked_variant_list_with_metadata(args): """ Computes all the data needed for report generation. Parameters ---------- args : Namespace Parsed user args from this run Returns a dictionary containing 3 items: - ranked variant/vaccine peptide list - a dictionary of command-line arguments used to generate it - patient info object """ if hasattr(args, 'input_json_file'): with open(args.input_json_file) as f: data = serializable.from_json(f.read()) # the JSON data from the previous run will have the older args saved, which may need to # be overridden with args from this run (which all be output related) data['args'].update(vars(args)) # if we need to truncate the variant list based on max_mutations_in_report, do that here if len(data['variants']) > args.max_mutations_in_report: data['variants'] = data['variants'][:args.max_mutations_in_report] return data # get various things from user args mhc_alleles = mhc_alleles_from_args(args) logger.info("MHC alleles: %s", mhc_alleles) variants = variant_collection_from_args(args) logger.info("Variants: %s", variants) # generator that for each variant gathers all RNA reads, both those # supporting the variant and reference alleles reads_generator = allele_reads_generator_from_args(args) mhc_predictor = mhc_binding_predictor_from_args(args) ranked_list, variants_count_dict = ranked_vaccine_peptides( reads_generator=reads_generator, mhc_predictor=mhc_predictor, vaccine_peptide_length=args.vaccine_peptide_length, padding_around_mutation=args.padding_around_mutation, max_vaccine_peptides_per_variant=args.max_vaccine_peptides_per_mutation, min_alt_rna_reads=args.min_alt_rna_reads, min_variant_sequence_coverage=args.min_variant_sequence_coverage, min_epitope_score=args.min_epitope_score, num_mutant_epitopes_to_keep=args.num_epitopes_per_peptide, variant_sequence_assembly=args.variant_sequence_assembly) ranked_list_for_report = ranked_list[:args.max_mutations_in_report] patient_info = PatientInfo( patient_id=args.output_patient_id, vcf_paths=variants.sources, bam_path=args.bam, mhc_alleles=mhc_alleles, num_somatic_variants=len(variants), num_coding_effect_variants=variants_count_dict['num_coding_effect_variants'], num_variants_with_rna_support=variants_count_dict['num_variants_with_rna_support'], num_variants_with_vaccine_peptides=variants_count_dict['num_variants_with_vaccine_peptides'] ) # return variants, patient info, and command-line args data = { 'variants': ranked_list_for_report, 'patient_info': patient_info, 'args': vars(args), } logger.info('About to save args: %s', data['args']) # save JSON data if necessary. as of time of writing, vaxrank takes ~25 min to run, # most of which is core logic. the formatting is super fast, and it can # be useful to save the data to be able to iterate just on the formatting if args.output_json_file: with open(args.output_json_file, 'w') as f: f.write(serializable.to_json(data)) logger.info('Wrote JSON report data to %s', args.output_json_file) return data