def test_mutant_amino_acids_in_mm10_chr9_82927102_refGT_altTG_pT441H(): # In the Isovar repository this test is weird because the VCF only # mentions the G>T variant but doesn't include the subsequent nucleotide # change T>G. To avoid having to think about phasing of variants I changed # the VCF in vaxrank to contain a GT>TG variant. arg_parser = make_variant_sequences_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), ]) reads_generator = allele_reads_generator_from_args(args) ranked_list, _ = ranked_vaccine_peptides( reads_generator=reads_generator, mhc_predictor=RandomBindingPredictor(["H-2-Kb", "H-2-Db"]), vaccine_peptide_length=15, padding_around_mutation=5, min_alt_rna_reads=1, min_variant_sequence_coverage=1, variant_sequence_assembly=True, max_vaccine_peptides_per_variant=1) for variant, vaccine_peptides in ranked_list: vaccine_peptide = vaccine_peptides[0] mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment check_mutant_amino_acids( variant, mutant_protein_fragment)
def test_keep_top_k_epitopes(): arg_parser = make_variant_sequences_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), ]) reads_generator = allele_reads_generator_from_args(args) keep_k_epitopes = 3 ranked_list, _ = ranked_vaccine_peptides( reads_generator=reads_generator, mhc_predictor=RandomBindingPredictor(["H-2-Kb", "H-2-Db"]), vaccine_peptide_length=15, padding_around_mutation=5, min_alt_rna_reads=1, min_variant_sequence_coverage=1, variant_sequence_assembly=True, max_vaccine_peptides_per_variant=1, num_mutant_epitopes_to_keep=keep_k_epitopes) for variant, vaccine_peptides in ranked_list: vaccine_peptide = vaccine_peptides[0] eq_(keep_k_epitopes, len(vaccine_peptide.mutant_epitope_predictions)) # recompute the expected score, make sure the top-k argument from ranked_vaccine_peptides() # propagated as expected mutant_epitope_score = sum( p.logistic_epitope_score() for p in vaccine_peptide.mutant_epitope_predictions) assert_almost_equal(mutant_epitope_score, vaccine_peptide.mutant_epitope_score)
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I(): # there are two co-occurring variants in the RNAseq data but since # they don't happen in the same codon then we're considering the Varcode # annotation to be correct # TODO: deal with phasing of variants explicitly so that both # variant positions are considered mutated arg_parser = make_variant_sequences_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Wdr13.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), ]) reads_generator = allele_reads_generator_from_args(args) ranked_list, _ = ranked_vaccine_peptides( reads_generator=reads_generator, mhc_predictor=RandomBindingPredictor(["H-2-Kb", "H-2-Db"]), vaccine_peptide_length=15, padding_around_mutation=5, max_vaccine_peptides_per_variant=1, min_alt_rna_reads=1, min_variant_sequence_coverage=1, variant_sequence_assembly=True) for variant, vaccine_peptides in ranked_list: eq_( 1, len(vaccine_peptides), "Expected 1 vaccine peptide for variant '%s' but got %d" % ( variant, len(vaccine_peptides))) vaccine_peptide = vaccine_peptides[0] mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment check_mutant_amino_acids(variant, mutant_protein_fragment)
def test_mutant_amino_acids_in_mm10_chr9_82927102_refGT_altTG_pT441H(): # In the Isovar repository this test is weird because the VCF only # mentions the G>T variant but doesn't include the subsequent nucleotide # change T>G. To avoid having to think about phasing of variants I changed # the VCF in vaxrank to contain a GT>TG variant. arg_parser = make_variant_sequences_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), ]) reads_generator = allele_reads_generator_from_args(args) ranked_list = ranked_vaccine_peptides(reads_generator=reads_generator, mhc_predictor=RandomBindingPredictor( ["H-2-Kb", "H-2-Db"]), vaccine_peptide_length=15, padding_around_mutation=5, min_alt_rna_reads=1, min_variant_sequence_coverage=1, variant_sequence_assembly=True, max_vaccine_peptides_per_variant=1) for variant, vaccine_peptides in ranked_list: vaccine_peptide = vaccine_peptides[0] mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment check_mutant_amino_acids(variant, mutant_protein_fragment)
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I(): # there are two co-occurring variants in the RNAseq data but since # they don't happen in the same codon then we're considering the Varcode # annotation to be correct # TODO: deal with phasing of variants explicitly so that both # variant positions are considered mutated arg_parser = make_variant_sequences_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Wdr13.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), ]) reads_generator = allele_reads_generator_from_args(args) ranked_list = ranked_vaccine_peptides(reads_generator=reads_generator, mhc_predictor=RandomBindingPredictor( ["H-2-Kb", "H-2-Db"]), vaccine_peptide_length=15, padding_around_mutation=5, max_vaccine_peptides_per_variant=1, min_alt_rna_reads=1, min_variant_sequence_coverage=1, variant_sequence_assembly=True) for variant, vaccine_peptides in ranked_list: eq_( 1, len(vaccine_peptides), "Expected 1 vaccine peptide for variant '%s' but got %d" % (variant, len(vaccine_peptides))) vaccine_peptide = vaccine_peptides[0] mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment check_mutant_amino_acids(variant, mutant_protein_fragment)
def test_keep_top_k_epitopes(): arg_parser = make_variant_sequences_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), ]) reads_generator = allele_reads_generator_from_args(args) keep_k_epitopes = 3 ranked_list, _ = ranked_vaccine_peptides( reads_generator=reads_generator, mhc_predictor=RandomBindingPredictor(["H-2-Kb", "H-2-Db"]), vaccine_peptide_length=15, padding_around_mutation=5, min_alt_rna_reads=1, min_variant_sequence_coverage=1, variant_sequence_assembly=True, max_vaccine_peptides_per_variant=1, num_mutant_epitopes_to_keep=keep_k_epitopes) for variant, vaccine_peptides in ranked_list: vaccine_peptide = vaccine_peptides[0] eq_(keep_k_epitopes, len(vaccine_peptide.mutant_epitope_predictions)) # recompute the expected score, make sure the top-k argument from ranked_vaccine_peptides() # propagated as expected mutant_epitope_score = sum( p.logistic_epitope_score() for p in vaccine_peptide.mutant_epitope_predictions) assert_almost_equal(mutant_epitope_score, vaccine_peptide.mutant_epitope_score)
Prints number of reads supporting ref, alt, and other alleles at variant loci. """ from __future__ import division, absolute_import import logging import logging.config import pkg_resources from isovar.cli.rna_reads import (make_rna_reads_arg_parser, allele_reads_generator_from_args) from isovar.allele_counts import allele_counts_dataframe logging.config.fileConfig( pkg_resources.resource_filename('isovar.cli', 'logging.conf')) logger = logging.getLogger(__name__) parser = make_rna_reads_arg_parser() parser.add_argument("--output", default="isovar-allele-counts-result.csv", help="Name of CSV file which contains read sequences") if __name__ == "__main__": args = parser.parse_args() logger.info(args) variants_and_allele_reads_generator = allele_reads_generator_from_args( args) allele_counts_df = allele_counts_dataframe( variants_and_allele_reads_generator) logger.info(allele_counts_df) allele_counts_df.to_csv(args.output)
def ranked_variant_list_with_metadata(args): """ Computes all the data needed for report generation. Parameters ---------- args : Namespace Parsed user args from this run Returns a dictionary containing 3 items: - ranked variant/vaccine peptide list - a dictionary of command-line arguments used to generate it - patient info object """ if hasattr(args, 'input_json_file'): with open(args.input_json_file) as f: data = serializable.from_json(f.read()) # the JSON data from the previous run will have the older args saved, which may need to # be overridden with args from this run (which all be output related) data['args'].update(vars(args)) # if we need to truncate the variant list based on max_mutations_in_report, do that here if len(data['variants']) > args.max_mutations_in_report: data['variants'] = data['variants'][:args. max_mutations_in_report] return data # get various things from user args mhc_alleles = mhc_alleles_from_args(args) logger.info("MHC alleles: %s", mhc_alleles) variants = variant_collection_from_args(args) logger.info("Variants: %s", variants) # generator that for each variant gathers all RNA reads, both those # supporting the variant and reference alleles reads_generator = allele_reads_generator_from_args(args) mhc_predictor = mhc_binding_predictor_from_args(args) ranked_list, variants_count_dict = ranked_vaccine_peptides( reads_generator=reads_generator, mhc_predictor=mhc_predictor, vaccine_peptide_length=args.vaccine_peptide_length, padding_around_mutation=args.padding_around_mutation, max_vaccine_peptides_per_variant=args. max_vaccine_peptides_per_mutation, min_alt_rna_reads=args.min_alt_rna_reads, min_variant_sequence_coverage=args.min_variant_sequence_coverage, min_epitope_score=args.min_epitope_score, num_mutant_epitopes_to_keep=args.num_epitopes_per_peptide, variant_sequence_assembly=args.variant_sequence_assembly) ranked_list_for_report = ranked_list[:args.max_mutations_in_report] patient_info = PatientInfo( patient_id=args.output_patient_id, vcf_paths=variants.sources, bam_path=args.bam, mhc_alleles=mhc_alleles, num_somatic_variants=len(variants), num_coding_effect_variants=variants_count_dict[ 'num_coding_effect_variants'], num_variants_with_rna_support=variants_count_dict[ 'num_variants_with_rna_support'], num_variants_with_vaccine_peptides=variants_count_dict[ 'num_variants_with_vaccine_peptides']) # return variants, patient info, and command-line args data = { 'variants': ranked_list_for_report, 'patient_info': patient_info, 'args': vars(args), } logger.info('About to save args: %s', data['args']) # save JSON data if necessary. as of time of writing, vaxrank takes ~25 min to run, # most of which is core logic. the formatting is super fast, and it can # be useful to save the data to be able to iterate just on the formatting if args.output_json_file: with open(args.output_json_file, 'w') as f: f.write(serializable.to_json(data)) logger.info('Wrote JSON report data to %s', args.output_json_file) return data
def ranked_variant_list_with_metadata(args): """ Computes all the data needed for report generation. Parameters ---------- args : Namespace Parsed user args from this run Returns a dictionary containing 3 items: - ranked variant/vaccine peptide list - a dictionary of command-line arguments used to generate it - patient info object """ if hasattr(args, 'input_json_file'): with open(args.input_json_file) as f: data = serializable.from_json(f.read()) # the JSON data from the previous run will have the older args saved, which may need to # be overridden with args from this run (which all be output related) data['args'].update(vars(args)) # if we need to truncate the variant list based on max_mutations_in_report, do that here if len(data['variants']) > args.max_mutations_in_report: data['variants'] = data['variants'][:args.max_mutations_in_report] return data # get various things from user args mhc_alleles = mhc_alleles_from_args(args) logger.info("MHC alleles: %s", mhc_alleles) variants = variant_collection_from_args(args) logger.info("Variants: %s", variants) # generator that for each variant gathers all RNA reads, both those # supporting the variant and reference alleles reads_generator = allele_reads_generator_from_args(args) mhc_predictor = mhc_binding_predictor_from_args(args) ranked_list, variants_count_dict = ranked_vaccine_peptides( reads_generator=reads_generator, mhc_predictor=mhc_predictor, vaccine_peptide_length=args.vaccine_peptide_length, padding_around_mutation=args.padding_around_mutation, max_vaccine_peptides_per_variant=args.max_vaccine_peptides_per_mutation, min_alt_rna_reads=args.min_alt_rna_reads, min_variant_sequence_coverage=args.min_variant_sequence_coverage, min_epitope_score=args.min_epitope_score, num_mutant_epitopes_to_keep=args.num_epitopes_per_peptide, variant_sequence_assembly=args.variant_sequence_assembly) ranked_list_for_report = ranked_list[:args.max_mutations_in_report] patient_info = PatientInfo( patient_id=args.output_patient_id, vcf_paths=variants.sources, bam_path=args.bam, mhc_alleles=mhc_alleles, num_somatic_variants=len(variants), num_coding_effect_variants=variants_count_dict['num_coding_effect_variants'], num_variants_with_rna_support=variants_count_dict['num_variants_with_rna_support'], num_variants_with_vaccine_peptides=variants_count_dict['num_variants_with_vaccine_peptides'] ) # return variants, patient info, and command-line args data = { 'variants': ranked_list_for_report, 'patient_info': patient_info, 'args': vars(args), } logger.info('About to save args: %s', data['args']) # save JSON data if necessary. as of time of writing, vaxrank takes ~25 min to run, # most of which is core logic. the formatting is super fast, and it can # be useful to save the data to be able to iterate just on the formatting if args.output_json_file: with open(args.output_json_file, 'w') as f: f.write(serializable.to_json(data)) logger.info('Wrote JSON report data to %s', args.output_json_file) return data
""" from __future__ import division, absolute_import import logging import logging.config import pkg_resources from isovar.cli.rna_reads import ( make_rna_reads_arg_parser, allele_reads_generator_from_args ) from isovar.allele_counts import allele_counts_dataframe logging.config.fileConfig(pkg_resources.resource_filename('isovar.cli', 'logging.conf')) logger = logging.getLogger(__name__) parser = make_rna_reads_arg_parser() parser.add_argument( "--output", default="isovar-allele-counts-result.csv", help="Name of CSV file which contains read sequences") if __name__ == "__main__": args = parser.parse_args() logger.info(args) variants_and_allele_reads_generator = allele_reads_generator_from_args(args) allele_counts_df = allele_counts_dataframe(variants_and_allele_reads_generator) logger.info(allele_counts_df) allele_counts_df.to_csv(args.output)