def variants_to_protein_sequences_dataframe( expressed_vcf="data/b16.f10/b16.expressed.vcf", not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf", tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam", min_mapping_quality=0, max_protein_sequences_per_variant=1, variant_sequence_assembly=False): """ Helper function to load pair of VCFs and tumor RNA BAM and use them to generate a DataFrame of expressed variant protein sequences. """ expressed_variants = load_vcf(expressed_vcf) not_expressed_variants = load_vcf(not_expressed_vcf) combined_variants = VariantCollection( list(expressed_variants) + list(not_expressed_variants)) samfile = load_bam(tumor_rna_bam) allele_reads_generator = reads_overlapping_variants( variants=combined_variants, samfile=samfile, min_mapping_quality=min_mapping_quality) protein_sequences_generator = reads_generator_to_protein_sequences_generator( allele_reads_generator, max_protein_sequences_per_variant=max_protein_sequences_per_variant, variant_sequence_assembly=variant_sequence_assembly) df = protein_sequences_generator_to_dataframe(protein_sequences_generator) return df, expressed_variants, combined_variants
def isovar_protein_sequence_dict(self): """ This function computes a dictionary of Variant objects to a single isovar protein sequence that will be used to try to construct VaccinePeptides. If this function has been previously called, the result will be cached. """ if self._isovar_protein_sequence_dict is None: # total number of amino acids is the vaccine peptide length plus the # number of off-center windows around the mutation protein_fragment_sequence_length = ( self.vaccine_peptide_length + 2 * self.padding_around_mutation) """ These sequences are only the ones that overlap the variant and support the mutation. Right now, this generator yields: - (variant, mutant protein sequences) if there's enough alt RNA support - (variant, None) if the variant is silent or there are ref reads overlapping the variant locus but inadequate alt RNA support. - does not return the variant if there's no RNA support for ref or alt - we may miss some coding variants this way unless we check for them explicitly Future intended behavior: returns all passing variants, with a protein sequences generator that is non empty if there are enough alt RNA reads supporting the variant """ protein_sequences_generator = reads_generator_to_protein_sequences_generator( self.reads_generator, transcript_id_whitelist=None, protein_sequence_length=protein_fragment_sequence_length, min_alt_rna_reads=self.min_alt_rna_reads, min_variant_sequence_coverage=self. min_variant_sequence_coverage, variant_sequence_assembly=self.variant_sequence_assembly, max_protein_sequences_per_variant=1) self._isovar_protein_sequence_dict = {} for variant, isovar_protein_sequences in protein_sequences_generator: if len(isovar_protein_sequences) == 0: # variant RNA support is below threshold logger.info("No protein sequences for %s", variant) continue # use the first protein sequence - why? self._isovar_protein_sequence_dict[ variant] = isovar_protein_sequences[0] return self._isovar_protein_sequence_dict
def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality(): # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped # if we set the minimum quality to 256 variants = load_vcf("data/b16.f10/b16.vcf") samfile = load_bam("data/b16.f10/b16.combined.sorted.bam") allele_reads_generator = reads_overlapping_variants( variants=variants, samfile=samfile, min_mapping_quality=256) protein_sequences_generator = reads_generator_to_protein_sequences_generator( allele_reads_generator, max_protein_sequences_per_variant=1) df = protein_sequences_generator_to_dataframe(protein_sequences_generator) print(df) eq_( len(df), 0, "Expected 0 entries, got %d: %s" % (len(df), df))
def isovar_protein_sequence_dict(self): """ This function computes a dictionary of Variant objects to a single isovar protein sequence that will be used to try to construct VaccinePeptides. If this function has been previously called, the result will be cached. """ if self._isovar_protein_sequence_dict is None: # total number of amino acids is the vaccine peptide length plus the # number of off-center windows around the mutation protein_fragment_sequence_length = ( self.vaccine_peptide_length + 2 * self.padding_around_mutation) """ These sequences are only the ones that overlap the variant and support the mutation. Right now, this generator yields: - (variant, mutant protein sequences) if there's enough alt RNA support - (variant, None) if the variant is silent or there are ref reads overlapping the variant locus but inadequate alt RNA support. - does not return the variant if there's no RNA support for ref or alt - we may miss some coding variants this way unless we check for them explicitly Future intended behavior: returns all passing variants, with a protein sequences generator that is non empty if there are enough alt RNA reads supporting the variant """ protein_sequences_generator = reads_generator_to_protein_sequences_generator( self.reads_generator, transcript_id_whitelist=None, protein_sequence_length=protein_fragment_sequence_length, min_alt_rna_reads=self.min_alt_rna_reads, min_variant_sequence_coverage=self.min_variant_sequence_coverage, variant_sequence_assembly=self.variant_sequence_assembly, max_protein_sequences_per_variant=1) self._isovar_protein_sequence_dict = {} for variant, isovar_protein_sequences in protein_sequences_generator: if len(isovar_protein_sequences) == 0: # variant RNA support is below threshold logger.info("No protein sequences for %s", variant) continue # use the first protein sequence - why? self._isovar_protein_sequence_dict[variant] = isovar_protein_sequences[0] return self._isovar_protein_sequence_dict
def generate_vaccine_peptides( reads_generator, mhc_predictor, vaccine_peptide_length, padding_around_mutation, max_vaccine_peptides_per_variant, min_alt_rna_reads, min_variant_sequence_coverage, variant_sequence_assembly, min_epitope_score=0): """ Returns dictionary mapping each variant to list of VaccinePeptide objects. """ # total number of amino acids is the vaccine peptide length plus the # number of off-center windows around the mutation protein_fragment_sequence_length = ( vaccine_peptide_length + 2 * padding_around_mutation) protein_sequences_generator = reads_generator_to_protein_sequences_generator( reads_generator, transcript_id_whitelist=None, protein_sequence_length=protein_fragment_sequence_length, min_alt_rna_reads=min_alt_rna_reads, min_variant_sequence_coverage=min_variant_sequence_coverage, variant_sequence_assembly=variant_sequence_assembly, max_protein_sequences_per_variant=1) result_dict = {} for variant, isovar_protein_sequences in protein_sequences_generator: vaccine_peptides = vaccine_peptides_for_variant( variant=variant, isovar_protein_sequences=isovar_protein_sequences, mhc_predictor=mhc_predictor, vaccine_peptide_length=vaccine_peptide_length, padding_around_mutation=padding_around_mutation, max_vaccine_peptides_per_variant=max_vaccine_peptides_per_variant, min_epitope_score=min_epitope_score) result_dict[variant] = vaccine_peptides return result_dict
def generate_vaccine_peptides( reads_generator, mhc_predictor, vaccine_peptide_length, padding_around_mutation, max_vaccine_peptides_per_variant, min_alt_rna_reads, min_variant_sequence_coverage, variant_sequence_assembly, num_mutant_epitopes_to_keep=10000, min_epitope_score=0): """ Returns a tuple of two values: - dictionary mapping each variant to list of VaccinePeptide objects - dictionary containing some variant counts for report display """ # total number of amino acids is the vaccine peptide length plus the # number of off-center windows around the mutation protein_fragment_sequence_length = ( vaccine_peptide_length + 2 * padding_around_mutation) protein_sequences_generator = reads_generator_to_protein_sequences_generator( reads_generator, transcript_id_whitelist=None, protein_sequence_length=protein_fragment_sequence_length, min_alt_rna_reads=min_alt_rna_reads, min_variant_sequence_coverage=min_variant_sequence_coverage, variant_sequence_assembly=variant_sequence_assembly, max_protein_sequences_per_variant=1) result_dict = {} counts_dict = defaultdict(int) for variant, isovar_protein_sequences in protein_sequences_generator: if len(variant.effects().drop_silent_and_noncoding()) > 0: counts_dict['num_coding_effect_variants'] += 1 isovar_protein_sequences = list(isovar_protein_sequences) if len(isovar_protein_sequences) == 0: # this means the variant RNA support is below threshold logger.info("No protein sequences for %s", variant) continue # use the first protein sequence - why? counts_dict['num_variants_with_rna_support'] += 1 isovar_protein_sequence = isovar_protein_sequences[0] vaccine_peptides = vaccine_peptides_for_variant( variant=variant, isovar_protein_sequence=isovar_protein_sequence, mhc_predictor=mhc_predictor, vaccine_peptide_length=vaccine_peptide_length, padding_around_mutation=padding_around_mutation, max_vaccine_peptides_per_variant=max_vaccine_peptides_per_variant, num_mutant_epitopes_to_keep=num_mutant_epitopes_to_keep, min_epitope_score=min_epitope_score) # do any of this variant's vaccine peptides contain mutant epitopes? any_mutant_epitopes = False for vaccine_peptide in vaccine_peptides: if vaccine_peptide.contains_mutant_epitopes(): any_mutant_epitopes = True break if any_mutant_epitopes: counts_dict['num_variants_with_vaccine_peptides'] += 1 result_dict[variant] = vaccine_peptides for key, value in counts_dict.items(): logger.info('%s: %d', key, value) return result_dict, counts_dict