def variants_to_protein_sequences_dataframe(
        expressed_vcf="data/b16.f10/b16.expressed.vcf",
        not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf",
        tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam",
        min_mapping_quality=0,
        max_protein_sequences_per_variant=1,
        variant_sequence_assembly=False):
    """
    Helper function to load pair of VCFs and tumor RNA BAM
    and use them to generate a DataFrame of expressed variant protein
    sequences.
    """
    expressed_variants = load_vcf(expressed_vcf)
    not_expressed_variants = load_vcf(not_expressed_vcf)

    combined_variants = VariantCollection(
        list(expressed_variants) + list(not_expressed_variants))
    alignment_file = load_bam(tumor_rna_bam)
    read_collector = ReadCollector(min_mapping_quality=min_mapping_quality)
    read_evidence_gen = read_collector.read_evidence_generator(
        variants=combined_variants, alignment_file=alignment_file)

    creator = ProteinSequenceCreator(
        max_protein_sequences_per_variant=max_protein_sequences_per_variant,
        variant_sequence_assembly=variant_sequence_assembly)
    protein_sequences_generator = \
        creator.protein_sequences_from_read_evidence_generator(read_evidence_gen)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    return df, expressed_variants, combined_variants
Exemplo n.º 2
0
def variants_to_protein_sequences_dataframe(
        expressed_vcf="data/b16.f10/b16.expressed.vcf",
        not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf",
        tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam",
        min_mapping_quality=0,
        max_protein_sequences_per_variant=1,
        variant_sequence_assembly=False):
    """
    Helper function to load pair of VCFs and tumor RNA BAM
    and use them to generate a DataFrame of expressed variant protein
    sequences.
    """
    expressed_variants = load_vcf(expressed_vcf)
    not_expressed_variants = load_vcf(not_expressed_vcf)

    combined_variants = VariantCollection(
        list(expressed_variants) + list(not_expressed_variants))
    samfile = load_bam(tumor_rna_bam)

    allele_reads_generator = reads_overlapping_variants(
        variants=combined_variants,
        samfile=samfile,
        min_mapping_quality=min_mapping_quality)

    protein_sequences_generator = reads_generator_to_protein_sequences_generator(
        allele_reads_generator,
        max_protein_sequences_per_variant=max_protein_sequences_per_variant,
        variant_sequence_assembly=variant_sequence_assembly)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    return df, expressed_variants, combined_variants
Exemplo n.º 3
0
def test_variants_to_protein_sequences_dataframe_protein_sequence_length():
    expressed_variants = load_vcf("data/b16.f10/b16.expressed.vcf")
    parser = make_protein_sequences_arg_parser()
    parser.print_help()
    for desired_length in range(9, 20, 3):
        args = parser.parse_args([
            "--vcf", data_path("data/b16.f10/b16.vcf"),
            "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"),
            "--max-protein-sequences-per-variant", "1",
            "--protein-sequence-length", str(desired_length),
        ])
        df = protein_sequences_dataframe_from_args(args)
        eq_(
            len(df),
            len(expressed_variants),
            "Expected %d entries for protein_sequence_length=%d, got %d results: %s" % (
                len(expressed_variants),
                desired_length,
                len(df),
                df))
        protein_sequences = df["amino_acids"]
        print(protein_sequences)
        protein_sequence_lengths = protein_sequences.str.len()
        assert (protein_sequence_lengths == desired_length).all(), (
            protein_sequence_lengths,)
Exemplo n.º 4
0
def test_variants_to_reference_contexts_dataframe():
    variants = load_vcf("data/b16.f10/b16.vcf")
    assert len(variants) > 0
    df = variants_to_reference_contexts_dataframe(variants, context_size=10)
    print(df)
    groups = df.groupby(["chr", "pos", "ref", "alt"])
    # make sure we have at least one reference context for each
    # of the B16 coding variants
    eq_(len(groups), len(variants))
Exemplo n.º 5
0
def test_variants_to_reference_contexts_dataframe():
    variants = load_vcf("data/b16.f10/b16.vcf")
    assert len(variants) > 0
    df = variants_to_reference_contexts_dataframe(variants, context_size=10)
    print(df)
    groups = df.groupby(["chr", "pos", "ref", "alt"])
    # make sure we have at least one reference context for each
    # of the B16 coding variants
    eq_(len(groups), len(variants))
Exemplo n.º 6
0
def test_translate_variant_collection():
    variants = load_vcf("data/b16.f10/b16.vcf")
    samfile = load_bam("data/b16.f10/b16.combined.sorted.bam")

    result = list(translate_variants(reads_supporting_variants(variants, samfile)))
    eq_(
        len(result),
        4,
        "Expected %d translated variants but got %d: %s" % (
            len(variants),
            len(result),
            result))
Exemplo n.º 7
0
def test_translate_variant_collection():
    variants = load_vcf("data/b16.f10/b16.vcf")
    samfile = load_bam("data/b16.f10/b16.combined.sorted.bam")
    read_evidence_gen = ReadCollector().read_evidence_generator(
        variants,
        samfile)
    translation_gen = ProteinSequenceCreator().translate_variants(read_evidence_gen)
    translations = list(translation_gen)
    eq_(
        len(translations),
        4,
        "Expected %d translated variants but got %d: %s" % (
            len(variants),
            len(translations),
            translations))
def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality(
):
    # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped
    # if we set the minimum quality to 256
    variants = load_vcf("data/b16.f10/b16.vcf")
    alignment_file = load_bam("data/b16.f10/b16.combined.sorted.bam")
    read_collector = ReadCollector(min_mapping_quality=256)
    read_evidence_gen = read_collector.read_evidence_generator(
        variants=variants, alignment_file=alignment_file)

    creator = ProteinSequenceCreator(max_protein_sequences_per_variant=1, )
    protein_sequences_generator = creator.protein_sequences_from_read_evidence_generator(
        read_evidence_gen)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    print(df)
    eq_(len(df), 0, "Expected 0 entries, got %d: %s" % (len(df), df))
Exemplo n.º 9
0
def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality():
    # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped
    # if we set the minimum quality to 256
    variants = load_vcf("data/b16.f10/b16.vcf")
    samfile = load_bam("data/b16.f10/b16.combined.sorted.bam")
    allele_reads_generator = reads_overlapping_variants(
        variants=variants,
        samfile=samfile,
        min_mapping_quality=256)
    protein_sequences_generator = reads_generator_to_protein_sequences_generator(
        allele_reads_generator,
        max_protein_sequences_per_variant=1)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    print(df)
    eq_(
        len(df),
        0,
        "Expected 0 entries, got %d: %s" % (len(df), df))
Exemplo n.º 10
0
def test_protein_sequence_creator_protein_length():
    variants = load_vcf("data/b16.f10/b16.vcf")
    alignment_file = load_bam("data/b16.f10/b16.combined.sorted.bam")
    read_collector = ReadCollector()

    for desired_length in [21, 15, 10]:
        creator = ProteinSequenceCreator(
            max_protein_sequences_per_variant=1,
            protein_sequence_length=desired_length)
        read_evidence_gen = read_collector.read_evidence_generator(
            variants=variants, alignment_file=alignment_file)
        protein_sequences_generator = creator.protein_sequences_from_read_evidence_generator(
            read_evidence_gen)
        df = protein_sequences_generator_to_dataframe(
            protein_sequences_generator)
        print(df)
        protein_sequences = df["amino_acids"]
        print(protein_sequences)
        protein_sequence_lengths = protein_sequences.str.len()
        assert (protein_sequence_lengths == desired_length).all(), (
            protein_sequence_lengths, )