Exemplo n.º 1
0
def ntvar(bam, reference, error_rate, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # create MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    mapped_read_collection_arr = []
    for r in rs:
        # create MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    variants = NTVariantCollection.from_mapped_read_collections(
        error_rate, rs, *mapped_read_collection_arr)

    variants.filter('q30', 'QUAL<30', True)
    variants.filter('ac5', 'AC<5', True)
    variants.filter('dp100', 'DP<100', True)

    if output:
        output.write(variants.to_vcf_file())
        output.close()
    else:
        click.echo(variants.to_vcf_file())
Exemplo n.º 2
0
    def construct_pileup(bam, references):
        """
        Creates a Pileup.
        INPUT:
            [FILE LOCATION] [bam] - file name of BAM file to create mapped
                                    read against reference

            [TUPLE] [references] - reference tuple

        RETURN:
            [ARRAY OF DICTIONARIES] [pileup] - contains read counts for each
                                               base
        POST:
            [None]
        """

        new_pileup = []
        # Iterate over each reference in the reference object.
        for reference in references:
            mrc = parse_mapped_reads_from_bam(reference, bam)

            # append reads mapped against the current reference to the pileup
            # end
            new_pileup += mrc.pileup(indels=True)
        # end for

        return Pileup(new_pileup)
Exemplo n.º 3
0
def cli(ctx, bam, reference, bed4_file, output):
    """This script builds an amino acid census and returns its coverage.
    The BAM alignment file corresponds to a pileup of sequences aligned to
    the REFERENCE. A BAM index file (.bai) must also be present and, except
    for the extension, have the same name as the BAM file. The REFERENCE must
    be in FASTA format. The BED4_FILE must be a BED file with at least 4
    columns and specify the gene locations within the REFERENCE.

    The output is in CSV format."""

    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    # Parse the genes from the gene file
    genes = parse_BED4_file(bed4_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]["frame"])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    if output:
        output.write(aa_census.coverage(frames))
        output.close()
    else:
        click.echo(aa_census.coverage(frames))
Exemplo n.º 4
0
def cli(ctx, bam, reference, percentage, id, output):
    rs = parse_references_from_fasta(reference)
    bam_header = pysam.Samfile(bam, "rb").header

    if id:
        fasta_id = id
    else:
        fasta_id = os.path.basename(bam).split('.')[0]

    for r in rs:
        mrc = parse_mapped_reads_from_bam(r, bam)

        conseq = mrc.to_consensus(percentage)

        if hasattr(bam_header, 'RG'):
            fasta_id = bam_header['RG']

        if output:
            output.write('>{0}_{1}_{2}\n{3}'.format(fasta_id, percentage,
                                                    r.name, conseq))
        else:
            click.echo('>{0}_{1}_{2}\n{3}'.format(fasta_id, percentage, r.name,
                                                  conseq))
    if output:
        output.close()
Exemplo n.º 5
0
def cli(ctx, bam, reference, genes_file, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    # Parse the genes from the gene file
    genes = parse_genes_file(genes_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]["frame"])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    if output:
        output.write(aa_census.coverage(frames))
        output.close()
    else:
        click.echo(aa_census.coverage(frames))
Exemplo n.º 6
0
    def test_from_bam(self):
        reference = Reference(
            'hxb2_pol',
            'CCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTGCCCATACAAAAGGAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTTAGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACCTTCTATGTAGATGGGGCAGCTAACAGGGAGACTAAATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTGACACAACAAATCAGAAGACTGAGTTACAAGCAATTTATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTAACAGACTCACAATATGCATTAGGAATCATTCAAGCACAACCAGATCAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGGAAAAGGTCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATATTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCGCCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAAATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAG'
        )
        mrc = parse_mapped_reads_from_bam(reference, 'tests/data/test1.bam')

        assert len(mrc.mapped_reads) == 6308
Exemplo n.º 7
0
    def test_from_aacensus(self):
        bam = TEST_PATH + "/data/align.bam"
        BED4_file = TEST_PATH + "/data/hxb2_pol.bed"
        mapped_read_collection_arr = []
        error_rate = 0.0038

        # Create a MappedReadCollection object
        for r in self.references:
            mapped_read_collection_arr.append(
                parse_mapped_reads_from_bam(r, bam))

            variants = NTVariantCollection.from_mapped_read_collections(
                error_rate, self.references, *mapped_read_collection_arr)
            variants.filter('q30', 'QUAL<30', True)
            variants.filter('ac5', 'AC<5', True)
            variants.filter('dp100', 'DP<100', True)

        # Mask the unconfident differences
        for mrc in mapped_read_collection_arr:
            mrc.mask_unconfident_differences(variants)

        # Parse the genes from the gene file
        genes = parse_BED4_file(BED4_file, self.references[0].name)

        # Determine which frames our genes are in
        frames = set()

        for gene in genes:
            frames.add(genes[gene]['frame'])

        aa_census = AACensus(self.reference, mapped_read_collection_arr, genes,
                             frames)

        test_variants = CodonVariantCollection.from_aacensus(aa_census)
        ref_seq = self.references[0].seq

        for gene in test_variants.variants:
            assert gene in genes
            for pos in test_variants.variants[gene]:
                for frame in frames:
                    nt_pos = pos / 3 - frame
                    assert nt_pos >= genes[gene]['start'] or nt_pos <= genes[
                        gene]['end']
                for codon in test_variants.variants[gene][pos]:
                    ref_codon = ref_seq[(pos):(pos) + 3].lower()
                    assert codon != ref_codon
Exemplo n.º 8
0
def cli(ctx, bam, reference, variants, bed4_file, min_freq, mutation_db,
        reporting_threshold, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    variants_obj = parse_nt_variants_from_vcf(variants, rs)

    # Mask the unconfident differences
    for mrc in mapped_read_collection_arr:
        mrc.mask_unconfident_differences(variants_obj)

    # Parse the genes from the gene file
    genes = parse_BED4_file(bed4_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]['frame'])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    # Create AAVar collection and print the aavf file
    aa_vars = AAVariantCollection.from_aacensus(aa_census)

    # Filter for mutant frequency
    aa_vars.filter('mf' + str(min_freq), 'freq<' + str(min_freq), True)

    # Build the mutation database
    mutation_db = MutationDB(mutation_db, genes)

    # Generate the mutation report
    if output:
        output.write(
            aa_vars.report_dr_mutations(mutation_db, reporting_threshold))
        output.close()
    else:
        click.echo(
            aa_vars.report_dr_mutations(mutation_db, reporting_threshold))
Exemplo n.º 9
0
def aavar(bam, reference, variants, genes_file, min_freq, mutation_db, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    variants_obj = parse_nt_variants_from_vcf(variants, rs)

    # Mask the unconfident differences
    for mrc in mapped_read_collection_arr:
        mrc.mask_unconfident_differences(variants_obj)

    # Parse the genes from the gene file
    genes = parse_genes_file(genes_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]['frame'])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    # Create AAVar collection and print the hmcf file
    aa_vars = AAVariantCollection.from_aacensus(aa_census)

    # Filter for mutant frequency
    aa_vars.filter('mf0.01', 'freq<0.01', True)

    # Build the mutation database and update collection
    if mutation_db is not None:
        mutation_db = MutationDB(mutation_db, genes)
        aa_vars.apply_mutation_db(mutation_db)

    if output:
        output.write(aa_vars.to_hmcf_file(CONFIDENT))
    else:
        click.echo(aa_vars.to_hmcf_file(CONFIDENT))
Exemplo n.º 10
0
def codonvar(bam, reference, offset, bed4_file, variants, error_rate, output):
    rs = parse_references_from_fasta(reference)
    mapped_read_collection_arr = []

    # Create a MappedReadCollection object
    for r in rs:
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    if variants:
        variants_obj = parse_nt_variants_from_vcf(variants, rs)
    else:
        variants = NTVariantCollection.from_mapped_read_collections(
            error_rate, rs, *mapped_read_collection_arr)
        variants.filter('q30', 'QUAL<30', True)
        variants.filter('ac5', 'AC<5', True)
        variants.filter('dp100', 'DP<100', True)
        variants_obj = variants

    # Mask the unconfident differences
    for mrc in mapped_read_collection_arr:
        mrc.mask_unconfident_differences(variants_obj)

    # Parse the genes from the gene file
    genes = parse_BED4_file(bed4_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]['frame'])

    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    codon_variants = CodonVariantCollection.from_aacensus(aa_census)

    if output:
        output.write(codon_variants.to_csv_file(offset))
        output.close()
    else:
        click.echo(codon_variants.to_csv_file(offset))
Exemplo n.º 11
0
    def setup(self):
        reference = TEST_PATH + "/data/hxb2_pol.fas"
        bam = TEST_PATH + "/data/align.bam"
        genes_file = TEST_PATH + "/data/hxb2_pol.bed"
        mutation_db = TEST_PATH + "/data/mutation_db.tsv"
        min_freq = 0.01

        rs = parse_references_from_fasta(reference)

        mapped_read_collection_arr = []
        for r in rs:
            # Create a MappedReadCollection object
            mapped_read_collection_arr.append(
                parse_mapped_reads_from_bam(r, bam))

        variants_obj = parse_nt_variants_from_vcf(VARIANTS_FILE, rs)

        # Mask the unconfident differences
        for mrc in mapped_read_collection_arr:
            mrc.mask_unconfident_differences(variants_obj)

        # Parse the genes from the gene file
        genes = parse_genes_file(genes_file, rs[0].name)

        # Determine which frames our genes are in
        frames = set()

        for gene in genes:
            frames.add(genes[gene]['frame'])

        # Create an AACensus object
        aa_census = AACensus(reference, mapped_read_collection_arr, genes,
                             frames)

        # Find the AA mutations
        self.aa_collection = AAVariantCollection.from_aacensus(aa_census)

        # Build the mutation database
        self.mutation_db = MutationDB(mutation_db, genes)
Exemplo n.º 12
0
    def setup_class(self):
        reference = TEST_PATH + "/data/hxb2_pol.fas"
        bam = TEST_PATH + "/data/align.bam"
        BED4_file = TEST_PATH + "/data/hxb2_pol.bed"

        rs = parse_references_from_fasta(reference)

        mapped_read_collection_arr = []
        for r in rs:
            # create MappedReadCollection object
            mapped_read_collection_arr.append(
                parse_mapped_reads_from_bam(r, bam))

        genes = parse_BED4_file(BED4_file, rs[0].name)

        # Determine which frames our genes are in
        self.frames = set()

        for gene in genes:
            self.frames.add(genes[gene]["frame"])

        self.aa_census = AACensus(reference, mapped_read_collection_arr, genes,
                                  self.frames)
Exemplo n.º 13
0
    def setup(self):
        bam = TEST_PATH + "/data/align.bam"
        reference = TEST_PATH + "/data/hxb2_pol.fas"
        genes_file = TEST_PATH + "/data/hxb2_pol.bed"
        error_rate = 0.0038

        rs = parse_references_from_fasta(reference)
        mapped_read_collection_arr = []

        # Create a MappedReadCollection object
        for r in rs:
            mapped_read_collection_arr.append(
                parse_mapped_reads_from_bam(r, bam))

        variants = NTVariantCollection.from_mapped_read_collections(
            error_rate, rs, *mapped_read_collection_arr)
        variants.filter('q30', 'QUAL<30', True)
        variants.filter('ac5', 'AC<5', True)
        variants.filter('dp100', 'DP<100', True)

        # Mask the unconfident differences
        for mrc in mapped_read_collection_arr:
            mrc.mask_unconfident_differences(variants)

        # Parse the genes from the gene file
        genes = parse_genes_file(genes_file, rs[0].name)

        # Determine which frames our genes are in
        frames = set()

        for gene in genes:
            frames.add(genes[gene]['frame'])

        aa_census = AACensus(reference, mapped_read_collection_arr, genes,
                             frames)

        self.codon_variants = CodonVariantCollection.from_aacensus(aa_census)
Exemplo n.º 14
0
    def analyze_reads(self, fasta_id, variant_filters, reporting_threshold,
                      generate_consensus):

        # Map reads against reference using bowtietwo
        if not self.quiet:
            print("# Mapping reads...")

        try:
            bam = self.generate_bam(fasta_id)
        except Exception as error:
            raise (error)

        if not self.quiet:
            print("# Loading read mappings...")

        # cmd_consensus
        if generate_consensus:
            cons_seq_file = open("%s/consensus.fasta" % self.output_dir, "w+")

        mapped_read_collection_arr = []
        for r in self.references:
            mrc = parse_mapped_reads_from_bam(r, bam)
            mapped_read_collection_arr.append(mrc)
            consensus_seq = mrc.to_consensus(self.consensus_pct)
            if generate_consensus and len(consensus_seq) > 0:
                cons_seq_file.write('>{0}_{1}_{2}\n{3}'.format(
                    fasta_id, reporting_threshold, r.name, consensus_seq))

        if generate_consensus:
            cons_seq_file.close()

        # cmd_callntvar
        if not self.quiet:
            print("# Identifying variants...")

        variants = NTVariantCollection.from_mapped_read_collections(
            variant_filters[ERROR_RATE], self.references,
            *mapped_read_collection_arr)

        variants.filter('q%s' % variant_filters[MIN_VARIANT_QUAL],
                        'QUAL<%s' % variant_filters[MIN_VARIANT_QUAL], True)
        variants.filter('ac%s' % variant_filters[MIN_AC],
                        'AC<%s' % variant_filters[MIN_AC], True)
        variants.filter('dp%s' % variant_filters[MIN_DP],
                        'DP<%s' % variant_filters[MIN_DP], True)

        vcf_file = open("%s/hydra.vcf" % self.output_dir, "w+")
        vcf_file.write(variants.to_vcf_file())
        vcf_file.close()

        # cmd_aa_census
        if not self.quiet:
            print("# Masking filtered variants...")

        for mrc in mapped_read_collection_arr:
            mrc.mask_unconfident_differences(variants)

        if not self.quiet:
            print("# Building amino acid census...")

        # Determine which frames our genes are in
        frames = set()

        for gene in self.genes:
            frames.add(self.genes[gene]['frame'])

        aa_census = AACensus(self.reference, mapped_read_collection_arr,
                             self.genes, frames)

        coverage_file = open("%s/coverage_file.csv" % self.output_dir, "w+")
        coverage_file.write(aa_census.coverage(frames))
        coverage_file.close()

        # cmd_aavariants
        if not self.quiet:
            print("# Finding amino acid mutations...")

        # Create AAVar collection and print the aavf file
        aa_vars = AAVariantCollection.from_aacensus(aa_census)

        # Filter for mutant frequency
        aa_vars.filter('mf%s' % variant_filters[MIN_FREQ],
                       'freq<%s' % variant_filters[MIN_FREQ], True)

        # Build the mutation database and update collection
        if self.mutation_db is not None:
            mutation_db = MutationDB(self.mutation_db, self.genes)
            aa_vars.apply_mutation_db(mutation_db)

        aavf_obj = aa_vars.to_aavf_obj("hydra",
                                       os.path.basename(self.reference),
                                       CONFIDENT)
        records = list(aavf_obj)

        mut_report = open("%s/mutation_report.aavf" % self.output_dir, "w+")

        writer = parser.Writer(mut_report, aavf_obj)

        for record in records:
            writer.write_record(record)

        mut_report.close()
        writer.close()

        # cmd_drmutations
        if not self.quiet:
            print("# Writing drug resistant mutation report...")

        dr_report = open("%s/dr_report.csv" % self.output_dir, "w+")
        dr_report.write(
            aa_vars.report_dr_mutations(mutation_db, reporting_threshold))
        dr_report.close()

        self.output_stats(mapped_read_collection_arr)
Exemplo n.º 15
0
    def analyze_reads(self, fasta_id, filters, reporting_threshold,
                      generate_consensus):
        # Map reads against reference using bowtietwo
        if not self.quiet:
            print("# Mapping reads...")

        bam = self.generate_bam(fasta_id)

        if not self.quiet:
            print("# Loading read mappings...")

        # cmd_consensus
        if generate_consensus:
            cons_seq_file = open("%s/consensus.fasta" % self.output_dir, "w+")

        mapped_read_collection_arr = []
        for r in self.references:
            mrc = parse_mapped_reads_from_bam(r, bam)
            mapped_read_collection_arr.append(mrc)
            if generate_consensus:
                cons_seq_file.write('>{0}_{1}_{2}\n{3}'.format(
                    fasta_id, reporting_threshold, r.name,
                    mrc.to_consensus(self.consensus_pct)))

        if generate_consensus:
            cons_seq_file.close()

        # cmd_callntvar
        if not self.quiet:
            print("# Identifying variants...")

        variants = NTVariantCollection.from_mapped_read_collections(
            filters["error_rate"], self.references,
            *mapped_read_collection_arr)

        variants.filter('q%s' % filters["min_qual"],
                        'QUAL<%s' % filters["min_qual"], True)
        variants.filter('ac%s' % filters["min_ac"],
                        'AC<%s' % filters["min_ac"], True)
        variants.filter('dp%s' % filters["min_dp"],
                        'DP<%s' % filters["min_dp"], True)

        vcf_file = open("%s/hydra.vcf" % self.output_dir, "w+")
        vcf_file.write(variants.to_vcf_file())
        vcf_file.close()

        # cmd_aa_census
        if not self.quiet:
            print("# Masking filtered variants...")

        for mrc in mapped_read_collection_arr:
            mrc.mask_unconfident_differences(variants)

        if not self.quiet:
            print("# Building amino acid census...")

        # Determine which frames our genes are in
        frames = set()

        for gene in self.genes:
            frames.add(self.genes[gene]['frame'])

        aa_census = AACensus(self.reference, mapped_read_collection_arr,
                             self.genes, frames)

        coverage_file = open("%s/coverage_file.csv" % self.output_dir, "w+")
        coverage_file.write(aa_census.coverage(frames))
        coverage_file.close()

        # cmd_aavariants
        if not self.quiet:
            print("# Finding amino acid mutations...")

        # Create AAVar collection and print the hmcf file
        aa_vars = AAVariantCollection.from_aacensus(aa_census)

        # Filter for mutant frequency
        aa_vars.filter('mf%s' % filters['min_freq'],
                       'freq<%s' % filters['min_freq'], True)

        # Build the mutation database and update collection
        if self.mutation_db is not None:
            mutation_db = MutationDB(self.mutation_db, self.genes)
            aa_vars.apply_mutation_db(mutation_db)

        mut_report = open("%s/mutation_report.hmcf" % self.output_dir, "w+")
        mut_report.write(aa_vars.to_hmcf_file(CONFIDENT))
        mut_report.close()

        # cmd_drmutations
        if not self.quiet:
            print("# Writing drug resistant mutation report...")

        dr_report = open("%s/dr_report.csv" % self.output_dir, "w+")
        dr_report.write(aa_vars.report_dr_mutations(mutation_db,
                                                    reporting_threshold))
        dr_report.close()

        self.output_stats(mapped_read_collection_arr)
Exemplo n.º 16
0
def aavar(bam, reference, bed4_file, variants, mutation_db, min_freq,
          error_rate, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    if variants:
        variants_obj = parse_nt_variants_from_vcf(variants, rs)
    else:
        variants = NTVariantCollection.from_mapped_read_collections(
            error_rate, rs, *mapped_read_collection_arr)
        variants.filter('q30', 'QUAL<30', True)
        variants.filter('ac5', 'AC<5', True)
        variants.filter('dp100', 'DP<100', True)
        variants_obj = variants

    # Mask the unconfident differences
    for mrc in mapped_read_collection_arr:
        mrc.mask_unconfident_differences(variants_obj)

    # Parse the genes from the gene file
    genes = parse_BED4_file(bed4_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]['frame'])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    # Create AAVar collection and print the aavf file
    aa_vars = AAVariantCollection.from_aacensus(aa_census)

    # Filter for mutant frequency
    aa_vars.filter('mf0.01', 'freq<0.01', True)

    # Build the mutation database and update collection
    if mutation_db is not None:
        mutation_db = MutationDB(mutation_db, genes)
        aa_vars.apply_mutation_db(mutation_db)

    aavf_obj = aa_vars.to_aavf_obj("aavar", os.path.basename(reference),
                                   CONFIDENT)
    records = list(aavf_obj)

    if output:
        writer = parser.Writer(output, aavf_obj)
    else:
        writer = parser.Writer(sys.stdout, aavf_obj)

    for record in records:
        writer.write_record(record)

    if output:
        output.close

    writer.close()
Exemplo n.º 17
0
    def test_valid_vcf_file(self):
        """Tests to ensure that valid vcf files are parsed properly."""

        reference = TEST_PATH + \
            "/data/hxb2_pol.fas"
        bam = TEST_PATH + "/data/align.bam"

        rs = parse_references_from_fasta(reference)

        mapped_read_collection_arr = []
        for r in rs:
            # Create a MappedReadCollection object
            mapped_read_collection_arr.append(
                parse_mapped_reads_from_bam(r, bam))

        variants_obj = NTVariantCollection(rs)

        for i in range(0, 20):
            variant = NTVariant(chrom="hxb2_pol",
                                pos=i,
                                id=".",
                                ref='a',
                                alt='t',
                                qual="50",
                                filter="PASS",
                                info={
                                    "DP": "300",
                                    "AC": "1",
                                    "AF": "0.0025"
                                })

            variants_obj.variants["hxb2_pol"][i]['t'] = variant

        #Create a valid vcf file
        valid_vcf_file = TEST_PATH + "/data/valid_vcf_file.vcf"

        with open(valid_vcf_file, "w+") as f:
            f.write(
                "##fileformat=VCFv4.2\n"
                "##fileDate=20171005\n"
                "##source=quasitools\n"
                "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n"
                "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele Count\">\n"
                "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">\n"
                "##FILTER=<ID=q30,Description=\"Quality below 30\">\n"
                "##FILTER=<ID=dp100,Description=\"Read depth below 100\">\n"
                "##FILTER=<ID=ac5,Description=\"Allele count below 5\">\n"
                "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO")

            for rid in variants_obj.variants:
                for pos in variants_obj.variants[rid]:
                    for alt in variants_obj.variants[rid][pos]:
                        variant = variants_obj.variants[rid][pos][alt]
                        f.write("\n%s\t%i\t%s\t%s\t%s\t%s\t%s" %
                                (variant.chrom, int(
                                    variant.pos), variant.id, variant.ref,
                                 variant.alt, variant.qual, variant.filter))
                        f.write(
                            "\tDP=%i;AC=%i;AF=%0.4f" %
                            (int(variant.info["DP"]), int(variant.info["AC"]),
                             float(variant.info["AF"])))

        parsed_nt_var = parse_nt_variants_from_vcf(valid_vcf_file, rs)

        # Check equality of parsed NTVariantCollection vs. the valid NTVariantCollection
        for rid in parsed_nt_var.variants:
            for pos in parsed_nt_var.variants[rid]:
                for alt in parsed_nt_var.variants[rid][pos]:
                    parsed_variant = parsed_nt_var.variants[rid][pos][alt]
                    variant = variants_obj.variants[rid][pos][alt]

                    assert parsed_variant.chrom == variant.chrom
                    assert parsed_variant.pos == variant.pos
                    assert parsed_variant.id == variant.id
                    assert parsed_variant.ref == variant.ref
                    assert parsed_variant.alt == variant.alt
                    assert parsed_variant.qual == variant.qual
                    assert parsed_variant.filter == variant.filter
                    assert parsed_variant.info["DP"] == variant.info["DP"]
                    assert parsed_variant.info["AC"] == variant.info["AC"]
                    assert parsed_variant.info["AF"] == variant.info["AF"]

        os.remove(valid_vcf_file)