Пример #1
0
    def test_write_and_format_decimals(self):
        """Test whether writes to file work with specifying a certain number
           of decimals for the ALT_FREQ field output as expected."""

        for num_dec in range(3, 6):
            reader = parser.Reader(SAMPLE_FILE)
            aavf_obj = reader.read_records()
            out = fhandle('sampleoutput4.aavf', "w+")
            writer = parser.Writer(out, aavf_obj)

            records = list(aavf_obj)
            for record in records:
                writer.write_record(record, decimals=num_dec)

            out.close()
            reader1 = parser.Reader(TEST_PATH +
                                    '/sampleoutput4.aavf').read_records()
            reader2 = parser.Reader(SAMPLE_FILE).read_records()
            writer.close()
            # each ALT_FREQ field's string should have num_dec + 2 characters
            # e.g. 0.123 if num_dec is three
            for left, right in zip(reader1, reader2):
                assert left.INFO == right.INFO
                assert left.ALT_FREQ == round(right.ALT_FREQ, num_dec), \
                    "%s and %s should be the same up to the %dth decimal place" % (left.ALT_FREQ, right.ALT_FREQ, num_dec)
Пример #2
0
    def test_write_to_read_from_stream(self):
        reader0 = parser.Reader(SAMPLE_FILE)
        aavf_obj = reader0.read_records()
        out = StringIO()
        writer = parser.Writer(out, aavf_obj)

        for record in aavf_obj:
            writer.write_record(record)
        out.seek(0)
        aavf = parser.Reader(out).read_records()
        out.close()
        record_list = [record for record in aavf]

        assert isinstance(aavf, AAVF)

        assert aavf.metadata.get("fileformat") == "AAVFv1.0", \
               "fileformat should be AAVFv1.0, metadata is %s" % aavf.metadata
        assert aavf.metadata.get("fileDate") == "20180501", \
               "filedate should be 20180501, metadata is %s" % aavf.metadata
        assert aavf.metadata.get("source") == "myProgramV1.0", \
               "source should be myProgramV1.0, metadata is %s" % aavf.metadata
        assert aavf.metadata.get("reference") == ["hxb2.fas"], \
               "reference list should be [hxb2.fas], metadata is %s" % aavf.metadata
        assert aavf.infos
        assert aavf.filters

        assert len(record_list) == 7
        # all data lines should be the same as in the sample file
        for record in record_list:
            assert isinstance(record, Record)
Пример #3
0
    def test_write(self):
        """Test whether the INFO section can be written correctly."""
        reader = parser.Reader(SAMPLE_FILE)
        aavf_obj = reader.read_records()
        out = fhandle('sampleoutput2.aavf', "w+")
        writer = parser.Writer(out, aavf_obj)

        records = list(aavf_obj)

        for record in records:
            writer.write_record(record)
        writer.flush()
        writer.close()

        sample2 = TEST_PATH + "/sampleoutput2.aavf"

        # initialize readers for iteration below
        reader = parser.Reader(SAMPLE_FILE)
        aavf_obj = reader.read_records()
        reader2 = parser.Reader(sample2)
        aavf_obj2 = reader2.read_records()

        # iterate over sample file input and written output to see if they match
        for left, right in zip(aavf_obj, aavf_obj2):
            assert left.INFO == right.INFO, "left.INFO is %s and right.INFO is %s" \
                   % (left.INFO, right.INFO)
Пример #4
0
    def test_aa_variants_nodb(self):
        # Same as previous test but for no mutation db
        aavf_path = TEST_PATH + "/data/output/temp.aavf"
        aavf_out = open(aavf_path, "w")

        # Read from file and make sure there are no empty lines
        with open(VALID_AA_VARIANTS_AAVF, "r") as input:
            valid_variants = input.read()

        valid_variants_lines = sorted(filter(None, valid_variants.split("\n")))

        # Replace category and surveillance with "."s
        # Okay because comparisons only done on non "#" lines
        for i, x in enumerate(valid_variants_lines):
            tokens = x.split(";")

            # Change result to be what it would be without a db
            if len(tokens) > 2:
                x = x.replace(tokens[-2], "CAT=.", 1)
                x = x.replace(tokens[-1], "SRVL=.", 1)
                valid_variants_lines[i] = x

        # Apply the filter to the collection
        self.aa_collection.filter('af0.01', 'freq<0.01', True)

        aavf_obj = self.aa_collection.to_aavf_obj(
            "test", os.path.basename(self.reference), CONFIDENT)
        records = list(aavf_obj)

        writer = parser.Writer(aavf_out, aavf_obj)

        for record in records:
            writer.write_record(record)

        aavf_out.close()

        with open(aavf_path, "r") as input:
            aa_variants = input.read()

        # Make sure it's sorted and has no empty strings
        aa_variants_lines = sorted(filter(None, aa_variants.split("\n")))

        # Check the length
        assert len(valid_variants_lines) == len(aa_variants_lines)

        # Make sure all the tokens that need to be there are there
        for pos in range(0, len(valid_variants_lines)):
            if valid_variants_lines[pos][0:1] != "#":
                valid_variants_tokens = \
                    re.split("[,=;\t]", valid_variants_lines[pos].rstrip())

                aa_variants_tokens = re.split("[,=;\t]",
                                              aa_variants_lines[pos])

                for token in aa_variants_tokens:
                    assert token in valid_variants_tokens
Пример #5
0
    def test_aa_variants(self):
        aavf_path = TEST_PATH + "/data/output/temp.aavf"
        aavf_out = open(aavf_path, "w")

        # Read from file and make sure there are no empty lines
        with open(VALID_AA_VARIANTS_AAVF, "r") as input:
            valid_variants = input.read()

        # Sort and filter so comparison order will be fine afterwards
        valid_variants_lines = sorted(filter(None, valid_variants.split("\n")))

        # Apply the filter to the collection
        self.aa_collection.filter('af0.01', 'freq<0.01', True)

        # Do the thing with the mutation_db
        self.aa_collection.apply_mutation_db(self.mutation_db)

        aavf_obj = self.aa_collection.to_aavf_obj(
            "test", os.path.basename(self.reference), CONFIDENT)
        records = list(aavf_obj)

        writer = parser.Writer(aavf_out, aavf_obj)

        for record in records:
            writer.write_record(record)

        aavf_out.close()

        with open(aavf_path, "r") as input:
            aa_variants = input.read()

        # Make sure it's sorted and has no empty strings
        aa_variants_lines = sorted(filter(None, aa_variants.split("\n")))

        # Check the length
        assert len(valid_variants_lines) == len(aa_variants_lines)

        # Make sure all the tokens that need to be there are there
        for pos in range(0, len(valid_variants_lines)):
            if valid_variants_lines[pos][0:1] != "#":
                valid_variants_tokens = \
                    re.split("[,=;\t]", valid_variants_lines[pos].rstrip())

                aa_variants_tokens = re.split("[,=;\t]",
                                              aa_variants_lines[pos])

                for token in aa_variants_tokens:
                    assert token in valid_variants_tokens

        writer.close()
Пример #6
0
    def test_write_to_file(self):
        """Test whether writes to file work as expected."""
        reader = parser.Reader(SAMPLE_FILE)
        aavf_obj = reader.read_records()
        out = fhandle('sampleoutput3.aavf', "w+")
        writer = parser.Writer(out, aavf_obj)

        records = list(aavf_obj)

        for record in records:
            writer.write_record(record)

        out.close()
        reader1 = parser.Reader(TEST_PATH +
                                '/sampleoutput3.aavf').read_records()

        reader2 = parser.Reader(SAMPLE_FILE).read_records()
        assert len(list(reader1)) == len(list(reader2))
        # all data lines should be read from the sample file

        reader2 = parser.Reader(SAMPLE_FILE).read_records()
        for left, right in zip(reader1, reader2):
            assert left.INFO == right.INFO
Пример #7
0
    def test_writer(self):
        """
        Order of INFO fields should be compatible with the order of their
        definition in the header and undefined fields should be last and in
        alphabetical order.
        """
        reader = parser.Reader(SAMPLE_FILE)
        aavf_obj = reader.read_records()
        out = StringIO()
        writer = parser.Writer(out, aavf_obj)

        for record in aavf_obj:
            writer.write_record(record)
        out.seek(0)
        out_str = out.getvalue()
        out.close()
        definitions = []
        for line in out_str.split('\n'):
            if line.startswith('##INFO='):
                definitions.append(line.split('ID=')[1].split(',')[0])
            if not line or line.startswith('#'):
                continue
            fields = [f.split('=')[0] for f in line.split('\t')[7].split(';')]
            self._assert_order(definitions, fields)
Пример #8
0
def annotate_aavf(in_file, hivdb_file, out_file):
    """Annotate an AAVF input file with drug resistance changes.
    """
    res_genes = XmlAsiTransformer(True).transform(open(hivdb_file, "r"))

    reader = parser.Reader(in_file)
    aavf_obj = reader.read_records()
    aavf_obj.infos["CAT"] = model.Info("CAT", ".", "String", "Drug resistance category", None, None)
    aavf_obj.infos["DRUG"] = model.Info("DRUG", ".", "String", "Drug reistances", None, None)
    with open(out_file, "w") as out_handle:
        writer = parser.Writer(out_handle, aavf_obj)
        for rec in aavf_obj:
            if rec.POS not in rec.ALT:
                rmuts = evaluate_resistance([rec], res_genes)
                k = (rec.GENE, "%s%s%s" % (rec.REF, rec.POS, rec.ALT[0]))
                if rmuts.get(k):
                    cats = []
                    drugs = []
                    for cat in rmuts.get(k).keys():
                        cats.append(cat)
                        drugs.extend(rmuts[k][cat])
                    rec.INFO["CAT"] = cats
                    rec.INFO["DRUG"] = drugs
                    writer.write_record(rec)
Пример #9
0
def aavar(bam, reference, bed4_file, variants, mutation_db, min_freq,
          error_rate, output):
    rs = parse_references_from_fasta(reference)

    mapped_read_collection_arr = []
    for r in rs:
        # Create a MappedReadCollection object
        mapped_read_collection_arr.append(parse_mapped_reads_from_bam(r, bam))

    if variants:
        variants_obj = parse_nt_variants_from_vcf(variants, rs)
    else:
        variants = NTVariantCollection.from_mapped_read_collections(
            error_rate, rs, *mapped_read_collection_arr)
        variants.filter('q30', 'QUAL<30', True)
        variants.filter('ac5', 'AC<5', True)
        variants.filter('dp100', 'DP<100', True)
        variants_obj = variants

    # Mask the unconfident differences
    for mrc in mapped_read_collection_arr:
        mrc.mask_unconfident_differences(variants_obj)

    # Parse the genes from the gene file
    genes = parse_BED4_file(bed4_file, rs[0].name)

    # Determine which frames our genes are in
    frames = set()

    for gene in genes:
        frames.add(genes[gene]['frame'])

    # Create an AACensus object
    aa_census = AACensus(reference, mapped_read_collection_arr, genes, frames)

    # Create AAVar collection and print the aavf file
    aa_vars = AAVariantCollection.from_aacensus(aa_census)

    # Filter for mutant frequency
    aa_vars.filter('mf0.01', 'freq<0.01', True)

    # Build the mutation database and update collection
    if mutation_db is not None:
        mutation_db = MutationDB(mutation_db, genes)
        aa_vars.apply_mutation_db(mutation_db)

    aavf_obj = aa_vars.to_aavf_obj("aavar", os.path.basename(reference),
                                   CONFIDENT)
    records = list(aavf_obj)

    if output:
        writer = parser.Writer(output, aavf_obj)
    else:
        writer = parser.Writer(sys.stdout, aavf_obj)

    for record in records:
        writer.write_record(record)

    if output:
        output.close

    writer.close()
Пример #10
0
    def analyze_reads(self, fasta_id, variant_filters, reporting_threshold,
                      generate_consensus):

        # Map reads against reference using bowtietwo
        if not self.quiet:
            print("# Mapping reads...")

        try:
            bam = self.generate_bam(fasta_id)
        except Exception as error:
            raise (error)

        if not self.quiet:
            print("# Loading read mappings...")

        # cmd_consensus
        if generate_consensus:
            cons_seq_file = open("%s/consensus.fasta" % self.output_dir, "w+")

        mapped_read_collection_arr = []
        for r in self.references:
            mrc = parse_mapped_reads_from_bam(r, bam)
            mapped_read_collection_arr.append(mrc)
            consensus_seq = mrc.to_consensus(self.consensus_pct)
            if generate_consensus and len(consensus_seq) > 0:
                cons_seq_file.write('>{0}_{1}_{2}\n{3}'.format(
                    fasta_id, reporting_threshold, r.name, consensus_seq))

        if generate_consensus:
            cons_seq_file.close()

        # cmd_callntvar
        if not self.quiet:
            print("# Identifying variants...")

        variants = NTVariantCollection.from_mapped_read_collections(
            variant_filters[ERROR_RATE], self.references,
            *mapped_read_collection_arr)

        variants.filter('q%s' % variant_filters[MIN_VARIANT_QUAL],
                        'QUAL<%s' % variant_filters[MIN_VARIANT_QUAL], True)
        variants.filter('ac%s' % variant_filters[MIN_AC],
                        'AC<%s' % variant_filters[MIN_AC], True)
        variants.filter('dp%s' % variant_filters[MIN_DP],
                        'DP<%s' % variant_filters[MIN_DP], True)

        vcf_file = open("%s/hydra.vcf" % self.output_dir, "w+")
        vcf_file.write(variants.to_vcf_file())
        vcf_file.close()

        # cmd_aa_census
        if not self.quiet:
            print("# Masking filtered variants...")

        for mrc in mapped_read_collection_arr:
            mrc.mask_unconfident_differences(variants)

        if not self.quiet:
            print("# Building amino acid census...")

        # Determine which frames our genes are in
        frames = set()

        for gene in self.genes:
            frames.add(self.genes[gene]['frame'])

        aa_census = AACensus(self.reference, mapped_read_collection_arr,
                             self.genes, frames)

        coverage_file = open("%s/coverage_file.csv" % self.output_dir, "w+")
        coverage_file.write(aa_census.coverage(frames))
        coverage_file.close()

        # cmd_aavariants
        if not self.quiet:
            print("# Finding amino acid mutations...")

        # Create AAVar collection and print the aavf file
        aa_vars = AAVariantCollection.from_aacensus(aa_census)

        # Filter for mutant frequency
        aa_vars.filter('mf%s' % variant_filters[MIN_FREQ],
                       'freq<%s' % variant_filters[MIN_FREQ], True)

        # Build the mutation database and update collection
        if self.mutation_db is not None:
            mutation_db = MutationDB(self.mutation_db, self.genes)
            aa_vars.apply_mutation_db(mutation_db)

        aavf_obj = aa_vars.to_aavf_obj("hydra",
                                       os.path.basename(self.reference),
                                       CONFIDENT)
        records = list(aavf_obj)

        mut_report = open("%s/mutation_report.aavf" % self.output_dir, "w+")

        writer = parser.Writer(mut_report, aavf_obj)

        for record in records:
            writer.write_record(record)

        mut_report.close()
        writer.close()

        # cmd_drmutations
        if not self.quiet:
            print("# Writing drug resistant mutation report...")

        dr_report = open("%s/dr_report.csv" % self.output_dir, "w+")
        dr_report.write(
            aa_vars.report_dr_mutations(mutation_db, reporting_threshold))
        dr_report.close()

        self.output_stats(mapped_read_collection_arr)