Пример #1
0
def read_vcf_file(reference, min_coverage, min_proportion, input_file):
    """
    Submit VCF to be read in to VCF parser, populate genome data and filter
    data from the parsed VCF data, return a list of the read-in genomes.
    """
    genomes = {}
    file_path = get_file_path(input_file)
    with open(file_path, 'r') as vcf_filehandle:
        from nasp.nasp_objects import VCFGenome, Genome, ReferenceCallMismatch, VCFRecord

        vcf_record = VCFRecord(file_path)
        vcf_samples = vcf_record.get_samples()
        for vcf_sample in vcf_samples:
            genomes[vcf_sample] = VCFGenome()
            set_genome_metadata(genomes[vcf_sample], input_file)
            genomes[vcf_sample].set_nickname(vcf_sample)
        while vcf_record.fetch_next_record():
            current_contig = vcf_record.get_contig()
            current_pos = vcf_record.get_position()
            # Skip if position isn't in reference; maybe user truncated reference to exclude an uninteresting region.
            if current_pos <= reference.get_contig_length(current_contig):
                reference_call = reference.get_call(current_pos, None, current_contig)
                simplified_refcall = Genome.simple_call(reference_call)
                if ( simplified_refcall != 'N' ) and (
                    simplified_refcall != Genome.simple_call(vcf_record.get_reference_call()[0]) ):
                    # Reference call from reference fasta differs from reference call in VCF file at the same position.
                    raise ReferenceCallMismatch(reference_call, vcf_record.get_reference_call(), file_path,
                                                current_contig, current_pos)
                for vcf_sample in vcf_samples:
                    sample_info = vcf_record.get_sample_info(vcf_sample)
                    # FIXME indels
                    if sample_info['call'] is not None:
                        genomes[vcf_sample].set_call(sample_info['call'], current_pos, 'X', current_contig)
                    if sample_info['was_called']:
                        genomes[vcf_sample].set_was_called('Y', current_pos, current_contig)
                    if sample_info['coverage'] is not None:
                        if sample_info['coverage'] == 'PASS' or sample_info['coverage'] >= min_coverage:
                            genomes[vcf_sample].set_coverage_pass('Y', current_pos, current_contig)
                        else:
                            genomes[vcf_sample].set_coverage_pass('N', current_pos, current_contig)
                    if sample_info['proportion'] is not None:
                        if sample_info['proportion'] == 'PASS' or sample_info['proportion'] >= min_proportion:
                            genomes[vcf_sample].set_proportion_pass('Y', current_pos, current_contig)
                        else:
                            genomes[vcf_sample].set_proportion_pass('N', current_pos, current_contig)
                    elif not sample_info['is_a_snp']:
                        # Some big SNP callers, like GATK, do not provide proportion information when
                        # the position is called reference.  We cannot filter these positions.
                        genomes[vcf_sample].set_proportion_pass('-', current_pos, current_contig)
    # from sys import stdout
    #for genome in genomes:
    #    genomes[genome]._genome._send_to_fasta_handle( stdout )
    return genomes.values()
Пример #2
0
class GenomeTestCase(unittest.TestCase):
    def setUp(self):
        self.genome = Genome()

    @unittest.skip("Covered by GenomeStatus.set_value()")
    def test_set_call(self):
        pass

    @unittest.skip("Covered by GenomeStatus.get_value()")
    def test_get_call(self):
        pass

    # FIXME: it should throw an exception or the prefix should be optional
    # If the prefix is missing, the contig will be appended to the previous contig
    def test__import_fasta_line_missing_prefix(self):
        expected = "SEQUENCE"
        prefix = "prefix"
        identifier = ">" + expected
        self.genome._import_fasta_line(identifier, prefix)
        self.assertListEqual([expected], self.genome.get_contigs())

    # FIXME: assertRaises a specific Exception
    def test__import_fasta_line_missing_identifier(self):
        sequence = "ABCDGHMNRSTUVWXY"
        with self.assertRaises(Exception):
            self.genome._import_fasta_line(sequence)

    # FIXME: assertRaises a specific Exception
    def test__import_fasta_line_missing_contig(self):
        with self.assertRaises(Exception):
            self.genome._import_fasta_line("SEQUENCE1")
            self.genome._import_fasta_line("SEQUENCE2")

    def test__import_fasta_line_identifier_contains_spaces(self):
        identifier = ">prefixName Description"
        prefix = "prefix"
        expected = "Name"
        self.genome._import_fasta_line(identifier, prefix)
        self.assertListEqual([expected], self.genome.get_contigs())

    @unittest.skip("Covered by _import_fasta_file tests")
    def test_import_fasta_file(self):
        pass

    def test_reverse_complement(self):
        dna_string = "ABCDGHMNRSTUVWXYabcdghmnrstuvwxy"
        expected = "rxwbaasynkdchgvtRXWBAASYNKDCHGVT"
        dna_string2 = "ABCDGHKNRSTTVWXYabcdghknrsttvwxy"
        self.assertEqual(expected, self.genome.reverse_complement(dna_string))
        self.assertEqual(dna_string2, self.genome.reverse_complement(expected))

    def test_simple_call(self):
        expected = ["A", "C", "G", "T"]
        for expect in expected:
            self.assertEqual(expect, self.genome.simple_call(expect.lower()))
        # It should check the base at position one
        self.assertEqual("A", self.genome.simple_call("agctn"))
        # It should replace uracil with thymine
        self.assertEqual("T", self.genome.simple_call("u"))
        # It should replace X with N if not allowed
        self.assertEqual("N", self.genome.simple_call("X", allow_x=False))
        self.assertEqual("X", self.genome.simple_call("X", allow_x=True))
        # It should replace . with N if deletions are not allowed
        self.assertEqual("N", self.genome.simple_call(".", allow_del=False))
        self.assertEqual(".", self.genome.simple_call(".", allow_del=True))
        # It should replace degeneracies with N
        self.assertEqual("N", self.genome.simple_call("d"))

    def test_simple_call_with_empty(self):
        self.assertEqual("N", self.genome.simple_call("", allow_del=False))
        self.assertEqual(".", self.genome.simple_call("", allow_del=True))

    def test_simple_call_with_none(self):
        self.assertEqual("N", self.genome.simple_call(None, allow_del=False))
        self.assertEqual(".", self.genome.simple_call(None, allow_del=True))
Пример #3
0
def read_vcf_file(reference, min_coverage, min_proportion, input_file):
    """
    Submit VCF to be read in to VCF parser, populate genome data and filter
    data from the parsed VCF data, return a list of the read-in genomes.
    """
    genomes = {}
    file_path = get_file_path(input_file)
    with open(file_path, 'r') as vcf_filehandle:
        from nasp.nasp_objects import VCFGenome, Genome, ReferenceCallMismatch, VCFRecord

        vcf_record = VCFRecord(file_path)
        vcf_samples = vcf_record.get_samples()
        for vcf_sample in vcf_samples:
            genomes[vcf_sample] = VCFGenome()
            set_genome_metadata(genomes[vcf_sample], input_file)
            genomes[vcf_sample].set_nickname(vcf_sample)
        while vcf_record.fetch_next_record():
            current_contig = vcf_record.get_contig()
            current_pos = vcf_record.get_position()
            # Skip if position isn't in reference; maybe user truncated reference to exclude an uninteresting region.
            if current_pos <= reference.get_contig_length(current_contig):
                reference_call = reference.get_call(current_pos, None,
                                                    current_contig)
                simplified_refcall = Genome.simple_call(reference_call)
                if (simplified_refcall !=
                        'N') and (simplified_refcall != Genome.simple_call(
                            vcf_record.get_reference_call()[0])):
                    # Reference call from reference fasta differs from reference call in VCF file at the same position.
                    raise ReferenceCallMismatch(
                        reference_call, vcf_record.get_reference_call(),
                        file_path, current_contig, current_pos)
                for vcf_sample in vcf_samples:
                    sample_info = vcf_record.get_sample_info(vcf_sample)
                    # FIXME indels
                    if sample_info['call'] is not None:
                        genomes[vcf_sample].set_call(sample_info['call'],
                                                     current_pos, 'X',
                                                     current_contig)
                    if sample_info['was_called']:
                        genomes[vcf_sample].set_was_called(
                            'Y', current_pos, current_contig)
                    if sample_info['coverage'] is not None:
                        if sample_info['coverage'] == 'PASS' or sample_info[
                                'coverage'] >= min_coverage:
                            genomes[vcf_sample].set_coverage_pass(
                                'Y', current_pos, current_contig)
                        else:
                            genomes[vcf_sample].set_coverage_pass(
                                'N', current_pos, current_contig)
                    if sample_info['proportion'] is not None:
                        if sample_info['proportion'] == 'PASS' or sample_info[
                                'proportion'] >= min_proportion:
                            genomes[vcf_sample].set_proportion_pass(
                                'Y', current_pos, current_contig)
                        else:
                            genomes[vcf_sample].set_proportion_pass(
                                'N', current_pos, current_contig)
                    elif not sample_info['is_a_snp']:
                        # Some big SNP callers, like GATK, do not provide proportion information when
                        # the position is called reference.  We cannot filter these positions.
                        genomes[vcf_sample].set_proportion_pass(
                            '-', current_pos, current_contig)
    # from sys import stdout
    #for genome in genomes:
    #    genomes[genome]._genome._send_to_fasta_handle( stdout )
    return genomes.values()
Пример #4
0
class GenomeTestCase(unittest.TestCase):
    def setUp(self):
        self.genome = Genome()

    @unittest.skip("Covered by GenomeStatus.set_value()")
    def test_set_call(self):
        pass

    @unittest.skip("Covered by GenomeStatus.get_value()")
    def test_get_call(self):
        pass

    # FIXME: it should throw an exception or the prefix should be optional
    # If the prefix is missing, the contig will be appended to the previous contig
    def test__import_fasta_line_missing_prefix(self):
        expected = 'SEQUENCE'
        prefix = 'prefix'
        identifier = '>' + expected
        self.genome._import_fasta_line(identifier, prefix)
        self.assertListEqual([expected], self.genome.get_contigs())

    # FIXME: assertRaises a specific Exception
    def test__import_fasta_line_missing_identifier(self):
        sequence = 'ABCDGHMNRSTUVWXY'
        with self.assertRaises(Exception):
            self.genome._import_fasta_line(sequence)

    # FIXME: assertRaises a specific Exception
    def test__import_fasta_line_missing_contig(self):
        with self.assertRaises(Exception):
            self.genome._import_fasta_line('SEQUENCE1')
            self.genome._import_fasta_line('SEQUENCE2')

    def test__import_fasta_line_identifier_contains_spaces(self):
        identifier = '>prefixName Description'
        prefix = 'prefix'
        expected = 'Name'
        self.genome._import_fasta_line(identifier, prefix)
        self.assertListEqual([expected], self.genome.get_contigs())

    @unittest.skip("Covered by _import_fasta_file tests")
    def test_import_fasta_file(self):
        pass

    def test_reverse_complement(self):
        dna_string = 'ABCDGHMNRSTUVWXYabcdghmnrstuvwxy'
        expected = 'rxwbaasynkdchgvtRXWBAASYNKDCHGVT'
        dna_string2 = 'ABCDGHKNRSTTVWXYabcdghknrsttvwxy'
        self.assertEqual(expected, self.genome.reverse_complement(dna_string))
        self.assertEqual(dna_string2, self.genome.reverse_complement(expected))

    def test_simple_call(self):
        expected = ['A', 'C', 'G', 'T']
        for expect in expected:
            self.assertEqual(expect, self.genome.simple_call(expect.lower()))
        # It should check the base at position one
        self.assertEqual('A', self.genome.simple_call('agctn'))
        # It should replace uracil with thymine
        self.assertEqual('T', self.genome.simple_call('u'))
        # It should replace X with N if not allowed
        self.assertEqual('N', self.genome.simple_call('X', allow_x=False))
        self.assertEqual('X', self.genome.simple_call('X', allow_x=True))
        # It should replace . with N if deletions are not allowed
        self.assertEqual('N', self.genome.simple_call('.', allow_del=False))
        self.assertEqual('.', self.genome.simple_call('.', allow_del=True))
        # It should replace degeneracies with N
        self.assertEqual('N', self.genome.simple_call('d'))

    def test_simple_call_with_empty(self):
        self.assertEqual('N', self.genome.simple_call('', allow_del=False))
        self.assertEqual('.', self.genome.simple_call('', allow_del=True))

    def test_simple_call_with_none(self):
        self.assertEqual('N', self.genome.simple_call(None, allow_del=False))
        self.assertEqual('.', self.genome.simple_call(None, allow_del=True))