示例#1
0
def read_vcf_file(reference, min_coverage, min_proportion, input_file):
    """
    Submit VCF to be read in to VCF parser, populate genome data and filter
    data from the parsed VCF data, return a list of the read-in genomes.
    """
    genomes = {}
    file_path = get_file_path(input_file)
    with open(file_path, 'r') as vcf_filehandle:
        from nasp.nasp_objects import VCFGenome, Genome, ReferenceCallMismatch, VCFRecord

        vcf_record = VCFRecord(file_path)
        vcf_samples = vcf_record.get_samples()
        for vcf_sample in vcf_samples:
            genomes[vcf_sample] = VCFGenome()
            set_genome_metadata(genomes[vcf_sample], input_file)
            genomes[vcf_sample].set_nickname(vcf_sample)
        while vcf_record.fetch_next_record():
            current_contig = vcf_record.get_contig()
            current_pos = vcf_record.get_position()
            # Skip if position isn't in reference; maybe user truncated reference to exclude an uninteresting region.
            if current_pos <= reference.get_contig_length(current_contig):
                reference_call = reference.get_call(current_pos, None, current_contig)
                simplified_refcall = Genome.simple_call(reference_call)
                if ( simplified_refcall != 'N' ) and (
                    simplified_refcall != Genome.simple_call(vcf_record.get_reference_call()[0]) ):
                    # Reference call from reference fasta differs from reference call in VCF file at the same position.
                    raise ReferenceCallMismatch(reference_call, vcf_record.get_reference_call(), file_path,
                                                current_contig, current_pos)
                for vcf_sample in vcf_samples:
                    sample_info = vcf_record.get_sample_info(vcf_sample)
                    # FIXME indels
                    if sample_info['call'] is not None:
                        genomes[vcf_sample].set_call(sample_info['call'], current_pos, 'X', current_contig)
                    if sample_info['was_called']:
                        genomes[vcf_sample].set_was_called('Y', current_pos, current_contig)
                    if sample_info['coverage'] is not None:
                        if sample_info['coverage'] == 'PASS' or sample_info['coverage'] >= min_coverage:
                            genomes[vcf_sample].set_coverage_pass('Y', current_pos, current_contig)
                        else:
                            genomes[vcf_sample].set_coverage_pass('N', current_pos, current_contig)
                    if sample_info['proportion'] is not None:
                        if sample_info['proportion'] == 'PASS' or sample_info['proportion'] >= min_proportion:
                            genomes[vcf_sample].set_proportion_pass('Y', current_pos, current_contig)
                        else:
                            genomes[vcf_sample].set_proportion_pass('N', current_pos, current_contig)
                    elif not sample_info['is_a_snp']:
                        # Some big SNP callers, like GATK, do not provide proportion information when
                        # the position is called reference.  We cannot filter these positions.
                        genomes[vcf_sample].set_proportion_pass('-', current_pos, current_contig)
    # from sys import stdout
    #for genome in genomes:
    #    genomes[genome]._genome._send_to_fasta_handle( stdout )
    return genomes.values()
示例#2
0
def _update_genome_from_delta_data(franken_genome, external_genome,
                                   parser_state, distance_covered,
                                   is_external_insert):
    from nasp.nasp_objects import Genome

    if distance_covered == -1:
        distance_covered = parser_state['final_pos'] - parser_state[
            'reference_pos'] + 1
        is_external_insert = True
    if distance_covered > 0:
        if parser_state['external_is_reversed']:
            matching_segment = Genome.reverse_complement(''.join(
                external_genome.get_call(
                    (parser_state['external_pos'] - distance_covered + 1),
                    parser_state['external_pos'])))
        else:
            matching_segment = ''.join(
                external_genome.get_call(
                    parser_state['external_pos'],
                    (parser_state['external_pos'] + distance_covered - 1)))
        franken_genome.set_call(matching_segment,
                                parser_state['reference_pos'], 'X')
    parser_state[
        'reference_pos'] = parser_state['reference_pos'] + distance_covered
    parser_state['external_pos'] = parser_state['external_pos'] + (
        -distance_covered
        if parser_state['external_is_reversed'] else distance_covered)
    if is_external_insert:
        parser_state['external_pos'] += -1 if parser_state[
            'external_is_reversed'] else 1
    else:
        franken_genome.set_call('.', parser_state['reference_pos'], '!')
        parser_state['reference_pos'] += 1
    return parser_state
示例#3
0
    def test_parse_delta_file(self):
        from tests import testdata

        from tempfile import NamedTemporaryFile
        from nasp.nasp_objects import Genome
        franken_genome = Genome()
        external_genome = Genome()
        external_genome.import_fasta_file(testdata.REFERENCE_FASTA)
        convert_external_genome.parse_delta_file(testdata.REFERENCE_DELTA,
                                                 franken_genome,
                                                 external_genome)
        with NamedTemporaryFile() as tmpfile:
            franken_genome.write_to_fasta_file(tmpfile.name)

            with open(testdata.REFERENCE_FASTA) as expected, open(
                    tmpfile.name) as actual:
                self.assertEqual(expected.readlines(), actual.readlines())
    def test_parse_delta_file(self):
        from tests import testdata

        from tempfile import NamedTemporaryFile
        from nasp.nasp_objects import Genome
        franken_genome = Genome()
        external_genome = Genome()
        external_genome.import_fasta_file(testdata.REFERENCE_FASTA)
        convert_external_genome.parse_delta_file(testdata.REFERENCE_DELTA, franken_genome, external_genome)
        with NamedTemporaryFile() as tmpfile:
            franken_genome.write_to_fasta_file(tmpfile.name)

            with open(testdata.REFERENCE_FASTA) as expected, open(tmpfile.name) as actual:
                self.assertEqual(expected.readlines(), actual.readlines())
示例#5
0
def _update_genome_from_delta_data(franken_genome, external_genome, parser_state, distance_covered, is_external_insert):
    from nasp.nasp_objects import Genome

    if distance_covered == -1:
        distance_covered = parser_state['final_pos'] - parser_state['reference_pos'] + 1
        is_external_insert = True
    if distance_covered > 0:
        if parser_state['external_is_reversed']:
            matching_segment = Genome.reverse_complement(''.join(
                external_genome.get_call(( parser_state['external_pos'] - distance_covered + 1 ),
                                         parser_state['external_pos'])))
        else:
            matching_segment = ''.join(external_genome.get_call(parser_state['external_pos'], (
                parser_state['external_pos'] + distance_covered - 1 )))
        franken_genome.set_call(matching_segment, parser_state['reference_pos'], 'X')
    parser_state['reference_pos'] = parser_state['reference_pos'] + distance_covered
    parser_state['external_pos'] = parser_state['external_pos'] + (
        -distance_covered if parser_state['external_is_reversed'] else distance_covered )
    if is_external_insert:
        parser_state['external_pos'] += -1 if parser_state['external_is_reversed'] else 1
    else:
        franken_genome.set_call('.', parser_state['reference_pos'], '!')
        parser_state['reference_pos'] += 1
    return parser_state
示例#6
0
 def setUp(self):
     self.genome = Genome()
示例#7
0
class GenomeTestCase(unittest.TestCase):
    def setUp(self):
        self.genome = Genome()

    @unittest.skip("Covered by GenomeStatus.set_value()")
    def test_set_call(self):
        pass

    @unittest.skip("Covered by GenomeStatus.get_value()")
    def test_get_call(self):
        pass

    # FIXME: it should throw an exception or the prefix should be optional
    # If the prefix is missing, the contig will be appended to the previous contig
    def test__import_fasta_line_missing_prefix(self):
        expected = "SEQUENCE"
        prefix = "prefix"
        identifier = ">" + expected
        self.genome._import_fasta_line(identifier, prefix)
        self.assertListEqual([expected], self.genome.get_contigs())

    # FIXME: assertRaises a specific Exception
    def test__import_fasta_line_missing_identifier(self):
        sequence = "ABCDGHMNRSTUVWXY"
        with self.assertRaises(Exception):
            self.genome._import_fasta_line(sequence)

    # FIXME: assertRaises a specific Exception
    def test__import_fasta_line_missing_contig(self):
        with self.assertRaises(Exception):
            self.genome._import_fasta_line("SEQUENCE1")
            self.genome._import_fasta_line("SEQUENCE2")

    def test__import_fasta_line_identifier_contains_spaces(self):
        identifier = ">prefixName Description"
        prefix = "prefix"
        expected = "Name"
        self.genome._import_fasta_line(identifier, prefix)
        self.assertListEqual([expected], self.genome.get_contigs())

    @unittest.skip("Covered by _import_fasta_file tests")
    def test_import_fasta_file(self):
        pass

    def test_reverse_complement(self):
        dna_string = "ABCDGHMNRSTUVWXYabcdghmnrstuvwxy"
        expected = "rxwbaasynkdchgvtRXWBAASYNKDCHGVT"
        dna_string2 = "ABCDGHKNRSTTVWXYabcdghknrsttvwxy"
        self.assertEqual(expected, self.genome.reverse_complement(dna_string))
        self.assertEqual(dna_string2, self.genome.reverse_complement(expected))

    def test_simple_call(self):
        expected = ["A", "C", "G", "T"]
        for expect in expected:
            self.assertEqual(expect, self.genome.simple_call(expect.lower()))
        # It should check the base at position one
        self.assertEqual("A", self.genome.simple_call("agctn"))
        # It should replace uracil with thymine
        self.assertEqual("T", self.genome.simple_call("u"))
        # It should replace X with N if not allowed
        self.assertEqual("N", self.genome.simple_call("X", allow_x=False))
        self.assertEqual("X", self.genome.simple_call("X", allow_x=True))
        # It should replace . with N if deletions are not allowed
        self.assertEqual("N", self.genome.simple_call(".", allow_del=False))
        self.assertEqual(".", self.genome.simple_call(".", allow_del=True))
        # It should replace degeneracies with N
        self.assertEqual("N", self.genome.simple_call("d"))

    def test_simple_call_with_empty(self):
        self.assertEqual("N", self.genome.simple_call("", allow_del=False))
        self.assertEqual(".", self.genome.simple_call("", allow_del=True))

    def test_simple_call_with_none(self):
        self.assertEqual("N", self.genome.simple_call(None, allow_del=False))
        self.assertEqual(".", self.genome.simple_call(None, allow_del=True))
示例#8
0
def read_vcf_file(reference, min_coverage, min_proportion, input_file):
    """
    Submit VCF to be read in to VCF parser, populate genome data and filter
    data from the parsed VCF data, return a list of the read-in genomes.
    """
    genomes = {}
    file_path = get_file_path(input_file)
    with open(file_path, 'r') as vcf_filehandle:
        from nasp.nasp_objects import VCFGenome, Genome, ReferenceCallMismatch, VCFRecord

        vcf_record = VCFRecord(file_path)
        vcf_samples = vcf_record.get_samples()
        for vcf_sample in vcf_samples:
            genomes[vcf_sample] = VCFGenome()
            set_genome_metadata(genomes[vcf_sample], input_file)
            genomes[vcf_sample].set_nickname(vcf_sample)
        while vcf_record.fetch_next_record():
            current_contig = vcf_record.get_contig()
            current_pos = vcf_record.get_position()
            # Skip if position isn't in reference; maybe user truncated reference to exclude an uninteresting region.
            if current_pos <= reference.get_contig_length(current_contig):
                reference_call = reference.get_call(current_pos, None,
                                                    current_contig)
                simplified_refcall = Genome.simple_call(reference_call)
                if (simplified_refcall !=
                        'N') and (simplified_refcall != Genome.simple_call(
                            vcf_record.get_reference_call()[0])):
                    # Reference call from reference fasta differs from reference call in VCF file at the same position.
                    raise ReferenceCallMismatch(
                        reference_call, vcf_record.get_reference_call(),
                        file_path, current_contig, current_pos)
                for vcf_sample in vcf_samples:
                    sample_info = vcf_record.get_sample_info(vcf_sample)
                    # FIXME indels
                    if sample_info['call'] is not None:
                        genomes[vcf_sample].set_call(sample_info['call'],
                                                     current_pos, 'X',
                                                     current_contig)
                    if sample_info['was_called']:
                        genomes[vcf_sample].set_was_called(
                            'Y', current_pos, current_contig)
                    if sample_info['coverage'] is not None:
                        if sample_info['coverage'] == 'PASS' or sample_info[
                                'coverage'] >= min_coverage:
                            genomes[vcf_sample].set_coverage_pass(
                                'Y', current_pos, current_contig)
                        else:
                            genomes[vcf_sample].set_coverage_pass(
                                'N', current_pos, current_contig)
                    if sample_info['proportion'] is not None:
                        if sample_info['proportion'] == 'PASS' or sample_info[
                                'proportion'] >= min_proportion:
                            genomes[vcf_sample].set_proportion_pass(
                                'Y', current_pos, current_contig)
                        else:
                            genomes[vcf_sample].set_proportion_pass(
                                'N', current_pos, current_contig)
                    elif not sample_info['is_a_snp']:
                        # Some big SNP callers, like GATK, do not provide proportion information when
                        # the position is called reference.  We cannot filter these positions.
                        genomes[vcf_sample].set_proportion_pass(
                            '-', current_pos, current_contig)
    # from sys import stdout
    #for genome in genomes:
    #    genomes[genome]._genome._send_to_fasta_handle( stdout )
    return genomes.values()
示例#9
0
def format_fasta(inputfasta, outputfasta):
    from nasp.nasp_objects import Genome
    fasta_data = Genome()
    fasta_data.import_fasta_file(inputfasta)
    fasta_data.write_to_fasta_file(outputfasta)
示例#10
0
 def setUp(self):
     self.genome = Genome()
示例#11
0
class GenomeTestCase(unittest.TestCase):
    def setUp(self):
        self.genome = Genome()

    @unittest.skip("Covered by GenomeStatus.set_value()")
    def test_set_call(self):
        pass

    @unittest.skip("Covered by GenomeStatus.get_value()")
    def test_get_call(self):
        pass

    # FIXME: it should throw an exception or the prefix should be optional
    # If the prefix is missing, the contig will be appended to the previous contig
    def test__import_fasta_line_missing_prefix(self):
        expected = 'SEQUENCE'
        prefix = 'prefix'
        identifier = '>' + expected
        self.genome._import_fasta_line(identifier, prefix)
        self.assertListEqual([expected], self.genome.get_contigs())

    # FIXME: assertRaises a specific Exception
    def test__import_fasta_line_missing_identifier(self):
        sequence = 'ABCDGHMNRSTUVWXY'
        with self.assertRaises(Exception):
            self.genome._import_fasta_line(sequence)

    # FIXME: assertRaises a specific Exception
    def test__import_fasta_line_missing_contig(self):
        with self.assertRaises(Exception):
            self.genome._import_fasta_line('SEQUENCE1')
            self.genome._import_fasta_line('SEQUENCE2')

    def test__import_fasta_line_identifier_contains_spaces(self):
        identifier = '>prefixName Description'
        prefix = 'prefix'
        expected = 'Name'
        self.genome._import_fasta_line(identifier, prefix)
        self.assertListEqual([expected], self.genome.get_contigs())

    @unittest.skip("Covered by _import_fasta_file tests")
    def test_import_fasta_file(self):
        pass

    def test_reverse_complement(self):
        dna_string = 'ABCDGHMNRSTUVWXYabcdghmnrstuvwxy'
        expected = 'rxwbaasynkdchgvtRXWBAASYNKDCHGVT'
        dna_string2 = 'ABCDGHKNRSTTVWXYabcdghknrsttvwxy'
        self.assertEqual(expected, self.genome.reverse_complement(dna_string))
        self.assertEqual(dna_string2, self.genome.reverse_complement(expected))

    def test_simple_call(self):
        expected = ['A', 'C', 'G', 'T']
        for expect in expected:
            self.assertEqual(expect, self.genome.simple_call(expect.lower()))
        # It should check the base at position one
        self.assertEqual('A', self.genome.simple_call('agctn'))
        # It should replace uracil with thymine
        self.assertEqual('T', self.genome.simple_call('u'))
        # It should replace X with N if not allowed
        self.assertEqual('N', self.genome.simple_call('X', allow_x=False))
        self.assertEqual('X', self.genome.simple_call('X', allow_x=True))
        # It should replace . with N if deletions are not allowed
        self.assertEqual('N', self.genome.simple_call('.', allow_del=False))
        self.assertEqual('.', self.genome.simple_call('.', allow_del=True))
        # It should replace degeneracies with N
        self.assertEqual('N', self.genome.simple_call('d'))

    def test_simple_call_with_empty(self):
        self.assertEqual('N', self.genome.simple_call('', allow_del=False))
        self.assertEqual('.', self.genome.simple_call('', allow_del=True))

    def test_simple_call_with_none(self):
        self.assertEqual('N', self.genome.simple_call(None, allow_del=False))
        self.assertEqual('.', self.genome.simple_call(None, allow_del=True))
示例#12
0
def format_fasta(inputfasta, outputfasta):
    from nasp.nasp_objects import Genome
    fasta_data = Genome()
    fasta_data.import_fasta_file(inputfasta)
    fasta_data.write_to_fasta_file(outputfasta)