def read_vcf_file(reference, min_coverage, min_proportion, input_file): """ Submit VCF to be read in to VCF parser, populate genome data and filter data from the parsed VCF data, return a list of the read-in genomes. """ genomes = {} file_path = get_file_path(input_file) with open(file_path, 'r') as vcf_filehandle: from nasp.nasp_objects import VCFGenome, Genome, ReferenceCallMismatch, VCFRecord vcf_record = VCFRecord(file_path) vcf_samples = vcf_record.get_samples() for vcf_sample in vcf_samples: genomes[vcf_sample] = VCFGenome() set_genome_metadata(genomes[vcf_sample], input_file) genomes[vcf_sample].set_nickname(vcf_sample) while vcf_record.fetch_next_record(): current_contig = vcf_record.get_contig() current_pos = vcf_record.get_position() # Skip if position isn't in reference; maybe user truncated reference to exclude an uninteresting region. if current_pos <= reference.get_contig_length(current_contig): reference_call = reference.get_call(current_pos, None, current_contig) simplified_refcall = Genome.simple_call(reference_call) if ( simplified_refcall != 'N' ) and ( simplified_refcall != Genome.simple_call(vcf_record.get_reference_call()[0]) ): # Reference call from reference fasta differs from reference call in VCF file at the same position. raise ReferenceCallMismatch(reference_call, vcf_record.get_reference_call(), file_path, current_contig, current_pos) for vcf_sample in vcf_samples: sample_info = vcf_record.get_sample_info(vcf_sample) # FIXME indels if sample_info['call'] is not None: genomes[vcf_sample].set_call(sample_info['call'], current_pos, 'X', current_contig) if sample_info['was_called']: genomes[vcf_sample].set_was_called('Y', current_pos, current_contig) if sample_info['coverage'] is not None: if sample_info['coverage'] == 'PASS' or sample_info['coverage'] >= min_coverage: genomes[vcf_sample].set_coverage_pass('Y', current_pos, current_contig) else: genomes[vcf_sample].set_coverage_pass('N', current_pos, current_contig) if sample_info['proportion'] is not None: if sample_info['proportion'] == 'PASS' or sample_info['proportion'] >= min_proportion: genomes[vcf_sample].set_proportion_pass('Y', current_pos, current_contig) else: genomes[vcf_sample].set_proportion_pass('N', current_pos, current_contig) elif not sample_info['is_a_snp']: # Some big SNP callers, like GATK, do not provide proportion information when # the position is called reference. We cannot filter these positions. genomes[vcf_sample].set_proportion_pass('-', current_pos, current_contig) # from sys import stdout #for genome in genomes: # genomes[genome]._genome._send_to_fasta_handle( stdout ) return genomes.values()
def _update_genome_from_delta_data(franken_genome, external_genome, parser_state, distance_covered, is_external_insert): from nasp.nasp_objects import Genome if distance_covered == -1: distance_covered = parser_state['final_pos'] - parser_state[ 'reference_pos'] + 1 is_external_insert = True if distance_covered > 0: if parser_state['external_is_reversed']: matching_segment = Genome.reverse_complement(''.join( external_genome.get_call( (parser_state['external_pos'] - distance_covered + 1), parser_state['external_pos']))) else: matching_segment = ''.join( external_genome.get_call( parser_state['external_pos'], (parser_state['external_pos'] + distance_covered - 1))) franken_genome.set_call(matching_segment, parser_state['reference_pos'], 'X') parser_state[ 'reference_pos'] = parser_state['reference_pos'] + distance_covered parser_state['external_pos'] = parser_state['external_pos'] + ( -distance_covered if parser_state['external_is_reversed'] else distance_covered) if is_external_insert: parser_state['external_pos'] += -1 if parser_state[ 'external_is_reversed'] else 1 else: franken_genome.set_call('.', parser_state['reference_pos'], '!') parser_state['reference_pos'] += 1 return parser_state
def test_parse_delta_file(self): from tests import testdata from tempfile import NamedTemporaryFile from nasp.nasp_objects import Genome franken_genome = Genome() external_genome = Genome() external_genome.import_fasta_file(testdata.REFERENCE_FASTA) convert_external_genome.parse_delta_file(testdata.REFERENCE_DELTA, franken_genome, external_genome) with NamedTemporaryFile() as tmpfile: franken_genome.write_to_fasta_file(tmpfile.name) with open(testdata.REFERENCE_FASTA) as expected, open( tmpfile.name) as actual: self.assertEqual(expected.readlines(), actual.readlines())
def test_parse_delta_file(self): from tests import testdata from tempfile import NamedTemporaryFile from nasp.nasp_objects import Genome franken_genome = Genome() external_genome = Genome() external_genome.import_fasta_file(testdata.REFERENCE_FASTA) convert_external_genome.parse_delta_file(testdata.REFERENCE_DELTA, franken_genome, external_genome) with NamedTemporaryFile() as tmpfile: franken_genome.write_to_fasta_file(tmpfile.name) with open(testdata.REFERENCE_FASTA) as expected, open(tmpfile.name) as actual: self.assertEqual(expected.readlines(), actual.readlines())
def _update_genome_from_delta_data(franken_genome, external_genome, parser_state, distance_covered, is_external_insert): from nasp.nasp_objects import Genome if distance_covered == -1: distance_covered = parser_state['final_pos'] - parser_state['reference_pos'] + 1 is_external_insert = True if distance_covered > 0: if parser_state['external_is_reversed']: matching_segment = Genome.reverse_complement(''.join( external_genome.get_call(( parser_state['external_pos'] - distance_covered + 1 ), parser_state['external_pos']))) else: matching_segment = ''.join(external_genome.get_call(parser_state['external_pos'], ( parser_state['external_pos'] + distance_covered - 1 ))) franken_genome.set_call(matching_segment, parser_state['reference_pos'], 'X') parser_state['reference_pos'] = parser_state['reference_pos'] + distance_covered parser_state['external_pos'] = parser_state['external_pos'] + ( -distance_covered if parser_state['external_is_reversed'] else distance_covered ) if is_external_insert: parser_state['external_pos'] += -1 if parser_state['external_is_reversed'] else 1 else: franken_genome.set_call('.', parser_state['reference_pos'], '!') parser_state['reference_pos'] += 1 return parser_state
def setUp(self): self.genome = Genome()
class GenomeTestCase(unittest.TestCase): def setUp(self): self.genome = Genome() @unittest.skip("Covered by GenomeStatus.set_value()") def test_set_call(self): pass @unittest.skip("Covered by GenomeStatus.get_value()") def test_get_call(self): pass # FIXME: it should throw an exception or the prefix should be optional # If the prefix is missing, the contig will be appended to the previous contig def test__import_fasta_line_missing_prefix(self): expected = "SEQUENCE" prefix = "prefix" identifier = ">" + expected self.genome._import_fasta_line(identifier, prefix) self.assertListEqual([expected], self.genome.get_contigs()) # FIXME: assertRaises a specific Exception def test__import_fasta_line_missing_identifier(self): sequence = "ABCDGHMNRSTUVWXY" with self.assertRaises(Exception): self.genome._import_fasta_line(sequence) # FIXME: assertRaises a specific Exception def test__import_fasta_line_missing_contig(self): with self.assertRaises(Exception): self.genome._import_fasta_line("SEQUENCE1") self.genome._import_fasta_line("SEQUENCE2") def test__import_fasta_line_identifier_contains_spaces(self): identifier = ">prefixName Description" prefix = "prefix" expected = "Name" self.genome._import_fasta_line(identifier, prefix) self.assertListEqual([expected], self.genome.get_contigs()) @unittest.skip("Covered by _import_fasta_file tests") def test_import_fasta_file(self): pass def test_reverse_complement(self): dna_string = "ABCDGHMNRSTUVWXYabcdghmnrstuvwxy" expected = "rxwbaasynkdchgvtRXWBAASYNKDCHGVT" dna_string2 = "ABCDGHKNRSTTVWXYabcdghknrsttvwxy" self.assertEqual(expected, self.genome.reverse_complement(dna_string)) self.assertEqual(dna_string2, self.genome.reverse_complement(expected)) def test_simple_call(self): expected = ["A", "C", "G", "T"] for expect in expected: self.assertEqual(expect, self.genome.simple_call(expect.lower())) # It should check the base at position one self.assertEqual("A", self.genome.simple_call("agctn")) # It should replace uracil with thymine self.assertEqual("T", self.genome.simple_call("u")) # It should replace X with N if not allowed self.assertEqual("N", self.genome.simple_call("X", allow_x=False)) self.assertEqual("X", self.genome.simple_call("X", allow_x=True)) # It should replace . with N if deletions are not allowed self.assertEqual("N", self.genome.simple_call(".", allow_del=False)) self.assertEqual(".", self.genome.simple_call(".", allow_del=True)) # It should replace degeneracies with N self.assertEqual("N", self.genome.simple_call("d")) def test_simple_call_with_empty(self): self.assertEqual("N", self.genome.simple_call("", allow_del=False)) self.assertEqual(".", self.genome.simple_call("", allow_del=True)) def test_simple_call_with_none(self): self.assertEqual("N", self.genome.simple_call(None, allow_del=False)) self.assertEqual(".", self.genome.simple_call(None, allow_del=True))
def read_vcf_file(reference, min_coverage, min_proportion, input_file): """ Submit VCF to be read in to VCF parser, populate genome data and filter data from the parsed VCF data, return a list of the read-in genomes. """ genomes = {} file_path = get_file_path(input_file) with open(file_path, 'r') as vcf_filehandle: from nasp.nasp_objects import VCFGenome, Genome, ReferenceCallMismatch, VCFRecord vcf_record = VCFRecord(file_path) vcf_samples = vcf_record.get_samples() for vcf_sample in vcf_samples: genomes[vcf_sample] = VCFGenome() set_genome_metadata(genomes[vcf_sample], input_file) genomes[vcf_sample].set_nickname(vcf_sample) while vcf_record.fetch_next_record(): current_contig = vcf_record.get_contig() current_pos = vcf_record.get_position() # Skip if position isn't in reference; maybe user truncated reference to exclude an uninteresting region. if current_pos <= reference.get_contig_length(current_contig): reference_call = reference.get_call(current_pos, None, current_contig) simplified_refcall = Genome.simple_call(reference_call) if (simplified_refcall != 'N') and (simplified_refcall != Genome.simple_call( vcf_record.get_reference_call()[0])): # Reference call from reference fasta differs from reference call in VCF file at the same position. raise ReferenceCallMismatch( reference_call, vcf_record.get_reference_call(), file_path, current_contig, current_pos) for vcf_sample in vcf_samples: sample_info = vcf_record.get_sample_info(vcf_sample) # FIXME indels if sample_info['call'] is not None: genomes[vcf_sample].set_call(sample_info['call'], current_pos, 'X', current_contig) if sample_info['was_called']: genomes[vcf_sample].set_was_called( 'Y', current_pos, current_contig) if sample_info['coverage'] is not None: if sample_info['coverage'] == 'PASS' or sample_info[ 'coverage'] >= min_coverage: genomes[vcf_sample].set_coverage_pass( 'Y', current_pos, current_contig) else: genomes[vcf_sample].set_coverage_pass( 'N', current_pos, current_contig) if sample_info['proportion'] is not None: if sample_info['proportion'] == 'PASS' or sample_info[ 'proportion'] >= min_proportion: genomes[vcf_sample].set_proportion_pass( 'Y', current_pos, current_contig) else: genomes[vcf_sample].set_proportion_pass( 'N', current_pos, current_contig) elif not sample_info['is_a_snp']: # Some big SNP callers, like GATK, do not provide proportion information when # the position is called reference. We cannot filter these positions. genomes[vcf_sample].set_proportion_pass( '-', current_pos, current_contig) # from sys import stdout #for genome in genomes: # genomes[genome]._genome._send_to_fasta_handle( stdout ) return genomes.values()
def format_fasta(inputfasta, outputfasta): from nasp.nasp_objects import Genome fasta_data = Genome() fasta_data.import_fasta_file(inputfasta) fasta_data.write_to_fasta_file(outputfasta)
class GenomeTestCase(unittest.TestCase): def setUp(self): self.genome = Genome() @unittest.skip("Covered by GenomeStatus.set_value()") def test_set_call(self): pass @unittest.skip("Covered by GenomeStatus.get_value()") def test_get_call(self): pass # FIXME: it should throw an exception or the prefix should be optional # If the prefix is missing, the contig will be appended to the previous contig def test__import_fasta_line_missing_prefix(self): expected = 'SEQUENCE' prefix = 'prefix' identifier = '>' + expected self.genome._import_fasta_line(identifier, prefix) self.assertListEqual([expected], self.genome.get_contigs()) # FIXME: assertRaises a specific Exception def test__import_fasta_line_missing_identifier(self): sequence = 'ABCDGHMNRSTUVWXY' with self.assertRaises(Exception): self.genome._import_fasta_line(sequence) # FIXME: assertRaises a specific Exception def test__import_fasta_line_missing_contig(self): with self.assertRaises(Exception): self.genome._import_fasta_line('SEQUENCE1') self.genome._import_fasta_line('SEQUENCE2') def test__import_fasta_line_identifier_contains_spaces(self): identifier = '>prefixName Description' prefix = 'prefix' expected = 'Name' self.genome._import_fasta_line(identifier, prefix) self.assertListEqual([expected], self.genome.get_contigs()) @unittest.skip("Covered by _import_fasta_file tests") def test_import_fasta_file(self): pass def test_reverse_complement(self): dna_string = 'ABCDGHMNRSTUVWXYabcdghmnrstuvwxy' expected = 'rxwbaasynkdchgvtRXWBAASYNKDCHGVT' dna_string2 = 'ABCDGHKNRSTTVWXYabcdghknrsttvwxy' self.assertEqual(expected, self.genome.reverse_complement(dna_string)) self.assertEqual(dna_string2, self.genome.reverse_complement(expected)) def test_simple_call(self): expected = ['A', 'C', 'G', 'T'] for expect in expected: self.assertEqual(expect, self.genome.simple_call(expect.lower())) # It should check the base at position one self.assertEqual('A', self.genome.simple_call('agctn')) # It should replace uracil with thymine self.assertEqual('T', self.genome.simple_call('u')) # It should replace X with N if not allowed self.assertEqual('N', self.genome.simple_call('X', allow_x=False)) self.assertEqual('X', self.genome.simple_call('X', allow_x=True)) # It should replace . with N if deletions are not allowed self.assertEqual('N', self.genome.simple_call('.', allow_del=False)) self.assertEqual('.', self.genome.simple_call('.', allow_del=True)) # It should replace degeneracies with N self.assertEqual('N', self.genome.simple_call('d')) def test_simple_call_with_empty(self): self.assertEqual('N', self.genome.simple_call('', allow_del=False)) self.assertEqual('.', self.genome.simple_call('', allow_del=True)) def test_simple_call_with_none(self): self.assertEqual('N', self.genome.simple_call(None, allow_del=False)) self.assertEqual('.', self.genome.simple_call(None, allow_del=True))