def map_references(contig_ref_name: str, coordinates_name: str, projects: ProjectConfig) -> typing.Mapping[int, int]: ref_seq = projects.getReference(contig_ref_name) coordinates_seq = projects.getReference(coordinates_name) aligned_coordinates, aligned_ref, _ = align_nucs(coordinates_seq, ref_seq) mapped_positions = {} coordinate_pos = ref_pos = 0 for coordinate_nuc, ref_nuc in zip(aligned_coordinates, aligned_ref): if coordinate_nuc != '-': coordinate_pos += 1 if ref_nuc != '-': ref_pos += 1 mapped_positions[ref_pos] = coordinate_pos return mapped_positions
def find_coord_pos(projects: ProjectConfig, coord_name: str, start_pos: int = None, end_pos: int = None): coord_seq = projects.getReference(coord_name) if start_pos is None: start_pos = 1 if end_pos is None: end_pos = len(coord_seq) + 1 if projects.config['regions'][coord_name]['is_nucleotide']: # Already have a nucleotide sequence, nothing to do. return coord_name, start_pos, end_pos gap_open = 40 gap_extend = 10 use_terminal_gap_penalty = 1 highest_score = 0 best_match = None ref_names = set() for project in projects.config['projects'].values(): for region in project['regions']: if coord_name == region['coordinate_region']: ref_names.update(region['seed_region_names']) for ref_name in sorted(ref_names): ref_nuc_seq = projects.getReference(ref_name) for nuc_offset in range(3): ref_amino_seq = translate(ref_nuc_seq, nuc_offset) aligned_coord, aligned_ref, score = align_it_aa( coord_seq, ref_amino_seq, gap_open, gap_extend, use_terminal_gap_penalty) if score > highest_score: highest_score = score best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref) ref_name, nuc_offset, aligned_coord, aligned_ref = best_match coord_pos = ref_pos = 0 ref_start = ref_end = None for coord_amino, ref_amino in zip(aligned_coord, aligned_ref): if ref_amino != '-': ref_pos += 1 if coord_amino != '-': coord_pos += 1 if start_pos == coord_pos: ref_start = ref_pos * 3 - nuc_offset - 3 if coord_pos == end_pos: ref_end = ref_pos * 3 - nuc_offset assert ref_start is not None assert ref_end is not None return ref_name, ref_start, ref_end
class ProjectConfigurationTest(unittest.TestCase): def setUp(self): self.defaultJsonIO = StringIO("""\ { "projects": { "R1": { "max_variants": 5, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"], "id": 10042 } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ], "seed_group": "R1-seeds" }, "R1": { "is_nucleotide": false, "reference": [ "RWN", "NWR" ], "seed_group": null } } } """) self.config = ProjectConfig() def testConvert(self): expected_fasta = """\ >R1-seed ACTGAAAGGG """ fasta = StringIO() self.config.load(self.defaultJsonIO) self.config.writeSeedFasta(fasta) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testSharedRegions(self): jsonIO = StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] }, "R1 and R2": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] }, { "coordinate_region": null, "seed_region_names": ["R2-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R2-seed": { "is_nucleotide": true, "reference": [ "TTT" ] } } } """) expected_fasta = """\ >R1-seed ACTGAAAGGG >R2-seed TTT """ fasta = StringIO() self.config.load(jsonIO) self.config.writeSeedFasta(fasta) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testUnusedRegion(self): jsonIO = StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R2-seed": { "is_nucleotide": true, "reference": [ "TTT" ] } } } """) expected_fasta = """\ >R1-seed ACTGAAAGGG """ fasta = StringIO() self.config.load(jsonIO) self.config.writeSeedFasta(fasta) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testExcludeSeeds(self): jsonIO = StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] }, "R2": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R2-seed"] } ] }, "R3": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R3-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R2-seed": { "is_nucleotide": true, "reference": [ "TTT" ] }, "R3-seed": { "is_nucleotide": true, "reference": [ "TAG" ] } } } """) expected_fasta = """\ >R2-seed TTT """ fasta = StringIO() self.config.load(jsonIO) self.config.writeSeedFasta(fasta, excluded_seeds=['R1-seed', 'R3-seed']) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testExcludeUnknownSeed(self): expected_fasta = """\ >R1-seed ACTGAAAGGG """ fasta = StringIO() self.config.load(self.defaultJsonIO) self.config.writeSeedFasta(fasta, excluded_seeds=['R99-seed']) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testDuplicateReference(self): jsonIO = StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1a-seed", "R1b-seed"] } ] } }, "regions": { "R1a-seed": { "is_nucleotide": true, "reference": [ "ACTAAAGGG" ] }, "R1b-seed": { "is_nucleotide": true, "reference": [ "ACTAAAGGG" ] } } } """) fasta = StringIO() self.config.load(jsonIO) self.assertRaisesRegex(RuntimeError, "Duplicate references: R1a-seed and R1b-seed.", self.config.writeSeedFasta, fasta) def testGetReference(self): self.config.load(self.defaultJsonIO) seed_name = 'R1-seed' expected_ref = 'ACTGAAAGGG' seed_ref = self.config.getReference(seed_name) self.assertSequenceEqual(expected_ref, seed_ref) def testGetCoordinateReferences(self): self.config.load(self.defaultJsonIO) seed_name = 'R1-seed' expected_refs = {'R1': 'RWNNWR'} coordinate_refs = self.config.getCoordinateReferences(seed_name) self.assertDictEqual(expected_refs, coordinate_refs) def testGetAllReferences(self): expected_references = {'R1-seed': 'ACTGAAAGGG', 'R1': 'RWNNWR'} self.config.load(self.defaultJsonIO) references = self.config.getAllReferences() self.assertEqual(expected_references, references) def testUnknownReference(self): self.config.load(self.defaultJsonIO) seed_name = 'R-unknown' self.assertRaises(KeyError, self.config.getReference, seed_name) def testMaxVariants(self): self.config.load(self.defaultJsonIO) coordinate_region_name = 'R1' self.assertEqual(5, self.config.getMaxVariants(coordinate_region_name)) def testMaxVariantsUnusedRegion(self): jsonIO = StringIO("""\ { "projects": { "R1": { "max_variants": 2, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R1": { "is_nucleotide": false, "reference": [ "NSFW" ] }, "R2": { "is_nucleotide": false, "reference": [ "RSW" ] } } } """) self.config.load(jsonIO) coordinate_region_name = 'R2' self.assertEqual(0, self.config.getMaxVariants(coordinate_region_name)) def testMaxVariantsTwoProjects(self): """ If two projects specify a maximum for the same coordinate region, use the bigger of the two. """ jsonIO = StringIO("""\ { "projects": { "R1": { "max_variants": 9, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"] } ] }, "R1-and-R2": { "max_variants": 2, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"] }, { "coordinate_region": "R2", "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R1": { "is_nucleotide": false, "reference": [ "NSFW" ] }, "R2": { "is_nucleotide": false, "reference": [ "RSW" ] } } } """) self.config.load(jsonIO) coordinate_region_name = 'R1' self.assertEqual(9, self.config.getMaxVariants(coordinate_region_name)) def testReload(self): jsonIO1 = StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] } } } """) jsonIO2 = StringIO("""\ { "projects": { "R2": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R2-seed"] } ] } }, "regions": { "R2-seed": { "is_nucleotide": true, "reference": [ "GACCTA" ] } } } """) self.config.load(jsonIO1) self.config.load(jsonIO2) self.assertRaises(KeyError, self.config.getReference, "R1-seed") self.assertSequenceEqual("GACCTA", self.config.getReference("R2-seed")) def testProjectSeeds(self): expected_seeds = set(['R1-seed']) self.config.load(self.defaultJsonIO) seeds = self.config.getProjectSeeds('R1') self.assertSetEqual(expected_seeds, seeds) def testSeedGroup(self): expected_group = "R1-seeds" self.config.load(self.defaultJsonIO) group = self.config.getSeedGroup('R1-seed') self.assertEqual(expected_group, group)
class ProjectConfigurationTest(unittest.TestCase): def setUp(self): self.defaultJsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "max_variants": 5, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"], "id": 10042 } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ], "seed_group": "R1-seeds" }, "R1": { "is_nucleotide": false, "reference": [ "RWN", "NWR" ], "seed_group": null } } } """) self.config = ProjectConfig() def testConvert(self): expected_fasta = """\ >R1-seed ACTGAAAGGG """ fasta = StringIO.StringIO() self.config.load(self.defaultJsonIO) self.config.writeSeedFasta(fasta) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testSharedRegions(self): jsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] }, "R1 and R2": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] }, { "coordinate_region": null, "seed_region_names": ["R2-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R2-seed": { "is_nucleotide": true, "reference": [ "TTT" ] } } } """) expected_fasta = """\ >R1-seed ACTGAAAGGG >R2-seed TTT """ fasta = StringIO.StringIO() self.config.load(jsonIO) self.config.writeSeedFasta(fasta) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testUnusedRegion(self): jsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R2-seed": { "is_nucleotide": true, "reference": [ "TTT" ] } } } """) expected_fasta = """\ >R1-seed ACTGAAAGGG """ fasta = StringIO.StringIO() self.config.load(jsonIO) self.config.writeSeedFasta(fasta) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testDuplicateReference(self): jsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1a-seed", "R1b-seed"] } ] } }, "regions": { "R1a-seed": { "is_nucleotide": true, "reference": [ "ACTAAAGGG" ] }, "R1b-seed": { "is_nucleotide": true, "reference": [ "ACTAAAGGG" ] } } } """) fasta = StringIO.StringIO() self.config.load(jsonIO) self.assertRaisesRegexp(RuntimeError, "Duplicate references: R1a-seed and R1b-seed.", self.config.writeSeedFasta, fasta) def testGetReference(self): self.config.load(self.defaultJsonIO) seed_name = 'R1-seed' expected_ref = 'ACTGAAAGGG' seed_ref = self.config.getReference(seed_name) self.assertSequenceEqual(expected_ref, seed_ref) def testGetCoordinateReferences(self): self.config.load(self.defaultJsonIO) seed_name = 'R1-seed' expected_refs = {'R1': 'RWNNWR'} coordinate_refs = self.config.getCoordinateReferences(seed_name) self.assertDictEqual(expected_refs, coordinate_refs) def testUnknownReference(self): self.config.load(self.defaultJsonIO) seed_name = 'R-unknown' self.assertRaises(KeyError, self.config.getReference, seed_name) def testMaxVariants(self): self.config.load(self.defaultJsonIO) coordinate_region_name = 'R1' self.assertEqual(5, self.config.getMaxVariants(coordinate_region_name)) def testMaxVariantsUnusedRegion(self): jsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "max_variants": 2, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R1": { "is_nucleotide": false, "reference": [ "NSFW" ] }, "R2": { "is_nucleotide": false, "reference": [ "RSW" ] } } } """) self.config.load(jsonIO) coordinate_region_name = 'R2' self.assertEqual(0, self.config.getMaxVariants(coordinate_region_name)) def testMaxVariantsTwoProjects(self): """ If two projects specify a maximum for the same coordinate region, use the bigger of the two. """ jsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "max_variants": 9, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"] } ] }, "R1-and-R2": { "max_variants": 2, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"] }, { "coordinate_region": "R2", "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R1": { "is_nucleotide": false, "reference": [ "NSFW" ] }, "R2": { "is_nucleotide": false, "reference": [ "RSW" ] } } } """) self.config.load(jsonIO) coordinate_region_name = 'R1' self.assertEqual(9, self.config.getMaxVariants(coordinate_region_name)) def testReload(self): jsonIO1 = StringIO.StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] } } } """) jsonIO2 = StringIO.StringIO("""\ { "projects": { "R2": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R2-seed"] } ] } }, "regions": { "R2-seed": { "is_nucleotide": true, "reference": [ "GACCTA" ] } } } """) self.config.load(jsonIO1) self.config.load(jsonIO2) self.assertRaises(KeyError, self.config.getReference, "R1-seed") self.assertSequenceEqual("GACCTA", self.config.getReference("R2-seed")) def testProjectSeeds(self): expected_seeds = set(['R1-seed']) self.config.load(self.defaultJsonIO) seeds = self.config.getProjectSeeds('R1') self.assertSetEqual(expected_seeds, seeds) def testSeedGroup(self): expected_group = "R1-seeds" self.config.load(self.defaultJsonIO) group = self.config.getSeedGroup('R1-seed') self.assertEqual(expected_group, group) def testProjectRegions(self): jsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] } ] }, "R1 and R2": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [1, 3], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] }, { "coordinate_region": "R2", "coordinate_region_length": 1, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R2-seed" ] } ] } } } """) expected_project_regions = [{"project_name": "R1", "coordinate_region_length": 3, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100}, {"project_name": "R1 and R2", "coordinate_region_length": 3, "key_positions": [1, 3], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100}] self.config.load(jsonIO) project_regions = list(self.config.getProjectRegions('R1-seed', 'R1')) self.assertEqual(expected_project_regions, project_regions)