예제 #1
0
def map_references(contig_ref_name: str, coordinates_name: str,
                   projects: ProjectConfig) -> typing.Mapping[int, int]:
    ref_seq = projects.getReference(contig_ref_name)
    coordinates_seq = projects.getReference(coordinates_name)
    aligned_coordinates, aligned_ref, _ = align_nucs(coordinates_seq, ref_seq)
    mapped_positions = {}
    coordinate_pos = ref_pos = 0
    for coordinate_nuc, ref_nuc in zip(aligned_coordinates, aligned_ref):
        if coordinate_nuc != '-':
            coordinate_pos += 1
        if ref_nuc != '-':
            ref_pos += 1
            mapped_positions[ref_pos] = coordinate_pos
    return mapped_positions
예제 #2
0
def find_coord_pos(projects: ProjectConfig,
                   coord_name: str,
                   start_pos: int = None,
                   end_pos: int = None):
    coord_seq = projects.getReference(coord_name)
    if start_pos is None:
        start_pos = 1
    if end_pos is None:
        end_pos = len(coord_seq) + 1
    if projects.config['regions'][coord_name]['is_nucleotide']:
        # Already have a nucleotide sequence, nothing to do.
        return coord_name, start_pos, end_pos
    gap_open = 40
    gap_extend = 10
    use_terminal_gap_penalty = 1
    highest_score = 0
    best_match = None
    ref_names = set()
    for project in projects.config['projects'].values():
        for region in project['regions']:
            if coord_name == region['coordinate_region']:
                ref_names.update(region['seed_region_names'])

    for ref_name in sorted(ref_names):
        ref_nuc_seq = projects.getReference(ref_name)
        for nuc_offset in range(3):
            ref_amino_seq = translate(ref_nuc_seq, nuc_offset)
            aligned_coord, aligned_ref, score = align_it_aa(
                coord_seq, ref_amino_seq, gap_open, gap_extend,
                use_terminal_gap_penalty)
            if score > highest_score:
                highest_score = score
                best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref)
    ref_name, nuc_offset, aligned_coord, aligned_ref = best_match
    coord_pos = ref_pos = 0
    ref_start = ref_end = None
    for coord_amino, ref_amino in zip(aligned_coord, aligned_ref):
        if ref_amino != '-':
            ref_pos += 1
        if coord_amino != '-':
            coord_pos += 1
            if start_pos == coord_pos:
                ref_start = ref_pos * 3 - nuc_offset - 3
            if coord_pos == end_pos:
                ref_end = ref_pos * 3 - nuc_offset
    assert ref_start is not None
    assert ref_end is not None
    return ref_name, ref_start, ref_end
예제 #3
0
class ProjectConfigurationTest(unittest.TestCase):
    def setUp(self):
        self.defaultJsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 5,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"],
          "id": 10042
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ],
      "seed_group": "R1-seeds"
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "RWN",
        "NWR"
      ],
      "seed_group": null
    }
  }
}
""")
        self.config = ProjectConfig()

    def testConvert(self):
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
"""
        fasta = StringIO()

        self.config.load(self.defaultJsonIO)
        self.config.writeSeedFasta(fasta)

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testSharedRegions(self):
        jsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    },
    "R1 and R2": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        },
        {
          "coordinate_region": null,
          "seed_region_names": ["R2-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "TTT"
      ]
    }
  }
}
""")
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
>R2-seed
TTT
"""
        fasta = StringIO()

        self.config.load(jsonIO)
        self.config.writeSeedFasta(fasta)

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testUnusedRegion(self):
        jsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "TTT"
      ]
    }
  }
}
""")
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
"""
        fasta = StringIO()

        self.config.load(jsonIO)
        self.config.writeSeedFasta(fasta)

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testExcludeSeeds(self):
        jsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    },
    "R2": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R2-seed"]
        }
      ]
    },
    "R3": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R3-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "TTT"
      ]
    },
    "R3-seed": {
      "is_nucleotide": true,
      "reference": [
        "TAG"
      ]
    }
  }
}
""")
        expected_fasta = """\
>R2-seed
TTT
"""
        fasta = StringIO()

        self.config.load(jsonIO)
        self.config.writeSeedFasta(fasta,
                                   excluded_seeds=['R1-seed', 'R3-seed'])

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testExcludeUnknownSeed(self):
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
"""
        fasta = StringIO()

        self.config.load(self.defaultJsonIO)
        self.config.writeSeedFasta(fasta, excluded_seeds=['R99-seed'])

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testDuplicateReference(self):
        jsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1a-seed", "R1b-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1a-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTAAAGGG"
      ]
    },
    "R1b-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTAAAGGG"
      ]
    }
  }
}
""")
        fasta = StringIO()
        self.config.load(jsonIO)

        self.assertRaisesRegex(RuntimeError,
                               "Duplicate references: R1a-seed and R1b-seed.",
                               self.config.writeSeedFasta, fasta)

    def testGetReference(self):
        self.config.load(self.defaultJsonIO)
        seed_name = 'R1-seed'
        expected_ref = 'ACTGAAAGGG'

        seed_ref = self.config.getReference(seed_name)

        self.assertSequenceEqual(expected_ref, seed_ref)

    def testGetCoordinateReferences(self):
        self.config.load(self.defaultJsonIO)
        seed_name = 'R1-seed'
        expected_refs = {'R1': 'RWNNWR'}

        coordinate_refs = self.config.getCoordinateReferences(seed_name)

        self.assertDictEqual(expected_refs, coordinate_refs)

    def testGetAllReferences(self):
        expected_references = {'R1-seed': 'ACTGAAAGGG', 'R1': 'RWNNWR'}

        self.config.load(self.defaultJsonIO)
        references = self.config.getAllReferences()

        self.assertEqual(expected_references, references)

    def testUnknownReference(self):
        self.config.load(self.defaultJsonIO)
        seed_name = 'R-unknown'

        self.assertRaises(KeyError, self.config.getReference, seed_name)

    def testMaxVariants(self):
        self.config.load(self.defaultJsonIO)
        coordinate_region_name = 'R1'

        self.assertEqual(5, self.config.getMaxVariants(coordinate_region_name))

    def testMaxVariantsUnusedRegion(self):
        jsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 2,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "NSFW"
      ]
    },
    "R2": {
      "is_nucleotide": false,
      "reference": [
        "RSW"
      ]
    }
  }
}
""")
        self.config.load(jsonIO)
        coordinate_region_name = 'R2'

        self.assertEqual(0, self.config.getMaxVariants(coordinate_region_name))

    def testMaxVariantsTwoProjects(self):
        """ If two projects specify a maximum for the same coordinate region,
        use the bigger of the two.
        """
        jsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 9,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"]
        }
      ]
    },
    "R1-and-R2": {
      "max_variants": 2,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"]
        },
        {
          "coordinate_region": "R2",
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "NSFW"
      ]
    },
    "R2": {
      "is_nucleotide": false,
      "reference": [
        "RSW"
      ]
    }
  }
}
""")
        self.config.load(jsonIO)
        coordinate_region_name = 'R1'

        self.assertEqual(9, self.config.getMaxVariants(coordinate_region_name))

    def testReload(self):
        jsonIO1 = StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    }
  }
}
""")
        jsonIO2 = StringIO("""\
{
  "projects": {
    "R2": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R2-seed"]
        }
      ]
    }
  },
  "regions": {
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "GACCTA"
      ]
    }
  }
}
""")

        self.config.load(jsonIO1)
        self.config.load(jsonIO2)

        self.assertRaises(KeyError, self.config.getReference, "R1-seed")
        self.assertSequenceEqual("GACCTA", self.config.getReference("R2-seed"))

    def testProjectSeeds(self):
        expected_seeds = set(['R1-seed'])

        self.config.load(self.defaultJsonIO)
        seeds = self.config.getProjectSeeds('R1')

        self.assertSetEqual(expected_seeds, seeds)

    def testSeedGroup(self):
        expected_group = "R1-seeds"

        self.config.load(self.defaultJsonIO)
        group = self.config.getSeedGroup('R1-seed')

        self.assertEqual(expected_group, group)
예제 #4
0
class ProjectConfigurationTest(unittest.TestCase):
    def setUp(self):
        self.defaultJsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 5,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"],
          "id": 10042
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ],
      "seed_group": "R1-seeds"
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "RWN",
        "NWR"
      ],
      "seed_group": null
    }
  }
}
""")
        self.config = ProjectConfig()

    def testConvert(self):
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
"""
        fasta = StringIO.StringIO()

        self.config.load(self.defaultJsonIO)
        self.config.writeSeedFasta(fasta)

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testSharedRegions(self):
        jsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    },
    "R1 and R2": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        },
        {
          "coordinate_region": null,
          "seed_region_names": ["R2-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "TTT"
      ]
    }
  }
}
""")
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
>R2-seed
TTT
"""
        fasta = StringIO.StringIO()

        self.config.load(jsonIO)
        self.config.writeSeedFasta(fasta)

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testUnusedRegion(self):
        jsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "TTT"
      ]
    }
  }
}
""")
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
"""
        fasta = StringIO.StringIO()

        self.config.load(jsonIO)
        self.config.writeSeedFasta(fasta)

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testDuplicateReference(self):
        jsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1a-seed", "R1b-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1a-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTAAAGGG"
      ]
    },
    "R1b-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTAAAGGG"
      ]
    }
  }
}
""")
        fasta = StringIO.StringIO()
        self.config.load(jsonIO)

        self.assertRaisesRegexp(RuntimeError,
                                "Duplicate references: R1a-seed and R1b-seed.",
                                self.config.writeSeedFasta,
                                fasta)

    def testGetReference(self):
        self.config.load(self.defaultJsonIO)
        seed_name = 'R1-seed'
        expected_ref = 'ACTGAAAGGG'

        seed_ref = self.config.getReference(seed_name)

        self.assertSequenceEqual(expected_ref, seed_ref)

    def testGetCoordinateReferences(self):
        self.config.load(self.defaultJsonIO)
        seed_name = 'R1-seed'
        expected_refs = {'R1': 'RWNNWR'}

        coordinate_refs = self.config.getCoordinateReferences(seed_name)

        self.assertDictEqual(expected_refs, coordinate_refs)

    def testUnknownReference(self):
        self.config.load(self.defaultJsonIO)
        seed_name = 'R-unknown'

        self.assertRaises(KeyError, self.config.getReference, seed_name)

    def testMaxVariants(self):
        self.config.load(self.defaultJsonIO)
        coordinate_region_name = 'R1'

        self.assertEqual(5, self.config.getMaxVariants(coordinate_region_name))

    def testMaxVariantsUnusedRegion(self):
        jsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 2,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "NSFW"
      ]
    },
    "R2": {
      "is_nucleotide": false,
      "reference": [
        "RSW"
      ]
    }
  }
}
""")
        self.config.load(jsonIO)
        coordinate_region_name = 'R2'

        self.assertEqual(0, self.config.getMaxVariants(coordinate_region_name))

    def testMaxVariantsTwoProjects(self):
        """ If two projects specify a maximum for the same coordinate region,
        use the bigger of the two.
        """
        jsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 9,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"]
        }
      ]
    },
    "R1-and-R2": {
      "max_variants": 2,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"]
        },
        {
          "coordinate_region": "R2",
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "NSFW"
      ]
    },
    "R2": {
      "is_nucleotide": false,
      "reference": [
        "RSW"
      ]
    }
  }
}
""")
        self.config.load(jsonIO)
        coordinate_region_name = 'R1'

        self.assertEqual(9, self.config.getMaxVariants(coordinate_region_name))

    def testReload(self):
        jsonIO1 = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    }
  }
}
""")
        jsonIO2 = StringIO.StringIO("""\
{
  "projects": {
    "R2": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R2-seed"]
        }
      ]
    }
  },
  "regions": {
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "GACCTA"
      ]
    }
  }
}
""")

        self.config.load(jsonIO1)
        self.config.load(jsonIO2)

        self.assertRaises(KeyError, self.config.getReference, "R1-seed")
        self.assertSequenceEqual("GACCTA", self.config.getReference("R2-seed"))

    def testProjectSeeds(self):
        expected_seeds = set(['R1-seed'])

        self.config.load(self.defaultJsonIO)
        seeds = self.config.getProjectSeeds('R1')

        self.assertSetEqual(expected_seeds, seeds)

    def testSeedGroup(self):
        expected_group = "R1-seeds"

        self.config.load(self.defaultJsonIO)
        group = self.config.getSeedGroup('R1-seed')

        self.assertEqual(expected_group, group)

    def testProjectRegions(self):
        jsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        }
      ]
    },
    "R1 and R2": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [1, 3],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        },
        {
          "coordinate_region": "R2",
          "coordinate_region_length": 1,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R2-seed"
          ]
        }
      ]
    }
  }
}
""")
        expected_project_regions = [{"project_name": "R1",
                                     "coordinate_region_length": 3,
                                     "key_positions": [],
                                     "min_coverage1": 10,
                                     "min_coverage2": 50,
                                     "min_coverage3": 100},
                                    {"project_name": "R1 and R2",
                                     "coordinate_region_length": 3,
                                     "key_positions": [1, 3],
                                     "min_coverage1": 10,
                                     "min_coverage2": 50,
                                     "min_coverage3": 100}]

        self.config.load(jsonIO)
        project_regions = list(self.config.getProjectRegions('R1-seed', 'R1'))

        self.assertEqual(expected_project_regions, project_regions)