def test_left_right_from(): r1 = SequenceRange(1, 11, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") r2 = SequenceRange(9, 20, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") ok_(r1.is_left_from(r2)) ok_(r2.is_right_from(r1))
def _parse_interpro_ranges(self, xml_str): ranges = [] ns_map = { 'p': 'http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5' } root = ET.fromstring(xml_str) sequence_elem = root.find('./p:protein/p:sequence', namespaces=ns_map) sequence = sequence_elem.text for match_elem in root.find('./p:protein/p:matches', namespaces=ns_map): entry_elem = match_elem.find('.//p:signature/p:entry', namespaces=ns_map) if entry_elem is None: continue ac = entry_elem.get('ac') desc = entry_elem.get('desc') allow_short_domain = self._short_domain_allowed(entry_elem) for location_elem in match_elem.find('.//p:locations', namespaces=ns_map): start = int(location_elem.get('start')) - 1 end = int(location_elem.get('end')) - 1 length = end - start if length > 20 or allow_short_domain: range_ = SequenceRange(start, end, sequence) range_.ac = ac ranges.append(range_) return ranges
def test_includes_residue(): r = SequenceRange(5, 15, "AAAAAAAVAVAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") ok_(r.includes_residue(6)) ok_(r.includes_residue(10)) ok_(r.includes_residue(15))
def test_intersection(): r1 = SequenceRange(1, 15, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") r2 = SequenceRange(10, 20, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") i = r1.get_intersection(r2) eq_(i.start, 10) eq_(i.end, 15)
def test_merge(): r1 = SequenceRange(0, 11, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") r2 = SequenceRange(5, 22, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") m = r1.merge_with(r2) eq_(m.start, 0) eq_(m.end, 22)
def test_merge_similar_ranges(): rs = domain_aligner._merge_similar_ranges([ SequenceRange(1, 22, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), SequenceRange(19, 30, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), SequenceRange(0, 23, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), SequenceRange(1, 23, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") ]) eq_(len(rs), 2)
def test_alignment_ok_for_range(): r = SequenceRange(1, 22, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") a = TargetTemplateAlignment( "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "-----AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA--") ok_(domain_aligner._alignment_ok_for_range(r, a)) r = SequenceRange(1, 22, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") a = TargetTemplateAlignment( "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "-------GGGG------------------------------------") ok_(not domain_aligner._alignment_ok_for_range(r, a))
def test_no_alignment_flip(): seq = ( "MGKLVALVLLGVGLSLVGEMFLAFRERVNASREVEPVEPENCHLIEELESGSEDIDILPSGLAFISSGLKYP" + "GMPNFAPDEPGKIFLMDLNEQNPRAQALEISGGFDKELFNPHGISIFIDKDNTVYLYVVNHPHMKSTVEIFK" + "FEEQQRSLVYLKTIKHELLKSVNDIVVLGPEQFYATRDHYFTNSLLSFFEMILDLRWTYVLFYSPREVKVVA" + "KGFCSANGITVSADQKYVYVADVAAKNIHIMEKHDNWDLTQLKVIQLGTLVDNLTVDPATGDILAGCHPNPM" + "KLLNYNPEDPPGSEVLRIQNVLSEKPRVSTVYANNGSVLQGTSVASVYHGKILIGTVFHKTLYCEL") species_id = 'human' range_ = SequenceRange(183, 265, seq) template_id = TemplateID('4zrn', 'A') alignment = DomainAlignment( "YFTNSLLSFFEMILDLRWT---YVLFYSPRE-----VKVVA---KGFCSANGITVSAD-Q--K-YVYVADVAAKNIHIMEKHDNWDLTQLKVIQLGT", "YSTEMYLEFFAREYGLKYTVLRYANVYGPRQDPYGEAGVVAIFTERMLRGEEVHIFGDGEYVRDYVYVDDVVRANLLAMEKGDN------EVFNIGT", range_, template_id) context = modeler._prepare_context(alignment.template_id.pdbid) context.set_main_target(seq, species_id, alignment.template_id.chain_id) chain_alignments = modeler._make_alignments(seq, species_id, alignment, context, None) for chain_id in chain_alignments: _log.debug("got alignment {}: {}".format(chain_id, chain_alignments[chain_id])) ok_(chain_alignments[chain_id].target_alignment.replace('-','') in seq)
def test_subtraction(): r = SequenceRange(5, 15, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") r -= 5 eq_(r.start, 0) eq_(r.end, 10)
def test_filter_forbidden_ranges(): il = [ SequenceRange(1, 15, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), SequenceRange(0, 16, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), SequenceRange(17, 40, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), SequenceRange(20, 33, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), ] from hommod import default_settings as settings il[1].ac = settings.FORBIDDEN_INTERPRO_DOMAINS[0] rs = domain_aligner._filter_forbidden_ranges(il) eq_(len(rs), 2)
def test_generate_error_archive(): sequence = "EDFPRFPHRGLLLDTSRHYLPLSSILDTLDVMAYNKLNVFHWH" alignment = DomainAlignment(sequence, sequence, SequenceRange(0, len(sequence), sequence), TemplateID('2GK1', 'I')) class _FakeYasara: def CD(self, work_dir): self.work_dir = work_dir def Processors(self, n): pass def ExperimentHomologyModeling(self, *args, **kwargs): error_path = os.path.join(self.work_dir, 'errorexit.txt') with open(error_path, 'w') as f: f.write('10$ reward for reporting') def Experiment(self, s): pass def Wait(self, s): pass def SaveSce(self, filename): pass class FakeContext: def __init__(self): self.target_species_id = 'HUMAN' self.main_target_chain_id = 'I' self.template_pdbid = '2GK1' self.yasara = _FakeYasara() self.template_obj = 1 def get_main_target_sequence(self): return sequence def get_chain_ids(self): return ['I'] def get_sequence(self, chain_id): return sequence context = _FakeContext() try: modeler._model_run(alignment, {'I': alignment}, context) except ModelRunError: pass tar_path = model_storage.get_error_tar_path(context.get_main_target_sequence(), context.target_species_id, alignment, TemplateID(context.template_pdbid, context.main_target_chain_id)) ok_(os.path.isfile(tar_path))
def test_remove_enclosing(): d = { SequenceRange(1, 22, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"): TargetTemplateAlignment( "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV"), SequenceRange(3, 18, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"): TargetTemplateAlignment( "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV"), SequenceRange(3, 20, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"): TargetTemplateAlignment( "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV") } e = SequenceRange(2, 19, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") r = domain_aligner._remove_enclosing(e, d) eq_(len(r), 2)
def test_find_shared_hits_ranges(): d = { SequenceRange(1, 22, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"): TargetTemplateAlignment( "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV"), SequenceRange(3, 18, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"): TargetTemplateAlignment( "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV"), SequenceRange(3, 19, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"): TargetTemplateAlignment( "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV") } template_id = TemplateID('1xxx', 'A') for v in d.values(): v.template_id = template_id r = domain_aligner._find_shared_hits_ranges(d) eq_(len(r), 1) eq_(len(list(r.values())[0]), 3)
def test_find_template(mock_get_domain_ranges): seq = ( "MGKLVALVLLGVGLSLVGEMFLAFRERVNASREVEPVEPENCHLIEELESGSEDIDILPSGLAFISSGLKYP" + "GMPNFAPDEPGKIFLMDLNEQNPRAQALEISGGFDKELFNPHGISIFIDKDNTVYLYVVNHPHMKSTVEIFK" + "FEEQQRSLVYLKTIKHELLKSVNDIVVLGPEQFYATRDHYFTNSLLSFFEMILDLRWTYVLFYSPREVKVVA" + "KGFCSANGITVSADQKYVYVADVAAKNIHIMEKHDNWDLTQLKVIQLGTLVDNLTVDPATGDILAGCHPNPM" + "KLLNYNPEDPPGSEVLRIQNVLSEKPRVSTVYANNGSVLQGTSVASVYHGKILIGTVFHKTLYCEL") mock_get_domain_ranges.return_value = [SequenceRange(0, len(seq), seq)] alignments = domain_aligner.get_domain_alignments(seq, require_resnum=190) ok_(any([ali.count_aligned_residues() > 300 for ali in alignments]))
def test_no_alignment_flip(mock_get_domain_ranges): seq = ( "MGKLVALVLLGVGLSLVGEMFLAFRERVNASREVEPVEPENCHLIEELESGSEDIDILPSGLAFISSGLKYP" + "GMPNFAPDEPGKIFLMDLNEQNPRAQALEISGGFDKELFNPHGISIFIDKDNTVYLYVVNHPHMKSTVEIFK" + "FEEQQRSLVYLKTIKHELLKSVNDIVVLGPEQFYATRDHYFTNSLLSFFEMILDLRWTYVLFYSPREVKVVA" + "KGFCSANGITVSADQKYVYVADVAAKNIHIMEKHDNWDLTQLKVIQLGTLVDNLTVDPATGDILAGCHPNPM" + "KLLNYNPEDPPGSEVLRIQNVLSEKPRVSTVYANNGSVLQGTSVASVYHGKILIGTVFHKTLYCEL") mock_get_domain_ranges.return_value = [SequenceRange(0, len(seq), seq)] alignments = domain_aligner.get_domain_alignments(seq) for alignment in alignments: _log.debug("got alignment {}".format(alignment)) ok_(alignment.target_alignment.replace('-', '') in seq)
def get_relative_span(self): """ Tells the starting position of 'target' relative to the starting position of 'template'. """ i = 0 while not is_amino_acid_char(self.target_alignment[i]): i += 1 start = len(self.template_alignment[:i].replace('-', '')) i = len(self.target_alignment) while not is_amino_acid_char(self.target_alignment[i - 1]): i -= 1 end = len(self.template_alignment[:i].replace('-', '')) return SequenceRange(start, end, self.get_template_sequence())
def test_get_template_sequence_in_target_range(): alignment = TargetTemplateAlignment( "PHTSHSWLCDGRLLCLHDPSNKNNWKIFRECWKQGQPVLVSGVHKKLK" + "SELWKPEAFSQEFGDQDVDLVNCRNCAIISDVKVRDFWDGFEIICKRL" + "RSEDGQPMVLKLKDWPPGEDFRDMMPTRFEDLMENLPLPEYTKRDGRL" + "NLASRLPSYFVRPDLGPKMYNAYGLITAEDRRVGTTNLHLDVSDAVNV" + "MVYVGIPIGEG-AHDEEVLKTIDEGDADEVTKQRIHDGKEKPGALWHI" + "YAAKDAEKIRELLRKVGEEQGQENPPDHDPIHDQSWYLDQTLRKRLYE" + "EYGVQGWAIVQFLGDAVFIPAGAPHQVHNLYSCIKVAEDFVSPEHVKH" + "CFRLTQEF", "-MIPHSWICEKHILWLKDYKNSSNWKLFKECWKQGQPAVVSGVHKKMN" + "ISLWKAESISLDFGDHQADLLNCKD-SIISNANVKEFWDGFEEVSKR-" + "-----ETVVLKLKDWPSGEDFKTMMPARYEDLLKSLPLPEYCNPEGKF" + "NLASHLPGFFVR---GPRLCSAYGVVAAKDHDIGTTNLHIEVSDVVNI" + "LVYVGIAKGNGILSKAGILKKFEEEDLDDILRKRLKDSSEIPGALWHI" + "YAGKDVDKIREFLQKISKEQG------HDPIRDQSWYVNKKLRQRLLE" + "EYGVRTCTLIQFLGDAIVLPAGALHQVQNFHSCIQVTEDFVSPEHLVE" + "SFHLTQEL") range_ = SequenceRange(48, 96, alignment.target_alignment.replace('-', '')) template_seq = domain_aligner._get_template_sequence_in_target_range( alignment, range_) eq_(template_seq, "ISLWKAESISLDFGDHQADLLNCKD-SIISNANVKEFWDGFEEVSKR-")
def test_length(): r = SequenceRange(5, 15, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") eq_(r.get_length(), 10)
def test_sub_sequence(): r = SequenceRange(5, 15, "AAAAAAAVAVAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") eq_(r.get_sub_sequence(), "AAVAVAAAAA")
def test_percentage_overlap(): r1 = SequenceRange(0, 10, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") r2 = SequenceRange(5, 15, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") eq_(r1.get_percentage_overlap(r2), 50.0)
def test_overlaps_with(): r1 = SequenceRange(0, 10, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") r2 = SequenceRange(5, 15, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") ok_(r1.overlaps_with(r2))
def get_domain_alignments(self, target_sequence, require_resnum=None, template_id=None): if self.min_percentage_coverage is None: raise InitError("min percentage coverage is not set") interpro_ranges = interpro.get_domain_ranges(target_sequence) _log.debug("{} ranges from interpro".format(len(interpro_ranges))) sample_ranges = self._filter_forbidden_ranges(interpro_ranges) if require_resnum is not None: sample_ranges = filter( lambda r: r.includes_residue(require_resnum), sample_ranges) _log.debug("{} ranges have residue {}".format( len(sample_ranges), require_resnum)) # Add the whole sequence as a range too: sample_ranges.append( SequenceRange(0, len(target_sequence), target_sequence)) ok_ranges_alignments = {} best_ranges_alignments = {} checked_ranges = [] while len(sample_ranges) > 0: merged_sample_ranges = self._merge_similar_ranges(sample_ranges) _log.debug("sampling {} ranges".format(len(merged_sample_ranges))) # Check the largest ranges first. If that yields, then the smaller ones don't matter. for range_ in sorted(merged_sample_ranges, key=lambda r: r.get_length(), reverse=True): if range_ in checked_ranges: continue # already passed this one checked_ranges.append(range_) if any([r.encloses(range_) for r in best_ranges_alignments]): continue # we already have a larger enclosing range # These can differ per range: best_hit = None last_resort_hit = None hit_candidates = self._get_hits(range_, template_id) _log.debug('trying range: {} against {} hits'.format( range_, len(hit_candidates))) for hit_candidate in hit_candidates: hit_range = hit_candidate.get_query_range() if require_resnum is not None: if not hit_candidate.is_query_residue_covered( require_resnum): _log.debug( "hit with {} on {} does not cover residue {}". format(hit_candidate.get_hit_accession_code(), hit_range, require_resnum)) continue if self._alignment_ok_for_range(range_, hit_candidate): _log.debug("hit with {} {} is ok".format( hit_candidate.get_hit_accession_code(), hit_range)) # This range made an OK alignment, so at least store it for later usage: template_id = TemplateID( hit_candidate.get_hit_accession_code(), hit_candidate.get_hit_chain_id()) ok_ranges_alignments[hit_range] = DomainAlignment( hit_candidate.query_alignment, hit_candidate.subject_alignment, hit_range, template_id) if hit_candidate.get_percentage_coverage( ) > self.min_percentage_coverage: _log.debug( "coverage is high enough for {} {}".format( hit_candidate.get_hit_accession_code(), hit_range)) if best_hit is None or self._is_better_than( hit_candidate, best_hit): _log.debug("{} is better than {}".format( hit_candidate, best_hit)) best_hit = hit_candidate else: last_resort_hit = hit_candidate if best_hit is None: best_hit = last_resort_hit if best_hit is not None: # Remove any smaller ranges that this one encloses: best_ranges_alignments = self._remove_enclosing( range_, best_ranges_alignments) template_id = TemplateID(best_hit.get_hit_accession_code(), best_hit.get_hit_chain_id()) hit_range = best_hit.get_query_range() _log.debug( "passing best hit with template {} with range {}". format(template_id, hit_range)) best_ranges_alignments[hit_range] = DomainAlignment( best_hit.query_alignment, best_hit.subject_alignment, hit_range, template_id) else: _log.debug("no hit for range {}".format(range_)) # After iterating the sample ranges, prepare for the next round: sample_ranges = self._clean_search_space(checked_ranges, sample_ranges, ok_ranges_alignments) return best_ranges_alignments.values()
def test_not_overlaps_with(): r1 = SequenceRange(0, 10, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") r2 = SequenceRange(10, 20, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") ok_(not r1.overlaps_with(r2))
def test_eq(): r1 = SequenceRange(0, 20, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") r2 = SequenceRange(0, 20, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") ok_(r1 == r2)
def test_encloses(): r1 = SequenceRange(0, 20, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") r2 = SequenceRange(5, 15, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") ok_(r1.encloses(r2))
def test_no_intersection(): r1 = SequenceRange(1, 15, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") r2 = SequenceRange(15, 30, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") i = r1.get_intersection(r2)
def get_query_range(self): return SequenceRange(self.query_start - 1, self.query_end, self.full_query_sequence)
def test_distance_from(): r1 = SequenceRange(1, 9, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") r2 = SequenceRange(11, 20, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") eq_(r1.get_distance_from(r2), 2)