def test_rmsd_pheromone_binding_protein_paper(self): apo = self.load_test_structure('2fjy') holo = self.load_test_structure('1dqe') apo_chain = apo[0]['A'] holo_chain = holo[0]['A'] # # logging.root.setLevel(logging.INFO) # self.assertTrue(sequences_same(apo_chain, holo_chain)) # chains also have leading and trailing residues not present in the other, remove them apo_pp = chain_to_polypeptide(apo_chain)[:-5] holo_pp = chain_to_polypeptide(holo_chain)[6:] # analyze just A chain (like in the paper) apo_residues = ChainResidues(list(apo_pp), apo.id, apo_chain.id) holo_residues = ChainResidues(list(holo_pp), holo.id, holo_chain.id) get_c_alpha_coords = GetCAlphaCoords() get_centroid = GetCentroid((get_c_alpha_coords, )) get_centered_c_alpha_coords = GetCenteredCAlphaCoords( (get_c_alpha_coords, get_centroid)) get_rmsd = GetRMSD((get_centered_c_alpha_coords, GetRotationMatrix( (get_centered_c_alpha_coords, )))) rmsd = get_rmsd(apo_residues, holo_residues) self.assertAlmostEqual(7, rmsd, delta=1) # todo 6.24 (vs 7.0 in the paper)
def test_rmsd_rotated_and_translated(self): s1 = self.get_test_structure() s2 = self.get_test_structure() # rotate and translate s2 AXIS_DIRECTION = np.array([11, 2, 0]) AXIS_DIRECTION = AXIS_DIRECTION / np.linalg.norm( AXIS_DIRECTION) # following code expects a unit vector ANGLE = np.pi / 4 TRANSLATION = np.array([1, 200, 7]) atoms = list(s2.get_atoms()) coords = np.array([a.get_coord() for a in atoms]) rotation = Rotation.from_rotvec(ANGLE * AXIS_DIRECTION) for atom, new_coord in zip(atoms, rotation.apply(coords) + TRANSLATION): atom.set_coord(new_coord) # test rmsd is still 0 chain_a = ChainResidues.from_bio_chain(s1[0]['A']) chain_a_rotated = ChainResidues.from_bio_chain(s2[0]['A']) get_c_alpha_coords = GetCAlphaCoords() get_centroid = GetCentroid((get_c_alpha_coords, )) get_centered_c_alpha_coords = GetCenteredCAlphaCoords( (get_c_alpha_coords, get_centroid)) get_rmsd = GetRMSD((get_centered_c_alpha_coords, GetRotationMatrix( (get_centered_c_alpha_coords, )))) rmsd = get_rmsd(chain_a, chain_a_rotated) self.assertAlmostEqual(0, rmsd, places=4)
def test_rmsd_guanylate_kinase_paper(self): apo = self.load_test_structure('1ex6') holo = self.load_test_structure('1ex7') apo_chain = apo[0][ 'B'] # note that different chain (as by dyndom), why? holo_chain = holo[0]['A'] logging.root.setLevel(logging.INFO) self.assertTrue(sequences_same(apo_chain, holo_chain)) # analyze just A chain (like in the paper) apo_residues = ChainResidues.from_bio_chain(apo_chain) holo_residues = ChainResidues.from_bio_chain(holo_chain) get_c_alpha_coords = GetCAlphaCoords() get_centroid = GetCentroid((get_c_alpha_coords, )) get_centered_c_alpha_coords = GetCenteredCAlphaCoords( (get_c_alpha_coords, get_centroid)) get_rmsd = GetRMSD((get_centered_c_alpha_coords, GetRotationMatrix( (get_centered_c_alpha_coords, )))) rmsd = get_rmsd(apo_residues, holo_residues) self.assertAlmostEqual(4.4, rmsd, delta=0.1) # 4.37 (vs 4.4 Å in the paper)
def test_interdomain_surface(self): s = self.get_test_structure() chain_a = ChainResidues(list(s[0]['A']), s.id, 'A') chain_b = ChainResidues(list(s[0]['B']), s.id, 'B') interdomain_surface_computer = GetInterfaceBuriedArea( (GetSASAForStructure(), )) area = interdomain_surface_computer(chain_a, chain_b) self.assertGreater(area, 1)
def test_get_hinge_angle(self): s1 = self.get_test_structure() s2 = self.get_test_structure() s2.id = f'{s1.id}_with_rotated_chain' s1d1 = ChainResidues.from_bio_chain(s1[0]['A']) s1d2 = ChainResidues.from_bio_chain(s1[0]['B']) s2d1 = ChainResidues.from_bio_chain(s2[0]['A']) s2d2 = ChainResidues.from_bio_chain(s2[0]['B']) # rotate second domain over a defined screw axis, then check if GetHingeAngle indeed computes the correct parameters (angle, translation) # define the screw axis AXIS_DIRECTION = np.array([1, 2, 0]) AXIS_DIRECTION = AXIS_DIRECTION / np.linalg.norm( AXIS_DIRECTION) # following code expects a unit vector AXIS_LOCATION = np.array([ 52.71183395385742, 44.92530822753906, -11.425999641418457 ]) # a random pivot point (the axis goes through it) ANGLE = np.pi / 4 TRANSLATION_IN_AXIS = 3 # move along the screw axis using scipy s2d2_atoms = [atom for residue in s2d2 for atom in residue] s2d2_atom_coords = np.array([atom.coord for atom in s2d2_atoms]) rotation = Rotation.from_rotvec(ANGLE * AXIS_DIRECTION) rotated_s2d2_atom_coords = rotation.apply( s2d2_atom_coords - AXIS_LOCATION) + AXIS_LOCATION for atom, new_coord in zip( s2d2_atoms, rotated_s2d2_atom_coords + AXIS_DIRECTION * TRANSLATION_IN_AXIS): atom.set_coord(new_coord) # compute the hinge angle with GetHingeAngle get_c_alpha_coords = GetCAlphaCoords() get_centroid = GetCentroid((get_c_alpha_coords, )) get_centered_c_alpha_coords = GetCenteredCAlphaCoords( (get_c_alpha_coords, get_centroid)) get_hinge_angle = GetHingeAngle((get_c_alpha_coords, get_centroid, GetRotationMatrix( (get_centered_c_alpha_coords, )))) screw_motion = get_hinge_angle(s1d1, s1d2, s2d1, s2d2) self.assertAlmostEqual(ANGLE, screw_motion.angle, places=3) self.assertAlmostEqual(TRANSLATION_IN_AXIS, screw_motion.translation_in_axis, places=3)
def test_get_sasa_for_structure(self): s = self.get_test_structure() chain_b = s[0]['B'] residues = ChainResidues(list(chain_b), s.id, 'B') sasa_computer = GetSASAForStructure() chain_b_sasa = sasa_computer(residues) self.assertGreater(chain_b_sasa, 1)
def test_rmsd_translated(self): s = self.get_test_structure() chain_a_copy = s[0]['A'].copy() # move the copy by 1 angstrom for atom in chain_a_copy.get_atoms(): atom.coord += (1, 0, 0) chain_a = ChainResidues([r for r in s[0]['A'] if is_aa(r)], s.id, 'A') chain_a_copy = ChainResidues([r for r in chain_a_copy if is_aa(r)], f'moved_{s.id}', 'A') get_c_alpha_coords = GetCAlphaCoords() get_centroid = GetCentroid((get_c_alpha_coords, )) get_centered_c_alpha_coords = GetCenteredCAlphaCoords( (get_c_alpha_coords, get_centroid)) get_rmsd = GetRMSD((get_centered_c_alpha_coords, GetRotationMatrix( (get_centered_c_alpha_coords, )))) rmsd = get_rmsd(chain_a, chain_a_copy) self.assertAlmostEqual(0, rmsd, places=5)
def compare_chains(chain1: Chain, chain2: Chain, c1_residue_mapping: BiopythonToMmcifResidueIds.Mapping, c2_residue_mapping: BiopythonToMmcifResidueIds.Mapping, c1_seq: Dict[int, str], c2_seq: Dict[int, str], # in 3-letter codes comparators__residues_param: List[Analyzer], comparators__residue_ids_param: List[Analyzer], comparators__domains__residues_param: List[Analyzer], comparators__domains__residue_ids_param: List[Analyzer], comparators__2domains__residues_param: List[Analyzer], serializer_or_analysis_handler: AnalysisHandler, domains_info: list, ) -> None: """ Runs comparisons between two chains. E.g. one ligand-free (apo) and another ligand-bound (holo). :param chain1: A Bio.PDB Chain, obtained as a part of BioPython Structure object as usual :param chain2: A corresponding chain (same sequence), typically from a different PDB structure. See chain1. :param c1_residue_mapping: :param apo_poly_seqs: """ s1_pdb_code = chain1.get_parent().get_parent().id s2_pdb_code = chain2.get_parent().get_parent().id logging.info(f'running analyses for ({s1_pdb_code}, {s2_pdb_code}) pair...') # # with warnings.catch_warnings(): # warnings.simplefilter("ignore") # pp1 = chain_to_polypeptide(chain1) # pp2 = chain_to_polypeptide(chain2) # c1_seq, c2_seq todo, is the order in atom_site loop guaranteed? If not, I should sort the dict by label_seq_id # also todo, is label_seq_id sequential, that is one-by-one always +1? # todo assert entity_poly_seq have no gaps (always +1), they say they're sequential, I think they mean exactly this # crop polypeptides to longest common substring c1_common_seq, c2_common_seq = get_longest_common_polypeptide(c1_seq, c2_seq) c1_label_seq_ids = list(c1_common_seq.keys()) c2_label_seq_ids = list(c2_common_seq.keys()) return label_seq_id_offset = c2_label_seq_ids[0] - c1_label_seq_ids[0] # up to this point, we have residue ids of the protein sequence in the experiment. This also includes unobserved # residues, but those we will exclude from our analysis as their positions weren't determined c1_residues, c1_label_seq_ids, c2_residues, c2_label_seq_ids = get_observed_residues( chain1, c1_label_seq_ids, c1_residue_mapping, chain2, c2_label_seq_ids, c2_residue_mapping, ) c1_residues = ChainResidues(c1_residues, s1_pdb_code, chain1.id) c2_residues = ChainResidues(c2_residues, s2_pdb_code, chain2.id) # todo trochu nesikovny c1_residue_ids = ChainResidueData[ResidueId]([ResidueId(label_seq_id, chain1.id) for label_seq_id in c1_label_seq_ids], s1_pdb_code, chain1.id) c2_residue_ids = ChainResidueData[ResidueId]([ResidueId(label_seq_id, chain2.id) for label_seq_id in c2_label_seq_ids], s2_pdb_code, chain2.id) # [done] tady nahradit pp pomocí apo_seq nějak # [done] v analyzerech (APIs) nahradit author_seq_id # todo tady matchovaní domén pomocí tohodle - zas mohu pouzit Sequence Matcher # - ale spany, je to složitější -> zatím přeindexovat apo nebo holo do druhý... for a in comparators__residues_param: # this fn (run_analyses_for_isoform_group) does not know anything about serialization? # But it will know how nested it is (domain->structure) and can pass full identifiers of structures/domains serializer_or_analysis_handler.handle('chain2chain', a, a(c1_residues, c2_residues), c1_residues, c2_residues) # in future maybe pass apo and holo. Will serialize itself. And output the object in rdf for example? # because what I would like is to output the analysis with objects identifiers, and then output the objects, what they contain (e.g. domain size?) for c in comparators__residue_ids_param: serializer_or_analysis_handler.handle('chain2chain', c, c(c1_residue_ids, c2_residue_ids), c1_residue_ids, c2_residue_ids) # domain-level analyses # get domains (set of auth_seq_id), sort them by domain id and hope they will correspond to each other # or could map corresponding domains by choosing the ones that have the most overlap? try: c1_domains = sorted(filter(lambda d: d.chain_id == chain1.id, get_domains(s1_pdb_code)), key=lambda d: d.domain_id) c2_domains = sorted(filter(lambda d: d.chain_id == chain2.id, get_domains(s2_pdb_code)), key=lambda d: d.domain_id) # todo zaznamenat total počet domén (pro obě struktury), zapsat do jinýho jsonu třeba for pdb_code, domains in ((s1_pdb_code, c1_domains), (s2_pdb_code, c2_domains)): for d in domains: domains_info.append( {'type': 'full_domain', 'full_id': (pdb_code, d.chain_id, d.domain_id), 'pdb_code': pdb_code, 'chain_id': d.chain_id, 'domain_id': d.domain_id, 'spans': d.get_spans(),}) # for d in c2_domains: # domains_info.append( # {'type': 'total_domains_found', 'result': len(c2_domains), 'pdb_code': s2_pdb_code, 'chain_id': chain2.id}) # todo spany domén, hlavně except APIException as e: if e.__cause__ and '404' in str(e.__cause__): logging.warning(f'{s1_pdb_code} {s2_pdb_code} no domains found, skip the domain-level analysis') return # no domains found, skip the domain-level analysis raise # assert len(c1_domains) == len(c2_domains) # not always true, as expected, but now OK # SequenceMatcher on domain resiudes c1_domains__residues = [] c2_domains__residues = [] for c1_d in c1_domains: # or c2_domains: # first remap first domain to second (or in future use longest common substrings, but not trivial since domains can be composed of multiple segments) # offset nemusí být všude stejný c1_domain_mapped_to_c2 = DomainResidueMapping.from_domain_on_another_chain(c1_d, chain2.id, label_seq_id_offset) # todo proc chain.get_parent?? Asi abych chain nemusel specifikovat (ale ted pracuju jenom s nima..) c1_d_residues = DomainResidues.from_domain(c1_d, chain1.get_parent(), c1_residue_mapping, lambda id: id not in c1_label_seq_ids) c2_d_residues = DomainResidues.from_domain(c1_domain_mapped_to_c2, chain2.get_parent(), c2_residue_mapping, lambda id: id not in c2_label_seq_ids) if not c1_d_residues or not c2_d_residues: # the domain is not within the processed LCS of both chains (empty intersection with chain residues) logging.warning(f'domain {c1_d.domain_id} is not within the processed LCS of both chains (empty ' f'intersection with ' f'chain residues)') continue c1_domains__residues.append(DomainResidues(c1_d_residues.data, c1_d_residues.structure_id, c1_d_residues.chain_id, c1_d_residues.domain_id)) c2_domains__residues.append(DomainResidues(c2_d_residues.data, c2_d_residues.structure_id, c2_d_residues.chain_id, c2_d_residues.domain_id)) for residue_mapping, domains in ((c1_residue_mapping, c1_domains__residues), (c2_residue_mapping, c2_domains__residues)): for d in domains: domains_info.append( {'type': 'analyzed_domain', 'full_id': d.get_full_id(), 'pdb_code': d.structure_id, 'chain_id': d.chain_id, 'domain_id': d.domain_id, 'spans': d.get_spans(residue_mapping), 'spans_auth_seq_id': d.get_spans(residue_mapping, auth_seq_id=True), }) # # # todo zaznamenat počet domén jdoucích do analýz # domains_info.append({'type': 'analyzed_domain_count', 'result': len(c1_domains__residues), 'pdb_code': s1_pdb_code, 'chain_id': chain1.id}) # domains_info.append({'type': 'analyzed_domain_count', 'result': len(c2_domains__residues), 'pdb_code': s2_pdb_code, 'chain_id': chain2.id}) # todo to tam taky neni v argumentech, ale harcoded.., to je ten muj fix... # todo tohle totiž neni párový porovnání.., ale 'jednotkový' # - stejně jako get domains, get_ss (nikoliv compare ss), vlastne i sequence atp # - cachovat surface area teda nedava smysl, nacte se proste z predvypocitanyho, jako normalne # - nebo, proste jenom tyhle structure-level veci ma smysl "cachovat" resp nepocitat tady, pro kazdej par, ale # - nacitat z filu/unpicklovat - to asi ne, mít serialize/deserialize (stejne chci to mit jako citelny vystup). 4 # - A pak to klidně všechno pro rychlost deserializovat do pameti... # no, tak to abych se těšil zas na json/pandas-merge hell.. Vsude merge.. Vsude dupe cols/delat index (ten pak ale nekdy zas potrebujes v cols...) for chain_domains in (c1_domains__residues, c2_domains__residues): for d1, d2 in itertools.combinations(chain_domains, 2): serializer_or_analysis_handler.handle('2DA', get_interdomain_surface, get_interdomain_surface(d1, d2), d1, d2) for d_chain1, d_chain2 in zip(c1_domains__residues, c2_domains__residues): for a in comparators__domains__residues_param: serializer_or_analysis_handler.handle('domain2domain', a, a(d_chain1, d_chain2), d_chain1, d_chain2) # todo vyres ty divny idcka for d_chain1, d_chain2 in zip(c1_domains__residues, c2_domains__residues): # Convert DomainResidues to DomainResidueData[ResidueId] # asi zas přes mapping... lepší by to bylo, kdyby byl implicitně schovaný třeba na to biopython residue ( # jinak by to nešlo moc ani, leda mit CustomResidue s fieldama bioresidue a label_seq_id, to je ale celkem # naprd, nebo ne? Nefungovalo by to s chainem, ale to stejně nikde nepoužívám... d_chain1 = DomainResidueData[ResidueId]([ResidueId.from_bio_residue(r, c1_residue_mapping) for r in d_chain1], d_chain1.structure_id, d_chain1.chain_id, d_chain1.domain_id) d_chain2 = DomainResidueData[ResidueId]([ResidueId.from_bio_residue(r, c2_residue_mapping) for r in d_chain2], d_chain2.structure_id, d_chain2.chain_id, d_chain2.domain_id) for a in comparators__domains__residue_ids_param: serializer_or_analysis_handler.handle('domain2domain', a, a(d_chain1, d_chain2), d_chain1, d_chain2) # two-domain arrangements to two-domain arrangements for (d1_chain1, d1_chain2), (d2_chain1, d2_chain2) in itertools.combinations(zip(c1_domains__residues, c2_domains__residues), 2): # (in paper considered if of both apo and holo interdomain iface >= 200 A^2 # if get_interdomain_surface(d1_chain1, d2_chain1) < 200 or get_interdomain_surface(d1_chain2, d2_chain2) < 200: # continue for a in comparators__2domains__residues_param: serializer_or_analysis_handler.handle('chain2DA2chain2DA', a, a(d1_chain1, d2_chain1, d1_chain2, d2_chain2), d1_chain1, d2_chain1, d1_chain2, d2_chain2) d1d2_chain1 = d1_chain1 + d2_chain1 d1d2_chain2 = d1_chain2 + d2_chain2 serializer_or_analysis_handler.handle('chain2DA2chain2DA', get_rmsd, get_rmsd(d1d2_chain1, d1d2_chain2), d1d2_chain1, d1d2_chain2) # todo hardcoded analysis
def compare_chains( chain1: Chain, chain2: Chain, c1_residue_mapping: BiopythonToMmcifResidueIds.Mapping, c2_residue_mapping: BiopythonToMmcifResidueIds.Mapping, c1_seq: Dict[int, str], c2_seq: Dict[int, str], # in 3-letter codes lcs_result: LCSResult, # comparators__residues_param: List[Analyzer], # todo tohle v pythonu 3.8 neni, # # ale spis bych mel mit duck-typing type annotations, protože to muze byt wrapply v cachich/a (de)serializerech.. # # a nebude to inheritance ale composition (napr nebudu muset delat subclassy pro kazdy analyzer na to, abych tam pridal cache funkcionalitu... # comparators__residue_ids_param: List[Analyzer], # comparators__domains__residues_param: List[Analyzer], # comparators__domains__residue_ids_param: List[Analyzer], # comparators__2DA__residues_param: List[Analyzer], # get_domains, # get_rmsd, # get_interdomain_surface, # serializer_or_analysis_handler: AnalysisHandler, # domains_info: list, # one_struct_analyses_done_set: dict, ) -> None: """ Runs comparisons between two chains. E.g. one ligand-free (apo) and another ligand-bound (holo). :param chain1: A Bio.PDB Chain, obtained as a part of BioPython Structure object as usual :param chain2: A corresponding chain (same sequence), typically from a different PDB structure. See chain1. :param c1_residue_mapping: :param apo_poly_seqs: """ s1_pdb_code = chain1.get_parent().get_parent().id s2_pdb_code = chain2.get_parent().get_parent().id logger.info(f'running analyses for ({s1_pdb_code}, {s2_pdb_code}) pair...') # # with warnings.catch_warnings(): # warnings.simplefilter("ignore") # pp1 = chain_to_polypeptide(chain1) # pp2 = chain_to_polypeptide(chain2) # c1_seq, c2_seq todo, is the order in atom_site loop guaranteed? If not, I should sort the dict by label_seq_id # also todo, is label_seq_id sequential, that is one-by-one always +1? - I use that contiguity assumption already... # - todo assert in code entity_poly_seq have no gaps (always +1), they say they're sequential, I think they mean exactly this # - I could have done in filter structures, just to know for sure. If it were extensible already # crop polypeptides to longest common substring # todo - vzit z inputu (mam i1 a i2, staci ziskat offset a real label seq) # crop polypeptides to longest common substring i1, i2, length = lcs_result.i1, lcs_result.i2, lcs_result.length c1_common_seq = dict(itertools.islice(c1_seq.items(), i1, i1 + length)) c2_common_seq = dict(itertools.islice(c2_seq.items(), i2, i2 + length)) c1_label_seq_ids = list(c1_common_seq.keys()) c2_label_seq_ids = list(c2_common_seq.keys()) label_seq_id_offset = c2_label_seq_ids[0] - c1_label_seq_ids[0] # up to this point, we have residue ids of the protein sequence in the experiment. This also includes unobserved # residues, but those we will exclude from our analysis as their positions weren't determined c1_residues, c1_label_seq_ids, c2_residues, c2_label_seq_ids = get_observed_residues( chain1, c1_label_seq_ids, c1_residue_mapping, chain2, c2_label_seq_ids, c2_residue_mapping, ) c1_residues = ChainResidues(c1_residues, s1_pdb_code, chain1.id) c2_residues = ChainResidues(c2_residues, s2_pdb_code, chain2.id) # todo trochu nesikovny c1_residue_ids = ChainResidueData[ResidueId]([ ResidueId(label_seq_id, chain1.id) for label_seq_id in c1_label_seq_ids ], s1_pdb_code, chain1.id) c2_residue_ids = ChainResidueData[ResidueId]([ ResidueId(label_seq_id, chain2.id) for label_seq_id in c2_label_seq_ids ], s2_pdb_code, chain2.id) # [done] tady nahradit pp pomocí apo_seq nějak # [done] v analyzerech (APIs) nahradit author_seq_id # todo tady matchovaní domén pomocí tohodle - zas mohu pouzit Sequence Matcher # - ale spany, je to složitější -> zatím přeindexovat apo nebo holo do druhý... def run_analysis(level_tag, analysis, *args): try: plocal.serializer_or_analysis_handler.handle( level_tag, analysis, analysis(*args), *args) except AnalysisException: logger.exception( 'Caught exception while computing analysis, all others will be run normally' ) for a in plocal.comparators__residues_param: # this fn (run_analyses_for_isoform_group) does not know anything about serialization? # But it will know how nested it is (domain->structure) and can pass full identifiers of structures/domains run_analysis( 'chain2chain', a, c1_residues, c2_residues ) # in future maybe pass apo and holo. Will serialize itself. And output the object in rdf for example? # because what I would like is to output the analysis with objects identifiers, and then output the objects, what they contain (e.g. domain size?) for a in plocal.comparators__residue_ids_param: run_analysis('chain2chain', a, c1_residue_ids, c2_residue_ids) # domain-level analyses # get domains (set of auth_seq_id), sort them by domain id and hope they will correspond to each other # or could map corresponding domains by choosing the ones that have the most overlap? try: c1_domains = sorted(filter(lambda d: d.chain_id == chain1.id, plocal.get_domains(s1_pdb_code)), key=lambda d: d.domain_id) c2_domains = sorted(filter(lambda d: d.chain_id == chain2.id, plocal.get_domains(s2_pdb_code)), key=lambda d: d.domain_id) # # todo tohle muzu dat uplne jinam.. treba do 1struct remote analyses, jestli to dá do ramky celej json vubec? # no ale ja musim nejak vyhodnotit ty analyzy v tom jupyter notebooku, jak to vůbec nactu do ramky úplně všechno vlastně?? # asi nenačtu... Takže to musim zrušit z jupytera, nebo si udělám interactive job a pustim jupytera na metacentru s ramkou 48 gb treba a pripojim se na nej # jde tam vubec delat server? for pdb_code, domains in ((s1_pdb_code, c1_domains), (s2_pdb_code, c2_domains)): key = (plocal.get_domains.get_name(), pdb_code) if key not in plocal.one_struct_analyses_done_set: # todo, tady jsem to zapomnel nastavit, takze to vlastne nepouzivam. Jsou tam dupes, ale je to jedno v podstate.. for d in domains: plocal.domains_info.append({ 'type': 'full_domain', 'full_id': (pdb_code, d.chain_id, d.domain_id), 'pdb_code': pdb_code, 'chain_id': d.chain_id, 'domain_id': d.domain_id, 'spans': d.get_spans(), }) except APIException as e: if e.__cause__ and '404' in str(e.__cause__): logger.warning( f'{s1_pdb_code} {s2_pdb_code} no domains found, skip the domain-level analysis' ) return # no domains found, skip the domain-level analysis raise except MissingDataException: logger.warning( f'{s1_pdb_code} {s2_pdb_code} no domains found, skip the domain-level analysis' ) return # no domains found, skip the domain-level analysis # assert len(c1_domains) == len(c2_domains) # not always true, as expected, but now OK # SequenceMatcher on domain resiudes c1_domains__residues = [] c2_domains__residues = [] # todo zatim to necham, ale uz mam i to defakto domain lcs for c1_d in c1_domains: # or c2_domains: # first remap first domain to second (or in future use longest common substrings, but not trivial since domains can be composed of multiple segments) # offset nemusí být všude stejný c1_domain_mapped_to_c2 = DomainResidueMapping.from_domain_on_another_chain( c1_d, chain2.id, label_seq_id_offset) # todo proc chain.get_parent?? Asi abych chain nemusel specifikovat (ale ted pracuju jenom s nima..) c1_d_residues = DomainResidues.from_domain( c1_d, chain1.get_parent(), c1_residue_mapping, lambda id: id not in c1_label_seq_ids) c2_d_residues = DomainResidues.from_domain( c1_domain_mapped_to_c2, chain2.get_parent(), c2_residue_mapping, lambda id: id not in c2_label_seq_ids) if not c1_d_residues or not c2_d_residues: # the domain is not within the processed LCS of both chains (empty intersection with chain residues) logger.warning( f'domain {c1_d.domain_id} is not within the processed LCS of both chains (empty ' f'intersection with ' f'chain residues)') continue c1_domains__residues.append( DomainResidues(c1_d_residues.data, c1_d_residues.structure_id, c1_d_residues.chain_id, c1_d_residues.domain_id)) c2_domains__residues.append( DomainResidues(c2_d_residues.data, c2_d_residues.structure_id, c2_d_residues.chain_id, c2_d_residues.domain_id)) for residue_mapping, domains in ((c1_residue_mapping, c1_domains__residues), (c2_residue_mapping, c2_domains__residues)): for d in domains: # samozrejme blbost to ukladat pokazdy, kdyz je to paru s necim klidne nekolikrat... # EDIT Ne uplne, je to croply na observed residues z obou párů (a lcs) # ale musel bych upravit idcko a dat tam nejak ten pár... # dupes pres joby muzu mazat pres ['pair_id', 'full_id'] plocal.domains_info.append({ 'type': 'analyzed_domain', 'pair_id': (c1_residues.get_full_id(), c2_residues.get_full_id() ), # domain cropping depends on the paired chains 'full_id': d.get_full_id(), 'pdb_code': d.structure_id, 'chain_id': d.chain_id, 'domain_id': d.domain_id, 'spans': d.get_spans(residue_mapping), 'spans_auth_seq_id': d.get_spans(residue_mapping, auth_seq_id=True), }) # todo to tam taky neni v argumentech, ale harcoded.., to je ten muj fix... # todo tohle totiž neni párový porovnání.., ale 'jednotkový' # - stejně jako get domains, get_ss (nikoliv compare ss), vlastne i sequence atp # - cachovat surface area teda nedava smysl, nacte se proste z predvypocitanyho, jako normalne # - nebo, proste jenom tyhle structure-level veci ma smysl "cachovat" resp nepocitat tady, pro kazdej par, ale # - nacitat z filu/unpicklovat - to asi ne, mít serialize/deserialize (stejne chci to mit jako citelny vystup). 4 # - A pak to klidně všechno pro rychlost deserializovat do pameti... # no, tak to abych se těšil zas na json/pandas-merge hell.. Vsude merge.. Vsude dupe cols/delat index (ten pak ale nekdy zas potrebujes v cols...) # TODO bud dát do 1struct (ale tam nechci nacitat mmcify- rekl bych hodne pomaly (Kolik to bylo procent casu 17?, urcite dost... nevim jestli bych 40K struktur nacet tak rychle.. spis ne) # 50/75 percentile trvá 0.5 s, takze klidne 40 K sekund = 10 hodin... dlouho, i v dost vlaknech.. # budu to delat jednou per job teda.. # tohle je jeste s domain argama, nevadi # asi ne.. rikal jsem, ze kazda domena muze byt jinak definovana.. zalezi podle druheho v paru.. # takze todo, zase pridat pair id? # - to pak ale zas neco budu muset upravit... for chain_domains in (c1_domains__residues, c2_domains__residues): for d1, d2 in itertools.combinations(chain_domains, 2): # key = (plocal.get_interdomain_surface.get_name(),) + (d1.get_full_id(), d2.get_full_id()) pair_id = (c1_residues.get_full_id(), c2_residues.get_full_id()) # hack, chci tam i pair_id plocal.serializer_or_analysis_handler.handle( '2DA', plocal.get_interdomain_surface, plocal.get_interdomain_surface(d1, d2), d1, d2, pair_id) # if key not in plocal.one_struct_analyses_done_set: # plocal.one_struct_analyses_done_set[key] = 1 for d_chain1, d_chain2 in zip(c1_domains__residues, c2_domains__residues): for a in plocal.comparators__domains__residues_param: run_analysis('domain2domain', a, d_chain1, d_chain2) # todo vyres ty divny idcka for d_chain1, d_chain2 in zip(c1_domains__residues, c2_domains__residues): # Convert DomainResidues to DomainResidueData[ResidueId] # asi zas přes mapping... lepší by to bylo, kdyby byl implicitně schovaný třeba na to biopython residue ( # jinak by to nešlo moc ani, leda mit CustomResidue s fieldama bioresidue a label_seq_id, to je ale celkem # naprd, nebo ne? Nefungovalo by to s chainem, ale to stejně nikde nepoužívám... d_chain1 = DomainResidueData[ResidueId]([ ResidueId.from_bio_residue(r, c1_residue_mapping) for r in d_chain1 ], d_chain1.structure_id, d_chain1.chain_id, d_chain1.domain_id) d_chain2 = DomainResidueData[ResidueId]([ ResidueId.from_bio_residue(r, c2_residue_mapping) for r in d_chain2 ], d_chain2.structure_id, d_chain2.chain_id, d_chain2.domain_id) for a in plocal.comparators__domains__residue_ids_param: run_analysis('domain2domain', a, d_chain1, d_chain2) # two-domain arrangements to two-domain arrangements for (d1_chain1, d1_chain2), (d2_chain1, d2_chain2) in itertools.combinations( zip(c1_domains__residues, c2_domains__residues), 2): # (in paper considered if of both apo and holo interdomain iface >= 200 A^2 # if get_interdomain_surface(d1_chain1, d2_chain1) < 200 or get_interdomain_surface(d1_chain2, d2_chain2) < 200: # continue for a in plocal.comparators__2DA__residues_param: run_analysis('chain2DA2chain2DA', a, d1_chain1, d2_chain1, d1_chain2, d2_chain2) d1d2_chain1 = d1_chain1 + d2_chain1 d1d2_chain2 = d1_chain2 + d2_chain2 run_analysis('chain2DA2chain2DA', plocal.get_rmsd, d1d2_chain1, d1d2_chain2) # todo hardcoded analysis