def free_cys_tyr(pdb_utils): parser = PDBParser(PERMISSIVE=1, QUIET=1) _log.debug("procesing free cys/tyr") total = ExperimentalStructure.objects(residue_sets__name__ne = "free_tyr").count() for strdoc in tqdm(ExperimentalStructure.objects(residue_sets__name__ne = "free_tyr").no_cache().timeout(False), total=total): if not (strdoc.residue_set("free_cys") or strdoc.residue_set("free_tyr")): if not os.path.exists(pdb_utils.pdb_path(strdoc.name)): pdb_utils.update_pdb(strdoc.name) if not os.path.exists(pdb_utils.pdb_path(strdoc.name)): continue try: bp_pdb = list(parser.get_structure(strdoc.name, pdb_utils.pdb_path(strdoc.name) ))[0] except PDBConstructionException: continue except TypeError: continue free = {"CYS": [], "TYR": []} codes = {"CYS": "SG", "TYR": "OH"} for x in bp_pdb.get_residues(): if x.resname in codes: neighbor_atoms = set(list(bp_pdb.get_atoms())) - set(list(x)) if (codes[x.resname] in x) and ( not any(map(lambda atom: (x[codes[x.resname]] - atom) <= 3, neighbor_atoms))): free[x.resname].append(x.parent.id + "_" + str(x.id[1])) if free["CYS"]: rs = ResidueSet(name="free_cys", residues=free["CYS"]) strdoc.residue_sets.append(rs) if free["TYR"]: rs = ResidueSet(name="free_tyr", residues=free["TYR"]) strdoc.residue_sets.append(rs) if free["CYS"] or free["TYR"]: strdoc.save()
def important_pfam(seqs_from_pdb_hmm): for query in tqdm(bpsio.parse(seqs_from_pdb_hmm, 'hmmer3-text')): try: pdb, chain, start, end = query.id.split("_") # @UnusedVariable if ExperimentalStructure.objects(name=pdb,residue_sets__name="important_pfam").count(): continue strdoc = ExperimentalStructure.objects(name=pdb).get() if not strdoc.residue_set("important_pfam"): important_rs = ResidueSet(name="important_pfam") domain_rs = None for hit in query: if len(hit): hsp = hit[0] domain_rs = ResidueSet(name=hit.id) i = 0 for x in str(hsp.aln[1].seq): residue = chain + "_" + str(i + int(start)) if x == x.upper(): important_rs.residues.append(residue) i = i + 1 domain_rs.residues.append(residue) if domain_rs: strdoc.residue_sets.append(domain_rs) strdoc.residue_sets.append(important_rs) strdoc.save() except DoesNotExist: pass
def update_quaternary(pdbUtils): ''' Example – Author and computed assembly predictions agree REMARK 350 BIOMOLECULE: 1 REMARK 350 AUTHOR DETERMINED BIOLOGICAL UNIT: DODECAMERIC REMARK 350 SOFTWARE DETERMINED QUATERNARY STRUCTURE: DODECAMERIC Example – Author and computed assembly predictions differ REMARK 350 BIOMOLECULE: 1 REMARK 350 AUTHOR DETERMINED BIOLOGICAL UNIT: HEXAMERIC REMARK 350 APPLY THE FOLLOWING TO CHAINS: A, B, C, D, E, F REMARK 350 BIOMOLECULE: 2 REMARK 350 AUTHOR DETERMINED BIOLOGICAL UNIT: HEXAMERIC REMARK 350 APPLY THE FOLLOWING TO CHAINS: G, H, I, J, K, L REMARK 350 BIOMOLECULE: 3 REMARK 350 SOFTWARE DETERMINED QUATERNARY STRUCTURE: DODECAMERIC REMARK 350 SOFTWARE USED: PISA REMARK 350 TOTAL BURIED SURFACE AREA: 2990 ANGSTROM**2 REMARK 350 SURFACE AREA OF THE COMPLEX: 9330 ANGSTROM**2 REMARK 350 CHANGE IN SOLVENT FREE ENERGY: -40.0 KCAL/MOL REMARK 350 APPLY THE FOLLOWING TO CHAINS: A, B, C, D, E, F, G, H, I, REMARK 350 AND CHAINS: J, K, L ''' total = ExperimentalStructure.objects().count() for strdoc in tqdm(ExperimentalStructure.objects().no_cache(), total=total): if not strdoc.quaternary: try: with open(pdbUtils.pdb_path(strdoc.name)) as h: data = [l for l in h.readlines() if l.startswith("REMARK 350")] biomolecules_index = [i for i, l in enumerate(data) if "BIOMOLECULE:" in l] + [None] biomolecules = [] for s, e in zip(biomolecules_index[0::2], biomolecules_index[1::2]): biomolecule = data[s].split(":")[1].strip() author = [l for l in data[s:e] if "AUTHOR DETERMINED BIOLOGICAL UNIT" in l] if author: author = author[0].split(":")[1].strip() program = [l for l in data[s:e] if " SOFTWARE DETERMINED QUATERNARY STRUCTURE" in l] if program: program = program[0].split(":")[1].strip() biomolecules.append((biomolecule, author, program)) quaternaty = "" for bm in biomolecules: quaternaty = "- Biomolecule " + str(bm[0]) + ": " if (bm[1] or bm[2]) and (bm[1] == bm[2]): quaternaty += ": " + bm[1] elif bm[1]: quaternaty += bm[1] elif bm[2]: quaternaty += bm[2] if len(biomolecules) == 1: quaternaty = quaternaty.replace("- Biomolecule " + str(bm[0]) + ": ", "") strdoc.quaternary = quaternaty strdoc.save() except IndexError: _log.debug("no se puede parsear %s" % strdoc.name) except FileNotFoundError : _log.debug(f"{strdoc.name} could not be found")
def update_binding_residues(distances_tbl): df_binding_dist = pd.read_table(distances_tbl, sep="\t", names=[ "pdb", "chain", "hmm_name", "prot_res", "resname", "res_atom_id", "comp_res_id", "comp_resname", "comp_atom_id", "distance" ]) df_binding_dist["comptype"] = map(lambda x: compound_type[x], df_binding_dist.comp_resname) groups = df_binding_dist.groupby("pdb") total = len(groups) _log.debug("procesing binding") for pdb, df_binding_dist_pdb in tqdm(groups, total=total): try: for r_comp_type in ['LIPID', 'METAL', 'NUCLEOTIDE', 'SUGAR', "DRUG", "COFACTOR"]: if r_comp_type != "COFACTOR": comp_type = r_comp_type else: comp_type = "DRUG" df_comp_dist_pdb_near = df_binding_dist_pdb[ (df_binding_dist_pdb.distance <= 3) & (comp_type == df_binding_dist_pdb.comptype)] if len(df_comp_dist_pdb_near): strdoc = ExperimentalStructure.objects(name=pdb).no_cache().get() rs_name = comp_type.lower() + "_binding" if not strdoc.has_residue_set(rs_name): residue_list = list(set([row.chain + "_" + str(row.prot_res) for i, row in df_comp_dist_pdb_near.iterrows()])) # @UnusedVariable rs = ResidueSet(name=rs_name, residues=residue_list, type="binding") strdoc.residue_sets.append(rs) strdoc.save() except DoesNotExist: _log.warn("%s does not exists" % pdb)
def update_clusters(): for cluster_name, seqs in CDHit().clustered_seq_iterator( "/data/databases/pdb/processed/seqs_from_pdb95.fasta"): _log.debug(cluster_name) cristals = [] cluster = Cluster(name=cluster_name, type="PDB_Segments_95") for seq in seqs: seq_id, seq_start, seq_end, clust_start, clust_end = seq pdb, chain, start, end = seq_id.split("_") cristals.append(pdb) cluster.parts.append( BioProperties(pdb=pdb, chain=chain, start=start, end=end, seq_start=seq_start, seq_end=seq_end, clust_start=clust_start, clust_end=clust_end)) for pdb in set(cristals): try: cristal_doc = ExperimentalStructure.objects(name=pdb).get() cristal_doc.clusters = [ x for x in cristal_doc.clusters if x.type != "PDB_Segments_95" ] if not cristal_doc.cluster(cluster_name): cristal_doc.clusters.append(cluster) cristal_doc.save() except DoesNotExist as ex: print(str(ex))
def procesar_pdb(pdb, pdbUtils): pdb_file = pdbUtils.pdb_path(pdb) if not os.path.exists(pdb_file): with open("/tmp/pdb_load_errors.txt", "a") as handle: handle.write(pdb + "|NOT FOUND: " + pdb_file + " \n") try: structure = parser.get_structure(pdb, pdb_file) models = list(structure) if len(models) == 0: with open("/tmp/pdb_load_errors.txt", "a") as handle: handle.write("Has no models: " + pdb_file + " \n") else: strdoc = ExperimentalStructure(name=pdb, seq_collection_name="pdb") model = structure[0] for chain in model: chaindoc = Chain(name=chain.id, segments=[[y.id[1] for y in list(x)] for x in [chain]]) strdoc.chains.append(chaindoc) for residue in chain: res_id = residue.id[1] molecule = Molecule( resid=res_id, chain=chain.id, compound=residue.get_resname(), compound_type=get_compound_type(residue)) if get_compound_type(residue) == 'RESIDUE': chaindoc.residues.append(molecule) else: molecule.compound_type = get_compound_type(residue) # if not [x for x in strdoc.ligands if (x.compound_type == molecule.compound_type) and (x.compound_type == 'SOLVENT')]: if molecule.compound_type != 'SOLVENT': strdoc.ligands.append(molecule) try: complete_pockets(pdb, strdoc, structure, pdbUtils) except: pass complete_pdb_attrs(pdb, strdoc, pdbUtils) strdoc.save() except Exception as ex: with open("/tmp/pdb_load_errors.txt", "a") as handle: handle.write(pdb + "|" + str(ex) + "\n")
def cluster_aligned_annotation(self, model, template_aln, segment, cluster_aligned_segment, cluster_res_start): segment.clust_start segment_start = int(segment.start) segment_end = int(segment.end) segment_str = "_".join( [segment.pdb, segment.chain, str(segment_start).split(".")[0], str(segment.end).split(".")[0]]) if segment_str != template_aln.aln_hit.name: template_eq = ExperimentalStructure.objects(name=segment.pdb).get() # prot IVAGRVSQKMAPVLRQIYDQMAEPKWVLAMGVCAS # template **IVAGAAS--MAPVLQQIYDQM--PKWVLAMGVC--** # template_eq -----AS--MAPVVQQILDQ--------------- # template_eq2 **IVAAAS--MAPVVQQILDQ--------------- # template_eq3 **IVAAAS--MAPVVQQILDQQM--PKWVLAMGVC--** template_res_to_cluster_res_map = lambda template_res: ( template_res - cluster_res_start) - segment.clust_start + segment_start alignment_start_after_cluster_start = cluster_aligned_segment.residue_numbers()[0] >= segment.clust_start eq_start_residue = segment_start if alignment_start_after_cluster_start else segment_start + template_aln.aln_hit.start alignment_end_before_cluster_end = cluster_aligned_segment.residue_numbers()[-1] <= segment.clust_end eq_end_residue = segment_end if alignment_end_before_cluster_end else template_res_to_cluster_res_map( template_aln.hit_res_end) eq_aligned_segment = ResidueSet(name="aln", residues=[segment.chain + "_" + str(i + eq_start_residue) for i in range(len(template_aln.aln_query.seq)) if not template_aln.is_gap(i)]) def map_eq_resid_to_query_resid(eq_res_id, template_aln, segment, segment_start): pos_in_eq = eq_res_id - segment_start pos_in_template = pos_in_eq + segment.clust_start pos_in_query = template_aln.map_pos_hit_query(pos_in_template) # - template_aln.aln_hit.start return pos_in_query eq_aligned_csas = (template_eq.residue_set("csa").in_range(eq_start_residue, eq_end_residue) & eq_aligned_segment).residue_numbers() if eq_aligned_csas: aligned_csas_projected = [map_eq_resid_to_query_resid(eq_res_id, template_aln, segment, segment_start) for eq_res_id in eq_aligned_csas] residue_set_csa = ResidueSet(name="csa_" + segment.pdb, type="catalitic_projected", residues=["_" + str(resid) for resid in aligned_csas_projected]) model.residue_sets.append(residue_set_csa) for comp_type in main_compound_types: binding_name = comp_type.lower() + "_binding" eq_aligned_binding = (template_eq.residue_set(binding_name).in_range(eq_start_residue, eq_end_residue) & eq_aligned_segment).residue_numbers() if eq_aligned_binding: aligned_binding_projected = [ map_eq_resid_to_query_resid(eq_res_id, template_aln, segment, segment_start) for eq_res_id in eq_aligned_binding if eq_res_id] residue_set_binding = ResidueSet(name=binding_name + "_" + segment.pdb, type=binding_name + "_projected", residues=["_" + str(resid) for resid in aligned_binding_projected]) model.residue_sets.append(residue_set_binding)
def aligned_annoations(self, model): model.residue_sets = [x for x in model.residue_sets if not any(map(lambda rsname: x.name.startswith(rsname), ["csa"] + [y.lower() + "_binding" for y in main_compound_types]))] for template_aln in model.templates: pdb, chain= template_aln.aln_hit.name.split("_") segment_start = template_aln.aln_hit.start segment_end = template_aln.aln_hit.end try: template = ExperimentalStructure.objects(name=pdb).get() except Structure.DoesNotExist as ex: print([ex,pdb]) continue is_aligned = not ((int(segment_start) == -1) & (int(segment_end) == -1)) if is_aligned: start_residue = template_aln[0].h_resid end_residue = template_aln[-1].h_resid aligned_segment = ResidueSet(name="aln", residues=[chain + "_" + str(template_aln[i].h_resid) for i in range(len(template_aln.aln_query.seq)) if not template_aln.is_gap(i)]) aligned_csas = template.residue_set("csa").in_range(start_residue, end_residue) & aligned_segment if len(aligned_csas): aln_query = [" _" + str(template_aln.aln_pos_from_h_resid(int(x.split("_")[1])).q_resid) for x in aligned_csas.residues] model.residue_sets.append( ResidueSet(name="csa_" + pdb, residues=aln_query, type="catalitic_projected")) for comp_type in main_compound_types: binding_name = comp_type.lower() + "_binding" aligned_binding = template.residue_set(binding_name).in_range(start_residue, end_residue) & aligned_segment def query_residue(pdb_resid): return " _" + str(template_aln.aln_pos_from_h_resid(pdb_resid).q_resid) aligned_binding_residues = [query_residue(pdb_resid) for pdb_resid in aligned_binding.residue_numbers()] if aligned_binding_residues: rsbinding = ResidueSet(name=binding_name + "_" + pdb, residues=aligned_binding_residues, type=binding_name + "_projected") model.residue_sets.append(rsbinding)
def update_csa(csa_txt): df_csa = pd.read_csv(csa_txt) for pdb in tqdm(set(df_csa["PDB ID"])): try: strdoc = ExperimentalStructure.objects(name=pdb).no_cache().get() except ExperimentalStructure.DoesNotExist: _log.warn(pdb + " csa pdb does not exists...") continue pdb = strdoc.name pdb_csa = df_csa[df_csa["PDB ID"] == pdb] if len(pdb_csa) > 0: if not strdoc.has_residue_set("csa"): csas = [str(row["CHAIN ID"]) + "_" + str(row["RESIDUE NUMBER"]) for i, row in pdb_csa.iterrows()] # @UnusedVariable csa_res_set = ResidueSet(name="csa", type="catalitic", residues=csas) strdoc.residue_sets.append(csa_res_set)
"pockets.0": { "$exists": 0 } }, {"name": 1}) } procesados = { x["name"]: 1 for x in db.structures.find({"seq_collection_name": "pdb"}, {"name": 1}) } pdbs = list(pdbUtils) for (pdb, pdb_file) in tqdm(pdbs): if pdb in procesados: if pdb in procesados_sin_pocket: q = ExperimentalStructure.objects(seq_collection_name="pdb", name=pdb).no_cache() if q: strdoc = q.get() try: structure = parser.get_structure( pdb, pdbUtils.pdb_path(pdb)) try: complete_pockets(pdb, strdoc, structure, pdbUtils) strdoc.save() except: pass complete_pdb_attrs(pdb, strdoc, pdbUtils) strdoc.save() except Exception as ex: