def __rebuildCache(self, **kwargs): mU = MarshalUtil() # source directory path srcDirPath = kwargs.get("srcDirPath", None) # cache details cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"}) useCache = kwargs.get("useCache", True) entrySaveLimit = kwargs.get("entrySaveLimit", None) abbreviated = str(kwargs.get("abbreviated", "TEST")).upper() # # cacheDirPath = kwargs.get("cacheDirPath", None) cacheDirPath = self.__cacheDirPath pyVersion = sys.version_info[0] ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json" saveFilePath = os.path.join(cacheDirPath, "sifts-summary-py%s.%s" % (str(pyVersion), ext)) # ssD = {} try: if useCache and os.access(saveFilePath, os.R_OK): ssD = mU.doImport(saveFilePath, **cacheKwargs) else: if not srcDirPath: logger.error("Missing SIFTS source path details") return ssD ssD = self.__getSummaryMapping(srcDirPath, abbreviated=abbreviated) if entrySaveLimit: ssD = {k: ssD[k] for k in list(ssD.keys())[:entrySaveLimit]} mU.mkdir(cacheDirPath) ok = mU.doExport(saveFilePath, ssD, **cacheKwargs) logger.debug("Saving SIFTS summary serialized data file %s (%d) status %r", saveFilePath, len(ssD), ok) except Exception as e: logger.exception("Failing with %s", str(e)) return ssD
def jsonSchemaCompare(self, databaseName, collectionName, encodingType, level, extraOpts=None): """Compare computed JSON schema defintion with current source/cached version. Args: databaseName (str): schema name collectionName (str): collection name encodingType (str): schema data type conventions (JSON|BSON) level (str): metadata level (min|full) extraOpts (str): extra schema construction options Returns: (str): path to the difference file or None """ mU = MarshalUtil(workPath=self.__workPath) schemaDiffPath = os.path.join(self.__cachePath, "schema_diff") mU.mkdir(schemaDiffPath) schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType, level) fn = self.__fileU.getFileName(schemaLocator) schemaPath = os.path.join(self.__jsonSchemaCachePath, fn) # sD = self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, saveSchema=False, extraOpts=extraOpts) v2 = self.__getSchemaVersion(sD) # ---- # tPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaPath) + "-test") # logger.info("Exporting json schema to %s", tPath) # mU.doExport(tPath, sD, fmt="json", indent=3) # ---- # sDCache = mU.doImport(schemaPath, fmt="json") v1 = self.__getSchemaVersion(sDCache) if not v1: logger.error("no version for %s - %s %s", schemaLocator, databaseName, collectionName) # numDiff, difD = self.schemaCompare(sDCache, sD) # jD = diff(sDCache, sD, marshal=True, syntax="explicit") diffPath = None if numDiff: logger.debug("diff for %s %s %s %s = \n%s", databaseName, collectionName, encodingType, level, pprint.pformat(difD, indent=3, width=100)) bn, _ = os.path.splitext(fn) diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json") mU.doExport(diffPath, difD, fmt="json", indent=3) return diffPath
def testRoundTripOps(self): """Test IO operation on generated related molecules""" try: oeIoU = OeIoUtils() mU = MarshalUtil() mU.mkdir(self.__molfileDirPath) ccMolD = self.__getChemCompDefs() oemf = OeMoleculeFactory() for ccId, ccObj in list(ccMolD.items())[:10]: # ---- tId = oemf.setChemCompDef(ccObj) self.assertEqual(tId, ccId) relatedIdxD = oemf.buildRelated(limitPerceptions=False) logger.info("%s generated %d molecular forms", ccId, len(relatedIdxD)) for sId, idxD in relatedIdxD.items(): logger.info("sId %r smiles %r", sId, idxD["smiles"]) mol2Path = os.path.join(self.__molfileDirPath, sId + ".mol2") oeMol = oeIoU.descriptorToMol(idxD["smiles"], "oe-iso-smiles", limitPerceptions=False, messageTag=None) oeIoU.write(mol2Path, oeMol, constantMol=True, addSdTags=True) tMolL = oeIoU.fileToMols(mol2Path) # nextMol2Path = os.path.join(self.__molfileDirPath, sId + "-next.mol2") oeIoU.write(nextMol2Path, tMolL[0], constantMol=True, addSdTags=True) sdfPath = os.path.join(self.__molfileDirPath, sId + ".mol") oeMol = oeIoU.descriptorToMol(idxD["smiles"], "oe-iso-smiles", limitPerceptions=False, messageTag=None) oeIoU.write(sdfPath, oeMol, constantMol=True, addSdTags=True) # tMolL = oeIoU.fileToMols(sdfPath) nextSdfPath = os.path.join(self.__molfileDirPath, sId + "-next.sdf") oeIoU.write(nextSdfPath, tMolL[0], constantMol=True, addSdTags=True) # ---- except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def __rebuildCache(self, urlTargetIsoLtwa, dirPath, useCache): """Rebuild the cache of ISO abbreviation term data Args: urlTargetIsoLtwa (str): URL for ISO4 LTWA title word abbreviations dirPath (str): cache path useCache (bool): flag to use cached files Returns: tuple: (dict) title word abbreviations (dict) language conflict dictionary (list) multi-word abbreviation targets Notes: ISO source file (tab delimited UTF-16LE) is maintained at the ISSN site - https://www.issn.org/wp-content/uploads/2013/09/LTWA_20160915.txt """ aD = {} mU = MarshalUtil(workPath=dirPath) fmt = "json" ext = fmt if fmt == "json" else "pic" isoLtwaNamePath = os.path.join(dirPath, "iso-ltwa.%s" % ext) logger.debug("Using cache data path %s", dirPath) mU.mkdir(dirPath) if not useCache: for fp in [isoLtwaNamePath]: try: os.remove(fp) except Exception: pass # if useCache and mU.exists(isoLtwaNamePath): aD = mU.doImport(isoLtwaNamePath, fmt=fmt) logger.debug("Abbreviation name length %d", len(aD["abbrev"])) elif not useCache: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetIsoLtwa, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetIsoLtwa)) ok = fU.get(urlTargetIsoLtwa, fp) aD = self.__getLtwaTerms(dirPath, fp) ok = mU.doExport(isoLtwaNamePath, aD, fmt=fmt) logger.debug("abbrevD keys %r", list(aD.keys())) logger.debug("Caching %d ISO LTWA in %s status %r", len(aD["abbrev"]), isoLtwaNamePath, ok) # abbrevD = aD["abbrev"] if "abbrev" in aD else {} conflictD = aD["conflicts"] if "conflicts" in aD else {} multiWordTermL = aD[ "multi_word_abbrev"] if "multi_word_abbrev" in aD else [] # return abbrevD, conflictD, multiWordTermL
def schemaDefCompare(self, databaseName, dataTyping="ANY"): """Compare computed schema defintion with current source/cached version. Args: databaseName (str): schema definition name for comparison dataTyping (str, optional): data type conventions for the schema comparison. Defaults to "ANY". Returns: (str): file path for schema difference or None """ mU = MarshalUtil(workPath=self.__workPath) schemaDiffPath = os.path.join(self.__cachePath, "schema_diff") mU.mkdir(schemaDiffPath) schemaPath = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) fn = self.__fileU.getFileName(schemaPath) sD = self.makeSchemaDef(databaseName, dataTyping=dataTyping) v2 = sD["DATABASE_VERSION"] # ---- # tPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath) + "-test") # logger.info("Exporting schema def to %s", tPath) # mU.doExport(tPath, sD, fmt="json", indent=3) # sD = mU.doImport(tPath, fmt="json") # ---- cPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath)) sDCache = mU.doImport(cPath, fmt="json") v1 = sDCache["DATABASE_VERSION"] # numDiff, difD = self.schemaCompare(sDCache, sD) # # jD = diff(sDCache, sD, syntax="explicit", marshal=True) diffPath = None if numDiff: bn, _ = os.path.splitext(fn) diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json") # logger.info("diff for %s %s = \n%s", databaseName, dataTyping, pprint.pformat(difD, indent=3, width=100)) mU.doExport(diffPath, difD, fmt="json", indent=3) # return diffPath
def __reload(self, urlTarget, dirPath, useCache=True): """Reload local cache of mapping resources to support validation report reader and translator. Args: urlTarget (list, str): URL for schema mapping file dirPath (str): path to the directory containing cache files useCache (bool, optional): flag to use cached files. Defaults to True. Returns: (object): instance of ValidationReportReader() """ mapD = {} # mU = MarshalUtil() fU = FileUtil() fn = fU.getFileName(urlTarget) mappingFilePath = os.path.join(dirPath, fn) mU.mkdir(dirPath) # # if not useCache: # for fp in [mappingFilePath]: # try: # os.remove(fp) # except Exception: # pass # # logger.debug("Loading validation mapping data in %s (useCache %r)", fn, useCache) if useCache and fU.exists(mappingFilePath): mapD = mU.doImport(mappingFilePath, fmt="json") else: logger.info("Fetching url %s to resource file %s", urlTarget, mappingFilePath) tS = uuid.uuid4().hex tP = os.path.join(dirPath, "._" + tS) ok = fU.get(urlTarget, tP) if ok: mapD = mU.doImport(tP, fmt="json") os.replace(tP, mappingFilePath) return mapD
class RemovedHoldingsProvider(object): """Provide an inventory of removed repository content.""" def __init__(self, **kwargs): self.__dirPath = kwargs.get("holdingsDirPath", ".") useCache = kwargs.get("useCache", True) baseUrl = kwargs.get("baseUrl", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/development/fall_back/holdings/") urlTarget = kwargs.get("removedTargetUrl", os.path.join(baseUrl, "removed_holdings.json.gz")) urlFallbackTarget = kwargs.get("removedTargetUrl", os.path.join(baseUrl, "removed_holdings.json.gz")) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__invD = self.__reload(urlTarget, urlFallbackTarget, self.__dirPath, useCache=useCache) def testCache(self, minCount=1000): logger.info("Inventory length cD (%d)", len(self.__invD)) if len(self.__invD) > minCount: return True return False def getStatusCode(self, entryId): """Return the status code for the removed entry""" try: return self.__invD[entryId.upper()]["status_code"] except Exception as e: logger.debug("Failing for %r with %s", entryId, str(e)) return None def getRemovedInfo(self, entryId): """Return the dictionary describing the details for this removed entry""" try: return self.__invD[entryId.upper()] except Exception as e: logger.debug("Failing for %r with %s", entryId, str(e)) return {} def getContentTypes(self, entryId): """Return the removed content types for the input entry identifier""" try: return sorted(self.__invD[entryId.upper()]["content_type"].keys()) except Exception as e: logger.debug("Failing for %r with %s", entryId, str(e)) return [] def getContentTypePathList(self, entryId, contentType): """Return the removed content types for the input entry identifier""" try: return ( self.__invD[entryId.upper()]["content_type"][contentType] if isinstance(self.__invD[entryId.upper()]["content_type"][contentType], list) else [self.__invD[entryId.upper()]["content_type"][contentType]] ) except Exception as e: logger.debug("Failing for %r %r with %s", entryId, contentType, str(e)) return [] def getInventory(self): """Return the removed inventory dictionary""" try: return self.__invD except Exception as e: logger.debug("Failing with %s", str(e)) return {} def __reload(self, urlTarget, urlFallbackTarget, dirPath, useCache=True): invD = {} fU = FileUtil() fn = fU.getFileName(urlTarget) fp = os.path.join(dirPath, fn) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(fp): invD = self.__mU.doImport(fp, fmt="json") logger.debug("Reading cached inventory (%d)", len(invD)) else: logger.info("Fetch inventory from %s", urlTarget) ok = fU.get(urlTarget, fp) if not ok: ok = fU.get(urlFallbackTarget, fp) # if ok: invD = self.__mU.doImport(fp, fmt="json") # return invD
def search(self, queryTargetId, queryTargetPath, resultPath, normalizeFlag=True, maxHits=50, searchType="similarity", suppressMetals=False): """Search the CCDC database for similar or substructure matches for the input query molecule. Args: queryTargetId (str): query identifier queryTargetPath (str): path to the query molfile (mol, sdf, mol2) resultPath (str): output path to match results normalizeFlag (bool, optional): do standard perceptions on matching molecules. Defaults to True. maxHits (int, optional): maximum number of matches to return. Defaults to 50. searchType (str, optional): search mode (substructure, similarity). Defaults to "similarity". suppressMetals (bool, optional): filter structures containing metals. Defaults to False. Returns: (int): number of matches """ mU = MarshalUtil() logger.info("Start search for target %s path %s result path %s", queryTargetId, queryTargetPath, resultPath) # summaryList = [] # targetDirPath = os.path.dirname(queryTargetPath) cifTargetPath = os.path.join(targetDirPath, queryTargetId + ".cif") # targetStructures = EntryReader(queryTargetPath) dirPath = os.path.join(resultPath, queryTargetId) numHits = 0 for ii, e in enumerate(targetStructures, 1): numHits = 0 startTime = time.time() targetMol = e.molecule if normalizeFlag: targetMol.assign_bond_types(which="unknown") targetMol.standardise_aromatic_bonds() targetMol.standardise_delocalised_bonds() # logger.info("(%d) begin %s search - query id %s", ii, searchType, queryTargetId) if searchType == "similarity": hits = self.__similaritySearch(targetMol, suppressMetals=suppressMetals) elif searchType == "substructure": hits = self.__moleculeSubstructureSearch( targetMol, suppressMetals=suppressMetals) else: hits = [] logger.info("(%d) completed search query id %s in %.3f seconds", ii, queryTargetId, time.time() - startTime) if hits: numHits += len(hits) logger.info("(%d) search for %s matched %d: %r", ii, queryTargetId, numHits, [targetHit.identifier for targetHit in hits]) # for targetHit in hits[:maxHits]: # hI = CcdcMatchIndexInst() hI.setCsdVersion(csd_version()) hI.setCsdDirectory(csd_directory()) hI.setTargetId(queryTargetId) hI.setTargetPath(queryTargetPath) if mU.exists(cifTargetPath): hI.setTargetCcPath(cifTargetPath) hI.setIdentifier(targetHit.identifier) hI.setMatchType(searchType) try: hI.setRFactor(targetHit.entry.r_factor) hI.setChemicalName(targetHit.entry.chemical_name) hI.setTemperature(targetHit.entry.temperature) hI.setRadiationSource(targetHit.entry.radiation_source) hI.setHasDisorder("N") cit = targetHit.entry.publication if cit.doi is not None: hI.setCitationDOI(cit.doi) if searchType == "similarity": hI.setSimilarityScore(targetHit.similarity) elif searchType == "substructure": hI.setMatchedAtomLength( len(targetHit.match_atoms())) except Exception as e: logger.exception("Failing with %s", str(e)) # # mU.mkdir(dirPath) mol2L = [] if searchType == "substructure": for jj, mc in enumerate(targetHit.match_components(), 1): fp = os.path.join( dirPath, queryTargetId + "_" + targetHit.identifier + "_%03d" % jj + ".mol2") mol2L.append(fp) with MoleculeWriter(fp) as ofh: ofh.write(mc) # Replace the title line with open(fp) as fin: lines = fin.readlines() lines[1] = lines[1].replace( "00", targetHit.identifier) # with open(fp, "w") as fout: fout.write("".join(lines)) # fp = os.path.join( dirPath, queryTargetId + "_" + targetHit.identifier + "_%03d" % jj + ".sdf") with MoleculeWriter(fp) as ofh: ofh.write(mc) # Replace the title line with open(fp) as fin: lines = fin.readlines() lines[0] = lines[0].replace( "00", targetHit.identifier) # with open(fp, "w") as fout: fout.write("".join(lines)) # # Check for multiple generated result files - # for jj, fp in enumerate(mol2L, 1): logger.debug("(%d) adding component fp %s", jj, fp) hI.setMatchNumber(jj) hI.setMol2Path(fp) tt = fp[:-4] + "sdf" hI.setMolPath(tt) summaryList.append(copy.deepcopy(hI.get())) # else: hI.setMatchNumber(1) summaryList.append(copy.deepcopy(hI.get())) else: logger.info("(%d) search for %s returns no matches", ii, targetMol.identifier) hits = None # if numHits > 0: mU.mkdir(dirPath) fp = os.path.join(dirPath, queryTargetId + "-index.json") cmI = CcdcMatchIndex(indexFilePath=fp, verbose=self.__verbose) cmI.load(summaryList) cmI.writeIndex() return numHits
class EntityPolymerExtractor(object): """Utilities to extract polymer related data from entry and entity collections.""" def __init__(self, cfgOb, **kwargs): self.__cfgOb = cfgOb self.__resourceName = "MONGO_DB" self.__mU = MarshalUtil() self.__entryD, self.__authAsymIdIndex = self.__rebuildCache(**kwargs) # def __rebuildCache(self, **kwargs): useCache = kwargs.get("useCache", True) dirPath = kwargs.get("exdbDirPath", ".") cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"}) # ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json" fn = "entity-polymer-extracted-data-cache" + "." + ext cacheFilePath = os.path.join(dirPath, fn) # cD = {"entryD": {}, "authIdxD": {}} try: self.__mU.mkdir(dirPath) if not useCache: for fp in [cacheFilePath]: try: os.remove(fp) except Exception: pass if useCache and cacheFilePath and os.access(cacheFilePath, os.R_OK): cD = self.__mU.doImport(cacheFilePath, **cacheKwargs) else: entryD = self.__selectEntries(**kwargs) entryD = self.__selectPolymerEntities(entryD, **kwargs) authIdxD = self.__buildIndices(entryD) cD["entryD"] = entryD cD["authIdxD"] = authIdxD if cacheFilePath: ok = self.__mU.doExport(cacheFilePath, cD, **cacheKwargs) logger.info("Saved entity-polymer extracted results (%d) status %r in %s", len(entryD), ok, cacheFilePath) except Exception as e: logger.exception("Failing with %s", str(e)) return cD["entryD"], cD["authIdxD"] def __buildIndices(self, entryD): indD = {} for entryId, eD in entryD.items(): entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {} for entityId, pD in entityD.items(): for authAsymId in pD["auth_asym_ids"]: # avoid tuples for json serialization # indD[(entryId, authAsymId)] = entityId indD[entryId + "_" + authAsymId] = entityId return indD def getEntryCount(self): return len(self.__entryD) def getRefSeqAccessions(self, dbName): acL = [] try: for _, eD in self.__entryD.items(): entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {} for _, pD in entityD.items(): for dD in pD["struct_ref"]: if "pdbx_db_accession" in dD and dD["db_name"] == dbName: acL.append(dD["pdbx_db_accession"]) return list(set(acL)) except Exception as e: logger.exception("Failing with %s", str(e)) return acL def countRefSeqAccessions(self, dbName): cD = {} try: for _, eD in self.__entryD.items(): entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {} for _, pD in entityD.items(): iCount = 0 for dD in pD["struct_ref"]: if "pdbx_db_accession" in dD and dD["db_name"] == dbName: iCount += 1 cD[iCount] = cD[iCount] + 1 if iCount in cD else 1 except Exception as e: logger.exception("Failing with %s", str(e)) return cD def countRefSeqAccessionDbType(self): cD = {} try: for _, eD in self.__entryD.items(): entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {} for _, pD in entityD.items(): for dD in pD["struct_ref"]: if "pdbx_db_accession" in dD and "db_name" in dD: cD[dD["db_name"]] = cD[dD["db_name"]] + 1 if dD["db_name"] in cD else 1 except Exception as e: logger.exception("Failing with %s", str(e)) return cD def countRefSeqAccessionAny(self): cD = {} try: for _, eD in self.__entryD.items(): entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {} for _, pD in entityD.items(): iCount = len(pD["struct_ref"]) # if iCount == 0: # logger.info("entryId %r " % (entryId, entityId)) cD[iCount] = cD[iCount] + 1 if iCount in cD else 1 except Exception as e: logger.exception("Failing with %s", str(e)) return cD def getUniqueTaxons(self): # tD = {} try: for _, eD in self.__entryD.items(): entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {} for _, pD in entityD.items(): # logger.info("Entity dictionary %r", pD.keys()) if "rcsb_entity_source_organism" in pD: for dd in pD["rcsb_entity_source_organism"]: if "ncbi_taxonomy_id" in dd: tD[dd["ncbi_taxonomy_id"]] = tD[dd["ncbi_taxonomy_id"]] + 1 if dd["ncbi_taxonomy_id"] in tD else 1 except Exception as e: logger.exception("Failing with %s", str(e)) logger.info("Taxon coverage %d", len(tD)) return tD def getOrigTaxons(self): # tD = {} try: for entryId, eD in self.__entryD.items(): entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {} for entityId, pD in entityD.items(): # logger.info("Entity dictionary %r", pD.keys()) if "original_taxonomy_ids" in pD: for tV in pD["original_taxonomy_ids"]: tD.setdefault(entryId, []).append((entityId, tV)) if entryId not in tD: logger.debug("No taxonomy for %s", entryId) except Exception as e: logger.exception("Failing with %s", str(e)) logger.info("Taxon coverage %d", len(tD)) return tD def countRefSeqAccessionByTaxon(self, dbNameList=None): # tD = {} iCount = 0 # try: for _, eD in self.__entryD.items(): entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {} for _, pD in entityD.items(): # logger.info("Entity dictionary %r", pD.keys()) if "rcsb_entity_source_organism" in pD: for dd in pD["rcsb_entity_source_organism"]: if "ncbi_taxonomy_id" in dd: tId = dd["ncbi_taxonomy_id"] for dD in pD["struct_ref"]: if "pdbx_db_accession" in dD and "db_name" in dD: if dD["db_name"] in dbNameList: tD.setdefault(tId, []).append(dD["pdbx_db_accession"]) iCount += 1 except Exception as e: logger.exception("Failing with %s", str(e)) logger.info("Total observed accessions %d", iCount) return tD def checkRefSeqAlignRange(self, dbName): ok = True try: eCount = 0 aCount = 0 tCount = 0 for entryId, eD in self.__entryD.items(): entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {} for entityId, pD in entityD.items(): for dD in pD["struct_ref"]: if "db_name" in dD and dD["db_name"] == dbName: if "pdbx_db_accession" in dD and "alignD" in dD and "pdbx_seq_one_letter_code" in dD and "pdbx_align_begin" in dD: seqLen = len(dD["pdbx_seq_one_letter_code"]) dbBegin = 100000000 dbEnd = -1 refSeqDbBegin = dD["pdbx_align_begin"] for authAsymId, alDL in dD["alignD"].items(): tCount += 1 difL = [] for alD in alDL: tBeg = alD["db_align_beg"] tEnd = alD["db_align_end"] tDif = tEnd - tBeg + 1 difL.append(tDif) dbBegin = min(tBeg, dbBegin) dbEnd = max(tEnd, dbEnd) # range is calculate on off - # if seqLen < dbEnd - dbBegin + 1: if seqLen < dbEnd - dbBegin and not refSeqDbBegin == dbBegin: fDif = sum(difL) logger.debug( "Bad alignment for %r %r %r %r (%d) seqLen %r (%d) dbBegin %r dbEnd %r difL %r tDif %r", entryId, entityId, authAsymId, alD["pdbx_strand_id"], len(alDL), seqLen, dbEnd - dbBegin + 1, dbBegin, dbEnd, difL, fDif, ) aCount += 1 else: eCount += 1 logger.info("Incomplete %s struct_ref record count %d", dbName, eCount) logger.info("Inconsistent %s db reference alignments %d/%d", dbName, aCount, tCount) except Exception as e: logger.exception("Failing with %s", str(e)) ok = False return ok def getEntityRefSeqAccessions(self, dbName, entryId, entityId): acL = [] try: dL = self.__entryD[entryId]["selected_polymer_entities"][entityId]["struct_ref"] acL = list(set([d["pdbx_db_accession"] for d in dL if d["db_name"] == dbName])) except Exception as e: logger.exception("Failing with %s %r %r %s", dbName, entryId, entityId, str(e)) return acL def __selectEntries(self, **kwargs): """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)""" dbName = kwargs.get("dbName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_entry") selectionQueryD = kwargs.get("entrySelectionQuery", {}) # entryD = {} try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(dbName, collectionName): logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName)) qD = {} if selectionQueryD: qD.update(qD) selectL = ["rcsb_entry_container_identifiers"] dL = mg.fetch(dbName, collectionName, selectL, queryD=qD) logger.info("Selection %r fetch result count %d", selectL, len(dL)) # for dD in dL: # if ( ("rcsb_entry_container_identifiers" in dD) and ("entry_id" in dD["rcsb_entry_container_identifiers"]) and ("polymer_entity_ids" in dD["rcsb_entry_container_identifiers"]) and dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"] ): entryD[dD["rcsb_entry_container_identifiers"]["entry_id"]] = {"polymer_entity_ids": dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"]} except Exception as e: logger.exception("Failing with %s", str(e)) return entryD # def __selectPolymerEntities(self, entryD, **kwargs): """Skeleton entity selector recovering essential biological sequence mapping features for macromolecules (default type = protein). "1CP9": { "polymer_entity_ids": [ "1", "2" ], "selected_polymer_entities": { "1": { "rcsb_multiple_source_flag": "N", "asym_ids": [ "A" ], "auth_asym_ids": [ "A" ], "entity_id": "1", "type": "polypeptide(L)", "rcsb_entity_polymer_type": "Protein", "rcsb_entity_source_organism": [ { "ncbi_taxonomy_id": 587, "beg_seq_num": 1, "end_seq_num": 205, "ncbi_scientific_name": "Providencia rettgeri" } ], "struct_ref": [ { "id": "1", "db_name": "UNP", "pdbx_db_accession": "Q7WZI9", "entity_id": "1", "pdbx_seq_one_letter_code": "QSTQIKIERDNYGVPHIYANDTYSLFYGYGYA...", "alignD": { "A": [ { "align_id": "1", "ref_id": "1", "pdbx_PDB_id_code": "1CP9", "pdbx_strand_id": "A", "seq_align_beg": 1, "seq_align_end": 205, "pdbx_db_accession": "Q7WZI9", "db_align_beg": 24, "db_align_end": 228, "pdbx_auth_seq_align_beg": "1", "pdbx_auth_seq_align_end": "205", "rcsb_entity_id": "1" } ] } } ] }, "2": { "rcsb_multiple_source_flag": "N", "asym_ids": [ "B" ], "auth_asym_ids": [ "B" ], "entity_id": "2", "type": "polypeptide(L)", "rcsb_entity_polymer_type": "Protein", "rcsb_entity_source_organism": [ { "ncbi_taxonomy_id": 587, "beg_seq_num": 1, "end_seq_num": 553, "ncbi_scientific_name": "Providencia rettgeri" } ], "struct_ref": [ { "id": "2", "db_name": "UNP", "pdbx_db_accession": "Q7WZI9", "entity_id": "2", "pdbx_seq_one_letter_code": "SNVWLVGKTKASGAKAILLNGPQFGWFNPAYTYGIGLHG", "alignD": { "B": [ { "align_id": "2", "ref_id": "2", "pdbx_PDB_id_code": "1CP9", "pdbx_strand_id": "B", "seq_align_beg": 1, "seq_align_end": 553, "pdbx_db_accession": "Q7WZI9", "db_align_beg": 285, "db_align_end": 837, "pdbx_auth_seq_align_beg": "1", "pdbx_auth_seq_align_end": "553", "rcsb_entity_id": "2" } ] } } ] } } }, """ dbName = kwargs.get("dbName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity") resultKey = kwargs.get("resultKey", "selected_polymer_entities") entryLimit = kwargs.get("entryLimit", None) selectionQueryD = kwargs.get("entitySelectionQuery", {"entity_poly.rcsb_entity_polymer_type": "Protein"}) # try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(dbName, collectionName): logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName)) selectL = [ "rcsb_polymer_entity_container_identifiers", "entity.rcsb_multiple_source_flag", "entity_poly.type", "entity_poly.rcsb_entity_polymer_type", "entity_poly.pdbx_seq_one_letter_code_can", "rcsb_entity_source_organism.ncbi_taxonomy_id", "rcsb_entity_source_organism.ncbi_scientific_name", "rcsb_entity_source_organism.beg_seq_num", "rcsb_entity_source_organism.end_seq_num", "struct_ref.id", "struct_ref.pdbx_db_accession", "struct_ref.db_name", "struct_ref.entity_id", "struct_ref.pdbx_seq_one_letter_code", "struct_ref.pdbx_align_begin", "struct_ref_seq", # "entity_src_nat.pdbx_ncbi_taxonomy_id", "entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id", "entity_src_gen.pdbx_host_org_ncbi_taxonomy_id", "pdbx_entity_src_syn.ncbi_taxonomy_id", ] iCount = 0 for entryId in entryD: # if resultKey in entryD[entryId]: continue # qD = {"rcsb_polymer_entity_container_identifiers.entry_id": entryId} qD.update(selectionQueryD) # dL = mg.fetch(dbName, collectionName, selectL, queryD=qD) logger.debug("%s query %r fetch result count %d", entryId, qD, len(dL)) eD = {} for ii, dD in enumerate(dL, 1): rD = {} logger.debug("%s (%4d) d is %r", entryId, ii, dD) if "entity" in dD: rD["rcsb_multiple_source_flag"] = dD["entity"]["rcsb_multiple_source_flag"] if "rcsb_multiple_source_flag" in dD["entity"] else "N" # if "rcsb_polymer_entity_container_identifiers" in dD: rD["asym_ids"] = dD["rcsb_entity_container_identifiers"]["asym_ids"] if "asym_ids" in dD["rcsb_entity_container_identifiers"] else [] rD["auth_asym_ids"] = dD["rcsb_entity_container_identifiers"]["auth_asym_ids"] if "auth_asym_ids" in dD["rcsb_entity_container_identifiers"] else [] rD["entity_id"] = dD["rcsb_entity_container_identifiers"]["entity_id"] # if "entity_poly" in dD: rD["type"] = dD["entity_poly"]["type"] if "type" in dD["entity_poly"] else None rD["rcsb_entity_polymer_type"] = dD["entity_poly"]["rcsb_entity_polymer_type"] if "rcsb_entity_polymer_type" in dD["entity_poly"] else None rD["entity_polymer_length"] = len(dD["entity_poly"]["pdbx_seq_one_letter_code_can"]) if "pdbx_seq_one_letter_code_can" in dD["entity_poly"] else 0 # tL = [] if "rcsb_entity_source_organism" in dD: for tD in dD["rcsb_entity_source_organism"]: tL.append(tD) rD["rcsb_entity_source_organism"] = copy.copy(tL) # qDL = [] if "struct_ref" in dD: for tD in dD["struct_ref"]: if "db_name" in tD: tD["db_name"] = str(tD["db_name"]).upper().strip() tD["db_name"] = "UNP" if tD["db_name"] in ["TREMBL"] else tD["db_name"] qDL.append(tD) if "struct_ref_seq" in dD: for qD in qDL: refId = qD["id"] alignL = [] for tD in dD["struct_ref_seq"]: if refId == tD["ref_id"]: alignL.append(tD) # qD['align_list'] = copy.copy(aL) for align in alignL: authAsymId = align["pdbx_strand_id"] qD.setdefault("alignD", {}).setdefault(authAsymId, []).append(align) rD["struct_ref"] = qDL # taxIdL = [] if "entity_src_nat" in dD: for tD in dD["entity_src_nat"]: if "pdbx_ncbi_taxonomy_id" in tD: taxIdL.append(tD["pdbx_ncbi_taxonomy_id"]) if "entity_src_gen" in dD: for tD in dD["entity_src_gen"]: if "pdbx_gene_src_ncbi_taxonomy_id" in tD: taxIdL.append(tD["pdbx_gene_src_ncbi_taxonomy_id"]) if "pdbx_host_org_ncbi_taxonomy_id" in tD: taxIdL.append(tD["pdbx_host_org_ncbi_taxonomy_id"]) if "pdbx_entity_src_syn" in dD: for tD in dD["pdbx_entity_src_syn"]: if "ncbi_taxonomy_id" in tD: taxIdL.append(tD["ncbi_taxonomy_id"]) qL = [] for taxId in taxIdL: ttL = [int(t.strip()) for t in taxId.split(",") if t.strip().isdigit()] qL.extend(ttL) logger.debug("TaxId list %r", qL) rD["original_taxonomy_ids"] = copy.copy(list(set(qL))) # if "entity_id" in rD: eD[rD["entity_id"]] = copy.copy(rD) entryD[entryId][resultKey] = copy.copy(eD) iCount += 1 if iCount % 1000 == 0: logger.info("Completed fetch %d/%d entries", iCount, len(entryD)) if entryLimit and iCount >= entryLimit: logger.info("Quitting after %d", iCount) break except Exception as e: logger.exception("Failing with %s", str(e)) return entryD
class ReferenceSequenceAssignmentUpdater(object): """Selected utilities to update reference sequence assignments information in the core_entity collection. """ def __init__(self, cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", polymerType="Protein", referenceDatabaseName="UniProt", provSource="PDB", **kwargs): self.__cfgOb = cfgOb self.__polymerType = polymerType self.__mU = MarshalUtil() # self.__databaseName = databaseName self.__collectionName = collectionName self.__statusList = [] # self.__ssP = self.__fetchSiftsSummaryProvider( self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__assignRefD, self.__refD, self.__matchD = self.__reload( databaseName, collectionName, polymerType, referenceDatabaseName, provSource, **kwargs) def __reload(self, databaseName, collectionName, polymerType, referenceDatabaseName, provSource, **kwargs): assignRefD = self.__getPolymerReferenceSequenceAssignments( databaseName, collectionName, polymerType, **kwargs) # get refIdD = {refId: [entity_id, ....], } refIdD, _ = self.__getUniqueAssignments( assignRefD, referenceDatabaseName=referenceDatabaseName, provSource=provSource) # refD, matchD = self.__rebuildReferenceCache(referenceDatabaseName, list(refIdD.keys()), **kwargs) return assignRefD, refD, matchD def doUpdate(self, updateId, updateLimit=None): desp = DataExchangeStatus() statusStartTimestamp = desp.setStartTime() # numUpd = 0 updateDL = self.__buildUpdate(self.__assignRefD) if updateDL: if updateLimit: numUpd = self.__doUpdate(self.__cfgOb, updateDL[:updateLimit], self.__databaseName, self.__collectionName) else: numUpd = self.__doUpdate(self.__cfgOb, updateDL, self.__databaseName, self.__collectionName) self.__updateStatus(updateId, self.__databaseName, self.__collectionName, True, statusStartTimestamp) return len(updateDL), numUpd def __doUpdate(self, cfgOb, updateDL, databaseName, collectionName): obUpd = ObjectUpdater(cfgOb) numUpd = obUpd.update(databaseName, collectionName, updateDL) logger.info("Update count is %d", numUpd) return numUpd def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, **kwargs): """Get all accessions assigned to input reference sequence database for the input polymerType. Returns: (dict): {"1abc_1": "rcsb_entity_container_identifiers": {"reference_sequence_identifiers": []}, "rcsb_polymer_entity_align": [], "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []} """ cachePath = kwargs.get("cachePath", ".") exDbDir = "exdb" cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3}) useCache = kwargs.get("useCache", True) fetchLimit = kwargs.get("fetchLimit", None) cacheFilePath = os.path.join(cachePath, exDbDir, "entity-poly-ref-seq-assign-cache.json") # try: obEx = ObjectExtractor( self.__cfgOb, databaseName=databaseName, collectionName=collectionName, cacheFilePath=cacheFilePath, useCache=useCache, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=cacheKwargs, objectLimit=fetchLimit, selectionQuery={ "entity_poly.rcsb_entity_polymer_type": polymerType }, selectionList=[ "rcsb_id", "rcsb_entity_container_identifiers.reference_sequence_identifiers", "rcsb_entity_container_identifiers.auth_asym_ids", "rcsb_polymer_entity_align", "rcsb_entity_source_organism.ncbi_taxonomy_id", ], ) eCount = obEx.getCount() logger.info("Entity count is %d", eCount) objD = obEx.getObjects() logger.info( "Reading polymer entity entity count %d ref accession length %d ", eCount, len(objD)) # except Exception as e: logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e)) return objD def __getUniqueAssignments(self, objD, referenceDatabaseName="UniProt", provSource="PDB"): refIdD = defaultdict(list) taxIdD = defaultdict(list) numMissing = 0 for entityKey, eD in objD.items(): try: accS = set() for ii, tD in enumerate(eD["rcsb_entity_container_identifiers"] ["reference_sequence_identifiers"]): if tD["database_name"] == referenceDatabaseName and tD[ "provenance_source"] == provSource: accS.add(tD["database_accession"]) refIdD[tD["database_accession"]].append(entityKey) # # pick up the corresponding taxonomy - try: taxIdD[tD["database_accession"]].append( eD["rcsb_entity_source_organism"][ii] ["ncbi_taxonomy_id"]) except Exception: logger.warning("Failing taxonomy lookup for %s %r", entityKey, tD["database_accession"]) logger.debug("PDB assigned sequences length %d", len(accS)) except Exception as e: numMissing += 1 logger.debug("No sequence assignments for %s with %s", entityKey, str(e)) # for refId, taxIdL in taxIdD.items(): taxIdL = list(set(taxIdL)) if len(taxIdL) > 1: logger.info( "Multitple taxIds assigned to reference sequence id %s: %r", refId, taxIdL) logger.info("Unique %s accession assignments by %s %d (missing %d) ", referenceDatabaseName, provSource, len(refIdD), numMissing) return refIdD, taxIdD def __reMapAccessions(self, rsiDL, referenceDatabaseName="UniProt", provSourceL=None, excludeReferenceDatabases=None): """Internal method to re-map accessions for the input databae and assignment source Args: rsiDL (list): list of accession databaseName (str, optional): resource database name. Defaults to 'UniProt'. provSource (str, optional): assignment provenance. Defaults to 'PDB'. Returns: bool, list: flag for mapping success, and remapped (and unmapped) accessions in the input object list """ isMatched = False unMapped = 0 matched = 0 excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else [ "PDB" ] provSourceL = provSourceL if provSourceL else [] retDL = [] for rsiD in rsiDL: if rsiD["database_name"] in excludeReferenceDatabases: unMapped += 1 continue if rsiD["database_name"] == referenceDatabaseName and rsiD[ "provenance_source"] in provSourceL: try: if len(self.__matchD[rsiD["database_accession"]] ["matchedIds"]) == 1: rsiD["database_accession"] = self.__matchD[ rsiD["database_accession"]]["matchedIds"][0] matched += 1 else: logger.info( "Skipping mapping to multiple superseding accessions %s", rsiD["database_accession"]) # except Exception: unMapped += 1 retDL.append(rsiD) if matched == len(retDL): isMatched = True return not unMapped, isMatched, retDL def __reMapAlignments(self, alignDL, referenceDatabaseName="UniProt", provSourceL=None, excludeReferenceDatabases=None): """Internal method to re-map alignments for the input databae and assignment source Args: alignDL (list): list of aligned regions databaseName (str, optional): resource database name. Defaults to 'UniProt'. provSourceL (list, optional): assignment provenance. Defaults to 'PDB'. Returns: bool, list: flag for mapping success, and remapped (and unmapped) accessions in the input align list """ isMatched = False unMapped = 0 matched = 0 excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else [ "PDB" ] retDL = [] provSourceL = provSourceL if provSourceL else [] for alignD in alignDL: if alignD["reference_database_name"] in excludeReferenceDatabases: unMapped += 1 continue if alignD[ "reference_database_name"] == referenceDatabaseName and alignD[ "provenance_code"] in provSourceL: try: if len(self.__matchD[ alignD["reference_database_accession"]] ["matchedIds"]) == 1: alignD["reference_database_accession"] = self.__matchD[ alignD["reference_database_accession"]][ "matchedIds"][0] matched += 1 else: logger.info( "Skipping alignment mapping to multiple superseding accessions %s", alignD["reference_database_accession"]) except Exception: unMapped += 1 retDL.append(alignD) if matched == len(retDL): isMatched = True # return not unMapped, isMatched, retDL def __getSiftsAccessions(self, entityKey, authAsymIdL): retL = [] saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL) for (_, dbAccession), _ in saoLD.items(): retL.append({ "database_name": "UniProt", "database_accession": dbAccession, "provenance_source": "SIFTS" }) return retL def __getSiftsAlignments(self, entityKey, authAsymIdL): retL = [] saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL) for (_, dbAccession), saoL in saoLD.items(): dD = { "reference_database_name": "UniProt", "reference_database_accession": dbAccession, "provenance_code": "SIFTS", "aligned_regions": [] } for sao in saoL: dD["aligned_regions"].append({ "ref_beg_seq_id": sao.getDbSeqIdBeg(), "entity_beg_seq_id": sao.getEntitySeqIdBeg(), "length": sao.getEntityAlignLength() }) retL.append(dD) return retL def __buildUpdate(self, assignRefD): # updateDL = [] for entityKey, eD in assignRefD.items(): selectD = {"rcsb_id": entityKey} try: updateD = {} authAsymIdL = [] ersDL = (eD["rcsb_entity_container_identifiers"] ["reference_sequence_identifiers"] if "reference_sequence_identifiers" in eD["rcsb_entity_container_identifiers"] else None) # # if ersDL: authAsymIdL = eD["rcsb_entity_container_identifiers"][ "auth_asym_ids"] isMapped, isMatched, updErsDL = self.__reMapAccessions( ersDL, referenceDatabaseName="UniProt", provSourceL=["PDB"]) # if not isMapped or not isMatched: tL = self.__getSiftsAccessions(entityKey, authAsymIdL) if tL: logger.debug( "Using SIFTS accession mapping for %s", entityKey) else: logger.info( "No alternative SIFTS accession mapping for %s", entityKey) updErsDL = tL if tL else [] # if len(updErsDL) < len(ersDL): logger.info( "Incomplete reference sequence mapping update for %s", entityKey) updateD[ "rcsb_entity_container_identifiers.reference_sequence_identifiers"] = updErsDL # alignDL = eD[ "rcsb_polymer_entity_align"] if "rcsb_polymer_entity_align" in eD else None if alignDL and authAsymIdL: isMapped, isMatched, updAlignDL = self.__reMapAlignments( alignDL, referenceDatabaseName="UniProt", provSourceL=["PDB"]) # if not isMapped or not isMatched: tL = self.__getSiftsAlignments(entityKey, authAsymIdL) if tL: logger.debug( "Using SIFTS alignment mapping for %s", entityKey) else: logger.info( "No alternative SIFTS alignment mapping for %s", entityKey) updAlignDL = tL if tL else updAlignDL # if len(updAlignDL) < len(alignDL): logger.info( "Incomplete alignment mapping update for %s", entityKey) updateD["rcsb_polymer_entity_align"] = updAlignDL # if updateD: updateDL.append({"selectD": selectD, "updateD": updateD}) except Exception as e: logger.exception("Mapping error for %s with %s", entityKey, str(e)) # return updateDL def __rebuildReferenceCache(self, refDbName, idList, **kwargs): """ """ dD = {} cachePath = kwargs.get("cachePath", ".") dirPath = os.path.join(cachePath, "exdb") cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3}) useCache = kwargs.get("useCache", True) fetchLimit = kwargs.get("fetchLimit", None) saveText = kwargs.get("saveText", False) # ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json" fn = "ref-sequence-data-cache" + "." + ext cacheFilePath = os.path.join(dirPath, fn) # self.__mU.mkdir(dirPath) if not useCache: for fp in [cacheFilePath]: try: os.remove(fp) except Exception: pass # if useCache and cacheFilePath and self.__mU.exists(cacheFilePath): dD = self.__mU.doImport(cacheFilePath, **cacheKwargs) # Check for completeness - missingS = set(dD["refDbCache"].keys()) - set(idList) if missingS: logger.info("Reference sequence cache missing %d accessions", len(missingS)) extraD = self.__fetchReferenceEntries(refDbName, list(missingS), saveText=saveText, fetchLimit=fetchLimit) dD["refDbCache"].update(extraD["refDbCache"]) dD["matchInfo"].update(extraD["matchInfo"]) if cacheFilePath and cacheKwargs: self.__mU.mkdir(dirPath) ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs) logger.info("Cache updated with status %r", ok) # else: dD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit) if cacheFilePath and cacheKwargs: self.__mU.mkdir(dirPath) ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs) logger.info("Cache save status %r", ok) return dD["refDbCache"], dD["matchInfo"] def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None): """Fetch database entries from the input reference sequence database name.""" dD = {"refDbName": refDbName, "refDbCache": {}, "matchInfo": {}} try: idList = idList[:fetchLimit] if fetchLimit else idList logger.info("Starting fetch for %d %s entries", len(idList), refDbName) if refDbName == "UniProt": fobj = UniProtUtils(saveText=saveText) refD, matchD = fobj.fetchList(idList) dD = { "refDbName": refDbName, "refDbCache": refD, "matchInfo": matchD } except Exception as e: logger.exception("Failing with %s", str(e)) return dD def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs): abbreviated = kwargs.get("siftsAbbreviated", "PROD") cachePath = kwargs.get("cachePath", ".") cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"}) useCache = kwargs.get("useCache", True) # siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName) # logger.info("Using SIFTS_SUMMARY_DATA_PATH, %r", siftsSummaryDataPath) if siftsSummaryDataPath.lower().startswith("http"): srcDirPath = siftsSummaryDataPath else: srcDirPath = os.path.join(cachePath, siftsSummaryDataPath) cacheDirPath = os.path.join( cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName)) logger.debug("ssP %r %r", srcDirPath, cacheDirPath) ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=abbreviated, cacheKwargs=cacheKwargs) logger.info("ssP entry count %d", ssP.getEntryCount()) return ssP def __dumpEntries(self, refD): for (eId, eDict) in refD.items(): logger.info("------ Reference id %s", eId) for k, v in eDict.items(): logger.info("%-15s = %r", k, v) def __getUpdateAssignmentCandidates(self, objD): totCount = 0 difCount = 0 pdbUnpIdD = defaultdict(list) siftsUnpIdD = defaultdict(list) assignIdDifD = defaultdict(list) # for entityKey, eD in objD.items(): try: siftsS = set() pdbS = set() for tD in eD["rcsb_entity_container_identifiers"][ "reference_sequence_identifiers"]: if tD["database_name"] == "UniProt": if tD["provenance_source"] == "SIFTS": siftsS.add(tD["database_accession"]) siftsUnpIdD[tD["database_accession"]].append( entityKey) elif tD["provenance_source"] == "PDB": pdbS.add(tD["database_accession"]) pdbUnpIdD[tD["database_accession"]].append( entityKey) else: logger.debug("No UniProt for %r", eD["rcsb_entity_container_identifiers"]) logger.debug("PDB assigned sequence length %d", len(pdbS)) logger.debug("SIFTS assigned sequence length %d", len(siftsS)) if pdbS and siftsS: totCount += 1 if pdbS != siftsS: difCount += 1 for idV in pdbS: assignIdDifD[idV].append(entityKey) except Exception as e: logger.warning("No identifiers for %s with %s", entityKey, str(e)) # logger.info("Total %d differences %d", totCount, difCount) logger.info("Unique UniProt accession assignments PDB %d SIFTS %d", len(pdbUnpIdD), len(siftsUnpIdD)) logger.info("Current unique overalapping assignment differences %d ", len(assignIdDifD)) logger.info("Current unique overalapping assignment differences %r ", assignIdDifD) return assignIdDifD, pdbUnpIdD, siftsUnpIdD def getReferenceAccessionAlignSummary(self): """Summarize the alignment of PDB accession assignments with the current reference sequence database.""" numPrimary = 0 numSecondary = 0 numNone = 0 for _, mD in self.__matchD.items(): if mD["matched"] == "primary": numPrimary += 1 elif mD["matched"] == "secondary": numSecondary += 1 else: numNone += 1 logger.debug("Matched primary: %d secondary: %d none %d", numPrimary, numSecondary, numNone) return numPrimary, numSecondary, numNone def getLoadStatus(self): return self.__statusList def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp): try: sFlag = "Y" if status else "N" desp = DataExchangeStatus() desp.setStartTime(tS=startTimestamp) desp.setObject(databaseName, collectionName) desp.setStatus(updateId=updateId, successFlag=sFlag) desp.setEndTime() self.__statusList.append(desp.getStatus()) return True except Exception as e: logger.exception("Failing with %s", str(e)) return False
def searchSmarts(self, queryTargetId, smarts, resultPath, maxHits=50, suppressMetals=False): """Search the CCDC database for substructure matches for the input SMARTS pattern. Args: queryTargetId (str): query identifier smarts (srt): smarts search pattern (NON STEREO) resultPath (str): output path to match results maxHits (int, optional): maximum number of matches to return. Defaults to 50. suppressMetals (bool, optional): filter structures containing metals. Defaults to False. Returns: (int): number of matches """ mU = MarshalUtil() logger.info("Start smarts search for target %s result path %s", queryTargetId, resultPath) # ii = 1 searchType = "substructure" summaryList = [] dirPath = os.path.join(resultPath, queryTargetId) numHits = 0 startTime = time.time() logger.info("(%d) begin %s search - query id %s", ii, searchType, queryTargetId) if searchType == "substructure": hits = self.__smartsSubstructureSearch( smarts, suppressMetals=suppressMetals) else: hits = [] logger.info("(%d) completed search query id %s in %.3f seconds", ii, queryTargetId, time.time() - startTime) if hits: numHits += len(hits) logger.info("(%d) search for %s matched %d: %r", ii, queryTargetId, numHits, [targetHit.identifier for targetHit in hits]) # for targetHit in hits[:maxHits]: # hI = CcdcMatchIndexInst() hI.setTargetId(queryTargetId) hI.setIdentifier(targetHit.identifier) hI.setMatchType(searchType) try: hI.setRFactor(targetHit.entry.r_factor) hI.setChemicalName(targetHit.entry.chemical_name) hI.setTemperature(targetHit.entry.temperature) hI.setRadiationSource(targetHit.entry.radiation_source) hI.setHasDisorder("N") cit = targetHit.entry.publication if cit.doi is not None: hI.setCitationDOI(cit.doi) if searchType == "similarity": hI.setSimilarityScore(targetHit.similarity) elif searchType == "substructure": hI.setMatchedAtomLength(len(targetHit.match_atoms())) except Exception as e: logger.exception("Failing with %s", str(e)) # # mU.mkdir(dirPath) mol2L = [] for jj, mc in enumerate(targetHit.molecule.components, 1): fp = os.path.join( dirPath, queryTargetId + "_" + targetHit.identifier + "_%03d" % jj + ".mol2") mol2L.append(fp) with MoleculeWriter(fp) as ofh: ofh.write(mc) # Replace the title line with open(fp) as fin: lines = fin.readlines() lines[1] = lines[1].replace("00", targetHit.identifier) # with open(fp, "w") as fout: fout.write("".join(lines)) # fp = os.path.join( dirPath, queryTargetId + "_" + targetHit.identifier + "_%03d" % jj + ".sdf") with MoleculeWriter(fp) as ofh: ofh.write(mc) # Replace the title line with open(fp) as fin: lines = fin.readlines() lines[0] = lines[0].replace("00", targetHit.identifier) # with open(fp, "w") as fout: fout.write("".join(lines)) # # Check for multiple generated result files - # for jj, fp in enumerate(mol2L, 1): logger.debug("(%d) adding component fp %s", jj, fp) hI.setMatchNumber(jj) hI.setMol2Path(fp) tt = fp[:-4] + "sdf" hI.setMolPath(tt) summaryList.append(copy.deepcopy(hI.get())) # else: logger.info("(%d) se sarch for %s returns no matches", ii, queryTargetId) hits = None # if numHits > 0: mU.mkdir(dirPath) fp = os.path.join(dirPath, queryTargetId + "-index.json") cmI = CcdcMatchIndex(indexFilePath=fp, verbose=self.__verbose) cmI.load(summaryList) cmI.writeIndex() return numHits
class ObjectExtractor(object): """Utilities to extract document features from the document object server.""" def __init__(self, cfgOb, **kwargs): self.__cfgOb = cfgOb self.__resourceName = "MONGO_DB" self.__mU = MarshalUtil() # self.__objectD = self.__rebuildCache(**kwargs) self.__objPathD = {} self.__stringPathList = [] self.__objValD = {} # def getObjects(self): return self.__objectD def getPathList(self, filterList=True): kL = [] if filterList: tL = [] for ky in self.__objPathD: if ky and (ky.find(".") != -1 or ky.startswith("_")) and ky not in [ "_id" ] and not ky.endswith("[]"): tL.append(ky) for ky in tL: for tky in tL: ok = True if ky in tky and ky != tky: ok = False break if ok: kL.append(ky) else: kL = list(self.__objPathD.keys()) # return sorted(kL) def getValues(self): return self.__objValD def setPathList(self, stringPathList): self.__objPathD = {k: True for k in stringPathList} return True def getCount(self): return len(self.__objectD) def __rebuildCache(self, **kwargs): cacheFilePath = kwargs.get("cacheFilePath", None) cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"}) useCache = kwargs.get("useCache", True) keyAttribute = kwargs.get("keyAttribute", "entry") selectL = kwargs.get("selectionList", []) # cD = {keyAttribute: {}} try: if useCache and cacheFilePath and os.access( cacheFilePath, os.R_OK): cD = self.__mU.doImport(cacheFilePath, **cacheKwargs) else: if selectL: objectD = self.__select(**kwargs) else: objectD = self.__selectObjects(**kwargs) cD[keyAttribute] = objectD if cacheFilePath: pth, _ = os.path.split(cacheFilePath) ok = self.__mU.mkdir(pth) ok = self.__mU.doExport(cacheFilePath, cD, **cacheKwargs) logger.info("Saved object results (%d) status %r in %s", len(objectD), ok, cacheFilePath) except Exception as e: logger.exception("Failing with %s", str(e)) return cD[keyAttribute] def __selectObjects(self, **kwargs): """Return a dictionary of objects satisfying the input conditions (e.g. method, resolution limit)""" databaseName = kwargs.get("databaseName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_entry") selectionQueryD = kwargs.get("selectionQuery", {}) # uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"]) # tV = kwargs.get("objectLimit", None) objLimit = int(tV) if tV is not None else None stripObjectId = kwargs.get("stripObjectId", False) logIncrement = kwargs.get("logIncrement", 10000) # objectD = {} try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(databaseName, collectionName): logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName)) qD = {} if selectionQueryD: qD.update(selectionQueryD) selectL = ["_id"] dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD) numDoc = len(dL) if dL else 0 logger.info("Selection %r fetch result count %d", selectL, numDoc) # for ii, dD in enumerate(dL, 1): if "_id" not in dD: continue rObj = mg.fetchOne(databaseName, collectionName, "_id", dD["_id"]) if stripObjectId and rObj and "_id" in rObj: rObj.pop("_id") else: rObj["_id"] = str(rObj["_id"]) # stKey = ".".join([rObj[ky] for ky in uniqueAttributes]) objectD[stKey] = copy.copy(rObj) if objLimit and ii >= objLimit: break logger.debug("Saving %d %s", ii, stKey) if ii % logIncrement == 0 or ii == numDoc: logger.info("Extracting object (%d of %d)", ii, numDoc) except Exception as e: logger.exception("Failing with %s", str(e)) return objectD # def __select(self, **kwargs): """Return a dictionary of object content satisfying the input conditions (e.g. method, resolution limit) and selection options. """ databaseName = kwargs.get("databaseName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_entry") selectionQueryD = kwargs.get("selectionQuery", {}) uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"]) selectL = kwargs.get("selectionList", []) stripObjectId = kwargs.get("stripObjectId", False) # tV = kwargs.get("objectLimit", None) objLimit = int(tV) if tV is not None else None # objectD = {} try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(databaseName, collectionName): logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName)) qD = {} if selectionQueryD: qD.update(selectionQueryD) dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD, suppressId=True) logger.info("Selection %r fetch result count %d", selectL, len(dL)) # for ii, rObj in enumerate(dL, 1): stKey = ".".join([rObj[ky] for ky in uniqueAttributes]) if stripObjectId and rObj and "_id" in rObj: rObj.pop("_id") objectD[stKey] = copy.copy(rObj) if objLimit and ii >= objLimit: break # logger.debug("Saving %d %s", ii, stKey) # logger.debug("Current objectD keys %r", list(objectD.keys())) except Exception as e: logger.exception("Failing with %s", str(e)) return objectD # def __getKeyValues(self, dct, keyNames): """Return the tuple of values of corresponding to the input dictionary key names expressed in dot notation. Args: dct (dict): source dictionary object (nested) keyNames (list): list of dictionary keys in dot notation Returns: tuple: tuple of values corresponding to the input key names """ rL = [] try: for keyName in keyNames: rL.append(self.__getKeyValue(dct, keyName)) except Exception as e: logger.exception("Failing for key names %r with %s", keyNames, str(e)) return tuple(rL) def __getKeyValue(self, dct, keyName): """Return the value of the corresponding key expressed in dot notation in the input dictionary object (nested).""" try: kys = keyName.split(".") for key in kys: try: dct = dct[key] except KeyError: return None return dct except Exception as e: logger.exception("Failing for key %r with %s", keyName, str(e)) return None def __toJsonPathString(self, path): pL = [ky if ky else "[]" for ky in path] sp = ".".join(pL) sp = sp.replace(".[", "[") return sp def __pathCallBack(self, path, value): sp = self.__toJsonPathString(path) self.__objPathD[ sp] = self.__objPathD[sp] + 1 if sp in self.__objPathD else 1 return value def __saveCallBack(self, path, value): sP = self.__toJsonPathString(path) if sP in self.__objPathD: ky = sP.replace("[]", "") if sP.find("[") != -1: # multivalued if isinstance(value, list): self.__objValD.setdefault(ky, []).extend(value) else: self.__objValD.setdefault(ky, []).append(value) else: self.__objValD[ky] = value return value def genPathList(self, dObj, path=None): return self.__walk(dObj, jsonPath=path, funct=self.__pathCallBack) def genValueList(self, dObj, path=None, clear=True): self.__objValD = {} if clear else self.__objValD return self.__walk(dObj, jsonPath=path, funct=self.__saveCallBack) def __walk(self, jsonObj, jsonPath=None, funct=None): """Walk JSON data types. An optional funct() is called to mutate the value of each element. The jsonPath is updated at each element. """ if jsonPath is None: jsonPath = [] if isinstance(jsonObj, dict): value = { k: self.__walk(v, jsonPath + [k], funct) for k, v in jsonObj.items() } elif isinstance(jsonObj, list): value = [ self.__walk(elem, jsonPath + [[]], funct) for elem in jsonObj ] else: value = jsonObj if funct is None: return value else: return funct(jsonPath, value) def __toPath(self, path): """Convert path strings into path lists.""" if isinstance(path, list): return path # already in list format def _iterPath(path): for parts in path.split("[]"): for part in parts.strip(".").split("."): yield part yield [] return list(_iterPath(path))[:-1]
class InterProProvider(object): """Manage mappings of InterPro identifiers to description and parent/child relationships""" def __init__(self, **kwargs): urlTargetInterPro = kwargs.get("urlTargetInterPro", "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/entry.list") urlTargetInterProFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/InterPro/entry.list" urlTargetInterProParent = kwargs.get("urlTargetInterPro", "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/ParentChildTreeFile.txt") urlTargetInterProParentFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/InterPro/ParentChildTreeFile.txt" cachePath = kwargs.get("cachePath", ".") dirPath = os.path.join(cachePath, "interPro") useCache = kwargs.get("useCache", True) # self.__mU = MarshalUtil(workPath=dirPath) self.__interProD, self.__interProParentD = self.__rebuildCache(urlTargetInterPro, urlTargetInterProFB, urlTargetInterProParent, urlTargetInterProParentFB, dirPath, useCache) def getDescription(self, interProId): ret = None try: ret = self.__interProD[interProId]["description"] except Exception: pass return ret def getType(self, interProId): ret = None try: ret = self.__interProD[interProId]["type"] except Exception: pass return ret def testCache(self): # Check length ... logger.info("Length InterPro %d", len(self.__interProD)) return len(self.__interProD) > 1000 # def __rebuildCache(self, urlTargetInterPro, urlTargetInterProFB, urlTargetInterProParent, urlTargetInterProParentFB, dirPath, useCache): fmt = "json" ext = fmt if fmt == "json" else "pic" interProDataPath = os.path.join(dirPath, "interPro-data.%s" % ext) # logger.debug("Using cache data path %s", dirPath) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(interProDataPath): rD = self.__mU.doImport(interProDataPath, fmt=fmt) interProD = rD["index"] interProParentD = rD["parents"] logger.debug("InterPro index length %d parent length %d", len(interProD), len(interProParentD)) else: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetInterPro, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetInterPro)) ok = fU.get(urlTargetInterPro, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProFB)) ok = fU.get(urlTargetInterProFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) interProD = self.__getInterProIndex(fp) logger.info("Caching %d in %s status %r", len(interProD), interProDataPath, ok) # ------ logger.info("Fetch data from source %s in %s", urlTargetInterProParent, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParent)) ok = fU.get(urlTargetInterProParent, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParentFB)) ok = fU.get(urlTargetInterProParentFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) interProParentD = self.__getInterProParents(fp) # ok = self.__mU.doExport(interProDataPath, {"index": interProD, "parents": interProParentD}, fmt=fmt) # return interProD, interProParentD def getLineage(self, idCode): pList = [] try: pList.append(idCode) pt = self.getParentId(idCode) while (pt is not None) and (pt != 1): pList.append(pt) pt = self.getParentId(pt) except Exception as e: logger.exception("Failing with %s", str(e)) # pList.reverse() return pList def getLineageWithNames(self, idCode): linL = [] try: idCodeL = self.getLineage(idCode) for ii, idCode in enumerate(idCodeL, 1): linL.append((idCode, self.getDescription(idCode), ii)) except Exception as e: logger.exception("Failing with %s", str(e)) return linL def getParentId(self, idCode): try: return self.__interProParentD[idCode] except Exception: pass return None def getTreeNodeList(self, filterD=None): dL = [] try: for idCode, _ in self.__interProD.items(): if filterD and idCode not in filterD: continue displayName = self.getDescription(idCode) pId = self.getParentId(idCode) linL = self.getLineage(idCode) # if pId is None: dD = {"id": idCode, "name": displayName, "depth": 0} else: dD = {"id": idCode, "name": displayName, "parents": [pId], "depth": len(linL) - 1} dL.append(dD) except Exception as e: logger.exception("Failing with %s", str(e)) return dL def __getInterProParents(self, filePath): """Read the InterPro parent hierarchy and return a dictionary parent ids. Args: filePath (str): path to InterPro parent/child hierachy Returns: dict: {idCode: parentIdCode or None} """ interProParentD = {} lineL = self.__mU.doImport(filePath, fmt="list") stack = [] for line in lineL: content = line.rstrip() # drop \n row = content.split("--") ff = row[-1].split("::") tS = ff[0].strip() # stack[:] = stack[: len(row) - 1] + [row[-1]] stack[:] = stack[: len(row) - 1] + [tS] for ii, idCode in enumerate(stack): if idCode not in interProParentD: # prevents overwriting the parent of idCode, in case idCode has already been iterated over in ParentChildTreeFile.txt interProParentD[idCode] = None if ii == 0 else stack[ii - 1] else: # This will correct the parent of idCode from being None if it's later identified as having a parent at another point in ParentChildTreeFile.txt if interProParentD[idCode] is None and ii != 0: interProParentD[idCode] = stack[ii - 1] logger.debug("Lineage %r", "\t".join(stack)) # return interProParentD def __getInterProIndex(self, filePath): """Read CSV file of InterPro accessions and descriptions Args: filePath (str): path to InterPro accession/description csv file Returns: dict: {idCode: description} """ interProD = {} encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {} rowL = self.__mU.doImport(filePath, fmt="tdd", rowFormat="list", **encodingD) for row in rowL: try: interProId = row[0].strip().upper() interProType = row[1].strip() descr = row[2].strip() interProD[interProId] = {"description": descr, "type": interProType} except Exception: pass # return interProD
class ChemCompDepictWrapper(SingletonClass): """Wrapper for chemical component depiction operations.""" def __init__(self): self.__startTime = time.time() # --- self.__workPath = "." self.__mU = MarshalUtil(workPath=self.__workPath) self.__configD = None self.__cachePath = None # --- self.__statusDescriptorError = -100 self.__searchError = -200 self.__searchSuccess = 0 self.__imageCount = 0 def readConfig(self, resetImagePath=True): # ok = False try: self.__cachePath = os.environ.get("CHEM_DEPICT_CACHE_PATH", ".") configFileName = os.environ.get("CHEM_DEPICT_CONFIG_FILE_NAME", "depict-config.json") # configFilePath = os.path.join(self.__cachePath, "config", configFileName) configD = {} if self.__mU.exists(configFilePath): configD = self.__mU.doImport(configFilePath, fmt="json") logger.debug("configD: %r", configD) if configD and (len(configD) >= 2) and float( configD["versionNumber"]) > 0.1: logger.info("Read version %r sections %r from %s", configD["versionNumber"], list(configD.keys()), configFilePath) ok = True # if resetImagePath: # Allow the configuration to be relocatable. tS = configD[ "imageDir"] if "imageDir" in configD else "images" configD["imageDirPath"] = os.path.join( self.__cachePath, tS) configD["versionNumber"] = "0.2" else: # Handle missing config for now configD["imageDir"] = "images" configD["imageDirPath"] = os.path.join(self.__cachePath, configD["imageDir"]) logger.warning("Reading config file fails from path %r", configFilePath) logger.warning("Using config %r", configD) ok = True # self.__configD = configD except Exception as e: logger.exception("Failing with %s", str(e)) ok = False return ok def setConfig(self, cachePath, **kwargs): """Provide dependencies for rebuilding depict file dependencies. Args: cachePath (str): path to cache data files. Other options are propagated to configurations of the wrapped classes in __bootstrapDepictConfig() """ self.__configD = self.__makeBootstrapDepictConfig(cachePath, **kwargs) return len(self.__configD) >= 2 def __makeBootstrapDepictConfig(self, cachePath, **kwargs): """Create depict configuration bootstrap file""" configD = {} try: storeConfig = kwargs.get("storeConfig", True) os.environ["CHEM_DEPICT_CACHE_PATH"] = os.path.join(cachePath) configDirPath = os.path.join(cachePath, "config") configFilePath = os.path.join(configDirPath, "depict-config.json") # logger.info("Updating depict configuration using %s", configFilePath) # imageDirPath = os.path.join(cachePath, "images") self.__mU.mkdir(imageDirPath) configD = {"versionNumber": 0.20, "imageDir": "images"} if storeConfig: self.__mU.mkdir(configDirPath) self.__mU.doExport(configFilePath, configD, fmt="json", indent=3) except Exception as e: logger.exception("Failing with %s", str(e)) return configD # def setImageCount(self, imageCount): self.__imageCount = imageCount def getImageCount(self): return self.__imageCount def __makeImagePath(self): imageDirPath = self.__configD[ "imageDirPath"] if self.__configD and "imageDirPath" in self.__configD else "." fileRotateIncrement = self.__configD[ "fileRotateIncrement"] if self.__configD and "fileRotateIncrement" in self.__configD else 50 ic = self.__imageCount % fileRotateIncrement imagePath = os.path.join(imageDirPath, "image-%s.svg" % ic) return imagePath def depictMolecule(self, identifier, identifierType, imagePath=None, **kwargs): """Create depiction from InChI, SMILES descriptors or PDB identifier.""" try: imagePath = imagePath if imagePath else self.__makeImagePath() oeio = OeIoUtils() if identifierType.lower() in ["smiles"]: oeMol = oeio.smilesToMol(identifier) elif identifierType.lower() in ["inchi"]: oeMol = oeio.inchiToMol(identifier) elif identifierType.lower() in ["identifierpdb"]: ccsw = ChemCompSearchWrapper() oesmP = ccsw.getSearchMoleculeProvider() oeMol = oesmP.getMol(identifier) # ok = self.__depictOne(oeMol, imagePath, **kwargs) return imagePath if ok else None except Exception as e: logger.exception("Failing with %s", str(e)) return None def __depictOne(self, oeMol, imagePath, **kwargs): """Single Args: oeMol (object): instance of an OE graph molecule imagePath (string): file path for image Returns: bool: True for success or False otherwise """ try: title = kwargs.get("title", None) oed = OeDepict() oed.setMolTitleList([("Target", oeMol, title)]) # --- bondDisplayWidth = 10.0 numAtoms = oeMol.NumAtoms() if numAtoms > 100 and numAtoms <= 200: bondDisplayWidth = 6.0 elif numAtoms > 200: bondDisplayWidth = 4.0 # --- oed.setDisplayOptions( imageSizeX=kwargs.get("imageSizeX", 2500), imageSizeY=kwargs.get("imageSizeX", 2500), labelAtomName=kwargs.get("labelAtomName", False), labelAtomCIPStereo=kwargs.get("labelAtomCIPStereo", True), labelAtomIndex=kwargs.get("labelAtomIndex", False), labelBondIndex=kwargs.get("labelBondIndex", False), labelBondCIPStereo=kwargs.get("labelBondCIPStereo", True), cellBorders=kwargs.get("cellBorders", True), bondDisplayWidth=bondDisplayWidth, ) oed.setGridOptions(rows=1, cols=1, cellBorders=False) oed.prepare() oed.write(imagePath) self.__imageCount += 1 return True except Exception as e: logger.exception("Failing with %s", str(e)) return False def status(self): unitS = "MB" if platform.system() == "Darwin" else "GB" rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("Maximum resident memory size %.4f %s", rusageMax / 10**6, unitS) endTime = time.time() logger.info("Status at %s (up %.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def alignMoleculePair(self, refIdentifier, refIdentifierType, fitIdentifier, fitIdentifierType, imagePath=None, **kwargs): """Create aligned depiction for a target molecule InChI, SMILES descriptors or PDB identifier.""" try: imagePath = imagePath if imagePath else self.__makeImagePath() oeio = OeIoUtils() ccsw = ChemCompSearchWrapper() oesmP = ccsw.getSearchMoleculeProvider() # --- if refIdentifierType.lower() in ["smiles"]: oeMolRef = oeio.smilesToMol(refIdentifier) elif refIdentifierType.lower() in ["inchi"]: oeMolRef = oeio.inchiToMol(refIdentifier) elif refIdentifierType.lower() in ["identifierpdb"]: oeMolRef = oesmP.getMol(refIdentifier) # if fitIdentifierType.lower() in ["smiles"]: oeMolFit = oeio.smilesToMol(fitIdentifier) elif fitIdentifierType.lower() in ["inchi"]: oeMolFit = oeio.inchiToMol(fitIdentifier) elif fitIdentifierType.lower() in ["identifierpdb"]: oeMolFit = oesmP.getMol(fitIdentifier) # --- logger.info("oeMolRef atoms %r", oeMolRef.NumAtoms()) logger.info("oeMolFit atoms %r", oeMolFit.NumAtoms()) displayIdRef = "Ref" displayIdFit = "Fit" ok = self.__depictAlignedPair(oeMolRef, displayIdRef, oeMolFit, displayIdFit, imagePath, **kwargs) return imagePath if ok else None except Exception as e: logger.exception("Failing with %s", str(e)) return None def __depictAlignedPair(self, oeMolRef, displayIdRef, oeMolFit, displayIdFit, imagePath, **kwargs): """Depict pairwise MCSS alignment""" try: # oed = OeDepictMCSAlignPage() oed.setSearchType(sType="relaxed") # oed.setRefMol(oeMolRef, displayIdRef) oed.setFitMol(oeMolFit, displayIdFit) # # imagePath = self.__makeImagePath() # --- bondDisplayWidth = 10.0 numAtomsRef = oeMolRef.NumAtoms() if numAtomsRef > 100 and numAtomsRef <= 200: bondDisplayWidth = 6.0 elif numAtomsRef > 200: bondDisplayWidth = 4.0 # --- oed.setDisplayOptions( imageSizeX=kwargs.get("imageSizeX", 2500), imageSizeY=kwargs.get("imageSizeX", 2500), labelAtomName=kwargs.get("labelAtomName", False), labelAtomCIPStereo=kwargs.get("labelAtomCIPStereo", True), labelAtomIndex=kwargs.get("labelAtomIndex", False), labelBondIndex=kwargs.get("labelBondIndex", False), labelBondCIPStereo=kwargs.get("labelBondCIPStereo", True), cellBorders=kwargs.get("cellBorders", True), bondDisplayWidth=bondDisplayWidth, highlightStyleFit=kwargs.get("highlightStyleFit", "ballAndStickInverse"), ) # aML = oed.alignPair(imagePath=imagePath) logger.info("Aligned atom count %d", len(aML)) # # self.assertGreater(len(aML), 1) # if aML: # for (rCC, rAt, tCC, tAt) in aML: # logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt) return True except Exception as e: logger.exception("Failing with %s", str(e)) return False def toMolFile(self, identifier, identifierType, molfilePath=None, fmt="mol", **kwargs): """Create molfile (fmt) from InChI, SMILES descriptors or PDB identifier.""" try: molfilePath = molfilePath if molfilePath else self.__makeMolfilePath( fmt=fmt) oeio = OeIoUtils() if identifierType.lower() in ["smiles"]: oeMol = oeio.smilesToMol(identifier) oeMol.SetTitle("From SMILES") elif identifierType.lower() in ["inchi"]: oeMol = oeio.inchiToMol(identifier) oeMol.SetTitle("From InChI") elif identifierType.lower() in ["identifierpdb"]: ccsw = ChemCompSearchWrapper() oesmP = ccsw.getSearchMoleculeProvider() oeMol = oesmP.getMol(identifier) # ok = self.__toMolFile(oeMol, molfilePath, **kwargs) return molfilePath if ok else None except Exception as e: logger.exception("Failing with %s", str(e)) return None def __toMolFile(self, oeMol, molfilePath, **kwargs): """Write the Args: oeMol (object): instance of an OE graph molecule molfilePath (string): file path for molfile (type determined by extension) Returns: bool: True for success or False otherwise """ try: _ = kwargs oeio = OeIoUtils() oeio.write(molfilePath, oeMol, constantMol=True) return True except Exception as e: logger.exception("Failing with %s", str(e)) return False def __makeMolfilePath(self, fmt="mol"): imageDirPath = self.__configD[ "imageDirPath"] if self.__configD and "imageDirPath" in self.__configD else "." fileRotateIncrement = self.__configD[ "fileRotateIncrement"] if self.__configD and "fileRotateIncrement" in self.__configD else 50 ic = self.__imageCount % fileRotateIncrement molPath = os.path.join(imageDirPath, "molfile-%s.%s" % (ic, fmt)) return molPath
class OeIoUtils(object): """Utility methods to manage OE specific IO and format conversion operations.""" def __init__(self, **kwargs): self.__dirPath = kwargs.get("dirPath", ".") self.__mU = MarshalUtil(workPath=self.__dirPath) self.__oeErrorLevel = oechem.OEErrorLevel_Info if kwargs.get("quietFlag", False): self.setQuiet() # def setQuiet(self): """Suppress OE warnings and processing errors""" oechem.OEThrow.SetLevel(oechem.OEErrorLevel_Quiet) self.__oeErrorLevel = oechem.OEErrorLevel_Quiet def getComponentDefinitions(self, ccdFilePath): rdCcObjL = [] try: rdCcObjL = self.__mU.doImport(ccdFilePath, fmt="mmcif") logger.info("Read %s with %d definitions", ccdFilePath, len(rdCcObjL)) except Exception as e: logger.exception("Loading %s failing with %s", ccdFilePath, str(e)) return rdCcObjL def suppressHydrogens(self, oeMol): tMol = oechem.OEMol(oeMol) if oeMol else None if tMol: oechem.OESuppressHydrogens(tMol) return tMol def chemCompToMol(self, ccdFilePath, molBuildType="model-xyz", quietFlag=False): retMolL = [] try: rdCcObjL = self.__mU.doImport(ccdFilePath, fmt="mmcif") logger.info("Read %s with %d definitions", ccdFilePath, len(rdCcObjL)) oemf = OeMoleculeFactory() if quietFlag: oemf.setQuiet() for ccObj in rdCcObjL: ccId = oemf.setChemCompDef(ccObj) if ccId: ok = oemf.build(molBuildType=molBuildType) if ok: oeMol = oemf.getMol() retMolL.append(oeMol) except Exception as e: logger.exception("Loading %s failing with %s", ccdFilePath, str(e)) return retMolL def descriptorToSmiles(self, descr, descrType, limitPerceptions=False, messageTag=None): """Parse the input descriptor string and return an OE smiles. Args: descr (str): descriptor descrType (str): descriptor type limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor messageTag (srt, optional): prefix string for error messages. Defaults to None. Returns: str: SMILES string """ try: if "SMILES" in descrType.upper() and "ISO" in descrType.upper(): oeMol = self.smilesToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: return oechem.OECreateIsoSmiString(oeMol) else: return None if "SMILES" in descrType.upper(): oeMol = self.smilesToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: return oechem.OECreateCanSmiString(oeMol) else: return None elif "INCHI" in descrType.upper(): oeMol = self.inchiToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: return oechem.OECreateIsoSmiString(oeMol) else: return None except Exception as e: logger.exception("Failing with %s", str(e)) return None def descriptorToMol(self, descr, descrType, limitPerceptions=False, messageTag=None): """Parse the input descriptor string and return a molecule object (OeGraphMol/OeQMol). Args: descr (str): descriptor descrType (str): descriptor type limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor messageTag (srt, optional): prefix string for error messages. Defaults to None. Returns: object: OeGraphMol()/OeQmol() object or None for failure ifs.SetFlavor(oechem.OEFormat_PDB, oechem.OEIFlavor_PDB_Default | oechem.OEIFlavor_PDB_DATA | oechem.OEIFlavor_PDB_ALTLOC) # noq """ try: if "SMILES" in descrType.upper() and "ISO" in descrType.upper(): oeMol = self.smilesToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: isoSmiles = oechem.OECreateIsoSmiString(oeMol) return self.smilesToMol(isoSmiles, limitPerceptions=limitPerceptions, messageTag=messageTag) else: return None if "SMILES" in descrType.upper(): oeMol = self.smilesToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: smiles = oechem.OECreateCanSmiString(oeMol) return self.smilesToMol(smiles, limitPerceptions=limitPerceptions, messageTag=messageTag) else: return None elif "INCHI" in descrType.upper(): oeMol = self.inchiToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: isoSmiles = oechem.OECreateIsoSmiString(oeMol) return self.smilesToMol(isoSmiles, limitPerceptions=limitPerceptions, messageTag=messageTag) elif "SMARTS" in descrType.upper(): return self.smartsToQmol(descr, messageTag=messageTag) else: return None except Exception as e: logger.exception("Failing with %s", str(e)) return None def smilesToMol(self, smiles, limitPerceptions=False, messageTag=None): """Parse the input SMILES string and return a molecule object (OeGraphMol). Args: smiles (str): SMILES string limitPerceptions (bool): flag to limit the perceptions/transformations of input SMILES Returns: object: OeGraphMol() object or None for failure """ try: label = messageTag if messageTag else "" mol = oechem.OEGraphMol() smiles.strip() if limitPerceptions: # convert the SMILES string into a molecule if oechem.OEParseSmiles(mol, smiles, False, False): return mol else: logger.debug( "%s parsing failed for input SMILES string %s", label, smiles) logger.error("%s parsing failed for input SMILES string", label) else: if oechem.OESmilesToMol(mol, smiles): return mol else: logger.debug( "%s converting failed for input SMILES string %s", label, smiles) logger.error( "%s converting failed for input SMILES string", label) except Exception as e: logger.exception("Failing with %s", str(e)) return None def inchiToMol(self, inchi, limitPerceptions=False, messageTag=None): """Parse the input InChI string and return a molecule object (OeGraphMol). Args: inchi (str): InChI string Returns: object: OeGraphMol() object or None for failure """ try: label = messageTag if messageTag else "" mol = oechem.OEGraphMol() inchi = inchi.strip() if limitPerceptions: if oechem.OEParseInChI(mol, inchi): return mol else: logger.debug("%s parsing failed for InChI string %r", label, inchi) logger.error("%s parsing failed for InChI string", label) else: if oechem.OEInChIToMol(mol, inchi): return mol else: logger.debug("%s converting failed for InChI string %r", label, inchi) logger.error("%s converting failed for InChI string", label) except Exception as e: logger.exception("Failing with %s", str(e)) return None def descriptorToQMol(self, descr, descrType, limitPerceptions=False, messageTag=None): """Parse the input descriptor string and return a query molecule object (OeQMol). Args: descr (str): descriptor descrType (str): descriptor type limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor messageTag (srt, optional): prefix string for error messages. Defaults to None. Returns: object: OeQmol() object or None for failure """ oeQMol = label = None try: label = messageTag if messageTag else "" tMol = self.descriptorToMol(descr, descrType, limitPerceptions=limitPerceptions, messageTag=messageTag) if tMol: oeQMol = oechem.OEQMol(tMol) except Exception as e: logger.error("%s Failing for with %s", label, str(e)) return oeQMol if oeQMol else None def smartsToQmol(self, smarts, messageTag=None): """Parse the input SMARTS query string and return a query molecule object (OeQMol). Args: smarts (str): SMARTS query string Returns: object : OeQMol() object or None for failure """ try: label = messageTag if messageTag else "" qmol = oechem.OEQMol() if oechem.OEParseSmarts(qmol, smarts): return qmol else: logger.debug("%s parsing failed for SMARTS string %s", label, smarts) logger.error("%s parsing failed for SMARTS string", label) except Exception as e: logger.exception("Failing with %s", str(e)) return None def fileToMols(self, filePath, use3D=False, largestPart=False): """Parse the input path returning a list of molecule objects (OeGraphMol). Args: filePath (str): file path must have strandard recognized extension ('mol', 'sdf', 'smi', 'oeb'). Returns: list : list of OeGraphMol() objects """ mL = [] oemf = OeMoleculeFactory() try: ifs = oechem.oemolistream() if ifs.open(filePath): for tMol in ifs.GetOEGraphMols(): oeMol = oechem.OEGraphMol(tMol) # if oechem.OEReadMolecule(ifs, oeMol): if largestPart: molL = oemf.getParts(oeMol) if len(molL) > 0: oeMol = molL[0] logger.info( "Using largest bonded molecule part (%d/%d)", len(molL), oeMol.NumAtoms()) if use3D: mL.append( oemf.updateOePerceptions3D( oeMol, aromaticModel=oechem.OEAroModelOpenEye)) else: mL.append( oemf.updateOePerceptions2D( oeMol, aromaticModel=oechem.OEAroModelOpenEye)) except Exception as e: logger.exception("Failing with %s", str(e)) return mL def stringToMols(self, txt, sType="mol2", use3D=False): """Parse the input string as input format type (sType) returning a list of molecule objects (OeGraphMol) Args: txt (str): string text of molecule data sType (str, optional): string data format (mol2, sdf, smiles) . Defaults to "mol2". Returns: list: list of OeGraphMol() objects """ # mL = [] oemf = OeMoleculeFactory() try: if sType not in ["mol2", "sdf", "smiles"]: logger.error("Unsupported string data format") return None fD = { "mol2": oechem.OEFormat_MOL2, "sdf": oechem.OEFormat_SDF, "smiles": oechem.OEFormat_SMI } ifs = oechem.oemolistream() ifs.SetFormat(fD["sType"]) if not ifs.openstring(txt): logger.error("Unable open string data for molecule reader") return None for tMol in ifs.GetOEGraphMols(): oeMol = oechem.OEGraphMol(tMol) if use3D: mL.append( oemf.updateOePerceptions3D( oeMol, aromaticModel=oechem.OEAroModelOpenEye)) else: mL.append( oemf.updateOePerceptions2D( oeMol, aromaticModel=oechem.OEAroModelOpenEye)) except Exception as e: logger.exception("Failing with %s", str(e)) return mL def readOeBinaryMolCache(self, filePath): """Return a list of OeGraphMol() objects read from the cached binary file. Args: filePath (str): file path for the binary OeMol cache Returns: dict: dictionary of OeGraphMol()'s {<ccId>: OeGraphMol(), ... } """ retD = {} startTime = time.time() try: ifs = oechem.oemolistream() if ifs.open(filePath): for oeMol in ifs.GetOEGraphMols(): tMol = oechem.OEGraphMol(oeMol) retD[tMol.GetTitle()] = tMol except Exception as e: logger.exception("Failing with %s", str(e)) endTime = time.time() logger.info("Completed operation at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return retD def createOeFingerPrintDatabase(self, oeMolDbFilePath, oeFpDbFilePath, fpType="TREE", dbType="FAST"): if dbType == "FAST": return self.__createOeFastFingerPrintDatabase(oeMolDbFilePath, oeFpDbFilePath, fpType=fpType) else: return True def __createOeFastFingerPrintDatabase(self, oeMolDbFilePath, oeFpDbFilePath, fpType="TREE"): """Create fast search fingerprint database from the input molecular database. Args: oeMolDbFilePath (str): path to the input molecular database oeFpDbFilePath (str): path to the output fingerprint database fpType (str): finger print type Returns: bool: True for success or False otherwise Supports: OEFPType_Circular OEFPType_Path OEFPType_Tree Not currently supported by OE fp search - OEFPType_MACCS166 OEFPType_Lingo """ startTime = time.time() ok = False try: _ = fpType fpD = { "TREE": oegraphsim.OEFPType_Tree, "CIRCULAR": oegraphsim.OEFPType_Circular, "PATH": oegraphsim.OEFPType_Path } myFpType = fpD[ fpType] if fpType in fpD else oegraphsim.OEFPType_Tree opts = oegraphsim.OECreateFastFPDatabaseOptions( oegraphsim.OEGetFPType(myFpType)) ok = oegraphsim.OECreateFastFPDatabaseFile(oeFpDbFilePath, oeMolDbFilePath, opts) except Exception as e: logger.exception("Failing with %s", str(e)) endTime = time.time() logger.info("Completed operation at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return ok def loadOeFingerPrintDatabase(self, oeMolDbFilePath, oeFpDbFilePath, inMemory=False, fpType="TREE", fpDbType="FAST"): if fpDbType == "FAST": return self.__loadOeFastFingerPrintDatabase(oeFpDbFilePath, inMemory=inMemory, fpType=fpType) else: return self.__loadOeFingerPrintDatabase(oeMolDbFilePath, fpType=fpType) def __loadOeFingerPrintDatabase(self, oeMolDbFilePath, fpType="TREE"): """Create conventional search fingerprint database from the input molecular database. Args: oeMolDbFilePath (str): path to the input molecular database oeFpDbFilePath (str): path to the output fingerprint database fpType (str): finger print type Returns: bool: True for success or False otherwise Supports: OEFPType_Circular OEFPType_Path OEFPType_Tree OEFPType_MACCS166 OEFPType_Lingo """ fpDb = None ok = False startTime = time.time() try: fpD = { "TREE": oegraphsim.OEFPType_Tree, "CIRCULAR": oegraphsim.OEFPType_Circular, "PATH": oegraphsim.OEFPType_Path, "MACCS": oegraphsim.OEFPType_MACCS166, "LINGO": oegraphsim.OEFPType_Lingo, } fpType = fpType if fpType and fpType in fpD else "TREE" tag = "FP_" + fpType oeFpType = fpD[ fpType] if fpType in fpD else oegraphsim.OEFPType_Tree oeMolDb = self.loadOeBinaryDatabaseAndIndex(oeMolDbFilePath) # fpDb = oegraphsim.OEFPDatabase(oeFpType) numMols = oeMolDb.GetMaxMolIdx() logger.debug("fpType %r tag %r oeFpType %r", fpType, tag, oeFpType) oeMol = oechem.OEGraphMol() for idx in range(0, numMols): if oeMolDb.GetMolecule(oeMol, idx): if oeMol.HasData(tag): tfp = oeMol.GetData(tag) fpDb.AddFP(tfp) else: fpDb.AddFP(oeMol) else: logger.info("Missing molecule at index %r", idx) numFp = fpDb.NumFingerPrints() ok = numMols == numFp logger.info( "Loaded molecules %d %s fingerprints %d (%.4f seconds)", numMols, fpType, numFp, time.time() - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) fpDb = None endTime = time.time() logger.debug("Completed with status %r operation at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return fpDb def __loadOeFastFingerPrintDatabase(self, oeFpDbFilePath, inMemory=False, fpType="TREE"): # _ = fpType startTime = time.time() if inMemory: memType = oegraphsim.OEFastFPDatabaseMemoryType_InMemory else: memType = oegraphsim.OEFastFPDatabaseMemoryType_MemoryMapped if not self.__mU.exists(oeFpDbFilePath): logger.error("Missing fingerprint database file %r", oeFpDbFilePath) fpDb = oegraphsim.OEFastFPDatabase(oeFpDbFilePath, memType) if not fpDb.IsValid(): logger.error("Cannot open fingerprint database %r", oeFpDbFilePath) # lenFp = fpDb.NumFingerPrints() memTypeStr = fpDb.GetMemoryTypeString() endTime = time.time() logger.info( "Read fingerprint database length %d loaded %s (%.4f seconds)", lenFp, memTypeStr, endTime - startTime) return fpDb def loadOeBinaryDatabaseAndIndex(self, oeMolDbFilePath): molDb = None try: molDb = oechem.OEMolDatabase() if not molDb.Open(oeMolDbFilePath): logger.error("Unable to open %r", oeMolDbFilePath) molCount = molDb.NumMols() logger.info("Loaded OE database file containing %d molecules", molCount) except Exception as e: logger.exception("Loading %r failing with %s", oeMolDbFilePath, str(e)) return molDb def createOeBinaryDatabaseAndIndex(self, oebMolFilePath, oeMolDbFilePath): """Create OE binary database file and associated index from the input serial binary data file. Args: oebMolFilePath (str): input OeMol stream binary file path oeMolDbFilePath (str): output OeMolDatabase file path Returns: int: number of molecules processed in the database. """ molCount = 0 try: startTime = time.time() moldb = oechem.OEMolDatabase() if not moldb.Open(oebMolFilePath): logger.error("Read fails for %r", oebMolFilePath) return molCount # logger.info( "Opened database in format %r num mols %d max index %d", moldb.GetFormat(), moldb.NumMols(), moldb.GetMaxMolIdx()) moldb.Save(oeMolDbFilePath) tL = list(moldb.GetTitles()) logger.info("First and last titles: %r %r", tL[0], tL[-1]) molCount = moldb.NumMols() endTime = time.time() logger.info("Completed operation at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) return molCount def buildOeBinaryMolCache(self, filePath, ccObjD, molBuildType="model-xyz", quietFlag=False, fpTypeList=None, limitPerceptions=False, suppressHydrogens=False): """Build cache of OEMol() objects from the input chemical component definition list. Args: filePath (str): output cache file path ccObjD (dict): chemical component object dictionary molBuildType (str, optional): [description]. Defaults to "model-xyz". quietFlag (bool, optional): [description]. Defaults to False. fpTypeList (list, optional): fingerprint type list. Defaults to None. limitPerceptions (bool, optional): suppress automatic chemical perceptions. Defaults to False. suppressHydrogens (bool, optional): suppress explicit hydrogen count. Defaults to False. Returns: (int, int, list): chem comp success count, error count, chem comp identifier failure list """ ok = False startTime = time.time() failIdList = [] ccCount = 0 errCount = 0 try: ofs = oechem.oemolostream() ofs.SetFormat(oechem.OEFormat_OEB) if ofs.open(filePath): oemf = OeMoleculeFactory() if quietFlag: oemf.setQuiet() for ccId, ccObj in ccObjD.items(): tId = oemf.setChemCompDef(ccObj) if tId and tId == ccId: ok = oemf.build(molBuildType=molBuildType, limitPerceptions=limitPerceptions) if ok and fpTypeList: fpOk = oemf.addFingerPrints(fpTypeList) if not fpOk: logger.info( "Fingerprint generation fails for %r", ccId) if ok: oeMol = oemf.getMol( suppressHydrogens=suppressHydrogens) oechem.OEWriteMolecule(ofs, oeMol) ccCount += 1 if not ok or not tId: # build failed incomplete component (e.g. missing atoms or bonds) errCount += 1 failIdList.append(ccId) else: logger.error("Unable to open cache database %s", filePath) errCount += 1 except Exception as e: logger.exception("Failing with %s", str(e)) # endTime = time.time() logger.info("Completed operation at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return ccCount, errCount, failIdList # def buildOeBinaryMolCacheFromIndex(self, filePath, ccIdxD, quietFlag=False, fpTypeList=None, limitPerceptions=False, suppressHydrogens=False): """Build cache of OEGraphMol() objects from the input chemical component search index. Args: filePath (str): output cache file path ccIdxD (dict): search index dictionary quietFlag (bool, optional): suppress OE output. Defaults to False. fpTypeList (list, optional): list of fingerprint types. Defaults to None. limitPerceptions (bool, optional): suppress automatic chemical perceptions. Defaults to False. suppressHydrogens (bool, optional): suppress explicit hydrogen count. Defaults to False. Returns: (int, int, list): chem comp success count, error count, chem comp identifier failure list """ failIdList = [] ccCount = 0 errCount = 0 startTime = time.time() try: ofs = oechem.oemolostream() ofs.SetFormat(oechem.OEFormat_OEB) if ofs.open(filePath): oemf = OeMoleculeFactory() if quietFlag: oemf.setQuiet() for searchCcId, ccIdx in ccIdxD.items(): oemf.setDescriptor(ccIdx["smiles"], "oe-iso-smiles", searchCcId) ok = oemf.build(molBuildType="oe-iso-smiles", limitPerceptions=limitPerceptions) if ok and fpTypeList: fpOk = oemf.addFingerPrints(fpTypeList) if not fpOk: logger.info("Fingerprint generation fails for %r", searchCcId) if ok: if not suppressHydrogens: oemf.addExplicitHydrogens() oemf.setSimpleAtomNames() oeMol = oemf.getMol( suppressHydrogens=suppressHydrogens) oechem.OEWriteMolecule(ofs, oeMol) ccCount += 1 if not ok: # build failed incomplete component (e.g. missing atoms or bonds) errCount += 1 failIdList.append(searchCcId) else: logger.error("Unable to open cache database %s", filePath) errCount += 1 except Exception as e: logger.exception("Failing with %s", str(e)) # endTime = time.time() logger.info("Completed operation at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return ccCount, errCount, failIdList def createOeSubSearchDatabase(self, oebMolFilePath, oeSubSearchFilePath, screenType="SMARTS", numProc=2): sort = True keepTitle = True myScreenType = None if screenType == "MOLECULE": myScreenType = oechem.OEGetSubSearchScreenType( oechem.OESubSearchScreenType_Molecule) elif screenType == "MDL": myScreenType = oechem.OEGetSubSearchScreenType( oechem.OESubSearchScreenType_MDL) elif screenType == "SMARTS": myScreenType = oechem.OEGetSubSearchScreenType( oechem.OESubSearchScreenType_SMARTS) opts = oechem.OECreateSubSearchDatabaseOptions(myScreenType) opts.SetSortByBitCounts(sort) opts.SetKeepTitle(keepTitle) opts.SetNumProcessors(numProc) screenStr = myScreenType.GetName() logger.info("Using %d processor(s) to generate database with %s", numProc, screenStr) tracer = oechem.OEConsoleProgressTracer() ok = oechem.OECreateSubSearchDatabaseFile(oeSubSearchFilePath, oebMolFilePath, opts, tracer) return ok def loadOeSubSearchDatabase(self, oeSubSearchFilePath, screenType=None, numProc=1): ssDb = None try: _ = screenType ssDb = oechem.OESubSearchDatabase( oechem.OESubSearchDatabaseType_Default, numProc) tracer = oechem.OEConsoleProgressTracer() if not ssDb.Open(oeSubSearchFilePath, tracer): logger.error("Unable to open %r", oeSubSearchFilePath) logger.info("Opened %r with %r molecules", oeSubSearchFilePath, ssDb.NumMolecules()) except Exception as e: logger.exception("Loading %r failing with %s", oeSubSearchFilePath, str(e)) return ssDb def write(self, filePath, oeMol, constantMol=False, addSdTags=True): """Write an oeMol with format type inferred from the filePath extension (e.g. .mol) Args: filePath (str): filepath with a chemical type extension constantMol (bool, optional): copies molecule before performing format specific perceptions Returns: bool: True for success or False otherwise """ try: molId = os.path.splitext(os.path.basename(filePath))[0] fmt = os.path.splitext(os.path.basename(filePath))[1][1:].lower() # if addSdTags: oemf = OeMoleculeFactory() oemf.setOeMol(oeMol, molId) oemf.addSdTags() oeMol = oemf.getMol() # self.__mU.mkdir(os.path.dirname(filePath)) ofs = oechem.oemolostream() ofs.open(filePath) logger.debug("Writing (fmt=%s) molId %s path %s title %s", fmt, molId, filePath, oeMol.GetTitle()) # if constantMol: oechem.OEWriteConstMolecule(ofs, oeMol) else: oechem.OEWriteMolecule(ofs, oeMol) # # If this is a mol2 file, we need to replace the resname if fmt.startswith("mol2"): # If this is a mol2/mol2h substitute the default substructure id with open(filePath, "r", encoding="utf-8") as ifh: lines = ifh.readlines() lines = [line.replace("<0>", molId) for line in lines] with open(filePath, "w", encoding="utf-8") as ofh: ofh.writelines(lines) return True except Exception as e: logger.exception("Failing for %s with %s", filePath, str(e)) return False def serializeOe(self, oeMol): """Create a string representing the content of the current OE molecule. This serialization uses the OE internal binary format. """ try: oms = oechem.oemolostream() oms.SetFormat(oechem.OEFormat_OEB) oms.openstring() oechem.OEWriteMolecule(oms, oeMol) logger.debug("SMILES %s", oechem.OECreateCanSmiString(oeMol)) logger.debug("Atoms = %d", oeMol.NumAtoms()) return oms.GetString() except Exception as e: logger.exception("Failing with %s", str(e)) def deserializeOe(self, oeS): """Reconstruct an OE molecule from the input string serialization (OE binary). The deserialized molecule is used to initialize the internal OE molecule within this object. Returns: list: OE GraphMol list """ molList = [] try: ims = oechem.oemolistream() ims.SetFormat(oechem.OEFormat_OEB) ims.openstring(oeS) for mol in ims.GetOEGraphMols(): logger.debug("SMILES %s", oechem.OECreateCanSmiString(mol)) logger.debug("title %s", mol.GetTitle()) logger.debug("atoms %d", mol.NumAtoms()) molList.append(oechem.OEGraphMol(mol)) except Exception as e: logger.exception("Failing with %s", str(e)) return molList
class EcodClassificationProvider(StashableBase): """Extract ECOD domain assignments, term descriptions and ECOD classification hierarchy from ECOD flat files. http://prodata.swmed.edu/ecod/ See: H. Cheng, R. D. Schaeffer, Y. Liao, L. N. Kinch, J. Pei, S. Shi, B. H. Kim, N. V. Grishin. (2014) ECOD: An evolutionary classification of protein domains. PLoS Comput Biol 10(12): e1003926. Linking details: http://prodata.swmed.edu/ecod/complete/domain/<domainId> http://prodata.swmed.edu/ecod/complete/domain/e6sl5G1 """ # # -- def __init__(self, cachePath, useCache, **kwargs): self.__cachePath = cachePath self.__useCache = useCache dirName = "ecod" super(EcodClassificationProvider, self).__init__(self.__cachePath, [dirName]) self.__dirPath = os.path.join(cachePath, "ecod") self.__version = None # urlTarget = kwargs.get("ecodTargetUrl", "http://prodata.swmed.edu/ecod/distributions/ecod.latest.domains.txt") urlBackup = kwargs.get("ecodUrlBackupPath", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/ECOD/ecod.latest.domains.txt.gz") # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__pD, self.__nD, self.__ntD, self.__pdbD = self.__reload(urlTarget, urlBackup, self.__dirPath, useCache=useCache) def testCache(self): logger.info("ECOD Lengths nD %d pdbD %d", len(self.__nD), len(self.__pdbD)) if (len(self.__nD) > 100) and (len(self.__pdbD) > 5000): return True return False def getVersion(self): return self.__version # -- def getFamilyIds(self, pdbId, authAsymId): try: return list(set([tup[1] for tup in self.__pdbD[(pdbId.lower(), authAsymId)]])) except Exception as e: logger.exception("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getDomainIds(self, pdbId, authAsymId): try: return list(set([tup[0] for tup in self.__pdbD[(pdbId.lower(), authAsymId)]])) except Exception as e: logger.exception("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getFamilyNames(self, pdbId, authAsymId): try: return list(set([self.getName(tup[1]) for tup in self.__pdbD[(pdbId.lower(), authAsymId)]])) except Exception as e: logger.exception("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getFamilyResidueRanges(self, pdbId, authAsymId): try: # pdbD.setdefault((pdbId, authAsymId), []).append((domId, fId, authAsymId, authSeqBeg, authSeqEnd)) return [(tup[0], tup[1], tup[2], tup[3], tup[4]) for tup in self.__pdbD[(pdbId.lower(), authAsymId)]] except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getName(self, domId): try: return self.__nD[domId].split("|")[0] except Exception: logger.debug("Undefined ECOD id %r", domId) return None def getNameType(self, domId): qD = {"A": "Architecture", "X": "Possible Homology", "H": "Homology", "T": "Topology", "F": "Family"} try: return qD[self.__ntD[domId]] except Exception: logger.debug("Undefined ECOD id %r", domId) return None def getIdLineage(self, domId): pList = [] try: pList.append(domId) if domId == 0: return pList pt = self.__pD[domId] while (pt is not None) and (pt != 0): pList.append(pt) pt = self.__pD[pt] except Exception as e: logger.exception("Failing for %r with %s", domId, str(e)) # pList.reverse() return pList def getNameLineage(self, domId): try: nL = [] for dId in self.getIdLineage(domId): tN = self.getName(dId) tN = tN if tN else "Unnamed" nL.append(tN) return nL except Exception as e: logger.exception("Failing for %r with %s", domId, str(e)) return None def getTreeNodeList(self): return self.__exportTreeNodeList(self.__pD) def __getDomainFileName(self): pyVersion = sys.version_info[0] fn = "ecod_domains-py%s.pic" % str(pyVersion) return fn def __reload(self, urlTarget, urlBackup, ecodDirPath, useCache=True): pD = nD = ntD = pdbD = {} fn = self.__getDomainFileName() ecodDomainPath = os.path.join(ecodDirPath, fn) self.__mU.mkdir(ecodDirPath) # if useCache and self.__mU.exists(ecodDomainPath): sD = self.__mU.doImport(ecodDomainPath, fmt="pickle") logger.debug("ECOD domain length %d", len(sD)) nD = sD["names"] ntD = sD["nametypes"] pD = sD["parents"] pdbD = sD["assignments"] self.__version = sD["version"] elif not useCache: minLen = 1000 logger.info("Fetch ECOD name and domain assignment data from primary data source %s", urlTarget) nmL = self.__fetchFromSource(urlTarget) if not nmL: nmL = self.__fetchFromSource(urlBackup) # logger.info("ECOD raw file length (%d)", len(nmL)) ok = False pD, nD, ntD, pdbD = self.__extractDomainHierarchy(nmL) # tS = datetime.datetime.now().isoformat() vS = self.__version sD = {"version": vS, "created": tS, "names": nD, "nametypes": ntD, "parents": pD, "assignments": pdbD} if (len(nD) > minLen) and (len(pD) > minLen): ok = self.__mU.doExport(ecodDomainPath, sD, fmt="pickle") logger.debug("Cache save status %r", ok) # return pD, nD, ntD, pdbD def __fetchFromSource(self, urlTarget): """Fetch the classification names and domain assignments from the ECOD repo.""" fU = FileUtil() fn = fU.getFileName(urlTarget) fp = os.path.join(self.__dirPath, fn) if not fU.exists(fp): fU.get(urlTarget, fp) # with open(fp, "r", encoding="utf-8") as ifh: line = ifh.readline() line = ifh.readline() line = ifh.readline() ff = line[:-1].split() self.__version = ff[-1] # nmL = self.__mU.doImport(fp, fmt="list", uncomment=True) fU.remove(fp) # return nmL def __extractDomainHierarchy(self, nmL): """ #/data/ecod/database_versions/v280/ecod.develop280.domains.txt #ECOD version develop280 #Domain list version 1.6 #Grishin lab (http://prodata.swmed.edu/ecod) #uid ecod_domain_id manual_rep f_id pdb chain pdb_range seqid_range unp_acc arch_name x_name h_name t_name f_name asm_status ligand 002728551 e7d2xA1 AUTO_NONREP 1.1.1 7d2x A A:-3-183 A:20-206 NO_UNP beta barrels "cradle loop barrel" "RIFT-related" "acid protease" F_UNCLASSIFIED 002728572 e7d5aA2 AUTO_NONREP 1.1.1 7d5a A A:-3-183 A:20-206 NO_UNP beta barrels "cradle loop barrel" "RIFT-related" "acid protease" F_UNCLASSIFIED 002726563 e7b1eA1 AUTO_NONREP 1.1.1 7b1e A A:46P-183 A:14-199 NO_UNP beta barrels "cradle loop barrel" "RIFT-related" "acid protease" F_UNCLASSIFIED 002726573 e7b1pA2 AUTO_NONREP 1.1.1 7b1p A A:47P-183 A:15-199 NO_UNP beta barrels "cradle loop barrel" "RIFT-related" "acid protease" F_UNCLASSIFIED """ assignD = {} pD = {} ntD = {} hD = {} pIdD = {} nmD = {} # logger.info("Length of input ECOD name list %d", len(nmL)) for nm in nmL: ff = nm.split("\t") # uId = ff[0] # ecodId is the linkable identifier - ecodId = ff[1] entryId = ff[4].lower() authAsymId = ff[5] resRange = ff[6] # # There are no unique identifiers published for the internal elements of the hierarchy # so these are assigned here similar to scop - There are also many unnamed nodes # that are conventionally filled in from the leaf levels of the tree... # {"A": "Architecture", "X": "Possible Homology", "H": "Homology", "T": "Topology", "F": "Family"} aGroupOrg = "A: " + ff[9].replace('"', "") xGroupOrg = "X: " + ff[10].replace('"', "") hGroupOrg = "H: " + ff[11].replace('"', "") tGroupOrg = "T: " + ff[12].replace('"', "") fGroupOrg = "F: " + ff[13].replace('"', "") if hGroupOrg == "H: NO_H_NAME": # hGroupOrg = tGroupOrg + "|(NO_H)" hGroupOrg = "H: " + ff[12].replace('"', "") + " (From Topology)" + "|(NO_H)" if xGroupOrg == "X: NO_X_NAME": if ff[11].replace('"', "") == "NO_H_NAME": # xGroupOrg = hGroupOrg + "|(NO_X)" xGroupOrg = "X: " + ff[12].replace('"', "") + " (From Topology)" + "|(NO_X)" else: xGroupOrg = "X: " + ff[11].replace('"', "") + " (From Homology)" + "|(NO_X)" # fGroupOrg = fGroupOrg if fGroupOrg != "F_UNCLASSIFIED" else "Unmapped domain of " + tGroupOrg # # Remove redundancy in names and assign unique ids # aGroup = aGroupOrg xGroup = xGroupOrg + "|" + aGroupOrg hGroup = hGroupOrg + "|" + xGroupOrg + "|" + aGroupOrg tGroup = tGroupOrg + "|" + hGroupOrg + "|" + xGroupOrg fGroup = fGroupOrg + "|" + tGroupOrg # hD.setdefault("A", set()).add(aGroup) hD.setdefault("X", set()).add(xGroup) hD.setdefault("H", set()).add(hGroup) hD.setdefault("T", set()).add(tGroup) hD.setdefault("F", set()).add(fGroup) aId = 100000 + len(hD["A"]) xId = 200000 + len(hD["X"]) hId = 300000 + len(hD["H"]) tId = 400000 + len(hD["T"]) fId = 500000 + len(hD["F"]) # # if xGroup in pD and pD[xGroup] != aGroup: logger.error("skipping %r multiple parents for xGroup %r %r and %r ", ecodId, xGroup, pD[xGroup], aGroup) continue # if hGroup in pD and pD[hGroup] != xGroup: logger.error("skipping %r multiple parents for hGroup %r %r and %r ", ecodId, hGroup, pD[hGroup], xGroup) continue # if tGroup in pD and pD[tGroup] != hGroup: logger.error("skipping %r multiple parents for tGroup %r %r and %r ", ecodId, tGroup, pD[tGroup], hGroup) continue # if fGroup in pD and pD[fGroup] != tGroup: logger.error("skipping %r multiple parents for fGroup %r %r and %r ", ecodId, fGroup, pD[fGroup], tGroup) continue if xId in pIdD and pIdD[xId] != aId: logger.error("skipped %r multiple parents for xId %r %r and %r ", ecodId, xId, pIdD[xId], aId) # if hId in pIdD and pIdD[hId] != xId: logger.error("skipped %r multiple parents for hId %r %r and %r ", ecodId, hId, pIdD[hId], xId) # if tId in pIdD and pIdD[tId] != hId: logger.error("skipped %r multiple parents for tId %r %r and %r ", ecodId, tId, pIdD[tId], hId) # if fId in pIdD and pIdD[fId] != tId: logger.error("skipped %r multiple parents for fId %r %r and %r ", ecodId, fId, pIdD[fId], tId) # pIdD[aId] = 0 pIdD[xId] = aId pIdD[hId] = xId pIdD[tId] = hId pIdD[fId] = tId # nmD[aId] = aGroupOrg nmD[xId] = xGroupOrg nmD[hId] = hGroupOrg nmD[tId] = tGroupOrg nmD[fId] = fGroupOrg # ntD[aId] = "A" ntD[xId] = "X" ntD[hId] = "H" ntD[tId] = "T" ntD[fId] = "F" rL = self.__parseRanges(resRange) if (entryId, authAsymId) not in assignD: assignD[(entryId, authAsymId)] = [(ecodId, fId, t[0], t[1], t[2]) for t in rL] else: for t in rL: assignD[(entryId, authAsymId)].append((ecodId, fId, t[0], t[1], t[2])) # return pIdD, nmD, ntD, assignD def __parseRanges(self, rS): rL = [] authAsymId = authSeqBeg = authSeqEnd = None try: tSL = rS.split(",") for tS in tSL: fL = tS.split(":") authAsymId = fL[0] rS = fL[1] if rS[0] == "-": authSeqBeg = -int(rS[1:].split("-")[0]) authSeqEnd = int(rS[1:].split("-")[1]) else: authSeqBeg = int(rS.split("-")[0]) authSeqEnd = int(rS.split("-")[1]) rL.append((authAsymId, authSeqBeg, authSeqEnd)) except Exception: pass return rL def __exportTreeNodeList(self, pD): """Create node list from name dictionary and lineage dictionaries.""" # rootId = 0 pL = [rootId] # logger.info("pD %d pL %r", len(pD), pL) # -- # # create child dictionary cD = {} for ctId, ptId in pD.items(): cD.setdefault(ptId, []).append(ctId) # logger.info("cD %d", len(cD)) # idL = [] for rootId in sorted(pL): visited = set([rootId]) queue = collections.deque(visited) while queue: tId = queue.popleft() idL.append(tId) if tId not in cD: # logger.debug("No children for Ecod tId %s", tId) continue for childId in cD[tId]: if childId not in visited: queue.append(childId) visited.add(childId) # dL = [] for tId in idL: displayName = self.getName(tId) ptId = pD[tId] if tId in pD else None lL = self.getIdLineage(tId)[1:] # if tId == rootId: continue elif ptId == rootId: dD = {"id": str(tId), "name": displayName, "depth": 0} else: dD = {"id": str(tId), "name": displayName, "parents": [str(ptId)], "depth": len(lL)} dL.append(dD) return dL
class ReferenceSequenceAssignmentProvider(object): """Utilities to cache content required to update referencence sequence assignments.""" def __init__( self, cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", polymerType="Protein", referenceDatabaseName="UniProt", provSource="PDB", maxChunkSize=100, fetchLimit=None, **kwargs ): self.__cfgOb = cfgOb self.__polymerType = polymerType self.__mU = MarshalUtil() # self.__maxChunkSize = maxChunkSize self.__statusList = [] # self.__pfP = self.__fetchPfamProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__ipP = self.__fetchInterProProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__goP = self.__fetchGoProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__ecP = self.__fetchEcProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__refIdMapD, self.__matchD, self.__refD = self.__reload(databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs) def goIdExists(self, goId): try: return self.__goP.exists(goId) except Exception as e: logger.exception("Failing for %r with %s", goId, str(e)) return False def getGeneOntologyLineage(self, goIdL): # "id" "name" gL = [] try: gTupL = self.__goP.getUniqueDescendants(goIdL) for gTup in gTupL: gL.append({"id": gTup[0], "name": gTup[1]}) except Exception as e: logger.exception("Failing for %r with %s", goIdL, str(e)) return gL def getPfamProvider(self): return self.__pfP def getInterProProvider(self): return self.__ipP def getEcProvider(self): return self.__ecP def getSiftsSummaryProvider(self): return self.__ssP def getMatchInfo(self): return self.__matchD def getRefData(self): return self.__refD def getDocuments(self, formatType="exchange"): fobj = UniProtUtils(saveText=False) exObjD = fobj.reformat(self.__refD, formatType=formatType) return list(exObjD.values()) def getRefIdMap(self): return self.__refIdMapD def getRefDataCount(self): return len(self.__refD) def testCache(self, minMatchPrimaryPercent=None, logSizes=False): okC = True logger.info("Reference cache lengths: refIdMap %d matchD %d refD %d", len(self.__refIdMapD), len(self.__matchD), len(self.__refD)) ok = bool(self.__refIdMapD and self.__matchD and self.__refD) # numRef = len(self.__refIdMapD) countD = defaultdict(int) logger.info("Match dictionary length %d", len(self.__matchD)) for _, mD in self.__matchD.items(): if "matched" in mD: countD[mD["matched"]] += 1 logger.info("Reference length %d match length %d coverage %r", len(self.__refD), len(self.__matchD), countD.items()) if minMatchPrimaryPercent: try: okC = 100.0 * float(countD["primary"]) / float(numRef) > minMatchPrimaryPercent except Exception: okC = False logger.info("Primary reference match percent test status %r", okC) # if logSizes: logger.info( "Pfam %.2f InterPro %.2f SIFTS %.2f GO %.2f EC %.2f RefIdMap %.2f RefMatchD %.2f RefD %.2f", getObjSize(self.__pfP) / 1000000.0, getObjSize(self.__ipP) / 1000000.0, getObjSize(self.__ssP) / 1000000.0, getObjSize(self.__goP) / 1000000.0, getObjSize(self.__ecP) / 1000000.0, getObjSize(self.__refIdMapD) / 1000000.0, getObjSize(self.__matchD) / 1000000.0, getObjSize(self.__refD) / 1000000.0, ) return ok and okC def __reload(self, databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs): assignRefD = self.__getPolymerReferenceSequenceAssignments(databaseName, collectionName, polymerType, fetchLimit) refIdMapD, _ = self.__getAssignmentMap(assignRefD, referenceDatabaseName=referenceDatabaseName, provSource=provSource) # entryIdL = [rcsbId[:4] for rcsbId in assignRefD] siftsUniProtL = self.__ssP.getEntryUniqueIdentifiers(entryIdL, idType="UNPID") logger.info("Incorporating %d SIFTS accessions for %d entries", len(siftsUniProtL), len(entryIdL)) unpIdList = sorted(set(list(refIdMapD.keys()) + siftsUniProtL)) # logger.info("Rebuild cache for %d UniProt accessions (consolidated)", len(unpIdList)) # matchD, refD = self.__rebuildReferenceCache(unpIdList, referenceDatabaseName, **kwargs) return refIdMapD, matchD, refD def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, fetchLimit): """Get all accessions assigned to input reference sequence database for the input polymerType. Returns: (dict): {"1abc_1": "rcsb_polymer_entity_container_identifiers": {"reference_sequence_identifiers": []}, "rcsb_polymer_entity_align": [], "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []} """ try: obEx = ObjectExtractor( self.__cfgOb, databaseName=databaseName, collectionName=collectionName, cacheFilePath=None, useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=None, objectLimit=fetchLimit, selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType}, selectionList=[ "rcsb_id", "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers", "rcsb_polymer_entity_container_identifiers.auth_asym_ids", # "rcsb_polymer_entity_align", # "rcsb_entity_source_organism.ncbi_taxonomy_id", # "rcsb_polymer_entity_container_identifiers.related_annotation_identifiers", # "rcsb_polymer_entity_annotation", "rcsb_entity_source_organism.ncbi_taxonomy_id", ], ) eCount = obEx.getCount() logger.info("Polymer entity count type %s is %d", polymerType, eCount) objD = obEx.getObjects() logger.info("Reading polymer entity count %d ref accession length %d ", eCount, len(objD)) # except Exception as e: logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e)) return objD def __getAssignmentMap(self, objD, referenceDatabaseName="UniProt", provSource="PDB"): refIdD = defaultdict(list) taxIdD = defaultdict(list) numMissing = 0 numMissingTaxons = 0 for entityKey, eD in objD.items(): try: accS = set() for ii, tD in enumerate(eD["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]): if tD["database_name"] == referenceDatabaseName and tD["provenance_source"] == provSource: accS.add(tD["database_accession"]) refIdD[tD["database_accession"]].append(entityKey) # # pick up the corresponding taxonomy - try: taxIdD[tD["database_accession"]].append(eD["rcsb_entity_source_organism"][ii]["ncbi_taxonomy_id"]) except Exception: logger.debug("Failing taxonomy lookup for %s %r", entityKey, tD["database_accession"]) numMissingTaxons += 1 logger.debug("PDB assigned sequences length %d", len(accS)) except Exception as e: numMissing += 1 logger.debug("No sequence assignments for %s with %s", entityKey, str(e)) # numMultipleTaxons = 0 for refId, taxIdL in taxIdD.items(): taxIdL = list(set(taxIdL)) if len(taxIdL) > 1: logger.debug("Multitple taxIds assigned to reference sequence id %s: %r", refId, taxIdL) numMultipleTaxons += 1 logger.info("Entities with missing taxonomy %d", numMissingTaxons) logger.info("Reference sequences with multiple taxonomies %d", numMultipleTaxons) logger.info("Unique %s accession assignments by %s %d (entities missing archive accession assignments %d) ", referenceDatabaseName, provSource, len(refIdD), numMissing) return refIdD, taxIdD # def __rebuildReferenceCache(self, idList, refDbName, **kwargs): """ """ fetchLimit = None doMissing = True dD = {} cachePath = kwargs.get("cachePath", ".") dirPath = os.path.join(cachePath, "exdb") # cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3}) cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"}) useCache = kwargs.get("useCache", True) saveText = kwargs.get("saveText", False) # ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json" fn = refDbName + "-ref-sequence-data-cache" + "." + ext dataCacheFilePath = os.path.join(dirPath, fn) # fn = refDbName + "-ref-sequence-id-cache" + ".json" accCacheFilePath = os.path.join(dirPath, fn) # self.__mU.mkdir(dirPath) if not useCache: for fp in [dataCacheFilePath, accCacheFilePath]: try: os.remove(fp) except Exception: pass # if useCache and accCacheFilePath and self.__mU.exists(accCacheFilePath) and dataCacheFilePath and self.__mU.exists(dataCacheFilePath): dD = self.__mU.doImport(dataCacheFilePath, **cacheKwargs) idD = self.__mU.doImport(accCacheFilePath, fmt="json") logger.info("Reading cached reference sequence ID and data cache files - cached match reference length %d", len(idD["matchInfo"])) idD["matchInfo"] = self.__rebuildReferenceMatchIndex(idList, dD["refDbCache"]) # Check for completeness - if doMissing: missingS = set(idList) - set(idD["matchInfo"].keys()) if missingS: logger.info("Reference sequence cache missing %d accessions", len(missingS)) extraD, extraIdD = self.__fetchReferenceEntries(refDbName, list(missingS), saveText=saveText, fetchLimit=fetchLimit) dD["refDbCache"].update(extraD["refDbCache"]) idD["matchInfo"].update(extraIdD["matchInfo"]) # idD["matchInfo"] = self.__rebuildReferenceMatchIndex(idList, dD["refDbCache"]) # if accCacheFilePath and dataCacheFilePath and cacheKwargs: self.__mU.mkdir(dirPath) ok1 = self.__mU.doExport(dataCacheFilePath, dD, **cacheKwargs) ok2 = self.__mU.doExport(accCacheFilePath, idD, fmt="json", indent=3) logger.info("Cache updated with missing references with status %r", ok1 and ok2) # else: logger.info("Rebuilding reference cache for %s for %d accessions with limit %r", refDbName, len(idList), fetchLimit) dD, idD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit) if accCacheFilePath and dataCacheFilePath and cacheKwargs: self.__mU.mkdir(dirPath) ok1 = self.__mU.doExport(dataCacheFilePath, dD, **cacheKwargs) ok2 = self.__mU.doExport(accCacheFilePath, idD, fmt="json", indent=3) logger.info("Cache save status %r", ok1 and ok2) return idD["matchInfo"], dD["refDbCache"] def __rebuildReferenceMatchIndex(self, idList, referenceD): fobj = UniProtUtils() logger.info("Rebuilding match index on idList (%d) using reference data (%d) %r", len(idList), len(referenceD), type(referenceD)) matchD = fobj.rebuildMatchResultIndex(idList, referenceD) return matchD def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None): """Fetch database entries from the input reference sequence database name.""" dD = {"refDbName": refDbName, "refDbCache": {}} idD = {"matchInfo": {}, "refIdMap": {}} try: idList = idList[:fetchLimit] if fetchLimit else idList logger.info("Starting fetch for %d %s entries", len(idList), refDbName) if refDbName == "UniProt": fobj = UniProtUtils(saveText=saveText) logger.info("Maximum reference chunk size %d", self.__maxChunkSize) refD, matchD = fobj.fetchList(idList, maxChunkSize=self.__maxChunkSize) dD = {"refDbName": refDbName, "refDbCache": refD} idD = {"matchInfo": matchD} # # Check the coverage - # countD = defaultdict(int) logger.info("Match dictionary length %d", len(matchD)) for _, mD in matchD.items(): if "matched" in mD: countD[mD["matched"]] += 1 logger.info("Reference length %d match length %d coverage %r", len(refD), len(matchD), countD.items()) except Exception as e: logger.exception("Failing with %s", str(e)) return dD, idD def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs): abbreviated = kwargs.get("siftsAbbreviated", "TEST") cachePath = kwargs.get("cachePath", ".") cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"}) useCache = kwargs.get("useCache", True) # siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName) # logger.info("Using SIFTS_SUMMARY_DATA_PATH, %r", siftsSummaryDataPath) if siftsSummaryDataPath.lower().startswith("http"): srcDirPath = siftsSummaryDataPath else: srcDirPath = os.path.join(cachePath, siftsSummaryDataPath) cacheDirPath = os.path.join(cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName)) logger.debug("ssP %r %r", srcDirPath, cacheDirPath) ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=abbreviated, cacheKwargs=cacheKwargs) ok = ssP.testCache() logger.debug("SIFTS cache status %r", ok) logger.debug("ssP entry count %d", ssP.getEntryCount()) return ssP def __fetchGoProvider(self, cfgOb, configName, **kwargs): cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) # cacheDirPath = os.path.join(cachePath, cfgOb.get("EXDB_CACHE_DIR", sectionName=configName)) logger.debug("goP %r %r", cacheDirPath, useCache) goP = GeneOntologyProvider(goDirPath=cacheDirPath, useCache=useCache) ok = goP.testCache() logger.debug("Gene Ontology (%r) root node count %r", ok, goP.getRootNodes()) return goP def __fetchEcProvider(self, cfgOb, configName, **kwargs): cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) # cacheDirPath = os.path.join(cachePath, cfgOb.get("ENZYME_CLASSIFICATION_CACHE_DIR", sectionName=configName)) logger.debug("ecP %r %r", cacheDirPath, useCache) ecP = EnzymeDatabaseProvider(enzymeDirPath=cacheDirPath, useCache=useCache) ok = ecP.testCache() logger.debug("Enzyme cache status %r", ok) return ecP def __fetchPfamProvider(self, cfgOb, configName, **kwargs): _ = cfgOb _ = configName cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) pfP = PfamProvider(cachePath=cachePath, useCache=useCache) ok = pfP.testCache() return pfP if ok else None def __fetchInterProProvider(self, cfgOb, configName, **kwargs): _ = cfgOb _ = configName cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) ipP = InterProProvider(cachePath=cachePath, useCache=useCache) ok = ipP.testCache() return ipP if ok else None
class ChemCompSearchWrapper(SingletonClass): """Wrapper for chemical component search operations.""" def __init__(self, **kwargs): """Wrapper class for chemical search/depiction operations. Path and prefix data for wrapper class may be set as keyword arguments as environmental variables. Args: cachePath (str): path to top-level cache directory used to store search index file dependencies (default environment variable CHEM_SEARCH_CACHE_PATH or ".") ccFileNamePrefix (str): prefix code used to distinguish different subsets of chemical definitions (default environment variable CHEM_SEARCH_CC_PREFIX or "cc-full") """ self.__startTime = time.time() # self.__cachePath = kwargs.get( "cachePath", os.environ.get("CHEM_SEARCH_CACHE_PATH", ".")) self.__ccFileNamePrefix = kwargs.get( "ccFileNamePrefix", os.environ.get("CHEM_SEARCH_CC_PREFIX", "cc-full")) # self.__dependFileName = "ChemCompSearchWrapperData.tar.gz" self.__dependTarFilePath = os.path.join(self.__cachePath, self.__dependFileName) # --- self.__mU = MarshalUtil(workPath=self.__cachePath) # --- self.__configD = {} self.__ccIdxP = None self.__siIdxP = None self.__siIdx = {} self.__oesmP = None self.__oesU = None self.__oesubsU = None # --- self.__statusDescriptorError = -100 self.__searchError = -200 self.__searchSuccess = 0 def setConfig(self, ccUrlTarget, birdUrlTarget, **kwargs): """Provide the chemical definition source path details for rebuilding search index file dependencies. Args: ccUrlTarget (str): path to concatenated chemical component definition file birdUrlTarget (str): path to the concatenated BIRD definition file Other options are propagated to configurations of the wrapped classes in __bootstrapConfig() """ kwargs["ccUrlTarget"] = ccUrlTarget kwargs["birdUrlTarget"] = birdUrlTarget kwargs["cachePath"] = self.__cachePath kwargs["ccFileNamePrefix"] = self.__ccFileNamePrefix self.__configD = self.__bootstrapConfig(**kwargs) return len(self.__configD) >= 3 def __bootstrapConfig(self, **kwargs): """Build on-the-fly default configuration for this wrapper class.""" # The following few options have no defaults -- and should be specified. ccUrlTarget = kwargs.get("ccUrlTarget", None) birdUrlTarget = kwargs.get("birdUrlTarget", None) cachePath = kwargs.get("cachePath", None) ccFileNamePrefix = kwargs.get("ccFileNamePrefix", None) logger.info("Bootstrap configuration for prefix %r cc %r bird %r", ccFileNamePrefix, ccUrlTarget, birdUrlTarget) # --- # Reasonable values are selected for the remaining options... oeFileNamePrefix = "oe-" + ccFileNamePrefix try: storeConfig = kwargs.get("storeConfig", True) molLimit = kwargs.get("molLimit", None) useCache = kwargs.get("useCache", False) logSizes = kwargs.get("logSizes", False) # numProc = kwargs.get("numProc", 12) maxProc = os.cpu_count() numProc = min(numProc, maxProc) maxChunkSize = kwargs.get("maxChunkSize", 50) # logger.debug("+++ >>> Assigning numProc as %d", numProc) # limitPerceptions = kwargs.get("limitPerceptions", False) quietFlag = kwargs.get("quietFlag", True) # # fpTypeCuttoffD = {"TREE": 0.6, "MACCS": 0.9, "PATH": 0.6, "CIRCULAR": 0.6, "LINGO": 0.9} fpTypeCuttoffD = kwargs.get("fpTypeCuttoffD", { "TREE": 0.6, "MACCS": 0.9 }) buildTypeList = kwargs.get("buildTypeList", [ "oe-iso-smiles", "oe-smiles", "cactvs-iso-smiles", "cactvs-smiles", "inchi" ]) # oesmpKwargs = { "ccUrlTarget": ccUrlTarget, "birdUrlTarget": birdUrlTarget, "cachePath": cachePath, "useCache": useCache, "ccFileNamePrefix": ccFileNamePrefix, "oeFileNamePrefix": oeFileNamePrefix, "limitPerceptions": limitPerceptions, "minCount": None, "maxFpResults": 50, "fpTypeCuttoffD": fpTypeCuttoffD, "buildTypeList": buildTypeList, "screenTypeList": None, "quietFlag": quietFlag, "numProc": numProc, "maxChunkSize": maxChunkSize, "molLimit": molLimit, "logSizes": logSizes, "suppressHydrogens": True, } ccsiKwargs = { "ccUrlTarget": ccUrlTarget, "birdUrlTarget": birdUrlTarget, "cachePath": cachePath, "useCache": useCache, "ccFileNamePrefix": ccFileNamePrefix, "oeFileNamePrefix": oeFileNamePrefix, "limitPerceptions": limitPerceptions, "minCount": None, "numProc": numProc, "quietFlag": quietFlag, "maxChunkSize": maxChunkSize, "molLimit": None, "logSizes": False, } configD = { "versionNumber": 0.30, "ccsiKwargs": ccsiKwargs, "oesmpKwargs": oesmpKwargs } # if storeConfig: configDirPath = os.path.join(cachePath, "config") configFilePath = os.path.join( configDirPath, ccFileNamePrefix + "-config.json") logger.info("Saving configuration bootstrap in %r", configFilePath) self.__mU.mkdir(configDirPath) self.__mU.doExport(configFilePath, configD, fmt="json", indent=3) except Exception as e: logger.exception("Failing with %s", str(e)) return configD def readConfig(self, resetCachePath=True): """Read a prepared configuration file for the search wrapper class. This will override any default configuration settings. Args: resetCachPath (bool): update cachePath configuration option with the current cachePath setting. Returns: bool : True for success or False otherwise """ # # ok = False try: # configFilePath = os.path.join( self.__cachePath, "config", self.__ccFileNamePrefix + "-config.json") configD = self.__mU.doImport(configFilePath, fmt="json") logger.debug("ConfigD: %r", configD) if configD and (len(configD) > 2) and float( configD["versionNumber"]) > 0.2: logger.info("Read version %r sections %r from %s", configD["versionNumber"], list(configD.keys()), configFilePath) ok = True self.__configD = configD if resetCachePath: # Allow the configuration to be relocatable. configD["ccsiKwargs"]["cachePath"] = self.__cachePath configD["oesmpKwargs"]["cachePath"] = self.__cachePath else: logger.error("Reading config file fails from %r", configFilePath) except Exception as e: logger.exception("Failing with %s", str(e)) ok = False return ok def buildDependenices(self, ccUrlTarget, birdUrlTarget, **kwargs): """Convenience method to build configuration and static dependencies for the chemical search services. Args: ccUrlTarget (str): path to source concatenated chemical component definition file birdUrlTarget (str): path to the source concatenated BIRD definition file Other options are propagated to configurations of the wrapped classes in __bootstrapConfig() """ try: okT = False ok1 = self.setConfig(ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget, **kwargs) useCache = kwargs.get("useCache", False) ok2 = self.updateChemCompIndex(useCache=useCache) ok3 = self.updateSearchIndex(useCache=useCache) ok4 = self.updateSearchMoleculeProvider(useCache=useCache) okBuild = ok1 and ok2 and ok3 and ok4 if okBuild: fileU = FileUtil() dirPathList = [ os.path.join(self.__cachePath, subDir) for subDir in ["chem_comp", "oe_mol", "config"] ] okT = fileU.bundleTarfile(self.__dependTarFilePath, dirPathList, mode="w:gz", recursive=True) # return okT and okBuild except Exception as e: logger.exception("Failing build with %r and %r with %s", ccUrlTarget, birdUrlTarget, str(e)) return False def stashDependencies(self, url, dirPath, bundleLabel="A", userName=None, pw=None): """Store a copy of the bundled search dependencies remotely - Args: url (str): URL string for the destination host (e.g. sftp://myserver.net or None for a local file) dirPath (str): directory path on the remote resource bundleLabel (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A') userName (str, optional): optional access information. Defaults to None. password (str, optional): optional access information. Defaults to None. Returns: bool: True for success or False otherwise """ try: ok = False fn = self.__makeBundleFileName(self.__dependFileName, bundleLabel=bundleLabel) if url and url.startswith("sftp://"): sftpU = SftpUtil() hostName = url[7:] ok = sftpU.connect(hostName, userName, pw=pw, port=22) if ok: remotePath = os.path.join("/", dirPath, fn) ok = sftpU.put(self.__dependTarFilePath, remotePath) elif not url: fileU = FileUtil() remotePath = os.path.join(dirPath, fn) ok = fileU.put(self.__dependTarFilePath, remotePath) else: logger.error("Unsupported stash protocol %r", url) return ok except Exception as e: logger.exception("For %r %r failing with %s", url, dirPath, str(e)) return False def __makeBundleFileName(self, rootName, bundleLabel="A"): fn = rootName try: fn = rootName fn = "%s-%s" % (bundleLabel.upper(), rootName) if bundleLabel else rootName except Exception as e: logger.exception("Failing with %s", str(e)) return fn def restoreDependencies(self, url, dirPath, bundleLabel="A", userName=None, pw=None): """Restore bundled dependencies from remote storage and unbundle these in the current local cache directory. Args: url (str): remote URL dirPath (str): remote directory path on the bundleLabel (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A') userName (str, optional): optional access information. Defaults to None. password (str, optional): optional access information. Defaults to None. """ try: ok = False fileU = FileUtil() fn = self.__makeBundleFileName(self.__dependFileName, bundleLabel=bundleLabel) if not url: remotePath = os.path.join(dirPath, fn) ok = fileU.get(remotePath, self.__dependTarFilePath) elif url and url.startswith("http://"): remotePath = url + os.path.join("/", dirPath, fn) ok = fileU.get(remotePath, self.__dependTarFilePath) elif url and url.startswith("sftp://"): sftpU = SftpUtil() ok = sftpU.connect(url[7:], userName, pw=pw, port=22) if ok: remotePath = os.path.join(dirPath, fn) ok = sftpU.get(remotePath, self.__dependTarFilePath) else: logger.error("Unsupported protocol %r", url) if ok: ok = fileU.unbundleTarfile(self.__dependTarFilePath, dirPath=self.__cachePath) return ok except Exception as e: logger.exception("For %r %r Failing with %s", url, dirPath, str(e)) ok = False return ok def updateChemCompIndex(self, useCache=False): """Rebuild the basic index of source chemical component and BIRD definitions. Update the internal state of this index in the current object instance. Resource requirements: 94 sec 1 proc 7GB memory macbook pro Args: useCache (bool): False to rebuild search index and True to reload Returns: bool: True for success or false otherwise """ ok = False try: kwargs = copy.deepcopy( self.__configD["ccsiKwargs"] ) if "ccsiKwargs" in self.__configD else None if kwargs: kwargs["useCache"] = useCache ccIdxP = ChemCompIndexProvider(**kwargs) ok = ccIdxP.testCache() self.__ccIdxP = ccIdxP if ok else None logger.info("Chemical component index status %r", ok) except Exception as e: logger.exception("Failing with %s", str(e)) return ok def getChemCompIndex(self): return self.__ccIdxP.getIndex() if self.__ccIdxP else {} def getSearchMoleculeProvider(self): return self.__oesmP if self.__oesmP else None def updateSearchIndex(self, useCache=False): """Rebuild the search index from source chemical component and BIRD definitions. Update the internal state of this index in the current object instance. Resource requirements 771 secs 6 proc macbook pro 7GB memory. Args: useCache (bool): False to rebuild search index and True to reload Returns: bool: True for success or false otherwise """ ok = False try: kwargs = copy.deepcopy( self.__configD["ccsiKwargs"] ) if "ccsiKwargs" in self.__configD else None if kwargs: kwargs["useCache"] = useCache siIdxP = ChemCompSearchIndexProvider(**kwargs) ok = siIdxP.testCache() self.__siIdxP = siIdxP if siIdxP else None self.__siIdx = siIdxP.getIndex() if siIdxP and ok else {} logger.info("Search index status %r index len %d", ok, len(self.__siIdx) if self.__siIdx else 0) except Exception as e: logger.exception("Failing with %s", str(e)) return ok def updateSearchMoleculeProvider(self, useCache=False): """Rebuild the search molecule provider. Update the internal state of this object reference in the current object instance. Resource requirements: 151 seconds 1 proc 0.5GB memory macbook pro Args: useCache (bool): False to rebuild molecule store and True to reload Returns: bool: True for success or false otherwise """ ok = False try: kwargs = copy.deepcopy( self.__configD["oesmpKwargs"] ) if "oesmpKwargs" in self.__configD else None if kwargs: kwargs["useCache"] = useCache oesmP = OeSearchMoleculeProvider(**kwargs) ok = oesmP.testCache() self.__oesmP = oesmP if oesmP and ok else None except Exception as e: logger.exception("Failing with %s", str(e)) return ok def reloadSearchDatabase(self): """Reload the in-memory search databases from the OE molecule provider. Resource requirements: ~90sec load time 0.35 GB memory Returns: bool: True for success or False otherwise """ ok = False try: okmp = self.updateSearchMoleculeProvider(useCache=True) if not okmp: return ok fpTypeCuttoffD = self.__configD["oesmpKwargs"][ "fpTypeCuttoffD"] if "fpTypeCuttoffD" in self.__configD[ "oesmpKwargs"] else {} fpTypeList = [k for k, v in fpTypeCuttoffD.items()] oesU = OeSearchUtils(self.__oesmP, fpTypeList=fpTypeList) ok1 = oesU.testCache() self.__oesU = oesU if ok1 else None # oesubsU = OeSubStructSearchUtils(self.__oesmP) ok2 = oesubsU.testCache() self.__oesubsU = oesubsU if ok2 else None except Exception as e: logger.exception("Failing with %s", str(e)) return ok1 and ok2 def searchByDescriptor(self, descriptor, descriptorType, matchOpts="graph-relaxed", searchId=None): """Wrapper method for descriptor match and descriptor substructure search methods. Args: descriptor (str): molecular descriptor (SMILES, InChI) descriptorType (str): descriptor type (SMILES, InChI matchOpts (str, optional): graph match criteria (graph-relaxed, graph-relaxed-stereo, graph-strict, fingerprint-similarity, sub-struct-graph-relaxed, sub-struct-graph-relaxed-stereo, sub-struct-graph-strict Defaults to "graph-relaxed") searchId (str, optional): search identifier for logging. Defaults to None. Returns: (statusCode, list, list): status, graph match and finger match lists of type (MatchResults) -100 descriptor processing error -200 search execution error 0 search execution success """ if matchOpts.startswith("sub-struct-"): return self.subStructSearchByDescriptor(descriptor, descriptorType, matchOpts=matchOpts, searchId=searchId) else: return self.matchByDescriptor(descriptor, descriptorType, matchOpts=matchOpts, searchId=searchId) def matchByDescriptor(self, descriptor, descriptorType, matchOpts="graph-relaxed", searchId=None): """Return graph match (w/ finger print pre-filtering) and finger print search results for the input desriptor. Args: descriptor (str): molecular descriptor (SMILES, InChI) descriptorType (str): descriptor type (SMILES, InChI matchOpts (str, optional): graph match criteria (graph-relaxed, graph-relaxed-stereo, graph-strict, fingerprint-similarity, Defaults to "graph-relaxed") searchId (str, optional): search identifier for logging. Defaults to None. Returns: (statusCode, list, list): status, graph match and finger match lists of type (MatchResults) -100 descriptor processing error -200 search execution error 0 search execution success """ ssL = fpL = [] retStatus = False statusCode = -200 try: fpTypeCuttoffD = self.__configD["oesmpKwargs"][ "fpTypeCuttoffD"] if "fpTypeCuttoffD" in self.__configD[ "oesmpKwargs"] else {} maxFpResults = self.__configD["oesmpKwargs"][ "maxFpResults"] if "maxFpResults" in self.__configD[ "oesmpKwargs"] else 50 limitPerceptions = self.__configD["oesmpKwargs"][ "limitPerceptions"] if "limitPerceptions" in self.__configD[ "oesmpKwargs"] else False # searchId = searchId if searchId else "query" messageTag = searchId + ":" + descriptorType oeioU = OeIoUtils() oeMol = oeioU.descriptorToMol(descriptor, descriptorType, limitPerceptions=limitPerceptions, messageTag=messageTag) oeMol = oeioU.suppressHydrogens(oeMol) if not oeMol: logger.warning("descriptor type %r molecule build fails: %r", descriptorType, descriptor) return self.__statusDescriptorError, ssL, fpL # retStatus, ssL, fpL = self.__oesU.searchSubStructureAndFingerPrint( oeMol, list(fpTypeCuttoffD.items())[:2], maxFpResults, matchOpts=matchOpts) statusCode = 0 if retStatus else self.__searchError except Exception as e: logger.exception("Failing with %s", str(e)) # return statusCode, ssL, fpL def subStructSearchByDescriptor(self, descriptor, descriptorType, matchOpts="sub-struct-graph-relaxed", searchId=None): """Return graph match (w/ finger print pre-filtering) and finger print search results for the input desriptor. Args: descriptor (str): molecular descriptor (SMILES, InChI) descriptorType (str): descriptor type (SMILES, InChI) matchOpts (str, optional): graph match criteria (sub-struct-graph-relaxed, sub-struct-graph-relaxed-stereo, sub-struct-graph-strict). Defaults to "sub-struct-graph-relaxed". searchId (str, optional): search identifier for logging. Defaults to None. Returns: (statusCode, list, list): status, substructure search results of type (MatchResults), empty list placeholder -100 descriptor processing error -200 search execution error 0 search execution success """ ssL = [] retStatus = False statusCode = -200 try: limitPerceptions = self.__configD["oesmpKwargs"][ "limitPerceptions"] if "limitPerceptions" in self.__configD[ "oesmpKwargs"] else False numProc = self.__configD["oesmpKwargs"][ "numProc"] if "numProc" in self.__configD["oesmpKwargs"] else 4 # searchId = searchId if searchId else "query" messageTag = searchId + ":" + descriptorType oeioU = OeIoUtils() oeMol = oeioU.descriptorToMol(descriptor, descriptorType, limitPerceptions=limitPerceptions, messageTag=messageTag) oeMol = oeioU.suppressHydrogens(oeMol) if not oeMol: logger.warning("descriptor type %r molecule build fails: %r", descriptorType, descriptor) return self.__statusDescriptorError, ssL, [] # ccIdL = self.__oesubsU.prefilterIndex(oeMol, self.__siIdxP, matchOpts=matchOpts) retStatus, ssL = self.__oesubsU.searchSubStructure( oeMol, ccIdList=ccIdL, matchOpts=matchOpts, numProc=numProc) statusCode = 0 if retStatus else self.__searchError except Exception as e: logger.exception("Failing with %s", str(e)) # return statusCode, ssL, [] def matchByFormulaRange(self, elementRangeD, matchSubset=False, searchId=None): """Return formula match results for input element range dictionary. Args: elementRangeD (dict): {'<element_name>: {'min': <int>, 'max': <int>}, ... } matchSubset (bool, optional): query for formula subset (default: False) searchId (str, optional): search identifier for logging. Defaults to None. Returns: (statusCode, list): status, list of chemical component identifiers """ ok = False rL = [] try: startTime = time.time() searchId = searchId if searchId else "query" rL = self.__ccIdxP.matchMolecularFormulaRange( elementRangeD, matchSubset=matchSubset) ok = True logger.info("%s formula %r matched %d (%.4f seconds)", searchId, elementRangeD, len(rL), time.time() - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) return ok, rL def matchByFormula(self, formula, matchSubset=False, searchId=None): """Return formula match results for input molecular formula. Args: formula (str): molecular formula (ex. 'C6H6') matchSubset (bool, optional): query for formula subset (default: False) searchId (str, optional): search identifier for logging. Defaults to None. Returns: (statusCode, list): status, list of chemical component identifiers """ ok = False rL = [] try: startTime = time.time() searchId = searchId if searchId else "query" mf = MolecularFormula() eD = mf.parseFormula(formula) elementRangeD = { k.upper(): { "min": v, "max": v } for k, v in eD.items() } rL = self.__ccIdxP.matchMolecularFormulaRange( elementRangeD, matchSubset=matchSubset) ok = True logger.info("%s formula %r matched %d (%.4f seconds)", searchId, elementRangeD, len(rL), time.time() - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) return ok, rL def status(self): unitS = "MB" if platform.system() == "Darwin" else "GB" rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("Maximum resident memory size %.4f %s", rusageMax / 10**6, unitS) endTime = time.time() logger.info("Status at %s (up %.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
class CurrentHoldingsProvider(object): """Provide inventory of current repository content.""" def __init__(self, **kwargs): self.__dirPath = kwargs.get("holdingsDirPath", ".") useCache = kwargs.get("useCache", True) baseUrl = kwargs.get( "baseUrl", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/development/fall_back/holdings/" ) urlTargetContent = kwargs.get( "currentTargetUrl", os.path.join(baseUrl, "current_holdings.json.gz")) urlFallbackTargetContent = kwargs.get( "currentTargetUrl", os.path.join(baseUrl, "current_holdings.json.gz")) # urlTargetIds = kwargs.get( "currentTargetUrl", os.path.join(baseUrl, "current_pdb_ids.json.gz")) urlFallbackTargetIds = kwargs.get( "currentTargetUrl", os.path.join(baseUrl, "current_pdb_ids.json.gz")) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__invD = self.__reloadEntryContent(urlTargetContent, urlFallbackTargetContent, self.__dirPath, useCache=useCache) self.__idD = self.__reloadEntryIds(urlTargetIds, urlFallbackTargetIds, self.__dirPath, useCache=useCache) def testCache(self, minCount=170000): logger.info("Inventory length cD (%d) id list (%d)", len(self.__invD), len(self.__idD)) # JDW - restore consistency checks # if len(self.__invD) > minCount and len(self.__idD) > minCount and len(self.__invD) == len(self.__idD): if len(self.__invD) > minCount and len(self.__idD) > minCount: return True return False def getEntryContentTypes(self, entryId): """Return the current content types for the input entry identifier""" try: return sorted(self.__invD[entryId.upper()].keys()) except Exception as e: logger.exception("Failing for %r with %s", entryId, str(e)) return [] def getEntryContentTypePathList(self, entryId, contentType): """Return the current content types for the input entry identifier""" try: return self.__invD[entryId.upper()][contentType] except Exception as e: logger.debug("Failing for %r %r with %s", entryId, contentType, str(e)) return [] def getEntryInventory(self): """Return the current inventory dictionary""" try: return self.__invD except Exception as e: logger.debug("Failing with %s", str(e)) return {} def getEntryIdList(self, afterDateTimeStamp=None): """Return the ID code list or optionally IDs changed after the input time stamp. Args: afterDateTimeStamp (str, optional): ISO format date time stamp. Defaults to None. """ try: if afterDateTimeStamp: dt = datetime.datetime.fromisoformat( afterDateTimeStamp).replace(tzinfo=pytz.utc) return [k for k, v in self.__idD.items() if v > dt] else: return list(self.__idD.keys()) except Exception as e: logger.error("Failing with %s", str(e)) return [] def __reloadEntryContent(self, urlTarget, urlFallbackTarget, dirPath, useCache=True): invD = {} fU = FileUtil() fn = fU.getFileName(urlTarget) fp = os.path.join(dirPath, fn) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(fp): invD = self.__mU.doImport(fp, fmt="json") logger.debug("Reading cached inventory (%d)", len(invD)) else: logger.info("Fetch inventory from %s", urlTarget) ok = fU.get(urlTarget, fp) if not ok: ok = fU.get(urlFallbackTarget, fp) # if ok: invD = self.__mU.doImport(fp, fmt="json") return invD def __reloadEntryIds(self, urlTarget, urlFallbackTarget, dirPath, useCache=True): idD = {} fU = FileUtil() fn = fU.getFileName(urlTarget) fp = os.path.join(dirPath, fn) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(fp): tdL = self.__mU.doImport(fp, fmt="json") logger.debug("Reading cached IDs list (%d)", len(tdL)) else: logger.info("Fetch ID list from %s", urlTarget) ok = fU.get(urlTarget, fp) if not ok: ok = fU.get(urlFallbackTarget, fp) # if ok: tdL = self.__mU.doImport(fp, fmt="json") # for td in tdL: for k, v in td.items(): try: idD[k] = datetime.datetime.fromisoformat(v) except Exception as e: logger.error("Date processing failing for %r %r with %s", k, v, str(e)) # sTupL = sorted(idD.items(), key=lambda item: item[1]) return {k: v for k, v in sTupL}
class ScopClassificationProvider(StashableBase): """Extract SCOPe assignments, term descriptions and SCOP classifications from SCOP flat files. """ def __init__(self, **kwargs): # self.__dirName = "scop" if "cachePath" in kwargs: self.__cachePath = os.path.abspath(kwargs.get("cachePath", None)) self.__scopDirPath = os.path.join(self.__cachePath, self.__dirName) else: self.__scopDirPath = kwargs.get("scopDirPath", ".") self.__cachePath, self.__dirName = os.path.split( os.path.abspath(self.__scopDirPath)) super(ScopClassificationProvider, self).__init__(self.__cachePath, [self.__dirName]) # useCache = kwargs.get("useCache", True) # urlTarget = kwargs.get("scopTargetUrl", "http://scop.berkeley.edu/downloads/update") # self.__version = kwargs.get("scopVersion", "2.07-2019-07-23") # self.__version = kwargs.get("scopVersion", "2.07-2020-01-23") # self.__version = kwargs.get("scopVersion", "2.07-2020-05-07") # self.__version = kwargs.get("scopVersion", "2.07-2021-07-07") urlTarget = kwargs.get("scopTargetUrl", "http://scop.berkeley.edu/downloads/parse") self.__version = kwargs.get("scopVersion", "2.08-stable") # urlBackupPath = kwargs.get( "scopUrlBackupPath", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/SCOP" ) # self.__mU = MarshalUtil(workPath=self.__scopDirPath) self.__nD, self.__pD, self.__pdbD = self.__reload( urlTarget, self.__scopDirPath, useCache=useCache, version=self.__version) # if not useCache and not self.testCache(): ok = self.__fetchFromBackup(urlBackupPath, self.__scopDirPath) if ok: self.__nD, self.__pD, self.__pdbD = self.__reload( urlTarget, self.__scopDirPath, useCache=True, version=self.__version) def testCache(self): logger.info("SCOP lengths nD %d pD %d pdbD %d", len(self.__nD), len(self.__pD), len(self.__pdbD)) if (len(self.__nD) > 100) and (len(self.__pD) > 100) and (len( self.__pdbD) > 100): return True return False def __fetchFromBackup(self, urlBackupPath, scopDirPath): pyVersion = sys.version_info[0] fn = "scop_domains-py%s.pic" % str(pyVersion) scopDomainPath = os.path.join(scopDirPath, fn) self.__mU.mkdir(scopDirPath) # backupUrl = urlBackupPath + "/" + fn logger.info("Using backup URL %r", backupUrl) fU = FileUtil() ok = fU.get(backupUrl, scopDomainPath) return ok def getScopVersion(self): return self.__version def getScopSunIds(self, pdbId, authAsymId): """ Get the sunid of the domain assignment for the assignment - aD[(pdbId, authAsymId)] = [(sunId, domainId, (authAsymId, resBeg, resEnd))] aD[(pdbId, authAsymId)] = [(domSunId, domainId, sccs, (authAsymId, resBeg, resEnd))] """ try: return list( set([tup[0] for tup in self.__pdbD[(pdbId, authAsymId)]])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getScopDomainNames(self, pdbId, authAsymId): try: return list( set([tup[1] for tup in self.__pdbD[(pdbId, authAsymId)]])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getScopSccsNames(self, pdbId, authAsymId): try: return list( set([tup[2] for tup in self.__pdbD[(pdbId, authAsymId)]])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getScopResidueRanges(self, pdbId, authAsymId): try: return [(tup[0], tup[1], tup[2], tup[3][0], tup[3][1], tup[3][2]) for tup in self.__pdbD[(pdbId, authAsymId)]] except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getScopName(self, sunId): try: return self.__nD[sunId] except Exception: logger.debug("Undefined SCOP sunId %r", sunId) return None def getIdLineage(self, sunId): pList = [] try: pList.append(sunId) pt = self.__pD[sunId] while (pt is not None) and (pt != 0): pList.append(pt) pt = self.__pD[pt] except Exception as e: logger.exception("Failing for %r with %s", sunId, str(e)) # pList.reverse() return pList def getNameLineage(self, sunId): try: return [self.getScopName(cId) for cId in self.getIdLineage(sunId)] except Exception as e: logger.exception("Failing for %r with %s", sunId, str(e)) return None def getTreeNodeList(self): return self.__exportTreeNodeList(self.__nD, self.__pD) # ### ### # def __reload(self, urlTarget, scopDirPath, useCache=True, version=None): nD = pD = pdbD = {} pyVersion = sys.version_info[0] scopDomainPath = os.path.join(scopDirPath, "scop_domains-py%s.pic" % str(pyVersion)) self.__mU.mkdir(scopDirPath) # # scopDomainPath = os.path.join(scopDirPath, "scop_domains.json") # if useCache and self.__mU.exists(scopDomainPath): sD = self.__mU.doImport(scopDomainPath, fmt="pickle") logger.debug( "SCOPe name length %d parent length %d assignments %d", len(sD["names"]), len(sD["parents"]), len(sD["assignments"])) nD = sD["names"] pD = sD["parents"] pdbD = sD["assignments"] elif not useCache: ok = False minLen = 1000 logger.info( "Fetch SCOPe name and domain assignment data using target URL %s", urlTarget) desL, claL, hieL = self.__fetchFromSource(urlTarget, version=version) # nD = self.__extractDescription(desL) dmD = self.__extractAssignments(claL) pD = self.__extractHierarchy(hieL, nD) pdbD = self.__buildAssignments(dmD) logger.info("nD %d dmD %d pD %d", len(nD), len(dmD), len(pD)) scopD = {"names": nD, "parents": pD, "assignments": pdbD} if (len(nD) > minLen) and (len(pD) > minLen) and (len(pD) > minLen): ok = self.__mU.doExport(scopDomainPath, scopD, fmt="pickle") logger.debug("Cache save status %r", ok) # return nD, pD, pdbD def __fetchFromSource(self, urlTarget, version="2.07-2019-07-23"): """Fetch the classification names and domain assignments from SCOPe repo. # dir.des.scope.2.07-2019-03-07.txt dir.cla.scope.2.07-2019-03-07.txt dir.hie.scope.2.07-2019-03-07.txt """ encoding = "utf-8-sig" if sys.version_info[0] > 2 else "ascii" fn = "dir.des.scope.%s.txt" % version url = os.path.join(urlTarget, fn) desL = self.__mU.doImport(url, fmt="tdd", rowFormat="list", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(desL)) # fn = "dir.cla.scope.%s.txt" % version url = os.path.join(urlTarget, fn) claL = self.__mU.doImport(url, fmt="tdd", rowFormat="list", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(claL)) # fn = "dir.hie.scope.%s.txt" % version url = os.path.join(urlTarget, fn) hieL = self.__mU.doImport(url, fmt="tdd", rowFormat="list", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(hieL)) # return desL, claL, hieL def __extractDescription(self, desL): """ From dir.des.scope.2.07-2019-03-07.txt: # dir.des.scope.txt # SCOPe release 2.07 (2018-03-02, last updated 2019-03-07) [File format version 1.02] # http://scop.berkeley.edu/ # Copyright (c) 1994-2019 the SCOP and SCOPe authors; see http://scop.berkeley.edu/about 46456 cl a - All alpha proteins 46457 cf a.1 - Globin-like 46458 sf a.1.1 - Globin-like 46459 fa a.1.1.1 - Truncated hemoglobin 46460 dm a.1.1.1 - Protozoan/bacterial hemoglobin 116748 sp a.1.1.1 - Bacillus subtilis [TaxId: 1423] 113449 px a.1.1.1 d1ux8a_ 1ux8 A: 46461 sp a.1.1.1 - Ciliate (Paramecium caudatum) [TaxId: 5885] 14982 px a.1.1.1 d1dlwa_ 1dlw A: 100068 px a.1.1.1 d1uvya_ 1uvy A: 46462 sp a.1.1.1 - Green alga (Chlamydomonas eugametos) [TaxId: 3054] 14983 px a.1.1.1 d1dlya_ 1dly A: 100067 px a.1.1.1 d1uvxa_ 1uvx A: 63437 sp a.1.1.1 - Mycobacterium tuberculosis, HbN [TaxId: 1773] 164742 px a.1.1.1 d2gkma_ 2gkm A: 164743 px a.1.1.1 d2gkmb_ 2gkm B: """ nD = {} for fields in desL: if fields[1] in ["cl", "cf", "sf", "fa", "dm"]: nD[int(fields[0])] = str(fields[4]).strip() logger.debug("Length of name dictionary %d", len(nD)) nD[0] = "root" if 0 not in nD else nD[0] return nD def __extractAssignments(self, claL): """ returns: aD[sunId] = [(), ... ] From dir.cla.scope.2.07-2019-03-07.txt: # dir.cla.scope.txt # SCOPe release 2.07 (2018-03-02, last updated 2019-03-07) [File format version 1.02] # http://scop.berkeley.edu/ # Copyright (c) 1994-2019 the SCOP and SCOPe authors; see http://scop.berkeley.edu/about # old_sunId sccs sunid d1ux8a_ 1ux8 A: a.1.1.1 113449 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=116748,px=113449 d1dlwa_ 1dlw A: a.1.1.1 14982 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46461,px=14982 d1uvya_ 1uvy A: a.1.1.1 100068 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46461,px=100068 d1dlya_ 1dly A: a.1.1.1 14983 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46462,px=14983 d1uvxa_ 1uvx A: a.1.1.1 100067 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46462,px=100067 d2gkma_ 2gkm A: a.1.1.1 164742 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164742 d2gkmb_ 2gkm B: a.1.1.1 164743 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164743 d2gl3a_ 2gl3 A: a.1.1.1 164754 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164754 d2gl3b_ 2gl3 B: a.1.1.1 164755 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164755 d1idra_ 1idr A: a.1.1.1 62301 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=62301 d1idrb_ 1idr B: a.1.1.1 62302 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=62302 d1rtea_ 1rte A: a.1.1.1 105096 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=105096 """ dmD = {} logger.info("Length of class list %d", len(claL)) rng = rngL = tL = None for fields in claL: try: rngL = str(fields[2]).strip().split(",") # dmTupL = [(tt[0], tt[1]) for tt in for rng.split(":") in rngL] # dmTupL = [] for rng in rngL: tL = [t for t in str(rng).strip().split(":") if len(t)] if len(tL) > 1: rL = tL[1].split("-") tt = (tL[0], rL[0], rL[1]) else: tt = (tL[0], None, None) dmTupL.append(tt) # # Get the sid of the domain - # sfL = str(fields[5]).strip().split(",") dmfL = sfL[4].split("=") dmf = int(dmfL[1]) # old domid sccs sunid for domain assignment dmD[int(fields[4])] = (fields[1], dmTupL, fields[0], fields[3], dmf) # except Exception as e: logger.exception( "Failing fields %r rngL %r rng %r tL %r with %s", fields, rngL, rng, tL, str(e)) # # logger.info("Length of domain assignments %d", len(dmD)) return dmD def __buildAssignments(self, dmD): """ Input internal data structure with domain assignments - dmD[sunId] = (pdbId, [(authAsymId, begRes, endRes), ...], domain_name, sccs, sid_domain_assigned) Returns: aD[(pdbId, authAsymId)] = [(domSunId, domainId, sccs, (authAsymId, resBeg, resEnd))] """ pdbD = {} for _, dTup in dmD.items(): for rTup in dTup[1]: pdbD.setdefault((dTup[0], rTup[0]), []).append( (dTup[4], dTup[2], dTup[3], rTup)) return pdbD def __extractHierarchy(self, hieL, nD): """ From dir.hie.scope.2.07-2019-03-07.txt: # dir.hie.scope.txt # SCOPe release 2.07 (2018-03-02, last updated 2019-03-07) [File format version 1.01] # http://scop.berkeley.edu/ # Copyright (c) 1994-2019 the SCOP and SCOPe authors; see http://scop.berkeley.edu/about 0 - 46456,48724,51349,53931,56572,56835,56992,57942,58117,58231,58788,310555 46456 0 46457,46556,46625,46688,46928,46954,46965,46996,47004,47013,47026,47039,47044,47049,47054,47059,47071,...,... 46457 46456 46458,46548 46458 46457 46459,46463,46532,74660,191420 46459 46458 46460,190322 """ pD = {} logger.debug("Length of input hierarchy list %d", len(hieL)) for fields in hieL: chId = int(fields[0]) # if chId not in nD: continue pId = int(fields[1]) if fields[1].isdigit() else None pD[chId] = pId # logger.info("Length of domain parent dictionary %d", len(pD)) return pD def __exportTreeNodeList(self, nD, pD): """Create node list from the SCOPe (sunid) parent and name/description dictionaries. Exclude the root node from the tree. """ # rootId = 0 pL = [rootId] logger.info("nD %d pD %d", len(nD), len(pD)) # create child dictionary cD = {} for ctId, ptId in pD.items(): cD.setdefault(ptId, []).append(ctId) # logger.debug("cD %d", len(cD)) # idL = [] for rootId in sorted(pL): visited = set([rootId]) queue = collections.deque(visited) while queue: tId = queue.popleft() idL.append(tId) if tId not in cD: # logger.warning("No children for scop tId %r", tId) continue for childId in cD[tId]: if childId not in visited: queue.append(childId) visited.add(childId) # dL = [] for tId in idL: displayName = nD[tId] if tId in nD else None ptId = pD[tId] if tId in pD else None lL = self.getIdLineage(tId)[1:] # # d = {'id': str(tId), 'name': displayName, 'lineage': [str(t) for t in lL], 'parents': [str(ptId)], 'depth': len(lL)} if tId == rootId: continue elif ptId == rootId: dD = {"id": str(tId), "name": displayName, "depth": 0} else: dD = { "id": str(tId), "name": displayName, "parents": [str(ptId)], "depth": len(lL) } dL.append(dD) return dL
class ReferenceSequenceUtils(object): """Selected utilities to integrate reference sequence information with PDB polymer entity data.""" def __init__(self, cfgOb, refDbName, **kwargs): self.__cfgOb = cfgOb self.__refDbName = refDbName self.__mU = MarshalUtil() # self.__refIdList = self.__getReferenceAssignments(refDbName, **kwargs) self.__refD, self.__matchD = self.__rebuildCache( refDbName, self.__refIdList, **kwargs) def __getReferenceAssignments(self, refDbName, **kwargs): """Get all accessions assigned to input reference sequence database""" rL = [] exdbDirPath = kwargs.get("exdbDirPath", None) cacheKwargs = kwargs.get("cacheKwargs", None) useCache = kwargs.get("useCache", True) entryLimit = kwargs.get("entryLimit", None) try: epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=exdbDirPath, useCache=useCache, cacheKwargs=cacheKwargs, entryLimit=entryLimit) eCount = epe.getEntryCount() rL = epe.getRefSeqAccessions(refDbName) logger.info( "Reading polymer entity cache with repository entry count %d ref accession length %d ", eCount, len(rL)) # except Exception as e: logger.exception("Failing with %s", str(e)) return rL def __rebuildCache(self, refDbName, idList, **kwargs): """ """ dD = {} dirPath = kwargs.get("exdbDirPath", None) cacheKwargs = kwargs.get("cacheKwargs", None) useCache = kwargs.get("useCache", True) fetchLimit = kwargs.get("fetchLimit", None) saveText = kwargs.get("saveText", False) ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json" fn = "ref-sequence-data-cache" + "." + ext cacheFilePath = os.path.join(dirPath, fn) self.__mU.mkdir(dirPath) if not useCache: for fp in [cacheFilePath]: try: os.remove(fp) except Exception: pass # if useCache and cacheFilePath and self.__mU.exists(cacheFilePath): dD = self.__mU.doImport(cacheFilePath, **cacheKwargs) else: dD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit) if cacheFilePath and cacheKwargs: self.__mU.mkdir(dirPath) ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs) logger.info("Cache save status %r", ok) return dD["refDbCache"], dD["matchInfo"] def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None): """Fetch database entries from the input reference sequence database name.""" dD = {"refDbName": refDbName, "refDbCache": {}, "matchInfo": {}} try: idList = idList[:fetchLimit] if fetchLimit else idList logger.info("Starting fetch for %d %s entries", len(idList), refDbName) if refDbName == "UNP": fobj = UniProtUtils(saveText=saveText) refD, matchD = fobj.fetchList(idList) dD = { "refDbName": refDbName, "refDbCache": refD, "matchInfo": matchD } except Exception as e: logger.exception("Failing with %s", str(e)) return dD def __dumpEntries(self, refD): for (eId, eDict) in refD.items(): logger.info("------ Entry id %s", eId) for k, v in eDict.items(): logger.info("%-15s = %r", k, v) def getReferenceAccessionAlignSummary(self): """Summarize the alignment of PDB accession assignments with the current reference sequence database.""" numPrimary = 0 numSecondary = 0 numNone = 0 for _, mD in self.__matchD.items(): if mD["matched"] == "primary": numPrimary += 1 elif mD["matched"] == "secondary": numSecondary += 1 else: numNone += 1 logger.debug("Matched primary: %d secondary: %d none %d", numPrimary, numSecondary, numNone) return numPrimary, numSecondary, numNone
class PfamProvider(StashableBase): """Manage an index of Pfam identifier to description mappings.""" def __init__(self, **kwargs): urlTargetPfam = kwargs.get( "urlTargetPfam", "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz" ) urlTargetPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/Pfam-A.clans.tsv.gz" self.__version = "34.0" dirName = "pfam" cachePath = kwargs.get("cachePath", ".") dirPath = os.path.join(cachePath, dirName) super(PfamProvider, self).__init__(cachePath, [dirName]) useCache = kwargs.get("useCache", True) # self.__mU = MarshalUtil(workPath=dirPath) self.__pfamD = self.__rebuildCache(urlTargetPfam, urlTargetPfamFB, dirPath, useCache) urlTargetMapPfam = kwargs.get( "urlTargetMapPfam", "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pdb_pfamA_reg.txt.gz" ) urlTargetMapPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/pdb_pfamA_reg.txt.gz" self.__pfamMapD = self.__rebuildMappingCache(urlTargetMapPfam, urlTargetMapPfamFB, dirPath, useCache) def getVersion(self): return self.__version def getDescription(self, pfamId): """Return the description for the input Pfam identifier Args: pfamId (str): Pfam identifier Returns: str: text description of the Pfam domain """ descr = None try: descr = self.__pfamD[pfamId] except Exception: pass return descr def getMapping(self, pdbId): """Return the list of Pfam domain assignments for the input PDB identifer along with residue level mapping information Args: pdbId (str): PDB identifier Returns: list: [{'pfamId': , 'authAsymId": , 'authSeqBeg': , 'authSeqEnd': 'insertBeg': , 'insertEnd': }, {}, ] """ mapL = [] try: mapL = self.__pfamMapD[pdbId.upper()] except Exception: pass return mapL def testCache(self): # Check length ... logger.info("Length PfamD %d", len(self.__pfamD)) return (len(self.__pfamD) > 19000) and (len(self.__pfamMapD) > 150000) # def __rebuildCache(self, urlTargetPfam, urlTargetPfamFB, dirPath, useCache): pfamD = {} fmt = "json" ext = fmt if fmt == "json" else "pic" pfamDataPath = os.path.join(dirPath, "pfam-data.%s" % ext) # logger.debug("Using cache data path %s", dirPath) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(pfamDataPath): pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt) logger.debug("Pfam data length %d", len(pfamD)) elif not useCache: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetPfam, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam)) ok = fU.get(urlTargetPfam, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB)) ok = fU.get(urlTargetPfamFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) pfamD = self.__getPfamIndex(fp) ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt) logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath, ok) # ------ # return pfamD def __getPfamIndex(self, filePath): """Parse annotation classifications # """ pfamD = {} encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {} rowL = self.__mU.doImport(filePath, fmt="tdd", rowFormat="list", **encodingD) for row in rowL: try: pfamId = row[0].strip().upper() idCode = row[3].strip() descr = row[4].strip() pfamD[pfamId] = descr + " (" + idCode + ")" except Exception: pass # return pfamD def __rebuildMappingCache(self, urlTargetPfam, urlTargetPfamFB, dirPath, useCache): fmt = "json" ext = fmt if fmt == "json" else "pic" pfamDataPath = os.path.join(dirPath, "pfam-mapping-data.%s" % ext) # logger.debug("Using cache data path %s", dirPath) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(pfamDataPath): pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt) logger.debug("Pfam mapping data length %d", len(pfamD)) else: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetPfam, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam)) ok = fU.get(urlTargetPfam, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB)) ok = fU.get(urlTargetPfamFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) pfamD = self.__getPfamMapping(fp) ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt) logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath, ok) # ------ # return pfamD def __getPfamMapping(self, filePath): """Parse mapping data""" pFamMapD = {} encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {} rowL = self.__mU.doImport(filePath, fmt="tdd", rowFormat="list", **encodingD) for row in rowL: try: pdbId = row[2].strip().upper() pfamId = row[3].strip().upper() authAsymId = row[5].strip() authSeqBeg = int(row[6].strip()) insertBeg = row[7].strip( ) if row[7].strip() != "NULL" else None authSeqEnd = int(row[8].strip()) insertEnd = row[9].strip( ) if row[9].strip() != "NULL" else None pFamMapD.setdefault(pdbId, []).append({ "pfamId": pfamId, "authAsymId": authAsymId, "authSeqBeg": authSeqBeg, "authSeqEnd": authSeqEnd, "insertBeg": insertBeg, "insertEnd": insertEnd, }) except Exception as e: logger.exception("Failing with %r %s", row, str(e)) # logger.info("Pfam mapping data for (%d) entries", len(pFamMapD)) return pFamMapD
class Scop2ClassificationProvider(StashableBase): """Extract SCOP2 domain assignments, term descriptions and SCOP classification hierarchy from SCOP and SCOP2B flat files. """ def __init__(self, cachePath, useCache, **kwargs): # _ = kwargs self.__cachePath = cachePath dirName = "scop2" self.__dirPath = os.path.join(self.__cachePath, dirName) self.__useCache = useCache super(Scop2ClassificationProvider, self).__init__(self.__cachePath, [dirName]) # self.__version = "latest" self.__fmt = "pickle" self.__mU = MarshalUtil(workPath=self.__dirPath) self.__nD, self.__ntD, self.__pAD, self.__pBD, self.__pBRootD, self.__fD, self.__sfD, self.__sf2bD = self.__reload( useCache=self.__useCache, fmt=self.__fmt) # if not useCache and not self.testCache(): ok = self.__fetchFromBackup() if ok: self.__nD, self.__ntD, self.__pAD, self.__pBD, self.__pBRootD, self.__fD, self.__sfD, self.__sf2bD = self.__reload( useCache=True, fmt=self.__fmt) # def testCache(self): logger.info( "SCOP2 lengths nD %d pAD %d pBD %d pBRootD %d fD %d sfD %d sf2bD %d", len(self.__nD), len(self.__pAD), len(self.__pBD), len(self.__pBRootD), len(self.__fD), len(self.__sfD), len(self.__sf2bD)) if (len(self.__nD) > 9000) and (len(self.__pAD) > 70000): return True return False def getVersion(self): """Returns the SCOP2 version""" return self.__version def getFamilyIds(self, pdbId, authAsymId): try: return list( set([tup[1] for tup in self.__fD[(pdbId.upper(), authAsymId)]])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getSuperFamilyIds(self, pdbId, authAsymId): try: return list( set([ tup[1] for tup in self.__sfD[(pdbId.upper(), authAsymId)] ])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getFamilyNames(self, pdbId, authAsymId): try: return list( set([ self.__nD[tup[1]] for tup in self.__fD[(pdbId.upper(), authAsymId)] ])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getSuperFamilyNames(self, pdbId, authAsymId): try: return list( set([ self.__nD[tup[1]] for tup in self.__sfD[(pdbId.upper(), authAsymId)] ])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getFamilyResidueRanges(self, pdbId, authAsymId): try: # s/fD.setdefault((pdbId, authAsymId), []).append((domSuperFamilyId, authAsymId, authSeqBeg, authSeqEnd)) return [(tup[0], tup[1], tup[2], tup[3], tup[4]) for tup in self.__fD[(pdbId.upper(), authAsymId)]] except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getSuperFamilyResidueRanges(self, pdbId, authAsymId): try: return [(tup[0], tup[1], tup[2], tup[3], tup[4]) for tup in self.__sfD[(pdbId.upper(), authAsymId)]] except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getSuperFamilyNames2B(self, pdbId, authAsymId): try: return list( set([ self.__nD[tup[1]] for tup in self.__sf2bD[(pdbId.upper(), authAsymId)] ])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getSuperFamilyIds2B(self, pdbId, authAsymId): try: return list( set([ tup[1] for tup in self.__sf2bD[(pdbId.upper(), authAsymId)] ])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getSuperFamilyResidueRanges2B(self, pdbId, authAsymId): try: return [(tup[0], tup[1], tup[2], tup[3], tup[4]) for tup in self.__sf2bD[(pdbId.upper(), authAsymId)]] except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getName(self, domId): try: return self.__nD[domId] except Exception: logger.debug("Undefined SCOP2 id %r", domId) return None def getNameType(self, domId): qD = { "TP": "Protein Type", "CL": "Protein Class", "CF": "Fold", "SF": "Superfamily", "FA": "Family" } try: return qD[self.__ntD[domId]] except Exception: logger.debug("Undefined ECOD id %r", domId) return None def getIdLineage(self, domId): pS = set() try: pS.add(domId) pt = self.__pAD[domId] while (pt is not None) and (pt != 0): pS.add(pt) pt = self.__pAD[pt] # pt = self.__pBD[domId] while (pt is not None) and (pt != 0): pS.add(pt) pt = self.__pBD[pt] except Exception as e: logger.debug("Failing for %r with %s", domId, str(e)) # return sorted(pS) def getNameLineage(self, domId): try: nL = [] for dId in self.getIdLineage(domId): tN = self.getName(dId) tN = tN if tN else "Unnamed" nL.append(tN) return nL except Exception as e: logger.debug("Failing for %r with %s", domId, str(e)) return None def getTreeNodeList(self): tnL = self.__exportTreeNodeList(self.__nD, self.__pAD, self.__pBRootD) return tnL def __getAssignmentFileName(self, fmt="json"): ext = "json" if fmt == "json" else "pic" fn = "scop2_domain_assignments.%s" % ext return fn def __reload(self, useCache=True, fmt="json"): nD = ntD = pAD = pBD = pBRootD = fD = sfD = sf2bD = {} fn = self.__getAssignmentFileName(fmt=fmt) assignmentPath = os.path.join(self.__dirPath, fn) self.__mU.mkdir(self.__dirPath) # if useCache and self.__mU.exists(assignmentPath): sD = self.__mU.doImport(assignmentPath, fmt=fmt) logger.debug("Domain name count %d", len(sD["names"])) self.__version = sD["version"] nD = sD["names"] ntD = sD["nametypes"] pAD = sD["parentsType"] pBD = sD["parentsClass"] pBRootD = sD["parentsClassRoot"] fD = sD["families"] sfD = sD["superfamilies"] sf2bD = sD["superfamilies2b"] elif not useCache: nmL, dmL, scop2bL, _ = self.__fetchFromSource() # ok = False nD = self.__extractNames(nmL) logger.info("Domain name dictionary (%d)", len(nD)) pAD, pBD, pBRootD, ntD, fD, sfD, domToSfD = self.__extractDomainHierarchy( dmL) # logger.info("Domain node parent hierarchy (protein type) (%d)", len(pAD)) logger.info("Domain node parent hierarchy (structural class) (%d)", len(pBD)) logger.info( "Domain node parent hierarchy (structural class root) (%d)", len(pBRootD)) logger.info("SCOP2 core domain assignments (family %d) (sf %d)", len(fD), len(sfD)) # sf2bD = self.__extractScop2bSuperFamilyAssignments( scop2bL, domToSfD) logger.info("SCOP2B SF domain assignments (%d)", len(sf2bD)) # tS = datetime.datetime.now().isoformat() # vS = datetime.datetime.now().strftime("%Y-%m-%d") vS = self.__version sD = { "version": vS, "created": tS, "names": nD, "nametypes": ntD, "parentsType": pAD, "parentsClass": pBD, "parentsClassRoot": pBRootD, "families": fD, "superfamilies": sfD, "superfamilies2b": sf2bD } ok = self.__mU.doExport(assignmentPath, sD, fmt=fmt, indent=3) logger.info("Cache save status %r", ok) # return nD, ntD, pAD, pBD, pBRootD, fD, sfD, sf2bD def __fetchFromBackup(self, fmt="json"): urlTarget = "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/SCOP2" # fn = self.__getAssignmentFileName(fmt=fmt) assignmentPath = os.path.join(self.__dirPath, fn) urlPath = os.path.join(urlTarget, fn) self.__mU.mkdir(assignmentPath) # logger.info("Using backup URL %r", urlPath) fU = FileUtil() ok = fU.get(urlPath, assignmentPath) return ok def __fetchFromSource(self): """Fetch the classification names and domain assignments from SCOP2 and SCOP2B resources. SCOP2 domain names: https://scop.mrc-lmb.cam.ac.uk/files/scop-des-latest.txt SCOP2 domain hierarchy: https://scop.mrc-lmb.cam.ac.uk/files/scop-cla-latest.txt SIFTS extrapolated SCOP2 and SCOP2B assignments: https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_scop2b_sf_uniprot.tsv.gz https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_scop2_uniprot.tsv.gz """ urlTargetScop2 = "https://scop.mrc-lmb.cam.ac.uk/files" encoding = "utf-8-sig" if sys.version_info[0] > 2 else "ascii" fn = "scop-des-latest.txt" url = os.path.join(urlTargetScop2, fn) desL = self.__mU.doImport(url, fmt="list", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(desL)) # fn = "scop-cla-latest.txt" url = os.path.join(urlTargetScop2, fn) claL = self.__mU.doImport(url, fmt="list", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(claL)) # headerLines = self.__mU.doImport(url, fmt="list", uncomment=False, encoding=encoding) self.__version = headerLines[0].split( " ")[3] if headerLines else "2021-05-27" # JDW note cert issues with this site urlTargetSifts = "http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv" fn = "pdb_chain_scop2b_sf_uniprot.tsv.gz" url = os.path.join(urlTargetSifts, fn) scop2bL = self.__mU.doImport(url, fmt="tdd", rowFormat="dict", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(scop2bL)) # fn = "pdb_chain_scop2_uniprot.tsv.gz" url = os.path.join(urlTargetSifts, fn) scop2L = self.__mU.doImport(url, fmt="tdd", rowFormat="dict", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(scop2bL)) # return desL, claL, scop2bL, scop2L def __extractNames(self, nmL): """ """ rD = {} logger.info("Length of input name list %d", len(nmL)) for nm in nmL: ff = nm.split(" ") rD[ff[0]] = " ".join(ff[1:]) # self.__mU.doExport(os.path.join(self.__dirPath, "scop2-names.json"), rD, fmt="json", indent=3) return rD def __extractDomainHierarchy(self, dmL): """Extract the domain node identifier hierarchy from the SCOP2 representative assignment file ... Returns: dict, dict, dict, dict, dict: parent and name type dictionaries, family and superfamily assignments, and domain to superfamily mapping ntD[domainId] = name type TP=protein type, CL=protein class, CF=fold, SF=superfamily, FA=family pD[child domain identifier] = parent domain identifier fD[(pdbId, authAsymId)] = [(faDomId, faId, authAsymId, resBeg, resEnd),] sfD[(pdbId, authAsymId)] = [(sfDomId, sfId, authAsymId, resBeg, resEnd),] domToSfD[domSfid] = sfId Example assignment file: # SCOP release 2021-05-27 # http://scop.mrc-lmb.cam.ac.uk # based on PDB release 2021-05-14 # based on UniProt realese 2021-04-08 # based on SIFTS release 2021-05-19 # FA-DOMID FA-PDBID FA-PDBREG FA-UNIID FA-UNIREG SF-DOMID SF-PDBID SF-PDBREG SF-UNIID SF-UNIREG SCOPCLA 8045703 3H8D C:1143-1264 Q64331 1143-1264 8091604 3H8D C:1143-1264 Q64331 1143-1264 TP=1,CL=1000003,CF=2001470,SF=3002524,FA=4004627 8094330 6J56 A:1158-1282 Q9UM54 1167-1291 8094331 6J56 A:1158-1282 Q9UM54 1167-1291 TP=1,CL=1000003,CF=2001470,SF=3002524,FA=4004627 # """ # Build the parent dictionary and name node type ntD = {} pAD = {} pBD = {} pBRootD = {} fD = {} sfD = {} domToSfD = {} # logger.info("Length of input domain assignment list %d", len(dmL)) for dm in dmL: try: ff = dm.split(" ") domFamilyId = ff[0] domSuperFamilyId = ff[5] rngL = ff[10].split(",") tD = {} for rng in rngL: tL = rng.split("=") tD[tL[0]] = tL[1] # # - # pD[tD["TP"]] = 0 # pD[tD["CL"]] = tD["TP"] # pD[tD["CF"]] = tD["CL"] # pD[tD["SF"]] = tD["CF"] # pD[tD["FA"]] = tD["SF"] # pD[domFamilyId] = tD["FA"] # pD[domSuperFamilyId] = tD["SF"] # # Represent as two trees separately rooted in protein type and structural class pAD[tD["TP"]] = 0 pAD[tD["CF"]] = tD["TP"] pAD[tD["SF"]] = tD["CF"] pAD[tD["FA"]] = tD["SF"] pAD[domFamilyId] = tD["FA"] pAD[domSuperFamilyId] = tD["SF"] # # Use this complete pBD here only for generating ID lineages, but NOT for merging with pAD pBD[tD["CL"]] = 0 pBD[tD["CF"]] = tD["CL"] pBD[tD["SF"]] = tD["CF"] pBD[tD["FA"]] = tD["SF"] pBD[domFamilyId] = tD["FA"] pBD[domSuperFamilyId] = tD["SF"] # # Use pBRootD for creating tree node lists; Don't capture any lower branches to avoid re-creating redundant key:values already in pAD pBRootD[tD["CL"]] = 0 pBRootD[tD["CF"]] = tD["CL"] # ntD[tD["FA"]] = "FA" ntD[tD["SF"]] = "SF" ntD[tD["CF"]] = "CF" ntD[tD["CL"]] = "CL" ntD[tD["TP"]] = "TP" # pdbId = ff[1] authAsymId, authSeqBeg, authSeqEnd = self.__parseAssignment( ff[2]) if authAsymId is not None: fD.setdefault((pdbId, authAsymId), []).append( (domFamilyId, tD["FA"], authAsymId, authSeqBeg, authSeqEnd)) pdbId = ff[6] authAsymId, authSeqBeg, authSeqEnd = self.__parseAssignment( ff[7]) if authAsymId is not None: sfD.setdefault((pdbId, authAsymId), []).append( (domSuperFamilyId, tD["SF"], authAsymId, authSeqBeg, authSeqEnd)) # domToSfD[domSuperFamilyId] = tD["SF"] except Exception as e: logger.exception("Failing for case %r: %s", dm, str(e)) # logger.info("pAD (%d) pBD (%d) pBRootD (%d) ntD (%d)", len(pAD), len(pBD), len(pBRootD), len(ntD)) logger.info("fD (%d) sfD (%d)", len(fD), len(sfD)) return pAD, pBD, pBRootD, ntD, fD, sfD, domToSfD def __parseAssignment(self, tS): authAsymId = authSeqBeg = authSeqEnd = None try: fL = tS.split(":") authAsymId = fL[0] rS = fL[1] if rS[0] == "-": authSeqBeg = -int(rS[1:].split("-")[0]) authSeqEnd = int(rS[1:].split("-")[1]) else: authSeqBeg = int(rS.split("-")[0]) authSeqEnd = int(rS.split("-")[1]) except Exception: pass return authAsymId, authSeqBeg, authSeqEnd def __extractScop2bSuperFamilyAssignments(self, scop2bL, domToSfD): """ Extract the SCOP2B SIFTS superfamily domain assignments for PDB structure entries. Returns: aD[(pdbId, authAsymId)] = [(sfDomId, sfId, authAsymId, resBeg, resEnd),] Example: # 2021/06/12 - 05:52 | PDB: 23.21 | UniProt: 2021.03 PDB CHAIN SF_DOMID SP_PRIMARY RES_BEG RES_END PDB_BEG PDB_END SP_BEG SP_END 5id7 B 8033045 P02768 197 388 197 388 221 412 1o9x A 8033045 P02768 197 388 197 388 221 412 """ sfD = {} try: for rowD in scop2bL: if rowD["SF_DOMID"] in domToSfD: sfD.setdefault( (rowD["PDB"].upper(), rowD["CHAIN"]), []).append( (rowD["SF_DOMID"], domToSfD[rowD["SF_DOMID"]], rowD["CHAIN"], rowD["PDB_BEG"], rowD["PDB_END"])) else: logger.warning("Missing SCOP2B SF ID mapping for %r", rowD["SF_DOMID"]) except Exception as e: logger.exception("Failing with %s", str(e)) return sfD def __exportTreeNodeList(self, nD, pAD, pBRootD): """Create node list from the SCOP2 parent and name/description dictionaries. Exclude the root node from the tree. """ # rootId = 0 pL = [rootId] # logger.info("nD %d pAD %d pBRootD %d pL %r", len(nD), len(pAD), len(pBRootD), pL) # create child dictionary cD = {} for ctId, ptId in pAD.items(): cD.setdefault(ptId, []).append(ctId) for ctId, ptId in pBRootD.items(): cD.setdefault(ptId, []).append(ctId) # logger.debug("cD %d", len(cD)) # idL = [] for rootId in sorted(pL): visited = set([rootId]) queue = collections.deque(visited) while queue: tId = queue.popleft() idL.append(tId) if tId not in cD: # logger.warning("No children for scop tId %r", tId) continue for childId in cD[tId]: if childId not in visited: queue.append(childId) visited.add(childId) # dL = [] for tId in idL: displayName = nD[tId] if tId in nD else None ptIdL = [] if tId in pAD: ptIdL.append(pAD[tId]) if tId in pBRootD: ptIdL.append(pBRootD[tId]) lL = self.getIdLineage(tId)[1:] # # d = {'id': str(tId), 'name': displayName, 'lineage': [str(t) for t in lL], 'parents': [str(ptId)], 'depth': len(lL)} if tId == rootId: continue elif any([ptId == rootId for ptId in ptIdL]): dD = {"id": str(tId), "name": displayName, "depth": 0} else: displayName = displayName if displayName else "Domain %s" % str( tId) dD = { "id": str(tId), "name": displayName, "parents": ptIdL, "depth": len(lL) } dL.append(dD) return dL
class CitationExtractor(object): """Utilities to extract citation related data from the core_entry collection.""" def __init__(self, cfgOb, **kwargs): self.__cfgOb = cfgOb self.__resourceName = "MONGO_DB" self.__databaseName = "pdbx_core" self.__collectionName = "pdbx_core_entry" # self.__mU = MarshalUtil() # self.__entryD = self.__rebuildCache(**kwargs) self.__idxD = self.__buildIndices(self.__entryD) # def __rebuildCache(self, **kwargs): useCache = kwargs.get("useCache", True) dirPath = kwargs.get("exdbDirPath", ".") cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"}) # ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json" fn = "entry-citation-extracted-data-cache" + "." + ext cacheFilePath = os.path.join(dirPath, fn) cD = {"entryD": {}} try: if useCache and cacheFilePath and os.access(cacheFilePath, os.R_OK): logger.info("Using cached entry citation file %s", cacheFilePath) cD = self.__mU.doImport(cacheFilePath, **cacheKwargs) else: entryD = self.__extractCitations() cD["entryD"] = entryD if cacheFilePath: ok = self.__mU.mkdir(dirPath) ok = self.__mU.doExport(cacheFilePath, cD, **cacheKwargs) logger.info("Saved entry citation results (%d) status %r in %s", len(entryD), ok, cacheFilePath) except Exception as e: logger.exception("Failing with %s", str(e)) return cD["entryD"] def __buildIndices(self, entryD): """ Example: "entryD": { "5KAL": { "citation": [ { "country": "UK", "id": "primary", "journal_abbrev": "Nucleic Acids Res.", "journal_id_ASTM": "NARHAD", "journal_id_CSD": "0389", "journal_id_ISSN": "1362-4962", "journal_volume": "44", "page_first": "10862", "page_last": "10878", "title": "RNA Editing TUTase 1: structural foundation of substrate recognition, complex interactions and drug targeting.", "year": 2016, "pdbx_database_id_DOI": "10.1093/nar/gkw917", "pdbx_database_id_PubMed": 27744351, "rcsb_authors": [ "Rajappa-Titu, L.", "Suematsu, T.", "Munoz-Tello, P.", "Long, M.", "Demir, O.", "Cheng, K.J.", "Stagno, J.R.", "Luecke, H.", "Amaro, R.E.", "Aphasizheva, I.", "Aphasizhev, R.", "Thore, S." ] } ], "_entry_id": "5KAL" }, """ indD = {} missingCitationCount = 0 missingJournalName = 0 numPubMed = 0 numDOI = 0 numCitations = 0 mD = {} issnD = {} missingISSNCount = 0 missingPubMedCount = 0 try: for entryId, eD in entryD.items(): cDL = eD["citation"] if "citation" in eD else None if cDL: for cD in cDL[:1]: if cD and "journal_abbrev" in cD: indD[cD["journal_abbrev"]] = indD[cD["journal_abbrev"]] + 1 if cD["journal_abbrev"] in indD else 1 else: logger.info("Missing journal name in entryId %s %r ", entryId, cD) missingJournalName += 1 if cD and "pdbx_database_id_DOI" in cD: numDOI += 1 if cD and "pdbx_database_id_PubMed" in cD: numPubMed += 1 else: mD[cD["journal_abbrev"]] = mD[cD["journal_abbrev"]] + 1 if cD["journal_abbrev"] in mD else 1 missingPubMedCount += 1 if "journal_id_ISSN" in cD and len(cD["journal_id_ISSN"]) > 7: issnD[cD["journal_id_ISSN"]] = issnD[cD["journal_id_ISSN"]] + 1 if cD["journal_id_ISSN"] in issnD else 1 else: missingISSNCount += 1 if cD: numCitations += 1 else: missingCitationCount += 1 except Exception as e: logger.exception("Failing with %s", str(e)) # logger.info("Number of citatons %d", numCitations) logger.info("Number of PubMed ids %d", numPubMed) logger.info("Number of DOIs %d", numDOI) logger.info("No citation category count %d missing journal name %d", missingCitationCount, missingJournalName) # logger.info("Journal index name length %d", len(indD)) # logger.info("Journal name length %r",indD.items()) # logger.info("Missing pubmed index length %d", len(mD)) logger.info("Missing pubmed length %d", missingPubMedCount) logger.info("Missing PubMed %r", mD.items()) # logger.info("ISSN dictionary length %d", len(issnD)) logger.info("ISSN missing length %d", missingISSNCount) # return indD def getEntryCount(self): return len(self.__entryD) def __extractCitations(self): """Test case - extract unique entity source and host taxonomies""" try: obEx = ObjectExtractor( self.__cfgOb, databaseName=self.__databaseName, collectionName=self.__collectionName, cacheFilePath=None, useCache=False, keyAttribute="entry", uniqueAttributes=["rcsb_id"], cacheKwargs=None, objectLimit=None, selectionQuery={}, selectionList=["rcsb_id", "citation"], ) eCount = obEx.getCount() logger.info("Entry count is %d", eCount) objD = obEx.getObjects() # for ky, eD in objD.items(): # logger.info("%s: %r", ky, eD) return objD except Exception as e: logger.exception("Failing with %s", str(e)) return {}
class GlyGenProvider(StashableBase): """Fetch glycans and glycoproteins available in the GlyGen.org resource. GlyGen glycan link template - https://glygen.org/glycan/G28882EF Glycoprotein link template - https://www.glygen.org/protein/Q658T7 """ def __init__(self, **kwargs): # dirName = "glygen" cachePath = kwargs.get("cachePath", ".") self.__dirPath = os.path.join(cachePath, dirName) super(GlyGenProvider, self).__init__(cachePath, [dirName]) useCache = kwargs.get("useCache", True) # baseUrl = kwargs.get( "glygenBasetUrl", "https://data.glygen.org/ln2data/releases/data/v-1.12.3/reviewed/") fallbackUrl = kwargs.get( "glygenFallbackUrl", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/glygen/" ) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__glycanD = self.__reloadGlycans(baseUrl, fallbackUrl, self.__dirPath, useCache=useCache) self.__glycoproteinD = self.__reloadGlycoproteins(baseUrl, fallbackUrl, self.__dirPath, useCache=useCache) def testCache(self, minGlycanCount=20000, minGlycoproteinCount=64000): # logger.info("GlyGen glycan list (%d) glycoprotein list (%d)", len(self.__glycanD), len(self.__glycoproteinD)) if self.__glycanD and len( self.__glycanD ) > minGlycanCount and self.__glycoproteinD and len( self.__glycoproteinD) > minGlycoproteinCount: return True return False def hasGlycan(self, glyTouCanId): try: return glyTouCanId in self.__glycanD except Exception: return False def hasGlycoprotein(self, uniProtId): try: return uniProtId in self.__glycoproteinD except Exception: return False def getGlycans(self): return self.__glycanD def getGlycoproteins(self): return self.__glycoproteinD def __reloadGlycans(self, baseUrl, fallbackUrl, dirPath, useCache=True): gD = {} logger.debug("Using dirPath %r", dirPath) self.__mU.mkdir(dirPath) # myDataPath = os.path.join(dirPath, "glygen-glycan-list.json") if useCache and self.__mU.exists(myDataPath): gD = self.__mU.doImport(myDataPath, fmt="json") logger.debug("GlyGen glycan data length %d", len(gD)) elif not useCache: logger.debug( "Fetch GlyGen glycan data from primary data source %s", baseUrl) endPoint = os.path.join(baseUrl, "glycan_masterlist.csv") # logger.info("Fetch GlyGen glycan data from primary data source %s", endPoint) rawPath = os.path.join(dirPath, "glycan_masterlist.csv") fU = FileUtil() ok = fU.get(endPoint, rawPath) logger.debug("Fetch GlyGen glycan data status %r", ok) if not ok: endPoint = os.path.join(fallbackUrl, "glycan_masterlist.csv") ok = fU.get(endPoint, rawPath) logger.info("Fetch fallback GlyGen glycan data status %r", ok) # if ok: gD = self.__parseGlycanList(rawPath) ok = self.__mU.doExport(myDataPath, gD, fmt="json") logger.info("Exported GlyGen glycan list (%d) (%r) %s", len(gD), ok, myDataPath) # return gD def __parseGlycanList(self, filePath): gD = {} row = None try: rowL = self.__mU.doImport(filePath, fmt="csv", rowFormat="list") logger.debug("Glycan list length (%d)", len(rowL)) logger.debug("Row 0 %r", rowL[0]) for row in rowL[1:]: gD[row[0]] = row[1] except Exception as e: logger.exception("Failing for %r (%r) with %s", filePath, row, str(e)) return gD def __reloadGlycoproteins(self, baseUrl, fallbackUrl, dirPath, useCache=True): gD = {} logger.debug("Using dirPath %r", dirPath) self.__mU.mkdir(dirPath) # myDataPath = os.path.join(dirPath, "glygen-glycoprotein-list.json") if useCache and self.__mU.exists(myDataPath): gD = self.__mU.doImport(myDataPath, fmt="json") logger.debug("GlyGen glycoprotein data length %d", len(gD)) else: for fn in [ "sarscov1_protein_masterlist.csv", "sarscov2_protein_masterlist.csv", "hcv1b_protein_masterlist.csv", "hcv1a_protein_masterlist.csv", "human_protein_masterlist.csv", "mouse_protein_masterlist.csv", "rat_protein_masterlist.csv", ]: logger.debug( "Fetch GlyGen glycoprotein data from primary data source %s", baseUrl) endPoint = os.path.join(baseUrl, fn) # logger.debug( "Fetch GlyGen glycoprotein data from primary data source %s", endPoint) rawPath = os.path.join(dirPath, fn) fU = FileUtil() ok = fU.get(endPoint, rawPath) logger.debug("Fetch GlyGen glycoprotein data status %r", ok) if not ok: endPoint = os.path.join(fallbackUrl, fn) ok = fU.get(endPoint, rawPath) logger.info("Fetch fallback GlyGen data status %r", ok) # if ok: tD = self.__parseGlycoproteinList(rawPath) gD.update(tD) # ok = self.__mU.doExport(myDataPath, gD, fmt="json") logger.info("Exported GlyGen glycoprotein list (%d) (%r) %s", len(gD), ok, myDataPath) # return gD def __parseGlycoproteinList(self, filePath): gD = {} try: rowL = self.__mU.doImport(filePath, fmt="csv", rowFormat="list") for row in rowL[1:]: ff = row[0].split("-") gD[ff[0]] = ff[1] except Exception as e: logger.exception("Failing for %r with %s", filePath, str(e)) return gD
class CathClassificationProvider(StashableBase): """Extract CATH domain assignments, term descriptions and CATH classification hierarchy from CATH flat files. """ def __init__(self, **kwargs): # self.__dirName = "cath" if "cachePath" in kwargs: self.__cachePath = os.path.abspath(kwargs.get("cachePath", None)) self.__cathDirPath = os.path.join(self.__cachePath, self.__dirName) else: self.__cathDirPath = kwargs.get("cathDirPath", ".") self.__cachePath, self.__dirName = os.path.split(os.path.abspath(self.__cathDirPath)) super(CathClassificationProvider, self).__init__(self.__cachePath, [self.__dirName]) # useCache = kwargs.get("useCache", True) urlTarget = kwargs.get("cathTargetUrl", "http://download.cathdb.info/cath/releases/daily-release/newest") urlFallbackTarget = kwargs.get("cathTargetUrl", "http://download.cathdb.info/cath/releases/daily-release/archive") # no trailing / urlBackupPath = kwargs.get("cathUrlBackupPath", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/CATH") # self.__mU = MarshalUtil(workPath=self.__cathDirPath) self.__nD, self.__pdbD = self.__reload(urlTarget, urlFallbackTarget, self.__cathDirPath, useCache=useCache) if not self.testCache() and not useCache: ok = self.__fetchFromBackup(urlBackupPath, self.__cathDirPath) if ok: self.__nD, self.__pdbD = self.__reload(urlTarget, urlFallbackTarget, self.__cathDirPath, useCache=True) # def testCache(self): logger.info("CATH lengths nD %d pdbD %d", len(self.__nD), len(self.__pdbD)) if (len(self.__nD) > 100) and (len(self.__pdbD) > 5000): return True return False def getCathVersions(self, pdbId, authAsymId): """aD[(pdbId, authAsymId)] = [(cathId, domainId, (authAsymId, resBeg, resEnd), version)]""" try: return list(set([tup[3] for tup in self.__pdbD[(pdbId, authAsymId)]])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getCathIds(self, pdbId, authAsymId): try: return list(set([tup[0] for tup in self.__pdbD[(pdbId, authAsymId)]])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getCathDomainNames(self, pdbId, authAsymId): try: return list(set([tup[1] for tup in self.__pdbD[(pdbId, authAsymId)]])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getCathResidueRanges(self, pdbId, authAsymId): try: return [(tup[0], tup[1], tup[2][0], tup[2][1], tup[2][2]) for tup in self.__pdbD[(pdbId, authAsymId)]] except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getCathName(self, cathId): try: return self.__nD[cathId] except Exception: logger.debug("Undefined CATH id %r", cathId) return None def getIdLineage(self, cathId): try: ff = cathId.split(".") return [".".join(ff[0:jj]) for jj in range(1, len(ff) + 1)] except Exception: logger.debug("No lineage for bad CATH id %r", cathId) return None def getNameLineage(self, cathId): try: return [self.getCathName(cId) for cId in self.getIdLineage(cathId)] except Exception as e: logger.exception("Failing with %s", str(e)) return None def getTreeNodeList(self): return self.__exportTreeNodeList(self.__nD) def __getCathDomainFileName(self): pyVersion = sys.version_info[0] fn = "cath_domains-py%s.pic" % str(pyVersion) return fn def __reload(self, urlTarget, urlFallbackTarget, cathDirPath, useCache=True): nD = {} pdbD = {} fn = self.__getCathDomainFileName() cathDomainPath = os.path.join(cathDirPath, fn) self.__mU.mkdir(cathDirPath) # # cathDomainPath = os.path.join(cathDirPath, "cath_domains.json") # if useCache and self.__mU.exists(cathDomainPath): sD = self.__mU.doImport(cathDomainPath, fmt="pickle") logger.debug("Cath domain length %d", len(sD)) nD = sD["names"] pdbD = sD["assignments"] elif not useCache: minLen = 1000 logger.info("Fetch CATH name and domain assignment data from primary data source %s", urlTarget) nmL, dmL = self.__fetchFromSource(urlTarget, urlFallbackTarget, minLen) # ok = False nD = self.__extractNames(nmL) dD = self.__extractDomainAssignments(dmL) pdbD = self.__buildAssignments(dD) sD = {"names": nD, "assignments": pdbD} if (len(nD) > minLen) and (len(dD) > minLen): ok = self.__mU.doExport(cathDomainPath, sD, fmt="pickle") logger.debug("Cache save status %r", ok) # return nD, pdbD def __fetchFromBackup(self, urlBackupPath, cathDirPath): fn = self.__getCathDomainFileName() cathDomainPath = os.path.join(cathDirPath, fn) self.__mU.mkdir(cathDirPath) # backupUrl = urlBackupPath + "/" + fn logger.info("Using backup URL %r", backupUrl) fU = FileUtil() ok = fU.get(backupUrl, cathDomainPath) return ok def __fetchFromSource(self, urlTarget, urlFallbackTarget, minLen): """Fetch the classification names and domain assignments from CATH repo. http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-names.gz # http://download.cathdb.info/cath/releases/daily-release/archive/cath-b-yyyymmdd-all.gz http://download.cathdb.info/cath/releases/daily-release/archive/cath-b-yyyymmdd-names-all.gz """ fn = "cath-b-newest-names.gz" url = os.path.join(urlTarget, fn) nmL = self.__mU.doImport(url, fmt="list", uncomment=True) # if not nmL or len(nmL) < minLen: dS = datetime.today().strftime("%Y%m%d") dS = datetime.strftime(datetime.now() - timedelta(1), "%Y%m%d") fn = "cath-b-%s-names-all.gz" % dS url = os.path.join(urlFallbackTarget, fn) logger.info("Using fallback resource for %s", fn) nmL = self.__mU.doImport(url, fmt="list", uncomment=True) # fn = "cath-b-newest-all.gz" url = os.path.join(urlTarget, fn) dmL = self.__mU.doImport(url, fmt="list", uncomment=True) # if not dmL or len(dmL) < minLen: dS = datetime.today().strftime("%Y%m%d") dS = datetime.strftime(datetime.now() - timedelta(1), "%Y%m%d") fn = "cath-b-%s-all.gz" % dS url = os.path.join(urlFallbackTarget, fn) logger.info("Using fallback resource for %s", fn) dmL = self.__mU.doImport(url, fmt="list", uncomment=True) # return nmL, dmL def __extractNames(self, nmL): """ From cath-b-newest-names: 1 Mainly Alpha 2 Mainly Beta 3 Alpha Beta 4 Few Secondary Structures 1.10 Orthogonal Bundle 1.20 Up-down Bundle 1.25 Alpha Horseshoe 1.40 Alpha solenoid 1.50 Alpha/alpha barrel 2.10 Ribbon 2.20 Single Sheet 2.30 Roll 2.40 Beta Barrel 2.50 Clam 2.60 Sandwich 2.70 Distorted Sandwich 2.80 Trefoil 2.90 Orthogonal Prism 2.100 Aligned Prism 2.102 3-layer Sandwich """ rD = {} logger.info("length of input name list %d", len(nmL)) for nm in nmL: ff = nm.split(" ") rD[ff[0]] = " ".join(ff[1:]) return rD def __extractDomainAssignments(self, dmL): """ From cath-b-newest-all: 101mA00 v4_2_0 1.10.490.10 0-153:A 102lA00 v4_2_0 1.10.530.40 1-162:A 102mA00 v4_2_0 1.10.490.10 0-153:A 103lA00 v4_2_0 1.10.530.40 1-162:A 103mA00 v4_2_0 1.10.490.10 0-153:A 104lA00 v4_2_0 1.10.530.40 1-162:A 104lB00 v4_2_0 1.10.530.40 1-162:B 104mA00 v4_2_0 1.10.490.10 1-153:A 105mA00 v4_2_0 1.10.490.10 1-153:A 106mA00 v4_2_0 1.10.490.10 0-153:A 107lA00 v4_2_0 1.10.530.40 1-162:A 107mA00 v4_2_0 1.10.490.10 0-153:A 108lA00 v4_2_0 1.10.530.40 1-162:A 108mA00 v4_2_0 1.10.490.10 0-153:A 109lA00 v4_2_0 1.10.530.40 1-162:A 109mA00 v4_2_0 1.10.490.10 0-153:A 10gsA01 v4_2_0 3.40.30.10 2-78:A,187-208:A # Returns: dD[domainId] = (cathId, [(authAsymId, resBeg, resEnd), ...], version) """ dD = {} logger.info("length of input domain assignment list %d", len(dmL)) for dm in dmL: # try: ff = dm.split(" ") # rngL = ff[3].split(",") dmTupL = [] for rng in rngL: tL = rng.split(":") rL = tL[0].split("-") dmTupL.append((tL[1], rL[0], rL[1])) # dD[ff[0]] = (ff[2], dmTupL, ff[1]) except Exception: logger.info("Failing for case %r: %r", ff, dm) return dD def __buildAssignments(self, dD): """ Input internal data structure with domain assignments - dD[domainId] = (cathId, rangelist, version) Returns: = aD[(pdbId, authAsymId)] = [(cathId, domainId, (authAsymId, resBeg, resEnd), version)] """ pdbD = {} for domId, dTup in dD.items(): pdbId = domId[:4] for rTup in dTup[1]: pdbD.setdefault((pdbId, rTup[0]), []).append((dTup[0], domId, rTup, dTup[2])) return pdbD def __exportTreeNodeList(self, nD): """Create node list from name dictionary and lineage dictionaries.""" # create parent dictionary # pL = [] pD = {} for tId in nD: ff = tId.split(".") if len(ff) == 1: ptId = None pL.append(tId) else: ptId = ".".join(ff[:-1]) logger.debug("tId %s parent %s", tId, ptId) pD[tId] = ptId # logger.info("nD %d pD %d", len(nD), len(pD)) # create child dictionary cD = {} for ctId, ptId in pD.items(): cD.setdefault(ptId, []).append(ctId) # logger.info("cD %d", len(cD)) # idL = [] for rootId in sorted(pL): visited = set([rootId]) queue = collections.deque(visited) while queue: tId = queue.popleft() idL.append(tId) if tId not in cD: # logger.debug("No children for CATH tId %s", tId) continue for childId in cD[tId]: if childId not in visited: queue.append(childId) visited.add(childId) # dL = [] for tId in idL: displayName = nD[tId] ptId = pD[tId] ff = tId.split(".") lL = [".".join(ff[0:jj]) for jj in range(1, len(ff) + 1)] # # d = {'id': tId, 'name': displayName, 'lineage': lL, 'parents': [ptId], 'depth': len(lL)} if len(lL) == 1: dD = {"id": tId, "name": displayName, "depth": 0} else: dD = {"id": tId, "name": displayName, "parents": [ptId], "depth": len(lL) - 1} dL.append(dD) return dL