def getJsonSchema(self, databaseName, collectionName, encodingType="BSON", level="full", extraOpts=None): """Return JSON schema (w/ BSON types) object for the input collection and level.and Args: databaseName (str): database name collectionName (str): collection name in document store encodingType (str, optional): data type convention (BSON|JSON) level (str, optional): Completeness of the schema (e.g. min or full) Returns: dict: Schema object """ sObj = None schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level) # if self.__rebuildFlag: filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, extraOpts=extraOpts) else: filePath = self.__reload(schemaLocator, self.__jsonSchemaCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) if filePath and mU.exists(filePath): mU = MarshalUtil(workPath=self.__workPath) sObj = mU.doImport(filePath, fmt="json") else: logger.debug("Failed to read schema for %s %r", collectionName, level) return sObj
def __rebuildCache(self, urlTargetIsoLtwa, dirPath, useCache): """Rebuild the cache of ISO abbreviation term data Args: urlTargetIsoLtwa (str): URL for ISO4 LTWA title word abbreviations dirPath (str): cache path useCache (bool): flag to use cached files Returns: tuple: (dict) title word abbreviations (dict) language conflict dictionary (list) multi-word abbreviation targets Notes: ISO source file (tab delimited UTF-16LE) is maintained at the ISSN site - https://www.issn.org/wp-content/uploads/2013/09/LTWA_20160915.txt """ aD = {} mU = MarshalUtil(workPath=dirPath) fmt = "json" ext = fmt if fmt == "json" else "pic" isoLtwaNamePath = os.path.join(dirPath, "iso-ltwa.%s" % ext) logger.debug("Using cache data path %s", dirPath) mU.mkdir(dirPath) if not useCache: for fp in [isoLtwaNamePath]: try: os.remove(fp) except Exception: pass # if useCache and mU.exists(isoLtwaNamePath): aD = mU.doImport(isoLtwaNamePath, fmt=fmt) logger.debug("Abbreviation name length %d", len(aD["abbrev"])) elif not useCache: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetIsoLtwa, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetIsoLtwa)) ok = fU.get(urlTargetIsoLtwa, fp) aD = self.__getLtwaTerms(dirPath, fp) ok = mU.doExport(isoLtwaNamePath, aD, fmt=fmt) logger.debug("abbrevD keys %r", list(aD.keys())) logger.debug("Caching %d ISO LTWA in %s status %r", len(aD["abbrev"]), isoLtwaNamePath, ok) # abbrevD = aD["abbrev"] if "abbrev" in aD else {} conflictD = aD["conflicts"] if "conflicts" in aD else {} multiWordTermL = aD[ "multi_word_abbrev"] if "multi_word_abbrev" in aD else [] # return abbrevD, conflictD, multiWordTermL
def readIndex(self): try: mU = MarshalUtil() if not mU.exists(self._indexFilePath): return False indexObj = mU.doImport(self._indexFilePath, fmt=self.__fmt) if indexObj is not None and len(indexObj) > 0: self._rL.extend(indexObj) return True except Exception as e: logger.error("Failing with %s", str(e)) return False
class ChemCompMoleculeProvider(object): """Utilities to read and serialize the dictionary of PDBx/mmCIF chemical component definitions.""" def __init__(self, **kwargs): # Default source target locators self.__ccUrlTarget = kwargs.get("ccUrlTarget", None) self.__ccUrlTarget = self.__ccUrlTarget if self.__ccUrlTarget else "http://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif.gz" self.__birdUrlTarget = kwargs.get("birdUrlTarget", None) self.__birdUrlTarget = self.__birdUrlTarget if self.__birdUrlTarget else "http://ftp.wwpdb.org/pub/pdb/data/bird/prd/prdcc-all.cif.gz" # ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc") cachePath = kwargs.get("cachePath", ".") dirPath = os.path.join(cachePath, "chem_comp") useCache = kwargs.get("useCache", True) molLimit = kwargs.get("molLimit", 0) skipObsolete = kwargs.get("skipObsolete", True) # Optional id dictionary filter filterIdD = kwargs.get("filterIdD", None) # self.__mU = MarshalUtil(workPath=dirPath) self.__ccMolD = self.__reload( self.__ccUrlTarget, self.__birdUrlTarget, ccFileNamePrefix, dirPath, useCache=useCache, molLimit=molLimit, filterIdD=filterIdD, skipObsolete=skipObsolete ) def testCache(self, minCount=None, logSizes=False): if logSizes and self.__ccMolD: logger.info("ccMolD object size %.2f MB", getObjSize(self.__ccMolD) / 1000000.0) ok = self.__ccMolD and len(self.__ccMolD) >= minCount if minCount else self.__ccMolD is not None return ok def getMolD(self): return self.__ccMolD def getMol(self, ccId): try: return self.__ccMolD[ccId] except Exception as e: logger.debug("Get molecule %r failing with %s", ccId, str(e)) return None def getReleaseStatus(self, ccId): try: ccIt = iter(PdbxChemCompIt(self.__ccMolD[ccId])) ccIt = next(ccIt, None) return ccIt.getReleaseStatus() if ccIt else None except Exception as e: logger.exception("Failing for ccId %r with %s", ccId, str(e)) return None def __reload(self, ccUrlTarget, birdUrlTarget, ccFileNamePrefix, dirPath, useCache=False, molLimit=None, filterIdD=None, skipObsolete=True): """Reload or create serialized data dictionary of chemical components. Args: ccUrlTarget (str): target url for chemical component dictionary resource file birdUrlTarget (str): target url for bird dictionary resource file (cc format) dirPath (str): path to the directory containing cache files useCache (bool): molLimit (int): maximum number of definitions to process filterIdD (dict): dictionary of selected chemical component identifier codes skipObsolete (bool): skip obsolete definitions Returns: (list): chemical component data containers """ # startTime = time.time() # This is the naming standard for serialized PDBx/mmCIF component data ccDataFilePath = os.path.join(dirPath, "%s-chemical-component-data.pic" % ccFileNamePrefix) _, fExt = os.path.splitext(ccDataFilePath) ccDataFormat = "json" if fExt == ".json" else "pickle" # if useCache and self.__mU.exists(ccDataFilePath): rdCcObjD = self.__mU.doImport(ccDataFilePath, fmt=ccDataFormat) ccObjD = {k: rdCcObjD[k] for k in sorted(rdCcObjD.keys())[:molLimit]} if molLimit else rdCcObjD if skipObsolete: tD = {} for ccId in ccObjD: ccIt = iter(PdbxChemCompIt(ccObjD[ccId])) ccIt = next(ccIt, None) if ccIt.getReleaseStatus() not in ["REL", "REF_ONLY"]: continue tD[ccId] = ccObjD[ccId] ccObjD = tD else: # Source component data files ... ccdFilePath = self.__fetchUrl(ccUrlTarget, dirPath, useCache=useCache) birdFilePath = self.__fetchUrl(birdUrlTarget, dirPath, useCache=useCache) rdCcObjD = self.__readComponentDefinitions(ccdFilePath, birdFilePath, molLimit=molLimit, skipObsolete=skipObsolete) ccObjD = {ccId: ccObj for ccId, ccObj in rdCcObjD.items() if ccId in filterIdD} if filterIdD else rdCcObjD ok = self.__mU.doExport(ccDataFilePath, ccObjD, fmt=ccDataFormat) logger.info("Storing %d definitions (status=%r) path: %s ", len(ccObjD), ok, ccDataFilePath) # endTime = time.time() logger.info("Loaded/reloaded %d definitions (%.4f seconds)", len(ccObjD), endTime - startTime) return ccObjD def __fetchUrl(self, urlTarget, dirPath, useCache=False): fU = FileUtil() fn = fU.getFileName(urlTarget) filePath = os.path.join(dirPath, fn) if not (useCache and fU.exists(filePath)): startTime = time.time() ok2 = fU.get(urlTarget, filePath) endTime = time.time() if ok2: logger.info("Fetched %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime) else: logger.error("Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime) # return filePath def __readComponentDefinitions(self, ccdFilePath, birdFilePath=None, molLimit=None, skipObsolete=True): ccObjD = {} try: startTime = time.time() logger.info("Reading definitions from %s", ccdFilePath) rdCcObjL = self.__mU.doImport(ccdFilePath, fmt="mmcif") endTime = time.time() logger.info("Read %s with %d CCD definitions (%.4f seconds)", ccdFilePath, len(rdCcObjL), endTime - startTime) # ------- if birdFilePath: startTime = time.time() logger.info("Reading definitions from %s", birdFilePath) birdCcObjL = self.__mU.doImport(birdFilePath, fmt="mmcif") endTime = time.time() logger.info("Read %s with %d BIRD definitions (%.4f seconds)", birdFilePath, len(birdCcObjL), endTime - startTime) rdCcObjL.extend(birdCcObjL) # startTime = time.time() ccObjL = rdCcObjL[:molLimit] if molLimit else rdCcObjL for ccObj in ccObjL: ccIt = iter(PdbxChemCompIt(ccObj)) ccIt = next(ccIt, None) ccId = ccIt.getId() if ccIt else ccObj.getName() if skipObsolete and ccIt.getReleaseStatus() not in ["REL", "REF_ONLY"]: continue ccObjD[ccId] = ccObj endTime = time.time() logger.info("Processed %d definitions (%.4f seconds)", len(ccObjD), endTime - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) # return ccObjD
class ReferenceSequenceAssignmentUpdater(object): """Selected utilities to update reference sequence assignments information in the core_entity collection. """ def __init__(self, cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", polymerType="Protein", referenceDatabaseName="UniProt", provSource="PDB", **kwargs): self.__cfgOb = cfgOb self.__polymerType = polymerType self.__mU = MarshalUtil() # self.__databaseName = databaseName self.__collectionName = collectionName self.__statusList = [] # self.__ssP = self.__fetchSiftsSummaryProvider( self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__assignRefD, self.__refD, self.__matchD = self.__reload( databaseName, collectionName, polymerType, referenceDatabaseName, provSource, **kwargs) def __reload(self, databaseName, collectionName, polymerType, referenceDatabaseName, provSource, **kwargs): assignRefD = self.__getPolymerReferenceSequenceAssignments( databaseName, collectionName, polymerType, **kwargs) # get refIdD = {refId: [entity_id, ....], } refIdD, _ = self.__getUniqueAssignments( assignRefD, referenceDatabaseName=referenceDatabaseName, provSource=provSource) # refD, matchD = self.__rebuildReferenceCache(referenceDatabaseName, list(refIdD.keys()), **kwargs) return assignRefD, refD, matchD def doUpdate(self, updateId, updateLimit=None): desp = DataExchangeStatus() statusStartTimestamp = desp.setStartTime() # numUpd = 0 updateDL = self.__buildUpdate(self.__assignRefD) if updateDL: if updateLimit: numUpd = self.__doUpdate(self.__cfgOb, updateDL[:updateLimit], self.__databaseName, self.__collectionName) else: numUpd = self.__doUpdate(self.__cfgOb, updateDL, self.__databaseName, self.__collectionName) self.__updateStatus(updateId, self.__databaseName, self.__collectionName, True, statusStartTimestamp) return len(updateDL), numUpd def __doUpdate(self, cfgOb, updateDL, databaseName, collectionName): obUpd = ObjectUpdater(cfgOb) numUpd = obUpd.update(databaseName, collectionName, updateDL) logger.info("Update count is %d", numUpd) return numUpd def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, **kwargs): """Get all accessions assigned to input reference sequence database for the input polymerType. Returns: (dict): {"1abc_1": "rcsb_entity_container_identifiers": {"reference_sequence_identifiers": []}, "rcsb_polymer_entity_align": [], "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []} """ cachePath = kwargs.get("cachePath", ".") exDbDir = "exdb" cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3}) useCache = kwargs.get("useCache", True) fetchLimit = kwargs.get("fetchLimit", None) cacheFilePath = os.path.join(cachePath, exDbDir, "entity-poly-ref-seq-assign-cache.json") # try: obEx = ObjectExtractor( self.__cfgOb, databaseName=databaseName, collectionName=collectionName, cacheFilePath=cacheFilePath, useCache=useCache, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=cacheKwargs, objectLimit=fetchLimit, selectionQuery={ "entity_poly.rcsb_entity_polymer_type": polymerType }, selectionList=[ "rcsb_id", "rcsb_entity_container_identifiers.reference_sequence_identifiers", "rcsb_entity_container_identifiers.auth_asym_ids", "rcsb_polymer_entity_align", "rcsb_entity_source_organism.ncbi_taxonomy_id", ], ) eCount = obEx.getCount() logger.info("Entity count is %d", eCount) objD = obEx.getObjects() logger.info( "Reading polymer entity entity count %d ref accession length %d ", eCount, len(objD)) # except Exception as e: logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e)) return objD def __getUniqueAssignments(self, objD, referenceDatabaseName="UniProt", provSource="PDB"): refIdD = defaultdict(list) taxIdD = defaultdict(list) numMissing = 0 for entityKey, eD in objD.items(): try: accS = set() for ii, tD in enumerate(eD["rcsb_entity_container_identifiers"] ["reference_sequence_identifiers"]): if tD["database_name"] == referenceDatabaseName and tD[ "provenance_source"] == provSource: accS.add(tD["database_accession"]) refIdD[tD["database_accession"]].append(entityKey) # # pick up the corresponding taxonomy - try: taxIdD[tD["database_accession"]].append( eD["rcsb_entity_source_organism"][ii] ["ncbi_taxonomy_id"]) except Exception: logger.warning("Failing taxonomy lookup for %s %r", entityKey, tD["database_accession"]) logger.debug("PDB assigned sequences length %d", len(accS)) except Exception as e: numMissing += 1 logger.debug("No sequence assignments for %s with %s", entityKey, str(e)) # for refId, taxIdL in taxIdD.items(): taxIdL = list(set(taxIdL)) if len(taxIdL) > 1: logger.info( "Multitple taxIds assigned to reference sequence id %s: %r", refId, taxIdL) logger.info("Unique %s accession assignments by %s %d (missing %d) ", referenceDatabaseName, provSource, len(refIdD), numMissing) return refIdD, taxIdD def __reMapAccessions(self, rsiDL, referenceDatabaseName="UniProt", provSourceL=None, excludeReferenceDatabases=None): """Internal method to re-map accessions for the input databae and assignment source Args: rsiDL (list): list of accession databaseName (str, optional): resource database name. Defaults to 'UniProt'. provSource (str, optional): assignment provenance. Defaults to 'PDB'. Returns: bool, list: flag for mapping success, and remapped (and unmapped) accessions in the input object list """ isMatched = False unMapped = 0 matched = 0 excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else [ "PDB" ] provSourceL = provSourceL if provSourceL else [] retDL = [] for rsiD in rsiDL: if rsiD["database_name"] in excludeReferenceDatabases: unMapped += 1 continue if rsiD["database_name"] == referenceDatabaseName and rsiD[ "provenance_source"] in provSourceL: try: if len(self.__matchD[rsiD["database_accession"]] ["matchedIds"]) == 1: rsiD["database_accession"] = self.__matchD[ rsiD["database_accession"]]["matchedIds"][0] matched += 1 else: logger.info( "Skipping mapping to multiple superseding accessions %s", rsiD["database_accession"]) # except Exception: unMapped += 1 retDL.append(rsiD) if matched == len(retDL): isMatched = True return not unMapped, isMatched, retDL def __reMapAlignments(self, alignDL, referenceDatabaseName="UniProt", provSourceL=None, excludeReferenceDatabases=None): """Internal method to re-map alignments for the input databae and assignment source Args: alignDL (list): list of aligned regions databaseName (str, optional): resource database name. Defaults to 'UniProt'. provSourceL (list, optional): assignment provenance. Defaults to 'PDB'. Returns: bool, list: flag for mapping success, and remapped (and unmapped) accessions in the input align list """ isMatched = False unMapped = 0 matched = 0 excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else [ "PDB" ] retDL = [] provSourceL = provSourceL if provSourceL else [] for alignD in alignDL: if alignD["reference_database_name"] in excludeReferenceDatabases: unMapped += 1 continue if alignD[ "reference_database_name"] == referenceDatabaseName and alignD[ "provenance_code"] in provSourceL: try: if len(self.__matchD[ alignD["reference_database_accession"]] ["matchedIds"]) == 1: alignD["reference_database_accession"] = self.__matchD[ alignD["reference_database_accession"]][ "matchedIds"][0] matched += 1 else: logger.info( "Skipping alignment mapping to multiple superseding accessions %s", alignD["reference_database_accession"]) except Exception: unMapped += 1 retDL.append(alignD) if matched == len(retDL): isMatched = True # return not unMapped, isMatched, retDL def __getSiftsAccessions(self, entityKey, authAsymIdL): retL = [] saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL) for (_, dbAccession), _ in saoLD.items(): retL.append({ "database_name": "UniProt", "database_accession": dbAccession, "provenance_source": "SIFTS" }) return retL def __getSiftsAlignments(self, entityKey, authAsymIdL): retL = [] saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL) for (_, dbAccession), saoL in saoLD.items(): dD = { "reference_database_name": "UniProt", "reference_database_accession": dbAccession, "provenance_code": "SIFTS", "aligned_regions": [] } for sao in saoL: dD["aligned_regions"].append({ "ref_beg_seq_id": sao.getDbSeqIdBeg(), "entity_beg_seq_id": sao.getEntitySeqIdBeg(), "length": sao.getEntityAlignLength() }) retL.append(dD) return retL def __buildUpdate(self, assignRefD): # updateDL = [] for entityKey, eD in assignRefD.items(): selectD = {"rcsb_id": entityKey} try: updateD = {} authAsymIdL = [] ersDL = (eD["rcsb_entity_container_identifiers"] ["reference_sequence_identifiers"] if "reference_sequence_identifiers" in eD["rcsb_entity_container_identifiers"] else None) # # if ersDL: authAsymIdL = eD["rcsb_entity_container_identifiers"][ "auth_asym_ids"] isMapped, isMatched, updErsDL = self.__reMapAccessions( ersDL, referenceDatabaseName="UniProt", provSourceL=["PDB"]) # if not isMapped or not isMatched: tL = self.__getSiftsAccessions(entityKey, authAsymIdL) if tL: logger.debug( "Using SIFTS accession mapping for %s", entityKey) else: logger.info( "No alternative SIFTS accession mapping for %s", entityKey) updErsDL = tL if tL else [] # if len(updErsDL) < len(ersDL): logger.info( "Incomplete reference sequence mapping update for %s", entityKey) updateD[ "rcsb_entity_container_identifiers.reference_sequence_identifiers"] = updErsDL # alignDL = eD[ "rcsb_polymer_entity_align"] if "rcsb_polymer_entity_align" in eD else None if alignDL and authAsymIdL: isMapped, isMatched, updAlignDL = self.__reMapAlignments( alignDL, referenceDatabaseName="UniProt", provSourceL=["PDB"]) # if not isMapped or not isMatched: tL = self.__getSiftsAlignments(entityKey, authAsymIdL) if tL: logger.debug( "Using SIFTS alignment mapping for %s", entityKey) else: logger.info( "No alternative SIFTS alignment mapping for %s", entityKey) updAlignDL = tL if tL else updAlignDL # if len(updAlignDL) < len(alignDL): logger.info( "Incomplete alignment mapping update for %s", entityKey) updateD["rcsb_polymer_entity_align"] = updAlignDL # if updateD: updateDL.append({"selectD": selectD, "updateD": updateD}) except Exception as e: logger.exception("Mapping error for %s with %s", entityKey, str(e)) # return updateDL def __rebuildReferenceCache(self, refDbName, idList, **kwargs): """ """ dD = {} cachePath = kwargs.get("cachePath", ".") dirPath = os.path.join(cachePath, "exdb") cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3}) useCache = kwargs.get("useCache", True) fetchLimit = kwargs.get("fetchLimit", None) saveText = kwargs.get("saveText", False) # ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json" fn = "ref-sequence-data-cache" + "." + ext cacheFilePath = os.path.join(dirPath, fn) # self.__mU.mkdir(dirPath) if not useCache: for fp in [cacheFilePath]: try: os.remove(fp) except Exception: pass # if useCache and cacheFilePath and self.__mU.exists(cacheFilePath): dD = self.__mU.doImport(cacheFilePath, **cacheKwargs) # Check for completeness - missingS = set(dD["refDbCache"].keys()) - set(idList) if missingS: logger.info("Reference sequence cache missing %d accessions", len(missingS)) extraD = self.__fetchReferenceEntries(refDbName, list(missingS), saveText=saveText, fetchLimit=fetchLimit) dD["refDbCache"].update(extraD["refDbCache"]) dD["matchInfo"].update(extraD["matchInfo"]) if cacheFilePath and cacheKwargs: self.__mU.mkdir(dirPath) ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs) logger.info("Cache updated with status %r", ok) # else: dD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit) if cacheFilePath and cacheKwargs: self.__mU.mkdir(dirPath) ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs) logger.info("Cache save status %r", ok) return dD["refDbCache"], dD["matchInfo"] def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None): """Fetch database entries from the input reference sequence database name.""" dD = {"refDbName": refDbName, "refDbCache": {}, "matchInfo": {}} try: idList = idList[:fetchLimit] if fetchLimit else idList logger.info("Starting fetch for %d %s entries", len(idList), refDbName) if refDbName == "UniProt": fobj = UniProtUtils(saveText=saveText) refD, matchD = fobj.fetchList(idList) dD = { "refDbName": refDbName, "refDbCache": refD, "matchInfo": matchD } except Exception as e: logger.exception("Failing with %s", str(e)) return dD def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs): abbreviated = kwargs.get("siftsAbbreviated", "PROD") cachePath = kwargs.get("cachePath", ".") cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"}) useCache = kwargs.get("useCache", True) # siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName) # logger.info("Using SIFTS_SUMMARY_DATA_PATH, %r", siftsSummaryDataPath) if siftsSummaryDataPath.lower().startswith("http"): srcDirPath = siftsSummaryDataPath else: srcDirPath = os.path.join(cachePath, siftsSummaryDataPath) cacheDirPath = os.path.join( cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName)) logger.debug("ssP %r %r", srcDirPath, cacheDirPath) ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=abbreviated, cacheKwargs=cacheKwargs) logger.info("ssP entry count %d", ssP.getEntryCount()) return ssP def __dumpEntries(self, refD): for (eId, eDict) in refD.items(): logger.info("------ Reference id %s", eId) for k, v in eDict.items(): logger.info("%-15s = %r", k, v) def __getUpdateAssignmentCandidates(self, objD): totCount = 0 difCount = 0 pdbUnpIdD = defaultdict(list) siftsUnpIdD = defaultdict(list) assignIdDifD = defaultdict(list) # for entityKey, eD in objD.items(): try: siftsS = set() pdbS = set() for tD in eD["rcsb_entity_container_identifiers"][ "reference_sequence_identifiers"]: if tD["database_name"] == "UniProt": if tD["provenance_source"] == "SIFTS": siftsS.add(tD["database_accession"]) siftsUnpIdD[tD["database_accession"]].append( entityKey) elif tD["provenance_source"] == "PDB": pdbS.add(tD["database_accession"]) pdbUnpIdD[tD["database_accession"]].append( entityKey) else: logger.debug("No UniProt for %r", eD["rcsb_entity_container_identifiers"]) logger.debug("PDB assigned sequence length %d", len(pdbS)) logger.debug("SIFTS assigned sequence length %d", len(siftsS)) if pdbS and siftsS: totCount += 1 if pdbS != siftsS: difCount += 1 for idV in pdbS: assignIdDifD[idV].append(entityKey) except Exception as e: logger.warning("No identifiers for %s with %s", entityKey, str(e)) # logger.info("Total %d differences %d", totCount, difCount) logger.info("Unique UniProt accession assignments PDB %d SIFTS %d", len(pdbUnpIdD), len(siftsUnpIdD)) logger.info("Current unique overalapping assignment differences %d ", len(assignIdDifD)) logger.info("Current unique overalapping assignment differences %r ", assignIdDifD) return assignIdDifD, pdbUnpIdD, siftsUnpIdD def getReferenceAccessionAlignSummary(self): """Summarize the alignment of PDB accession assignments with the current reference sequence database.""" numPrimary = 0 numSecondary = 0 numNone = 0 for _, mD in self.__matchD.items(): if mD["matched"] == "primary": numPrimary += 1 elif mD["matched"] == "secondary": numSecondary += 1 else: numNone += 1 logger.debug("Matched primary: %d secondary: %d none %d", numPrimary, numSecondary, numNone) return numPrimary, numSecondary, numNone def getLoadStatus(self): return self.__statusList def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp): try: sFlag = "Y" if status else "N" desp = DataExchangeStatus() desp.setStartTime(tS=startTimestamp) desp.setObject(databaseName, collectionName) desp.setStatus(updateId=updateId, successFlag=sFlag) desp.setEndTime() self.__statusList.append(desp.getStatus()) return True except Exception as e: logger.exception("Failing with %s", str(e)) return False
def __rebuildCache(self, targetUrl, mapNameL, outDirPath, rawDirPath, fmt="pickle", useCache=True): """Fetch the UniProt selected id mapping resource file and extract UniProt Acc to 'mapIndex' mapping. Serialize the mapping as required. Args: targetUrl (str): source URL of the remote index file mapNameL (list): list of key mapping names to extract from the index outDirPath (str): directory path for raw and processed mapping files fmt (str, optional): output format (pickle|json) . Defaults to "pickle". useCache (bool, optional): use cached files. Defaults to True. Returns: dict: od[uniprotId] = mapped value idmapping_selected.tab 1. UniProtKB-AC 2. UniProtKB-ID 3. GeneID (EntrezGene) 4. RefSeq 5. GI 6. PDB 7. GO 8. UniRef100 9. UniRef90 10. UniRef50 11. UniParc 12. PIR 13. NCBI-taxon 14. MIM 15. UniGene 16. PubMed 17. EMBL 18. EMBL-CDS 19. Ensembl 20. Ensembl_TRS 21. Ensembl_PRO 22. Additional PubMed """ startTime = time.time() nL = mapNameL oD = {} try: fileU = FileUtil() fExt = "pic" if fmt == "pickle" else "json" fExt = "tdd" if fmt == "tdd" else fExt fN, _ = os.path.splitext(fileU.getFileName(targetUrl)) mapFileName = fN + "-map." + fExt idMapPath = os.path.join(outDirPath, mapFileName) mU = MarshalUtil() if useCache and mU.exists(idMapPath): logger.info("Reading cached serialized file %r", idMapPath) if fmt in ["pickle", "json"]: tD = mU.doImport(idMapPath, fmt=fmt) nL = list(set(tD["idNameList"])) oD = tD["uniprotMapD"] logger.info("keys %r", list(oD.keys())[:10]) logger.info("nL %r", nL) ok = True elif fmt == "tdd": ioU = IoUtil() it = ioU.deserializeCsvIter(idMapPath, delimiter="\t", rowFormat="list", encodingErrors="ignore") tL = next(it, []) nL = tL[1:] if len(nL) == 1: for row in it: oD[row[0]] = row[1] else: for row in it: oD[row[0]] = row[1:] ok = True else: idPath = os.path.join(rawDirPath, fileU.getFileName(targetUrl)) if not fileU.exists(idPath): logger.info( "Fetching selected UniProt idmapping data from %r in %r", targetUrl, outDirPath) ok = fileU.get(targetUrl, idPath) if not ok: logger.error("Failed to downlowd %r", targetUrl) return oD else: logger.info("Using cached mapping file %r", idPath) # --- ioU = IoUtil() if fmt in ["pickle", "json"]: if len(mapNameL) == 1: for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): oD[row[0]] = str( row[self.__mapRecordD[mapNameL[0]] - 1]) else: for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): for mapName in mapNameL: oD.setdefault(row[0], []).append( str(row[self.__mapRecordD[mapName] - 1])) logger.info("Writing serialized mapping file %r", idMapPath) ok = mU.doExport(idMapPath, { "idNameList": mapNameL, "uniprotMapD": oD }, fmt=fmt) elif fmt == "tdd": # logger.info("Writing serialized mapping file %r", idMapPath) fU = FileUtil() fU.mkdirForFile(idMapPath) colNameL = [] colNameL.append("UniProtId") colNameL.extend(mapNameL) with open(idMapPath, "w", encoding="utf-8") as ofh: ofh.write("%s\n" % "\t".join(colNameL)) if len(mapNameL) == 1: idx = self.__mapRecordD[mapNameL[0]] - 1 for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): ofh.write("%s\t%s\n" % (row[0], row[idx])) else: idxL = [0] idxL.extend([ self.__mapRecordD[mapName] - 1 for mapName in mapNameL ]) for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): ofh.write( "%s\n" % "\t".join([str(row[idx]) for idx in idxL])) # nL, oD = self.__rebuildCache(targetUrl, mapNameL, outDirPath, rawDirPath, fmt=fmt, useCache=True) ok = True if nL and oD else False logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) # return nL, oD
class InterProProvider(object): """Manage mappings of InterPro identifiers to description and parent/child relationships""" def __init__(self, **kwargs): urlTargetInterPro = kwargs.get("urlTargetInterPro", "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/entry.list") urlTargetInterProFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/InterPro/entry.list" urlTargetInterProParent = kwargs.get("urlTargetInterPro", "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/ParentChildTreeFile.txt") urlTargetInterProParentFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/InterPro/ParentChildTreeFile.txt" cachePath = kwargs.get("cachePath", ".") dirPath = os.path.join(cachePath, "interPro") useCache = kwargs.get("useCache", True) # self.__mU = MarshalUtil(workPath=dirPath) self.__interProD, self.__interProParentD = self.__rebuildCache(urlTargetInterPro, urlTargetInterProFB, urlTargetInterProParent, urlTargetInterProParentFB, dirPath, useCache) def getDescription(self, interProId): ret = None try: ret = self.__interProD[interProId]["description"] except Exception: pass return ret def getType(self, interProId): ret = None try: ret = self.__interProD[interProId]["type"] except Exception: pass return ret def testCache(self): # Check length ... logger.info("Length InterPro %d", len(self.__interProD)) return len(self.__interProD) > 1000 # def __rebuildCache(self, urlTargetInterPro, urlTargetInterProFB, urlTargetInterProParent, urlTargetInterProParentFB, dirPath, useCache): fmt = "json" ext = fmt if fmt == "json" else "pic" interProDataPath = os.path.join(dirPath, "interPro-data.%s" % ext) # logger.debug("Using cache data path %s", dirPath) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(interProDataPath): rD = self.__mU.doImport(interProDataPath, fmt=fmt) interProD = rD["index"] interProParentD = rD["parents"] logger.debug("InterPro index length %d parent length %d", len(interProD), len(interProParentD)) else: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetInterPro, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetInterPro)) ok = fU.get(urlTargetInterPro, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProFB)) ok = fU.get(urlTargetInterProFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) interProD = self.__getInterProIndex(fp) logger.info("Caching %d in %s status %r", len(interProD), interProDataPath, ok) # ------ logger.info("Fetch data from source %s in %s", urlTargetInterProParent, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParent)) ok = fU.get(urlTargetInterProParent, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParentFB)) ok = fU.get(urlTargetInterProParentFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) interProParentD = self.__getInterProParents(fp) # ok = self.__mU.doExport(interProDataPath, {"index": interProD, "parents": interProParentD}, fmt=fmt) # return interProD, interProParentD def getLineage(self, idCode): pList = [] try: pList.append(idCode) pt = self.getParentId(idCode) while (pt is not None) and (pt != 1): pList.append(pt) pt = self.getParentId(pt) except Exception as e: logger.exception("Failing with %s", str(e)) # pList.reverse() return pList def getLineageWithNames(self, idCode): linL = [] try: idCodeL = self.getLineage(idCode) for ii, idCode in enumerate(idCodeL, 1): linL.append((idCode, self.getDescription(idCode), ii)) except Exception as e: logger.exception("Failing with %s", str(e)) return linL def getParentId(self, idCode): try: return self.__interProParentD[idCode] except Exception: pass return None def getTreeNodeList(self, filterD=None): dL = [] try: for idCode, _ in self.__interProD.items(): if filterD and idCode not in filterD: continue displayName = self.getDescription(idCode) pId = self.getParentId(idCode) linL = self.getLineage(idCode) # if pId is None: dD = {"id": idCode, "name": displayName, "depth": 0} else: dD = {"id": idCode, "name": displayName, "parents": [pId], "depth": len(linL) - 1} dL.append(dD) except Exception as e: logger.exception("Failing with %s", str(e)) return dL def __getInterProParents(self, filePath): """Read the InterPro parent hierarchy and return a dictionary parent ids. Args: filePath (str): path to InterPro parent/child hierachy Returns: dict: {idCode: parentIdCode or None} """ interProParentD = {} lineL = self.__mU.doImport(filePath, fmt="list") stack = [] for line in lineL: content = line.rstrip() # drop \n row = content.split("--") ff = row[-1].split("::") tS = ff[0].strip() # stack[:] = stack[: len(row) - 1] + [row[-1]] stack[:] = stack[: len(row) - 1] + [tS] for ii, idCode in enumerate(stack): if idCode not in interProParentD: # prevents overwriting the parent of idCode, in case idCode has already been iterated over in ParentChildTreeFile.txt interProParentD[idCode] = None if ii == 0 else stack[ii - 1] else: # This will correct the parent of idCode from being None if it's later identified as having a parent at another point in ParentChildTreeFile.txt if interProParentD[idCode] is None and ii != 0: interProParentD[idCode] = stack[ii - 1] logger.debug("Lineage %r", "\t".join(stack)) # return interProParentD def __getInterProIndex(self, filePath): """Read CSV file of InterPro accessions and descriptions Args: filePath (str): path to InterPro accession/description csv file Returns: dict: {idCode: description} """ interProD = {} encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {} rowL = self.__mU.doImport(filePath, fmt="tdd", rowFormat="list", **encodingD) for row in rowL: try: interProId = row[0].strip().upper() interProType = row[1].strip() descr = row[2].strip() interProD[interProId] = {"description": descr, "type": interProType} except Exception: pass # return interProD
class RemovedHoldingsProvider(object): """Provide an inventory of removed repository content.""" def __init__(self, **kwargs): self.__dirPath = kwargs.get("holdingsDirPath", ".") useCache = kwargs.get("useCache", True) baseUrl = kwargs.get("baseUrl", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/development/fall_back/holdings/") urlTarget = kwargs.get("removedTargetUrl", os.path.join(baseUrl, "removed_holdings.json.gz")) urlFallbackTarget = kwargs.get("removedTargetUrl", os.path.join(baseUrl, "removed_holdings.json.gz")) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__invD = self.__reload(urlTarget, urlFallbackTarget, self.__dirPath, useCache=useCache) def testCache(self, minCount=1000): logger.info("Inventory length cD (%d)", len(self.__invD)) if len(self.__invD) > minCount: return True return False def getStatusCode(self, entryId): """Return the status code for the removed entry""" try: return self.__invD[entryId.upper()]["status_code"] except Exception as e: logger.debug("Failing for %r with %s", entryId, str(e)) return None def getRemovedInfo(self, entryId): """Return the dictionary describing the details for this removed entry""" try: return self.__invD[entryId.upper()] except Exception as e: logger.debug("Failing for %r with %s", entryId, str(e)) return {} def getContentTypes(self, entryId): """Return the removed content types for the input entry identifier""" try: return sorted(self.__invD[entryId.upper()]["content_type"].keys()) except Exception as e: logger.debug("Failing for %r with %s", entryId, str(e)) return [] def getContentTypePathList(self, entryId, contentType): """Return the removed content types for the input entry identifier""" try: return ( self.__invD[entryId.upper()]["content_type"][contentType] if isinstance(self.__invD[entryId.upper()]["content_type"][contentType], list) else [self.__invD[entryId.upper()]["content_type"][contentType]] ) except Exception as e: logger.debug("Failing for %r %r with %s", entryId, contentType, str(e)) return [] def getInventory(self): """Return the removed inventory dictionary""" try: return self.__invD except Exception as e: logger.debug("Failing with %s", str(e)) return {} def __reload(self, urlTarget, urlFallbackTarget, dirPath, useCache=True): invD = {} fU = FileUtil() fn = fU.getFileName(urlTarget) fp = os.path.join(dirPath, fn) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(fp): invD = self.__mU.doImport(fp, fmt="json") logger.debug("Reading cached inventory (%d)", len(invD)) else: logger.info("Fetch inventory from %s", urlTarget) ok = fU.get(urlTarget, fp) if not ok: ok = fU.get(urlFallbackTarget, fp) # if ok: invD = self.__mU.doImport(fp, fmt="json") # return invD
class ReferenceSequenceUtils(object): """Selected utilities to integrate reference sequence information with PDB polymer entity data.""" def __init__(self, cfgOb, refDbName, **kwargs): self.__cfgOb = cfgOb self.__refDbName = refDbName self.__mU = MarshalUtil() # self.__refIdList = self.__getReferenceAssignments(refDbName, **kwargs) self.__refD, self.__matchD = self.__rebuildCache( refDbName, self.__refIdList, **kwargs) def __getReferenceAssignments(self, refDbName, **kwargs): """Get all accessions assigned to input reference sequence database""" rL = [] exdbDirPath = kwargs.get("exdbDirPath", None) cacheKwargs = kwargs.get("cacheKwargs", None) useCache = kwargs.get("useCache", True) entryLimit = kwargs.get("entryLimit", None) try: epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=exdbDirPath, useCache=useCache, cacheKwargs=cacheKwargs, entryLimit=entryLimit) eCount = epe.getEntryCount() rL = epe.getRefSeqAccessions(refDbName) logger.info( "Reading polymer entity cache with repository entry count %d ref accession length %d ", eCount, len(rL)) # except Exception as e: logger.exception("Failing with %s", str(e)) return rL def __rebuildCache(self, refDbName, idList, **kwargs): """ """ dD = {} dirPath = kwargs.get("exdbDirPath", None) cacheKwargs = kwargs.get("cacheKwargs", None) useCache = kwargs.get("useCache", True) fetchLimit = kwargs.get("fetchLimit", None) saveText = kwargs.get("saveText", False) ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json" fn = "ref-sequence-data-cache" + "." + ext cacheFilePath = os.path.join(dirPath, fn) self.__mU.mkdir(dirPath) if not useCache: for fp in [cacheFilePath]: try: os.remove(fp) except Exception: pass # if useCache and cacheFilePath and self.__mU.exists(cacheFilePath): dD = self.__mU.doImport(cacheFilePath, **cacheKwargs) else: dD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit) if cacheFilePath and cacheKwargs: self.__mU.mkdir(dirPath) ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs) logger.info("Cache save status %r", ok) return dD["refDbCache"], dD["matchInfo"] def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None): """Fetch database entries from the input reference sequence database name.""" dD = {"refDbName": refDbName, "refDbCache": {}, "matchInfo": {}} try: idList = idList[:fetchLimit] if fetchLimit else idList logger.info("Starting fetch for %d %s entries", len(idList), refDbName) if refDbName == "UNP": fobj = UniProtUtils(saveText=saveText) refD, matchD = fobj.fetchList(idList) dD = { "refDbName": refDbName, "refDbCache": refD, "matchInfo": matchD } except Exception as e: logger.exception("Failing with %s", str(e)) return dD def __dumpEntries(self, refD): for (eId, eDict) in refD.items(): logger.info("------ Entry id %s", eId) for k, v in eDict.items(): logger.info("%-15s = %r", k, v) def getReferenceAccessionAlignSummary(self): """Summarize the alignment of PDB accession assignments with the current reference sequence database.""" numPrimary = 0 numSecondary = 0 numNone = 0 for _, mD in self.__matchD.items(): if mD["matched"] == "primary": numPrimary += 1 elif mD["matched"] == "secondary": numSecondary += 1 else: numNone += 1 logger.debug("Matched primary: %d secondary: %d none %d", numPrimary, numSecondary, numNone) return numPrimary, numSecondary, numNone
class ReferenceSequenceAssignmentProvider(object): """Utilities to cache content required to update referencence sequence assignments.""" def __init__( self, cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", polymerType="Protein", referenceDatabaseName="UniProt", provSource="PDB", maxChunkSize=100, fetchLimit=None, **kwargs ): self.__cfgOb = cfgOb self.__polymerType = polymerType self.__mU = MarshalUtil() # self.__maxChunkSize = maxChunkSize self.__statusList = [] # self.__pfP = self.__fetchPfamProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__ipP = self.__fetchInterProProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__goP = self.__fetchGoProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__ecP = self.__fetchEcProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__refIdMapD, self.__matchD, self.__refD = self.__reload(databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs) def goIdExists(self, goId): try: return self.__goP.exists(goId) except Exception as e: logger.exception("Failing for %r with %s", goId, str(e)) return False def getGeneOntologyLineage(self, goIdL): # "id" "name" gL = [] try: gTupL = self.__goP.getUniqueDescendants(goIdL) for gTup in gTupL: gL.append({"id": gTup[0], "name": gTup[1]}) except Exception as e: logger.exception("Failing for %r with %s", goIdL, str(e)) return gL def getPfamProvider(self): return self.__pfP def getInterProProvider(self): return self.__ipP def getEcProvider(self): return self.__ecP def getSiftsSummaryProvider(self): return self.__ssP def getMatchInfo(self): return self.__matchD def getRefData(self): return self.__refD def getDocuments(self, formatType="exchange"): fobj = UniProtUtils(saveText=False) exObjD = fobj.reformat(self.__refD, formatType=formatType) return list(exObjD.values()) def getRefIdMap(self): return self.__refIdMapD def getRefDataCount(self): return len(self.__refD) def testCache(self, minMatchPrimaryPercent=None, logSizes=False): okC = True logger.info("Reference cache lengths: refIdMap %d matchD %d refD %d", len(self.__refIdMapD), len(self.__matchD), len(self.__refD)) ok = bool(self.__refIdMapD and self.__matchD and self.__refD) # numRef = len(self.__refIdMapD) countD = defaultdict(int) logger.info("Match dictionary length %d", len(self.__matchD)) for _, mD in self.__matchD.items(): if "matched" in mD: countD[mD["matched"]] += 1 logger.info("Reference length %d match length %d coverage %r", len(self.__refD), len(self.__matchD), countD.items()) if minMatchPrimaryPercent: try: okC = 100.0 * float(countD["primary"]) / float(numRef) > minMatchPrimaryPercent except Exception: okC = False logger.info("Primary reference match percent test status %r", okC) # if logSizes: logger.info( "Pfam %.2f InterPro %.2f SIFTS %.2f GO %.2f EC %.2f RefIdMap %.2f RefMatchD %.2f RefD %.2f", getObjSize(self.__pfP) / 1000000.0, getObjSize(self.__ipP) / 1000000.0, getObjSize(self.__ssP) / 1000000.0, getObjSize(self.__goP) / 1000000.0, getObjSize(self.__ecP) / 1000000.0, getObjSize(self.__refIdMapD) / 1000000.0, getObjSize(self.__matchD) / 1000000.0, getObjSize(self.__refD) / 1000000.0, ) return ok and okC def __reload(self, databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs): assignRefD = self.__getPolymerReferenceSequenceAssignments(databaseName, collectionName, polymerType, fetchLimit) refIdMapD, _ = self.__getAssignmentMap(assignRefD, referenceDatabaseName=referenceDatabaseName, provSource=provSource) # entryIdL = [rcsbId[:4] for rcsbId in assignRefD] siftsUniProtL = self.__ssP.getEntryUniqueIdentifiers(entryIdL, idType="UNPID") logger.info("Incorporating %d SIFTS accessions for %d entries", len(siftsUniProtL), len(entryIdL)) unpIdList = sorted(set(list(refIdMapD.keys()) + siftsUniProtL)) # logger.info("Rebuild cache for %d UniProt accessions (consolidated)", len(unpIdList)) # matchD, refD = self.__rebuildReferenceCache(unpIdList, referenceDatabaseName, **kwargs) return refIdMapD, matchD, refD def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, fetchLimit): """Get all accessions assigned to input reference sequence database for the input polymerType. Returns: (dict): {"1abc_1": "rcsb_polymer_entity_container_identifiers": {"reference_sequence_identifiers": []}, "rcsb_polymer_entity_align": [], "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []} """ try: obEx = ObjectExtractor( self.__cfgOb, databaseName=databaseName, collectionName=collectionName, cacheFilePath=None, useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=None, objectLimit=fetchLimit, selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType}, selectionList=[ "rcsb_id", "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers", "rcsb_polymer_entity_container_identifiers.auth_asym_ids", # "rcsb_polymer_entity_align", # "rcsb_entity_source_organism.ncbi_taxonomy_id", # "rcsb_polymer_entity_container_identifiers.related_annotation_identifiers", # "rcsb_polymer_entity_annotation", "rcsb_entity_source_organism.ncbi_taxonomy_id", ], ) eCount = obEx.getCount() logger.info("Polymer entity count type %s is %d", polymerType, eCount) objD = obEx.getObjects() logger.info("Reading polymer entity count %d ref accession length %d ", eCount, len(objD)) # except Exception as e: logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e)) return objD def __getAssignmentMap(self, objD, referenceDatabaseName="UniProt", provSource="PDB"): refIdD = defaultdict(list) taxIdD = defaultdict(list) numMissing = 0 numMissingTaxons = 0 for entityKey, eD in objD.items(): try: accS = set() for ii, tD in enumerate(eD["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]): if tD["database_name"] == referenceDatabaseName and tD["provenance_source"] == provSource: accS.add(tD["database_accession"]) refIdD[tD["database_accession"]].append(entityKey) # # pick up the corresponding taxonomy - try: taxIdD[tD["database_accession"]].append(eD["rcsb_entity_source_organism"][ii]["ncbi_taxonomy_id"]) except Exception: logger.debug("Failing taxonomy lookup for %s %r", entityKey, tD["database_accession"]) numMissingTaxons += 1 logger.debug("PDB assigned sequences length %d", len(accS)) except Exception as e: numMissing += 1 logger.debug("No sequence assignments for %s with %s", entityKey, str(e)) # numMultipleTaxons = 0 for refId, taxIdL in taxIdD.items(): taxIdL = list(set(taxIdL)) if len(taxIdL) > 1: logger.debug("Multitple taxIds assigned to reference sequence id %s: %r", refId, taxIdL) numMultipleTaxons += 1 logger.info("Entities with missing taxonomy %d", numMissingTaxons) logger.info("Reference sequences with multiple taxonomies %d", numMultipleTaxons) logger.info("Unique %s accession assignments by %s %d (entities missing archive accession assignments %d) ", referenceDatabaseName, provSource, len(refIdD), numMissing) return refIdD, taxIdD # def __rebuildReferenceCache(self, idList, refDbName, **kwargs): """ """ fetchLimit = None doMissing = True dD = {} cachePath = kwargs.get("cachePath", ".") dirPath = os.path.join(cachePath, "exdb") # cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3}) cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"}) useCache = kwargs.get("useCache", True) saveText = kwargs.get("saveText", False) # ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json" fn = refDbName + "-ref-sequence-data-cache" + "." + ext dataCacheFilePath = os.path.join(dirPath, fn) # fn = refDbName + "-ref-sequence-id-cache" + ".json" accCacheFilePath = os.path.join(dirPath, fn) # self.__mU.mkdir(dirPath) if not useCache: for fp in [dataCacheFilePath, accCacheFilePath]: try: os.remove(fp) except Exception: pass # if useCache and accCacheFilePath and self.__mU.exists(accCacheFilePath) and dataCacheFilePath and self.__mU.exists(dataCacheFilePath): dD = self.__mU.doImport(dataCacheFilePath, **cacheKwargs) idD = self.__mU.doImport(accCacheFilePath, fmt="json") logger.info("Reading cached reference sequence ID and data cache files - cached match reference length %d", len(idD["matchInfo"])) idD["matchInfo"] = self.__rebuildReferenceMatchIndex(idList, dD["refDbCache"]) # Check for completeness - if doMissing: missingS = set(idList) - set(idD["matchInfo"].keys()) if missingS: logger.info("Reference sequence cache missing %d accessions", len(missingS)) extraD, extraIdD = self.__fetchReferenceEntries(refDbName, list(missingS), saveText=saveText, fetchLimit=fetchLimit) dD["refDbCache"].update(extraD["refDbCache"]) idD["matchInfo"].update(extraIdD["matchInfo"]) # idD["matchInfo"] = self.__rebuildReferenceMatchIndex(idList, dD["refDbCache"]) # if accCacheFilePath and dataCacheFilePath and cacheKwargs: self.__mU.mkdir(dirPath) ok1 = self.__mU.doExport(dataCacheFilePath, dD, **cacheKwargs) ok2 = self.__mU.doExport(accCacheFilePath, idD, fmt="json", indent=3) logger.info("Cache updated with missing references with status %r", ok1 and ok2) # else: logger.info("Rebuilding reference cache for %s for %d accessions with limit %r", refDbName, len(idList), fetchLimit) dD, idD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit) if accCacheFilePath and dataCacheFilePath and cacheKwargs: self.__mU.mkdir(dirPath) ok1 = self.__mU.doExport(dataCacheFilePath, dD, **cacheKwargs) ok2 = self.__mU.doExport(accCacheFilePath, idD, fmt="json", indent=3) logger.info("Cache save status %r", ok1 and ok2) return idD["matchInfo"], dD["refDbCache"] def __rebuildReferenceMatchIndex(self, idList, referenceD): fobj = UniProtUtils() logger.info("Rebuilding match index on idList (%d) using reference data (%d) %r", len(idList), len(referenceD), type(referenceD)) matchD = fobj.rebuildMatchResultIndex(idList, referenceD) return matchD def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None): """Fetch database entries from the input reference sequence database name.""" dD = {"refDbName": refDbName, "refDbCache": {}} idD = {"matchInfo": {}, "refIdMap": {}} try: idList = idList[:fetchLimit] if fetchLimit else idList logger.info("Starting fetch for %d %s entries", len(idList), refDbName) if refDbName == "UniProt": fobj = UniProtUtils(saveText=saveText) logger.info("Maximum reference chunk size %d", self.__maxChunkSize) refD, matchD = fobj.fetchList(idList, maxChunkSize=self.__maxChunkSize) dD = {"refDbName": refDbName, "refDbCache": refD} idD = {"matchInfo": matchD} # # Check the coverage - # countD = defaultdict(int) logger.info("Match dictionary length %d", len(matchD)) for _, mD in matchD.items(): if "matched" in mD: countD[mD["matched"]] += 1 logger.info("Reference length %d match length %d coverage %r", len(refD), len(matchD), countD.items()) except Exception as e: logger.exception("Failing with %s", str(e)) return dD, idD def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs): abbreviated = kwargs.get("siftsAbbreviated", "TEST") cachePath = kwargs.get("cachePath", ".") cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"}) useCache = kwargs.get("useCache", True) # siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName) # logger.info("Using SIFTS_SUMMARY_DATA_PATH, %r", siftsSummaryDataPath) if siftsSummaryDataPath.lower().startswith("http"): srcDirPath = siftsSummaryDataPath else: srcDirPath = os.path.join(cachePath, siftsSummaryDataPath) cacheDirPath = os.path.join(cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName)) logger.debug("ssP %r %r", srcDirPath, cacheDirPath) ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=abbreviated, cacheKwargs=cacheKwargs) ok = ssP.testCache() logger.debug("SIFTS cache status %r", ok) logger.debug("ssP entry count %d", ssP.getEntryCount()) return ssP def __fetchGoProvider(self, cfgOb, configName, **kwargs): cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) # cacheDirPath = os.path.join(cachePath, cfgOb.get("EXDB_CACHE_DIR", sectionName=configName)) logger.debug("goP %r %r", cacheDirPath, useCache) goP = GeneOntologyProvider(goDirPath=cacheDirPath, useCache=useCache) ok = goP.testCache() logger.debug("Gene Ontology (%r) root node count %r", ok, goP.getRootNodes()) return goP def __fetchEcProvider(self, cfgOb, configName, **kwargs): cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) # cacheDirPath = os.path.join(cachePath, cfgOb.get("ENZYME_CLASSIFICATION_CACHE_DIR", sectionName=configName)) logger.debug("ecP %r %r", cacheDirPath, useCache) ecP = EnzymeDatabaseProvider(enzymeDirPath=cacheDirPath, useCache=useCache) ok = ecP.testCache() logger.debug("Enzyme cache status %r", ok) return ecP def __fetchPfamProvider(self, cfgOb, configName, **kwargs): _ = cfgOb _ = configName cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) pfP = PfamProvider(cachePath=cachePath, useCache=useCache) ok = pfP.testCache() return pfP if ok else None def __fetchInterProProvider(self, cfgOb, configName, **kwargs): _ = cfgOb _ = configName cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) ipP = InterProProvider(cachePath=cachePath, useCache=useCache) ok = ipP.testCache() return ipP if ok else None
class GlycanProvider(StashableBase): """Accessors and generators for entity glycan mapped identifiers. dirPath -> CACHE/glycan/ branched_entity_glycan_identifier_map.json accession-wurcs-mapping.json stash/entity_glycan_mapped_identifiers.tar.gz """ def __init__(self, **kwargs): # self.__version = "0.50" cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) self.__dirName = "glycan" self.__dirPath = os.path.join(cachePath, self.__dirName) super(GlycanProvider, self).__init__(cachePath, [self.__dirName]) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__glyD = self.__reload(fmt="json", useCache=useCache) # def testCache(self, minCount=1): if minCount == 0: return True if self.__glyD and minCount and ("identifiers" in self.__glyD) and len( self.__glyD["identifiers"]) >= minCount: logger.info("Glycan identifiers (%d)", len(self.__glyD["identifiers"])) return True return False def getIdentifiers(self): """Return a dictionary of related identifiers organized by branched entity id. Returns: (dict): {entityId: {'idType1': ids, 'idType1': ids}, ... } """ try: return self.__glyD["identifiers"] if self.__glyD[ "identifiers"] else {} except Exception as e: logger.error("Failing with %r", str(e)) return {} def __getMappingFilePath(self, fmt="json"): baseFileName = "branched_entity_glycan_identifier_map" fExt = ".json" if fmt == "json" else ".pic" fp = os.path.join(self.__dirPath, baseFileName + fExt) return fp def update(self, cfgOb, fmt="json", indent=3): """Update branched entity glycan accession mapping cache. Args: cfgObj (object): ConfigInfo() object instance Returns: (bool): True for success for False otherwise """ ok = False try: gU = GlycanUtils(cfgOb, self.__dirPath) eaD = gU.updateEntityAccessionMap() logger.info("Got branched entity glycan accession map (%d)", len(eaD)) # tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) self.__glyD = { "version": self.__version, "created": tS, "identifiers": eaD } # mappingFilePath = self.__getMappingFilePath(fmt=fmt) kwargs = {"indent": indent} if fmt == "json" else {} ok = self.__mU.doExport(mappingFilePath, self.__glyD, fmt=fmt, **kwargs) except Exception as e: logger.exception("Failing with %s", str(e)) return ok def reload(self): """Reload from the current cache file.""" ok = False try: self.__glyD = self.__reload(fmt="json", useCache=True) ok = self.__glyD is not None except Exception as e: logger.exception("Failing with %s", str(e)) return ok def __reload(self, fmt="json", useCache=True): mappingFilePath = self.__getMappingFilePath(fmt=fmt) tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) pcD = {"version": self.__version, "created": tS, "identifiers": {}} if useCache and self.__mU.exists(mappingFilePath): logger.info("reading cached path %r", mappingFilePath) pcD = self.__mU.doImport(mappingFilePath, fmt=fmt) return pcD
class ChemCompIndexProvider(object): """Utilities to read and process an index of PDB chemical component definitions.""" def __init__(self, **kwargs): # self.__cachePath = kwargs.get("cachePath", ".") self.__dirPath = os.path.join(self.__cachePath, "chem_comp") self.__mU = MarshalUtil(workPath=self.__dirPath) self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc") self.__ccIdxD = self.__reload(**kwargs) def getIndexFilePath(self): return os.path.join( self.__dirPath, "%s-idx-chemical-components.json" % self.__ccFileNamePrefix) def testCache(self, minCount=None, logSizes=False): if logSizes and self.__ccIdxD: logger.info("ccIdxD (%.2f MB)", getObjSize(self.__ccIdxD) / 1000000.0) ok = self.__ccIdxD and len( self.__ccIdxD ) >= minCount if minCount else self.__ccIdxD is not None return ok def matchMolecularFormulaRange(self, typeRangeD, matchSubset=False): """Find matching formula for the input atom type range query (evaluates min <= ff <= max). Args: typeRangeD (dict): dictionary of element ranges {'<element_name>: {'min': <int>, 'max': <int>}} matchSubset (bool, optional): test for formula subset (default: False) Returns: (list): chemical component identifiers with matching formula (MatchResults) """ rL = [] try: if not typeRangeD: return rL myTypeRangeD = {k.upper(): v for k, v in typeRangeD.items()} queryTypeS = set(myTypeRangeD.keys()) for ccId, idxD in self.__ccIdxD.items(): tD = idxD["type-counts"] targetTypeS = set(tD.keys()) if not matchSubset and targetTypeS != queryTypeS: continue # if not queryTypeS.issubset(targetTypeS): continue # match = True for atomType, rangeD in myTypeRangeD.items(): if atomType in tD: # min <= ff <= max if ("min" in rangeD and rangeD["min"] > tD[atomType] ) or ("max" in rangeD and rangeD["max"] < tD[atomType]): match = False break else: match = False break if match: # logger.info("%s formula %r query %r", ccId, idxD["type-counts"], typeRangeD) rL.append( MatchResults(ccId=ccId, searchType="formula", formula=idxD["formula"])) except Exception as e: logger.exception("Failing for %r with %s", typeRangeD, str(e)) return rL def filterMinimumMolecularFormula(self, typeCountD): """Find molecules with the minimum formula composition for the input atom type range query (evaluates min <= ff). Args: typeCountD (dict): dictionary of element minimum values {'<element_name>: #} Returns: (list): chemical component identifiers """ rL = [] try: if not typeCountD: return list(self.__ccIdxD.keys()) typeQueryS = set(typeCountD.keys()) for ccId, idxD in self.__ccIdxD.items(): tD = idxD["type-counts"] # if not typeQueryS.issubset(tD): continue match = True for atomType, minCount in typeCountD.items(): try: if minCount > tD[atomType]: match = False break except Exception: match = False break if match: rL.append(ccId) except Exception as e: logger.exception("Failing for %r with %s", typeCountD, str(e)) return rL def filterMinimumFormulaAndFeatures(self, typeCountD, featureCountD): """Find molecules with the minimum formula and feature composition. Args: typeCountD (dict): dictionary of element minimum values {'<element_name>: #} featureCountD (dict): dictionary of feature minimum values {'<element_name>: #} Returns: (list): chemical component identifiers """ rL = [] try: if not typeCountD or not featureCountD: return list(self.__ccIdxD.keys()) # ---- featureQueryS = set(featureCountD.keys()) typeQueryS = set(typeCountD.keys()) # for ccId, idxD in self.__ccIdxD.items(): tD = idxD["type-counts"] fD = idxD["feature-counts"] # if not typeQueryS.issubset(tD) or not featureQueryS.issubset( fD): continue match = True for atomType, minCount in typeCountD.items(): try: if minCount > tD[atomType]: match = False break except Exception: match = False break if not match: continue # for featureType, minCount in featureCountD.items(): try: if minCount > fD[featureType]: match = False break except Exception: match = False break # if match: rL.append(ccId) except Exception as e: logger.exception("Failing for %r with %s", typeCountD, str(e)) return rL def getIndex(self): return self.__ccIdxD def getIdList(self): return list(self.__ccIdxD.keys()) if self.__ccIdxD else [] def getMol(self, ccId): try: return self.__ccIdxD[ccId] except Exception as e: logger.debug("Get molecule %r failing with %s", ccId, str(e)) return None def getSMILES(self, ccId, smiTypeList=None): smiTypeList = smiTypeList if smiTypeList else [ "oe-iso-smiles", "oe-smiles", "acdlabs-smiles", "cactvs-iso-smiles", "cactvs-smiles" ] try: sL = [] for smilesType in smiTypeList: if smilesType in self.__ccIdxD[ccId]: sL.append(self.__ccIdxD[ccId][smilesType]) return sL except Exception as e: logger.debug("Get SMILES for %r failing with %s", ccId, str(e)) return [] def __reload(self, **kwargs): """Reload or created index of PDB chemical components. Args: cachePath (str): path to the directory containing cache files ccIdxFileName (str): serialized chemical component data index file name Returns: (list): chemical component data containers """ # logger.debug("kwargs %r", kwargs.items()) ccIdxD = {} useCache = kwargs.get("useCache", True) molLimit = kwargs.get("molLimit", 0) ccIdxFilePath = self.getIndexFilePath() # if useCache and self.__mU.exists(ccIdxFilePath): _, fExt = os.path.splitext(ccIdxFilePath) ccIdxFormat = "json" if fExt == ".json" else "pickle" rdCcIdxD = self.__mU.doImport(ccIdxFilePath, fmt=ccIdxFormat) ccIdxD = { k: rdCcIdxD[k] for k in sorted(rdCcIdxD.keys())[:molLimit] } if molLimit else rdCcIdxD else: cmpKwargs = { k: v for k, v in kwargs.items() if k not in ["cachePath", "useCache", "molLimit"] } ccmP = ChemCompMoleculeProvider(cachePath=self.__cachePath, useCache=useCache, molLimit=molLimit, **cmpKwargs) ok = ccmP.testCache(minCount=molLimit, logSizes=True) if ok: molBuildType = cmpKwargs.get("molBuildType", "model-xyz") ccIdxD = self.__updateChemCompIndex(ccmP.getMolD(), ccIdxFilePath, molBuildType=molBuildType) # for idxD in ccIdxD.values(): idxD["atom-types"] = set(idxD["type-counts"].keys() ) if "type-counts" in idxD else set() idxD["feature-types"] = set(idxD["feature-counts"].keys( )) if "feature-counts" in idxD else set() # return ccIdxD def __updateChemCompIndex(self, ccObjD, filePath, molBuildType="model-xyz"): idxD = {} try: # Serialized chemical component data index file startTime = time.time() _, fExt = os.path.splitext(filePath) fileFormat = "json" if fExt == ".json" else "pickle" idxD = self.__buildChemCompIndex(ccObjD, molBuildType=molBuildType) ok = self.__mU.doExport(filePath, idxD, fmt=fileFormat) endTime = time.time() logger.info( "Storing %s with %d raw indexed definitions (status=%r) (%.4f seconds)", filePath, len(idxD), ok, endTime - startTime) # except Exception as e: logger.exception("Failing with %s", str(e)) # return idxD def __buildChemCompIndex(self, cD, molBuildType="model-xyz", doFeatures=True): """Internal method return a dictionary of extracted chemical component descriptors and formula.""" rD = {} try: quietFlag = True for _, dataContainer in cD.items(): ccIt = iter(PdbxChemCompIt(dataContainer)) cc = next(ccIt, None) ccId = cc.getId() formula = str(cc.getFormula()).replace(" ", "") ambiguousFlag = cc.getAmbiguousFlag().upper() in ["Y", "YES"] tch = cc.getFormalCharge() fcharge = int(tch) if tch and tch not in [".", "?"] else 0 # logger.debug("ccId %r formula %r ambiguous %r fcharge %r", ccId, formula, ambiguousFlag, fcharge) if fcharge: sign = "+" if fcharge > 0 else "-" mag = str(abs(fcharge)) if abs(fcharge) > 1 else "" formula = formula + sign + mag # atIt = PdbxChemCompAtomIt(dataContainer) typeCounts = defaultdict(int) for at in atIt: aType = at.getType().upper() typeCounts[aType] += 1 # rD[ccId] = { "formula": formula, "type-counts": typeCounts, "ambiguous": ambiguousFlag, "feature-counts": {} } desIt = PdbxChemCompDescriptorIt(dataContainer) for des in desIt: desBuildType = des.getMolBuildType() tS = des.getDescriptor() descr = tS.strip() if tS else None if not descr: continue if desBuildType in [ "oe-iso-smiles", "oe-smiles", "acdlabs-smiles", "cactvs-iso-smiles", "cactvs-smiles", "inchi", "inchikey" ]: rD[ccId][desBuildType] = descr else: logger.error("%s unexpected descriptor build type %r", ccId, desBuildType) if doFeatures: oemf = OeMoleculeFactory() if quietFlag: oemf.setQuiet() tId = oemf.setChemCompDef(dataContainer) if tId != ccId: logger.error( "%s chemical component definition import error", ccId) continue ok = oemf.build(molBuildType=molBuildType) if ok: rD[ccId]["feature-counts"] = oemf.getFeatureCounts() except Exception as e: logger.exception("Failing with %s", str(e)) return rD
class GlycanUtils: """Utilities for fetching and mapping glycan annotations.""" def __init__(self, cfgOb, dirPath): self.__cfgOb = cfgOb self.__dirPath = dirPath self.__mU = MarshalUtil(workPath=self.__dirPath) # def __getRawGlycanDetailsPath(self): return os.path.join(self.__dirPath, "pdb-raw-branched-entity-details.json") def getBranchedEntityDetails(self): """For branched entities, get BIRD mapping and WURCS details""" ok = False try: bEx = BranchedEntityExtractor(self.__cfgOb) branchedEntityD = bEx.getBranchedDetails() logger.info("Branched entity descriptor details count %d", len(branchedEntityD)) detailsPath = self.__getRawGlycanDetailsPath() ok = bEx.exportBranchedEntityDetails(detailsPath, fmt="json") logger.info("Store raw branched entity data (%r) %s", ok, detailsPath) except Exception as e: logger.exception("Failing with %s", str(e)) # return branchedEntityD def __getGlycanAccessionMapPath(self): return os.path.join(self.__dirPath, "accession-wurcs-mapping.json") def fetchGlycanAccessionMap(self): mapD = {} accessionMapPath = self.__getGlycanAccessionMapPath() if self.__mU.exists(accessionMapPath): mapD = self.__mU.doImport(accessionMapPath, fmt="json") return mapD def storeGlycanAccessionMap(self, mapD): accessionMapPath = self.__getGlycanAccessionMapPath() ok = self.__mU.doExport(accessionMapPath, mapD, fmt="json", indent=3) return ok def updateEntityAccessionMap(self): """Update entity to glycan accession mapping Returns: dict: {entityId: {'glyTouCanId':... , 'prdId': ..., }, ... } """ entityAccessionMapD = {} wurcsTupL = [] uniqueWurcsD = {} accessionMapD = self.fetchGlycanAccessionMap() branchedEntityD = self.getBranchedEntityDetails() for entityId, iD in branchedEntityD.items(): if iD["wurcs"] and iD["wurcs"] not in accessionMapD and iD["wurcs"] not in uniqueWurcsD: wurcsTupL.append((entityId, iD["wurcs"])) uniqueWurcsD.setdefault(iD["wurcs"], []).append(entityId) if wurcsTupL: tMap = self.getAccessionMapping(wurcsTupL) accessionMapD.update(tMap) self.storeGlycanAccessionMap(accessionMapD) # for entityId, iD in branchedEntityD.items(): if iD["wurcs"] in accessionMapD: prdId = iD["prdId"] if iD["wurcs"] else None entityAccessionMapD[entityId] = {"glyTouCanId": accessionMapD[iD["wurcs"]][0], "prdId": prdId} return entityAccessionMapD def getAccessionMapping(self, wurcsTupL): """Fetch GlyTouCan accessions for the input WURCS desriptor list""" accessionMapD = {} logger.info("Fetching (%d) WURCS descriptors", len(wurcsTupL)) baseUrl = "https://api.glycosmos.org" endPoint = "glytoucan/sparql/wurcs2gtcids" numDescriptors = len(wurcsTupL) for ii, (entityId, wurcs) in enumerate(wurcsTupL, 1): try: pD = {} pD["wurcs"] = wurcs uR = UrlRequestUtil() rDL, retCode = uR.post(baseUrl, endPoint, pD, returnContentType="JSON") logger.debug(" %r wurcs fetch result (%r) %r", entityId, retCode, rDL) if rDL: for rD in rDL: if "id" in rD: accessionMapD.setdefault(wurcs, []).append(rD["id"]) else: logger.info("%r fetch fails (%r) (%r) %r", entityId, retCode, wurcs, rDL) if ii % 5 == 0: logger.info("Fetched %d/%d", ii, numDescriptors) except Exception as e: logger.exception("Failing for (%r) wurcs (%r) with %s", entityId, wurcs, str(e)) return accessionMapD
class EcodClassificationProvider(StashableBase): """Extract ECOD domain assignments, term descriptions and ECOD classification hierarchy from ECOD flat files. http://prodata.swmed.edu/ecod/ See: H. Cheng, R. D. Schaeffer, Y. Liao, L. N. Kinch, J. Pei, S. Shi, B. H. Kim, N. V. Grishin. (2014) ECOD: An evolutionary classification of protein domains. PLoS Comput Biol 10(12): e1003926. Linking details: http://prodata.swmed.edu/ecod/complete/domain/<domainId> http://prodata.swmed.edu/ecod/complete/domain/e6sl5G1 """ # # -- def __init__(self, cachePath, useCache, **kwargs): self.__cachePath = cachePath self.__useCache = useCache dirName = "ecod" super(EcodClassificationProvider, self).__init__(self.__cachePath, [dirName]) self.__dirPath = os.path.join(cachePath, "ecod") self.__version = None # urlTarget = kwargs.get("ecodTargetUrl", "http://prodata.swmed.edu/ecod/distributions/ecod.latest.domains.txt") urlBackup = kwargs.get("ecodUrlBackupPath", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/ECOD/ecod.latest.domains.txt.gz") # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__pD, self.__nD, self.__ntD, self.__pdbD = self.__reload(urlTarget, urlBackup, self.__dirPath, useCache=useCache) def testCache(self): logger.info("ECOD Lengths nD %d pdbD %d", len(self.__nD), len(self.__pdbD)) if (len(self.__nD) > 100) and (len(self.__pdbD) > 5000): return True return False def getVersion(self): return self.__version # -- def getFamilyIds(self, pdbId, authAsymId): try: return list(set([tup[1] for tup in self.__pdbD[(pdbId.lower(), authAsymId)]])) except Exception as e: logger.exception("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getDomainIds(self, pdbId, authAsymId): try: return list(set([tup[0] for tup in self.__pdbD[(pdbId.lower(), authAsymId)]])) except Exception as e: logger.exception("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getFamilyNames(self, pdbId, authAsymId): try: return list(set([self.getName(tup[1]) for tup in self.__pdbD[(pdbId.lower(), authAsymId)]])) except Exception as e: logger.exception("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getFamilyResidueRanges(self, pdbId, authAsymId): try: # pdbD.setdefault((pdbId, authAsymId), []).append((domId, fId, authAsymId, authSeqBeg, authSeqEnd)) return [(tup[0], tup[1], tup[2], tup[3], tup[4]) for tup in self.__pdbD[(pdbId.lower(), authAsymId)]] except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getName(self, domId): try: return self.__nD[domId].split("|")[0] except Exception: logger.debug("Undefined ECOD id %r", domId) return None def getNameType(self, domId): qD = {"A": "Architecture", "X": "Possible Homology", "H": "Homology", "T": "Topology", "F": "Family"} try: return qD[self.__ntD[domId]] except Exception: logger.debug("Undefined ECOD id %r", domId) return None def getIdLineage(self, domId): pList = [] try: pList.append(domId) if domId == 0: return pList pt = self.__pD[domId] while (pt is not None) and (pt != 0): pList.append(pt) pt = self.__pD[pt] except Exception as e: logger.exception("Failing for %r with %s", domId, str(e)) # pList.reverse() return pList def getNameLineage(self, domId): try: nL = [] for dId in self.getIdLineage(domId): tN = self.getName(dId) tN = tN if tN else "Unnamed" nL.append(tN) return nL except Exception as e: logger.exception("Failing for %r with %s", domId, str(e)) return None def getTreeNodeList(self): return self.__exportTreeNodeList(self.__pD) def __getDomainFileName(self): pyVersion = sys.version_info[0] fn = "ecod_domains-py%s.pic" % str(pyVersion) return fn def __reload(self, urlTarget, urlBackup, ecodDirPath, useCache=True): pD = nD = ntD = pdbD = {} fn = self.__getDomainFileName() ecodDomainPath = os.path.join(ecodDirPath, fn) self.__mU.mkdir(ecodDirPath) # if useCache and self.__mU.exists(ecodDomainPath): sD = self.__mU.doImport(ecodDomainPath, fmt="pickle") logger.debug("ECOD domain length %d", len(sD)) nD = sD["names"] ntD = sD["nametypes"] pD = sD["parents"] pdbD = sD["assignments"] self.__version = sD["version"] elif not useCache: minLen = 1000 logger.info("Fetch ECOD name and domain assignment data from primary data source %s", urlTarget) nmL = self.__fetchFromSource(urlTarget) if not nmL: nmL = self.__fetchFromSource(urlBackup) # logger.info("ECOD raw file length (%d)", len(nmL)) ok = False pD, nD, ntD, pdbD = self.__extractDomainHierarchy(nmL) # tS = datetime.datetime.now().isoformat() vS = self.__version sD = {"version": vS, "created": tS, "names": nD, "nametypes": ntD, "parents": pD, "assignments": pdbD} if (len(nD) > minLen) and (len(pD) > minLen): ok = self.__mU.doExport(ecodDomainPath, sD, fmt="pickle") logger.debug("Cache save status %r", ok) # return pD, nD, ntD, pdbD def __fetchFromSource(self, urlTarget): """Fetch the classification names and domain assignments from the ECOD repo.""" fU = FileUtil() fn = fU.getFileName(urlTarget) fp = os.path.join(self.__dirPath, fn) if not fU.exists(fp): fU.get(urlTarget, fp) # with open(fp, "r", encoding="utf-8") as ifh: line = ifh.readline() line = ifh.readline() line = ifh.readline() ff = line[:-1].split() self.__version = ff[-1] # nmL = self.__mU.doImport(fp, fmt="list", uncomment=True) fU.remove(fp) # return nmL def __extractDomainHierarchy(self, nmL): """ #/data/ecod/database_versions/v280/ecod.develop280.domains.txt #ECOD version develop280 #Domain list version 1.6 #Grishin lab (http://prodata.swmed.edu/ecod) #uid ecod_domain_id manual_rep f_id pdb chain pdb_range seqid_range unp_acc arch_name x_name h_name t_name f_name asm_status ligand 002728551 e7d2xA1 AUTO_NONREP 1.1.1 7d2x A A:-3-183 A:20-206 NO_UNP beta barrels "cradle loop barrel" "RIFT-related" "acid protease" F_UNCLASSIFIED 002728572 e7d5aA2 AUTO_NONREP 1.1.1 7d5a A A:-3-183 A:20-206 NO_UNP beta barrels "cradle loop barrel" "RIFT-related" "acid protease" F_UNCLASSIFIED 002726563 e7b1eA1 AUTO_NONREP 1.1.1 7b1e A A:46P-183 A:14-199 NO_UNP beta barrels "cradle loop barrel" "RIFT-related" "acid protease" F_UNCLASSIFIED 002726573 e7b1pA2 AUTO_NONREP 1.1.1 7b1p A A:47P-183 A:15-199 NO_UNP beta barrels "cradle loop barrel" "RIFT-related" "acid protease" F_UNCLASSIFIED """ assignD = {} pD = {} ntD = {} hD = {} pIdD = {} nmD = {} # logger.info("Length of input ECOD name list %d", len(nmL)) for nm in nmL: ff = nm.split("\t") # uId = ff[0] # ecodId is the linkable identifier - ecodId = ff[1] entryId = ff[4].lower() authAsymId = ff[5] resRange = ff[6] # # There are no unique identifiers published for the internal elements of the hierarchy # so these are assigned here similar to scop - There are also many unnamed nodes # that are conventionally filled in from the leaf levels of the tree... # {"A": "Architecture", "X": "Possible Homology", "H": "Homology", "T": "Topology", "F": "Family"} aGroupOrg = "A: " + ff[9].replace('"', "") xGroupOrg = "X: " + ff[10].replace('"', "") hGroupOrg = "H: " + ff[11].replace('"', "") tGroupOrg = "T: " + ff[12].replace('"', "") fGroupOrg = "F: " + ff[13].replace('"', "") if hGroupOrg == "H: NO_H_NAME": # hGroupOrg = tGroupOrg + "|(NO_H)" hGroupOrg = "H: " + ff[12].replace('"', "") + " (From Topology)" + "|(NO_H)" if xGroupOrg == "X: NO_X_NAME": if ff[11].replace('"', "") == "NO_H_NAME": # xGroupOrg = hGroupOrg + "|(NO_X)" xGroupOrg = "X: " + ff[12].replace('"', "") + " (From Topology)" + "|(NO_X)" else: xGroupOrg = "X: " + ff[11].replace('"', "") + " (From Homology)" + "|(NO_X)" # fGroupOrg = fGroupOrg if fGroupOrg != "F_UNCLASSIFIED" else "Unmapped domain of " + tGroupOrg # # Remove redundancy in names and assign unique ids # aGroup = aGroupOrg xGroup = xGroupOrg + "|" + aGroupOrg hGroup = hGroupOrg + "|" + xGroupOrg + "|" + aGroupOrg tGroup = tGroupOrg + "|" + hGroupOrg + "|" + xGroupOrg fGroup = fGroupOrg + "|" + tGroupOrg # hD.setdefault("A", set()).add(aGroup) hD.setdefault("X", set()).add(xGroup) hD.setdefault("H", set()).add(hGroup) hD.setdefault("T", set()).add(tGroup) hD.setdefault("F", set()).add(fGroup) aId = 100000 + len(hD["A"]) xId = 200000 + len(hD["X"]) hId = 300000 + len(hD["H"]) tId = 400000 + len(hD["T"]) fId = 500000 + len(hD["F"]) # # if xGroup in pD and pD[xGroup] != aGroup: logger.error("skipping %r multiple parents for xGroup %r %r and %r ", ecodId, xGroup, pD[xGroup], aGroup) continue # if hGroup in pD and pD[hGroup] != xGroup: logger.error("skipping %r multiple parents for hGroup %r %r and %r ", ecodId, hGroup, pD[hGroup], xGroup) continue # if tGroup in pD and pD[tGroup] != hGroup: logger.error("skipping %r multiple parents for tGroup %r %r and %r ", ecodId, tGroup, pD[tGroup], hGroup) continue # if fGroup in pD and pD[fGroup] != tGroup: logger.error("skipping %r multiple parents for fGroup %r %r and %r ", ecodId, fGroup, pD[fGroup], tGroup) continue if xId in pIdD and pIdD[xId] != aId: logger.error("skipped %r multiple parents for xId %r %r and %r ", ecodId, xId, pIdD[xId], aId) # if hId in pIdD and pIdD[hId] != xId: logger.error("skipped %r multiple parents for hId %r %r and %r ", ecodId, hId, pIdD[hId], xId) # if tId in pIdD and pIdD[tId] != hId: logger.error("skipped %r multiple parents for tId %r %r and %r ", ecodId, tId, pIdD[tId], hId) # if fId in pIdD and pIdD[fId] != tId: logger.error("skipped %r multiple parents for fId %r %r and %r ", ecodId, fId, pIdD[fId], tId) # pIdD[aId] = 0 pIdD[xId] = aId pIdD[hId] = xId pIdD[tId] = hId pIdD[fId] = tId # nmD[aId] = aGroupOrg nmD[xId] = xGroupOrg nmD[hId] = hGroupOrg nmD[tId] = tGroupOrg nmD[fId] = fGroupOrg # ntD[aId] = "A" ntD[xId] = "X" ntD[hId] = "H" ntD[tId] = "T" ntD[fId] = "F" rL = self.__parseRanges(resRange) if (entryId, authAsymId) not in assignD: assignD[(entryId, authAsymId)] = [(ecodId, fId, t[0], t[1], t[2]) for t in rL] else: for t in rL: assignD[(entryId, authAsymId)].append((ecodId, fId, t[0], t[1], t[2])) # return pIdD, nmD, ntD, assignD def __parseRanges(self, rS): rL = [] authAsymId = authSeqBeg = authSeqEnd = None try: tSL = rS.split(",") for tS in tSL: fL = tS.split(":") authAsymId = fL[0] rS = fL[1] if rS[0] == "-": authSeqBeg = -int(rS[1:].split("-")[0]) authSeqEnd = int(rS[1:].split("-")[1]) else: authSeqBeg = int(rS.split("-")[0]) authSeqEnd = int(rS.split("-")[1]) rL.append((authAsymId, authSeqBeg, authSeqEnd)) except Exception: pass return rL def __exportTreeNodeList(self, pD): """Create node list from name dictionary and lineage dictionaries.""" # rootId = 0 pL = [rootId] # logger.info("pD %d pL %r", len(pD), pL) # -- # # create child dictionary cD = {} for ctId, ptId in pD.items(): cD.setdefault(ptId, []).append(ctId) # logger.info("cD %d", len(cD)) # idL = [] for rootId in sorted(pL): visited = set([rootId]) queue = collections.deque(visited) while queue: tId = queue.popleft() idL.append(tId) if tId not in cD: # logger.debug("No children for Ecod tId %s", tId) continue for childId in cD[tId]: if childId not in visited: queue.append(childId) visited.add(childId) # dL = [] for tId in idL: displayName = self.getName(tId) ptId = pD[tId] if tId in pD else None lL = self.getIdLineage(tId)[1:] # if tId == rootId: continue elif ptId == rootId: dD = {"id": str(tId), "name": displayName, "depth": 0} else: dD = {"id": str(tId), "name": displayName, "parents": [str(ptId)], "depth": len(lL)} dL.append(dD) return dL
def search(self, dataList, procName, optionsD, workingDir): """Worker method to execute a shell to search CCDC for the input mol2 path list. Args: dataList (list): list of mol2 file paths to be searched procName (str): processName optionsD (dict): dictionary of options workingDir (str): path to working directory (not used) Returns: (successList, resultList, []): success and result lists of mol2 paths with CCDC matches """ resultPath = optionsD["resultPath"] searchType = optionsD["searchType"] pythonRootPath = optionsD["pythonRootPath"] csdHome = optionsD["csdHome"] _ = workingDir resultList = [] startTime = time.time() logger.info("starting %s at %s", procName, time.strftime("%Y %m %d %H:%M:%S", time.localtime())) # try: stopPath = os.path.join(resultPath, "STOP") logger.info("%s search list length %d", procName, len(dataList)) if self.__checkStop(stopPath): logger.info("%s stopping", procName) return resultList, resultList, [] # queryListFilePath = os.path.join(resultPath, procName, "queryFileList.list") mU = MarshalUtil() ok = mU.doExport(queryListFilePath, dataList, fmt="list") if not ok: return resultList, resultList, [] # exU = ExecUtils() logger.info("%s executing shell for %s", procName, queryListFilePath) cmdPath = os.path.join(pythonRootPath, "bin", "ccdc_search_cli") hitListPath = os.path.join(resultPath, procName, "hitList.list") logPath = os.path.join(resultPath, procName, "execlog.log") logger.info("cmdPath %r", cmdPath) ok = exU.runShell( "%s --mol_list_path %s --result_path %s --search_type %s --csdhome %s --hit_list_path %s" % (cmdPath, queryListFilePath, resultPath, searchType, csdHome, hitListPath), outPath=logPath, outAppend=False, timeOut=60, suppressStderr=False, ) # if ok and mU.exists(hitListPath): resultList = mU.doImport(hitListPath, fmt="list") except Exception as e: logger.exception("Failing with %s", str(e)) endTime = time.time() logger.info("%s (result len %d) completed at %s (%.2f seconds)", procName, len(resultList), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return resultList, resultList, []
class LigandNeighborMappingProvider(StashableBase): """Accessors for essential ligand neighbor mapping details associated with polymer and branched entity instances.""" def __init__(self, cachePath, useCache=True): # self.__cachePath = cachePath self.__useCache = useCache self.__dirName = "ligand-neighbor-mapping" super(LigandNeighborMappingProvider, self).__init__(self.__cachePath, [self.__dirName]) self.__dirPath = os.path.join(self.__cachePath, self.__dirName) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__mapD = self.__reload(self.__dirPath, useCache) # def testCache(self, minCount=0): logger.info("Cached ligand neighbor mapping count %d", len(self.__mapD["mapping"]) if "mapping" in self.__mapD else 0) if minCount == 0 or self.__mapD and "mapping" in self.__mapD and len(self.__mapD["mapping"]) >= minCount: return True else: return False def getLigandNeighbors(self, rcsbEntityId): """Get the unique list of ligand neighbors for the input polymer or branched entity instance. Args: rcsbEntityId (str): entryId '_' entityId Returns: list: [chem_comp_id, ... ] """ try: return list(set([t[0] for t in self.__mapD["mapping"][rcsbEntityId.upper()]])) except Exception: return [] def __reload(self, dirPath, useCache): startTime = time.time() retD = {} ok = False mappingPath = self.__getMappingDataPath() # logger.info("useCache %r mappingPath %r", useCache, mappingPath) if useCache and self.__mU.exists(mappingPath): retD = self.__mU.doImport(mappingPath, fmt="json") ok = True else: fU = FileUtil() fU.mkdir(dirPath) # --- num = len(retD["mapping"]) if "mapping" in retD else 0 logger.info("Completed ligand mapping reload (%d) with status (%r) at %s (%.4f seconds)", num, ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return retD def __getMappingDataPath(self): return os.path.join(self.__dirPath, "ligand-neighbor-mapping-data.json") def fetchLigandNeighborMapping(self, cfgOb): """Fetch ligand neighbor mapping details Args: cfgOb (obj): instance configuration class ConfigUtil() Returns: bool: True for success or False otherwise """ try: lnmEx = LigandNeighborMappingExtractor(cfgOb) lnD = lnmEx.getLigandNeighbors() fp = self.__getMappingDataPath() tS = datetime.datetime.now().isoformat() vS = datetime.datetime.now().strftime("%Y-%m-%d") ok = self.__mU.doExport(fp, {"version": vS, "created": tS, "mapping": lnD}, fmt="json", indent=3) return ok except Exception as e: logger.exception("Failing with %s", str(e)) return False
class SAbDabTargetFeatureProvider(StashableBase): """Accessors for Thera-SAbDab(Therapeutic Structural Antibody Database) target features.""" # Link out using the INN therapeutic name - # http://opig.stats.ox.ac.uk/webapps/newsabdab/therasabdab/search/?therapeutic=Coltuximab def __init__(self, **kwargs): # self.__cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) self.__dirName = "SAbDab-features" super(SAbDabTargetFeatureProvider, self).__init__(self.__cachePath, [self.__dirName]) self.__dirPath = os.path.join(self.__cachePath, self.__dirName) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__fD = self.__reload(self.__dirPath, useCache) # def testCache(self, minCount=500): logger.info( "Therapeutic SAbDab feature count %d Assignment count %d", len(self.__fD["features"]) if "features" in self.__fD else 0, len(self.__fD["assignments"]) if "assignments" in self.__fD else 0, ) if self.__fD and "features" in self.__fD and len(self.__fD["features"]) > minCount and self.__fD and "assignments" in self.__fD and len(self.__fD["assignments"]): return True else: return False def getVersion(self): try: return self.__fD["version"] except Exception: pass return None def hasFeatures(self, rcsbEntityId): return rcsbEntityId.upper() in self.__fD["features"] def getFeatures(self, rcsbEntityId): try: return self.__fD["features"][rcsbEntityId.upper()] except Exception: return [] def getAssignment(self, instanceId, featureKey): """Return the value of the key feature for the input instance identifier. Args: instanceId (str): instance identifier '<pdbId>.<authAsymId>' featureKey (str): assignment feature key: one of pdb|Hchain|Lchain|model|antigen_chain|antigen_type| antigen_het_name|antigen_name|heavy_subclass|light_subclass|light_ctype) Returns: str: feature value or None """ fVal = None try: fVal = self.__fD["assignments"][instanceId][featureKey] except Exception: fVal = None return fVal def hasAssignment(self, instanceId): """Return if assignment data is available for the input instance. Args: instanceId (str): instance identifier '<pdbId>.<authAsymId>' Returns: bool: True for success or False otherwise """ return instanceId in self.__fD["assignments"] def __getFeatureDataPath(self): return os.path.join(self.__dirPath, "sabdab-feature-data.json") def reload(self): self.__fD = self.__reload(self.__dirPath, True) return True def __reload(self, dirPath, useCache): startTime = time.time() fD = {} ok = False featurePath = self.__getFeatureDataPath() # logger.info("useCache %r featurePath %r", useCache, featurePath) if useCache and self.__mU.exists(featurePath): fD = self.__mU.doImport(featurePath, fmt="json") else: fU = FileUtil() fU.mkdir(dirPath) # --- logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return fD def buildFeatureList(self, sequenceMatchFilePath): """Build polymer entity feature list for the matching entities in the input sequence match file. Args: sequenceMatchFilePath (str): sequence match output file path Returns: bool: True for success or False otherwise """ rDL = [] stP = SAbDabTargetProvider(cachePath=self.__cachePath, useCache=False) mD = self.__mU.doImport(sequenceMatchFilePath, fmt="json") # provenanceSource = "SAbDab" refScheme = "PDB entity" assignVersion = stP.getAssignmentVersion() # # - sort out if we match light and heavy chains # iD = {} fullMatchD = {} for queryId, matchDL in mD.items(): qCmtD = self.__decodeComment(queryId) # Tanezumab|therapeutic|light|chain thName = qCmtD["therapeutic"] chainType = qCmtD["chain"] for matchD in matchDL: tCmtD = self.__decodeComment(matchD["target"]) entryId = tCmtD["entityId"].split("_")[0] entityId = tCmtD["entityId"].split("_")[1] iD[(thName, chainType, entryId)] = entityId logger.info("Match index length (%d)", len(iD)) for (thName, chainType, entryId), entityId in iD.items(): if chainType == "light": continue if (thName, "light", entryId) in iD: fullMatchD[(thName, "heavy", entryId, entityId)] = True lEntityId = iD[(thName, "light", entryId)] fullMatchD[(thName, "light", entryId, lEntityId)] = True logger.info("Antibody entity match length (%d)", len(fullMatchD)) # # - Add features for full matches - for queryId, matchDL in mD.items(): qCmtD = self.__decodeComment(queryId) # Tanezumab|therapeutic|light|chain thName = qCmtD["therapeutic"] chainType = qCmtD["chain"] # for matchD in matchDL: fpL = [] if "alignedRegions" in matchD: fpL = [{"beg_seq_id": arD["targetBegin"], "end_seq_id": arD["targetEnd"]} for arD in matchD["alignedRegions"]] else: fpL = [{"beg_seq_id": matchD["targetBegin"], "end_seq_id": matchD["targetEnd"]}] # tCmtD = self.__decodeComment(matchD["target"]) entryId = tCmtD["entityId"].split("_")[0] entityId = tCmtD["entityId"].split("_")[1] if (thName, chainType, entryId, entityId) not in fullMatchD: continue ii = 1 for fType, fKy in [ ("SABDAB_ANTIBODY_NAME", "antibodyName"), ("SABDAB_ANTIBODY_FORMAT", "antibodyFormat"), ("SABDAB_ANTIBODY_CH1_ISOTYPE", "ch1IsoType"), ("SABDAB_ANTIBODY_LIGHT_CHAIN_TYPE", "VD_LC"), ("SABDAB_ANTIBODY_TARGET", "target"), ]: if fType == "Antibody_Light_Chain_Type" and chainType == "heavy": continue fVL = stP.getFeatures(thName, fKy) if not fVL: continue for fV in fVL: rD = { "entry_id": entryId, "entity_id": entityId, "type": fType, "feature_id": thName + "_" + chainType + "_" + str(ii), "name": fV, "provenance_source": provenanceSource, "reference_scheme": refScheme, "assignment_version": assignVersion, "feature_positions": fpL, } rDL.append(rD) ii += 1 # qD = {} for rD in rDL: eId = rD["entry_id"] + "_" + rD["entity_id"] qD.setdefault(eId, []).append(rD) # logger.info("Antibody matches (%d)", len(qD)) # fp = self.__getFeatureDataPath() tS = datetime.datetime.now().isoformat() vS = datetime.datetime.now().strftime("%Y-%m-%d") ok = self.__mU.doExport(fp, {"version": vS, "created": tS, "features": qD, "assignments": stP.getAssignments()}, fmt="json", indent=3) return ok def __decodeComment(self, comment, separator="|"): dD = {} try: ti = iter(comment.split(separator)) dD = {tup[1]: tup[0] for tup in zip(ti, ti)} except Exception: pass return dD
class GlyGenProvider(StashableBase): """Fetch glycans and glycoproteins available in the GlyGen.org resource. GlyGen glycan link template - https://glygen.org/glycan/G28882EF Glycoprotein link template - https://www.glygen.org/protein/Q658T7 """ def __init__(self, **kwargs): # dirName = "glygen" cachePath = kwargs.get("cachePath", ".") self.__dirPath = os.path.join(cachePath, dirName) super(GlyGenProvider, self).__init__(cachePath, [dirName]) useCache = kwargs.get("useCache", True) # baseUrl = kwargs.get( "glygenBasetUrl", "https://data.glygen.org/ln2data/releases/data/v-1.12.3/reviewed/") fallbackUrl = kwargs.get( "glygenFallbackUrl", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/glygen/" ) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__glycanD = self.__reloadGlycans(baseUrl, fallbackUrl, self.__dirPath, useCache=useCache) self.__glycoproteinD = self.__reloadGlycoproteins(baseUrl, fallbackUrl, self.__dirPath, useCache=useCache) def testCache(self, minGlycanCount=20000, minGlycoproteinCount=64000): # logger.info("GlyGen glycan list (%d) glycoprotein list (%d)", len(self.__glycanD), len(self.__glycoproteinD)) if self.__glycanD and len( self.__glycanD ) > minGlycanCount and self.__glycoproteinD and len( self.__glycoproteinD) > minGlycoproteinCount: return True return False def hasGlycan(self, glyTouCanId): try: return glyTouCanId in self.__glycanD except Exception: return False def hasGlycoprotein(self, uniProtId): try: return uniProtId in self.__glycoproteinD except Exception: return False def getGlycans(self): return self.__glycanD def getGlycoproteins(self): return self.__glycoproteinD def __reloadGlycans(self, baseUrl, fallbackUrl, dirPath, useCache=True): gD = {} logger.debug("Using dirPath %r", dirPath) self.__mU.mkdir(dirPath) # myDataPath = os.path.join(dirPath, "glygen-glycan-list.json") if useCache and self.__mU.exists(myDataPath): gD = self.__mU.doImport(myDataPath, fmt="json") logger.debug("GlyGen glycan data length %d", len(gD)) elif not useCache: logger.debug( "Fetch GlyGen glycan data from primary data source %s", baseUrl) endPoint = os.path.join(baseUrl, "glycan_masterlist.csv") # logger.info("Fetch GlyGen glycan data from primary data source %s", endPoint) rawPath = os.path.join(dirPath, "glycan_masterlist.csv") fU = FileUtil() ok = fU.get(endPoint, rawPath) logger.debug("Fetch GlyGen glycan data status %r", ok) if not ok: endPoint = os.path.join(fallbackUrl, "glycan_masterlist.csv") ok = fU.get(endPoint, rawPath) logger.info("Fetch fallback GlyGen glycan data status %r", ok) # if ok: gD = self.__parseGlycanList(rawPath) ok = self.__mU.doExport(myDataPath, gD, fmt="json") logger.info("Exported GlyGen glycan list (%d) (%r) %s", len(gD), ok, myDataPath) # return gD def __parseGlycanList(self, filePath): gD = {} row = None try: rowL = self.__mU.doImport(filePath, fmt="csv", rowFormat="list") logger.debug("Glycan list length (%d)", len(rowL)) logger.debug("Row 0 %r", rowL[0]) for row in rowL[1:]: gD[row[0]] = row[1] except Exception as e: logger.exception("Failing for %r (%r) with %s", filePath, row, str(e)) return gD def __reloadGlycoproteins(self, baseUrl, fallbackUrl, dirPath, useCache=True): gD = {} logger.debug("Using dirPath %r", dirPath) self.__mU.mkdir(dirPath) # myDataPath = os.path.join(dirPath, "glygen-glycoprotein-list.json") if useCache and self.__mU.exists(myDataPath): gD = self.__mU.doImport(myDataPath, fmt="json") logger.debug("GlyGen glycoprotein data length %d", len(gD)) else: for fn in [ "sarscov1_protein_masterlist.csv", "sarscov2_protein_masterlist.csv", "hcv1b_protein_masterlist.csv", "hcv1a_protein_masterlist.csv", "human_protein_masterlist.csv", "mouse_protein_masterlist.csv", "rat_protein_masterlist.csv", ]: logger.debug( "Fetch GlyGen glycoprotein data from primary data source %s", baseUrl) endPoint = os.path.join(baseUrl, fn) # logger.debug( "Fetch GlyGen glycoprotein data from primary data source %s", endPoint) rawPath = os.path.join(dirPath, fn) fU = FileUtil() ok = fU.get(endPoint, rawPath) logger.debug("Fetch GlyGen glycoprotein data status %r", ok) if not ok: endPoint = os.path.join(fallbackUrl, fn) ok = fU.get(endPoint, rawPath) logger.info("Fetch fallback GlyGen data status %r", ok) # if ok: tD = self.__parseGlycoproteinList(rawPath) gD.update(tD) # ok = self.__mU.doExport(myDataPath, gD, fmt="json") logger.info("Exported GlyGen glycoprotein list (%d) (%r) %s", len(gD), ok, myDataPath) # return gD def __parseGlycoproteinList(self, filePath): gD = {} try: rowL = self.__mU.doImport(filePath, fmt="csv", rowFormat="list") for row in rowL[1:]: ff = row[0].split("-") gD[ff[0]] = ff[1] except Exception as e: logger.exception("Failing for %r with %s", filePath, str(e)) return gD
def search(self, queryTargetId, queryTargetPath, resultPath, normalizeFlag=True, maxHits=50, searchType="similarity", suppressMetals=False): """Search the CCDC database for similar or substructure matches for the input query molecule. Args: queryTargetId (str): query identifier queryTargetPath (str): path to the query molfile (mol, sdf, mol2) resultPath (str): output path to match results normalizeFlag (bool, optional): do standard perceptions on matching molecules. Defaults to True. maxHits (int, optional): maximum number of matches to return. Defaults to 50. searchType (str, optional): search mode (substructure, similarity). Defaults to "similarity". suppressMetals (bool, optional): filter structures containing metals. Defaults to False. Returns: (int): number of matches """ mU = MarshalUtil() logger.info("Start search for target %s path %s result path %s", queryTargetId, queryTargetPath, resultPath) # summaryList = [] # targetDirPath = os.path.dirname(queryTargetPath) cifTargetPath = os.path.join(targetDirPath, queryTargetId + ".cif") # targetStructures = EntryReader(queryTargetPath) dirPath = os.path.join(resultPath, queryTargetId) numHits = 0 for ii, e in enumerate(targetStructures, 1): numHits = 0 startTime = time.time() targetMol = e.molecule if normalizeFlag: targetMol.assign_bond_types(which="unknown") targetMol.standardise_aromatic_bonds() targetMol.standardise_delocalised_bonds() # logger.info("(%d) begin %s search - query id %s", ii, searchType, queryTargetId) if searchType == "similarity": hits = self.__similaritySearch(targetMol, suppressMetals=suppressMetals) elif searchType == "substructure": hits = self.__moleculeSubstructureSearch( targetMol, suppressMetals=suppressMetals) else: hits = [] logger.info("(%d) completed search query id %s in %.3f seconds", ii, queryTargetId, time.time() - startTime) if hits: numHits += len(hits) logger.info("(%d) search for %s matched %d: %r", ii, queryTargetId, numHits, [targetHit.identifier for targetHit in hits]) # for targetHit in hits[:maxHits]: # hI = CcdcMatchIndexInst() hI.setCsdVersion(csd_version()) hI.setCsdDirectory(csd_directory()) hI.setTargetId(queryTargetId) hI.setTargetPath(queryTargetPath) if mU.exists(cifTargetPath): hI.setTargetCcPath(cifTargetPath) hI.setIdentifier(targetHit.identifier) hI.setMatchType(searchType) try: hI.setRFactor(targetHit.entry.r_factor) hI.setChemicalName(targetHit.entry.chemical_name) hI.setTemperature(targetHit.entry.temperature) hI.setRadiationSource(targetHit.entry.radiation_source) hI.setHasDisorder("N") cit = targetHit.entry.publication if cit.doi is not None: hI.setCitationDOI(cit.doi) if searchType == "similarity": hI.setSimilarityScore(targetHit.similarity) elif searchType == "substructure": hI.setMatchedAtomLength( len(targetHit.match_atoms())) except Exception as e: logger.exception("Failing with %s", str(e)) # # mU.mkdir(dirPath) mol2L = [] if searchType == "substructure": for jj, mc in enumerate(targetHit.match_components(), 1): fp = os.path.join( dirPath, queryTargetId + "_" + targetHit.identifier + "_%03d" % jj + ".mol2") mol2L.append(fp) with MoleculeWriter(fp) as ofh: ofh.write(mc) # Replace the title line with open(fp) as fin: lines = fin.readlines() lines[1] = lines[1].replace( "00", targetHit.identifier) # with open(fp, "w") as fout: fout.write("".join(lines)) # fp = os.path.join( dirPath, queryTargetId + "_" + targetHit.identifier + "_%03d" % jj + ".sdf") with MoleculeWriter(fp) as ofh: ofh.write(mc) # Replace the title line with open(fp) as fin: lines = fin.readlines() lines[0] = lines[0].replace( "00", targetHit.identifier) # with open(fp, "w") as fout: fout.write("".join(lines)) # # Check for multiple generated result files - # for jj, fp in enumerate(mol2L, 1): logger.debug("(%d) adding component fp %s", jj, fp) hI.setMatchNumber(jj) hI.setMol2Path(fp) tt = fp[:-4] + "sdf" hI.setMolPath(tt) summaryList.append(copy.deepcopy(hI.get())) # else: hI.setMatchNumber(1) summaryList.append(copy.deepcopy(hI.get())) else: logger.info("(%d) search for %s returns no matches", ii, targetMol.identifier) hits = None # if numHits > 0: mU.mkdir(dirPath) fp = os.path.join(dirPath, queryTargetId + "-index.json") cmI = CcdcMatchIndex(indexFilePath=fp, verbose=self.__verbose) cmI.load(summaryList) cmI.writeIndex() return numHits
def build(self, alignType="relaxed-stereo", numProc=4, chunkSize=10, verbose=False, doFigures=True): """Run the model build step in the chemical component model workflow. Args: alignType (str): "relaxed"|"strict"| relaxed-stereo". Default: relaxed-stereo numProc (int, optional): number of processes to invoke. Defaults to 4. chunkSize (int, optional): work chunksize. Defaults to 10. verbose (bool, optional): verbose logging. Defaults to False. Returns: (dict): {searchId: [{"targetId": , "modelId": , "modelPath": ,"matchId": , "parentId": , "rFactor": , }] """ retD = {} try: mU = MarshalUtil(workPath=self.__cachePath) ccms = CODModelSearch(self.__cachePath, prefix=self.__prefix) modelDirPath = self.getModelDirFilePath() imageDirPath = self.getModelImageDirFilePath() # tD = ccms.getResultIndex() # Make parent index --- idxIdD = {} for idxId, iDL in tD.items(): pId = idxId.split("|")[0] idxIdD.setdefault(pId, []).extend(iDL) # idxIdL = list(idxIdD.keys()) midxIdL = [] for pId in idxIdL: fp = os.path.join(modelDirPath, pId, "model-index.json") if mU.exists(fp): # Skip empty indices fst = os.stat(fp) if fst.st_size > 10: continue midxIdL.append(pId) # logger.info( "Starting COD model build using (%d) from a total of results length (%d)", len(midxIdL), len(idxIdD)) # cmbw = CODModelBuildWorker(self.__cachePath, verbose=verbose, timeOut=self.__timeOut) mpu = MultiProcUtil(verbose=True) mpu.setWorkingDir(modelDirPath) mpu.setOptions( optionsD={ "modelDirPath": modelDirPath, "imageDirPath": imageDirPath, "alignType": alignType, "ccSIdxP": self.__ccSIdxP, "idxIdD": idxIdD, "oesmP": self.__oesmP, "ccmP": self.__ccmP, "doFigures": doFigures, }) # mpu.set(workerObj=cmbw, workerMethod="build") ok, failList, resultList, _ = mpu.runMulti(dataList=midxIdL, numProc=numProc, numResults=1, chunkSize=chunkSize) logger.info( "Run ended with status %r success count %d failures %r", ok, len(resultList[0]), len(failList)) successList = copy.copy(resultList[0]) # if successList: logger.info("Completed build with %d models ", len(successList)) else: logger.info("No models built") # # Build full index - # logger.info("Building full model index") for pId in idxIdL: fp = os.path.join(modelDirPath, pId, "model-index.json") if mU.exists(fp): tDL = mU.doImport(fp, fmt="json") for tD in tDL: retD.setdefault(tD["parentId"], []).append(tD) # retD = dict(sorted(retD.items())) logger.info("Storing models for %d parent components", len(retD)) ok = self.storeModelIndex(retD) except Exception as e: logger.exception("Failing with %s", str(e)) return retD
class RepositoryProvider(object): def __init__(self, cfgOb, cachePath=None, numProc=8, fileLimit=None, verbose=False): self.__fileLimit = fileLimit self.__numProc = numProc self.__verbose = verbose self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__topCachePath = cachePath if cachePath else "." self.__cachePath = os.path.join( self.__topCachePath, self.__cfgOb.get("REPO_UTIL_CACHE_DIR", sectionName=self.__configName)) # self.__mU = MarshalUtil(workPath=self.__cachePath) # self.__ccPathD = None # self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s" def getLocatorObjList(self, contentType, inputPathList=None, mergeContentTypes=None): """Convenience method to get the data path list for the input repository content type. Args: contentType (str): Repository content type (e.g. pdbx, chem_comp, bird, ...) inputPathList (list, optional): path list that will be returned if provided. mergeContentTypes (list, optional): repository content types to combined with the primary content type. Returns: Obj list: data file paths or tuple of file paths """ inputPathList = inputPathList if inputPathList else [] if inputPathList: return self.getLocatorObjListWithInput( contentType, inputPathList=inputPathList, mergeContentTypes=mergeContentTypes) # if mergeContentTypes and "vrpt" in mergeContentTypes and contentType in [ "pdbx", "pdbx_core" ]: dictPath = os.path.join( self.__topCachePath, self.__cfgOb.get( "DICTIONARY_CACHE_DIR", sectionName=self.__cfgOb.getDefaultSectionName())) os.environ["_RP_DICT_PATH_"] = dictPath locatorList = self.getEntryLocatorObjList( mergeContentTypes=mergeContentTypes) else: locatorList = self.__getLocatorList(contentType, inputPathList=inputPathList) return locatorList def getLocatorObjListWithInput(self, contentType, inputPathList=None, mergeContentTypes=None): """Convenience method to get the data path list for the input repository content type. Args: contentType (str): Repository content type (e.g. pdbx, chem_comp, bird, ...) inputPathList (list, optional): path list that will be returned if provided. mergeContentTypes (list, optional): repository content types to combined with the primary content type. Returns: Obj list: data file paths or tuple of file paths """ inputPathList = inputPathList if inputPathList else [] locatorList = self.__getLocatorList(contentType, inputPathList=inputPathList) # JDW move the following to config if mergeContentTypes and "vrpt" in mergeContentTypes and contentType in [ "pdbx", "pdbx_core" ]: dictPath = os.path.join( self.__topCachePath, self.__cfgOb.get( "DICTIONARY_CACHE_DIR", sectionName=self.__cfgOb.getDefaultSectionName())) os.environ["_RP_DICT_PATH_"] = dictPath # locObjL = [] for locator in locatorList: if isinstance(locator, str): kwD = HashableDict({}) oL = [ HashableDict({ "locator": locator, "fmt": "mmcif", "kwargs": kwD }) ] for mergeContentType in mergeContentTypes: _, fn = os.path.split(locator) idCode = fn[:4] if fn and len(fn) >= 8 else None mergeLocator = self.__getLocator( mergeContentType, idCode, checkExists=True) if idCode else None if mergeLocator: # kwD = HashableDict({"marshalHelper": vrd.toCif}) kwD = HashableDict({"marshalHelper": toCifWrapper}) oL.append( HashableDict({ "locator": mergeLocator, "fmt": "xml", "kwargs": kwD })) lObj = tuple(oL) else: logger.error("Unexpected output locator type %r", locator) lObj = locator locObjL.append(lObj) # locatorList = locObjL # - return locatorList def getContainerList(self, locatorObjList): """ Return the data container list obtained by parsing the input locator object list. """ cL = [] for locatorObj in locatorObjList: myContainerList = self.__mergeContainers(locatorObj, fmt="mmcif", mergeTarget=0) for cA in myContainerList: cL.append(cA) return cL def __mergeContainers(self, locatorObj, fmt="mmcif", mergeTarget=0): """ Consolidate content in auxiliary files locatorObj[1:] into locatorObj[0] container index 'mergeTarget'. """ # cL = [] try: if isinstance(locatorObj, str): cL = self.__mU.doImport(locatorObj, fmt=fmt) return cL if cL else [] elif isinstance(locatorObj, (list, tuple)) and locatorObj: dD = locatorObj[0] kw = dD["kwargs"] cL = self.__mU.doImport(dD["locator"], fmt=dD["fmt"], **kw) if cL: for dD in locatorObj[1:]: kw = dD["kwargs"] rObj = self.__mU.doImport(dD["locator"], fmt=dD["fmt"], **kw) mergeL = rObj if rObj else [] for mc in mergeL: cL[mergeTarget].merge(mc) # return cL else: return [] except Exception as e: logger.exception("Failing for %r with %s", locatorObj, str(e)) return cL def getLocatorsFromPaths(self, locatorObjList, pathList, locatorIndex=0): """ Return locator objects with paths (locatorObjIndex) matching the input pathList. """ # index the input locatorObjList rL = [] try: if locatorObjList and isinstance(locatorObjList[0], str): return pathList # locIdx = {} for ii, locatorObj in enumerate(locatorObjList): if "locator" in locatorObj[locatorIndex]: locIdx[locatorObj[locatorIndex]["locator"]] = ii # for pth in pathList: jj = locIdx[pth] if pth in locIdx else None if jj is not None: rL.append(locatorObjList[jj]) except Exception as e: logger.exception("Failing with %s", str(e)) # return rL def getLocatorPaths(self, locatorObjList, locatorIndex=0): try: if locatorObjList and isinstance(locatorObjList[0], str): return locatorObjList else: return [ locatorObj[locatorIndex]["locator"] for locatorObj in locatorObjList ] except Exception as e: logger.exception("Failing with %s", str(e)) return [] def __getLocatorList(self, contentType, inputPathList=None): """ Internal convenience method to return repository path list by content type: """ outputPathList = [] inputPathList = inputPathList if inputPathList else [] try: if contentType in ["bird", "bird_core"]: outputPathList = inputPathList if inputPathList else self.getBirdPathList( ) elif contentType == "bird_family": outputPathList = inputPathList if inputPathList else self.getBirdFamilyPathList( ) elif contentType in ["chem_comp"]: outputPathList = inputPathList if inputPathList else self.getChemCompPathList( ) elif contentType in ["bird_chem_comp"]: outputPathList = inputPathList if inputPathList else self.getBirdChemCompPathList( ) elif contentType in ["pdbx", "pdbx_core"]: outputPathList = inputPathList if inputPathList else self.getEntryPathList( ) elif contentType in [ "chem_comp_core", "bird_consolidated", "bird_chem_comp_core" ]: outputPathList = inputPathList if inputPathList else self.mergeBirdAndChemCompRefData( ) elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]: outputPathList = inputPathList if inputPathList else self.getIhmDevPathList( ) elif contentType in [ "pdb_distro", "da_internal", "status_history" ]: outputPathList = inputPathList if inputPathList else [] else: logger.warning("Unsupported contentType %s", contentType) except Exception as e: logger.exception("Failing with %s", str(e)) if self.__fileLimit: outputPathList = outputPathList[:self.__fileLimit] return sorted(outputPathList) def __getLocator(self, contentType, idCode, version="v1-0", checkExists=False): """ Convenience method to return repository path for a content type and cardinal identifier. """ pth = None try: idCodel = idCode.lower() if contentType == "bird": pth = os.path.join(self.__getRepoTopPath(contentType), idCode[-1], idCode + ".cif") elif contentType == "bird_family": pth = os.path.join(self.__getRepoTopPath(contentType), idCode[-1], idCode + ".cif") elif contentType in ["chem_comp", "chem_comp_core"]: pth = os.path.join(self.__getRepoTopPath(contentType), idCode[0], idCode, idCode + ".cif") elif contentType in ["bird_chem_comp"]: pth = os.path.join(self.__getRepoTopPath(contentType), idCode[-1], idCode + ".cif") elif contentType in ["pdbx", "pdbx_core"]: pth = os.path.join(self.__getRepoTopPath(contentType), idCodel[1:3], idCodel + ".cif.gz") elif contentType in ["bird_consolidated", "bird_chem_comp_core"]: pth = os.path.join(self.__getRepoTopPath(contentType), idCode + ".cif") elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]: pth = os.path.join(self.__getRepoTopPath(contentType), idCode, idCode + "_model_%s.cif.gz" % version) elif contentType in [ "pdb_distro", "da_internal", "status_history" ]: pass elif contentType in ["vrpt"]: pth = os.path.join(self.__getRepoTopPath(contentType), idCodel[1:3], idCodel, idCodel + "_validation.xml.gz") else: logger.warning("Unsupported contentType %s", contentType) except Exception as e: logger.exception("Failing with %s", str(e)) if checkExists: pth = pth if self.__mU.exists(pth) else None return pth def __getRepoTopPath(self, contentType): """ Convenience method to return repository top path from configuration data. """ pth = None try: if contentType == "bird": pth = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=self.__configName) elif contentType == "bird_family": pth = self.__cfgOb.getPath("BIRD_FAMILY_REPO_PATH", sectionName=self.__configName) elif contentType in ["chem_comp", "chem_comp_core"]: pth = self.__cfgOb.getPath("CHEM_COMP_REPO_PATH", sectionName=self.__configName) elif contentType in ["bird_chem_comp"]: pth = self.__cfgOb.getPath("BIRD_CHEM_COMP_REPO_PATH", sectionName=self.__configName) elif contentType in ["pdbx", "pdbx_core"]: pth = self.__cfgOb.getPath("PDBX_REPO_PATH", sectionName=self.__configName) elif contentType in ["bird_consolidated", "bird_chem_comp_core"]: pth = self.__cachePath elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]: pth = self.__cfgOb.getPath("IHM_DEV_REPO_PATH", sectionName=self.__configName) elif contentType in [ "pdb_distro", "da_internal", "status_history" ]: pass elif contentType in ["vrpt"]: pth = self.__cfgOb.getEnvValue("VRPT_REPO_PATH_ENV", sectionName=self.__configName, default=None) if pth is None: pth = self.__cfgOb.getPath("VRPT_REPO_PATH", sectionName=self.__configName) else: logger.debug( "Using validation report path from environment assignment %s", pth) else: logger.warning("Unsupported contentType %s", contentType) except Exception as e: logger.exception("Failing with %s", str(e)) return pth def _chemCompPathWorker(self, dataList, procName, optionsD, workingDir): """ Return the list of chemical component definition file paths in the current repository. """ _ = procName _ = workingDir topRepoPath = optionsD["topRepoPath"] pathList = [] for subdir in dataList: dd = os.path.join(topRepoPath, subdir) for root, _, files in walk(dd, topdown=False): if "REMOVE" in root: continue for name in files: if name.endswith(".cif") and len(name) <= 7: pathList.append(os.path.join(root, name)) return dataList, pathList, [] def getChemCompPathList(self): return self.__getChemCompPathList(self.__getRepoTopPath("chem_comp"), numProc=self.__numProc) def __getChemCompPathList(self, topRepoPath, numProc=8): """Get the path list for the chemical component definition repository """ ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting at %s", ts) startTime = time.time() pathList = [] try: dataS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" dataList = [a for a in dataS] optD = {} optD["topRepoPath"] = topRepoPath mpu = MultiProcUtil(verbose=self.__verbose) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="_chemCompPathWorker") _, _, retLists, _ = mpu.runMulti(dataList=dataList, numProc=numProc, numResults=1) pathList = retLists[0] endTime0 = time.time() logger.debug("Path list length %d in %.4f seconds", len(pathList), endTime0 - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) return self.__applyFileLimit(pathList) def _entryLocatorObjWithMergeWorker(self, dataList, procName, optionsD, workingDir): """ Return the list of entry locator objects including merge content in the current repository. """ _ = procName _ = workingDir topRepoPath = optionsD["topRepoPath"] mergeContentTypes = optionsD["mergeContentTypes"] locatorObjList = [] for subdir in dataList: dd = os.path.join(topRepoPath, subdir) for root, _, files in walk(dd, topdown=False): if "REMOVE" in root: continue for fn in files: if (fn.endswith(".cif.gz") and len(fn) == 11) or (fn.endswith(".cif") and len(fn) == 8): locator = os.path.join(root, fn) kwD = HashableDict({}) oL = [ HashableDict({ "locator": locator, "fmt": "mmcif", "kwargs": kwD }) ] for mergeContentType in mergeContentTypes: idCode = fn[:4] if fn and len(fn) >= 8 else None mergeLocator = self.__getLocator( mergeContentType, idCode, checkExists=True) if idCode else None if mergeLocator: kwD = HashableDict( {"marshalHelper": toCifWrapper}) oL.append( HashableDict({ "locator": mergeLocator, "fmt": "xml", "kwargs": kwD })) lObj = tuple(oL) locatorObjList.append(lObj) return dataList, locatorObjList, [] def getEntryLocatorObjList(self, mergeContentTypes=None): return self.__getEntryLocatorObjList( self.__getRepoTopPath("pdbx"), numProc=self.__numProc, mergeContentTypes=mergeContentTypes) def __getEntryLocatorObjList(self, topRepoPath, numProc=8, mergeContentTypes=None): """Get the path list for structure entries in the input repository """ ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting at %s", ts) startTime = time.time() pathList = [] try: dataList = [] anL = "abcdefghijklmnopqrstuvwxyz0123456789" for a1 in anL: for a2 in anL: hc = a1 + a2 dataList.append(hc) hc = a2 + a1 dataList.append(hc) dataList = list(set(dataList)) # optD = {} optD["topRepoPath"] = topRepoPath optD["mergeContentTypes"] = mergeContentTypes mpu = MultiProcUtil(verbose=self.__verbose) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="_entryLocatorObjWithMergeWorker") _, _, retLists, _ = mpu.runMulti(dataList=dataList, numProc=numProc, numResults=1) pathList = retLists[0] endTime0 = time.time() logger.debug("Locator object list length %d in %.4f seconds", len(pathList), endTime0 - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) return self.__applyFileLimit(pathList) def _entryPathWorker(self, dataList, procName, optionsD, workingDir): """ Return the list of entry file paths in the current repository. """ _ = procName _ = workingDir topRepoPath = optionsD["topRepoPath"] pathList = [] for subdir in dataList: dd = os.path.join(topRepoPath, subdir) for root, _, files in walk(dd, topdown=False): if "REMOVE" in root: continue for name in files: if (name.endswith(".cif.gz") and len(name) == 11) or (name.endswith(".cif") and len(name) == 8): pathList.append(os.path.join(root, name)) return dataList, pathList, [] def getEntryPathList(self): return self.__getEntryPathList(self.__getRepoTopPath("pdbx"), numProc=self.__numProc) def __getEntryPathList(self, topRepoPath, numProc=8): """Get the path list for structure entries in the input repository """ ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting at %s", ts) startTime = time.time() pathList = [] try: dataList = [] anL = "abcdefghijklmnopqrstuvwxyz0123456789" for a1 in anL: for a2 in anL: hc = a1 + a2 dataList.append(hc) hc = a2 + a1 dataList.append(hc) dataList = list(set(dataList)) # optD = {} optD["topRepoPath"] = topRepoPath mpu = MultiProcUtil(verbose=self.__verbose) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="_entryPathWorker") _, _, retLists, _ = mpu.runMulti(dataList=dataList, numProc=numProc, numResults=1) pathList = retLists[0] endTime0 = time.time() logger.debug("Path list length %d in %.4f seconds", len(pathList), endTime0 - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) return self.__applyFileLimit(pathList) def getBirdPathList(self): return self.__getBirdPathList(self.__getRepoTopPath("bird")) def __getBirdPathList(self, topRepoPath): """ Return the list of definition file paths in the current repository. List is ordered in increasing PRD ID numerical code. """ pathList = [] try: sd = {} for root, _, files in os.walk(topRepoPath, topdown=False): if "REMOVE" in root: continue for name in files: if name.startswith("PRD_") and name.endswith( ".cif") and len(name) <= 14: pth = os.path.join(root, name) sd[int(name[4:-4])] = pth # for k in sorted(sd.keys()): pathList.append(sd[k]) except Exception as e: logger.exception("Failing with %s", str(e)) # return self.__applyFileLimit(pathList) def getBirdFamilyPathList(self): return self.__getBirdFamilyPathList( self.__getRepoTopPath("bird_family")) def __getBirdFamilyPathList(self, topRepoPath): """ Return the list of definition file paths in the current repository. List is ordered in increasing PRD ID numerical code. """ pathList = [] try: sd = {} for root, _, files in os.walk(topRepoPath, topdown=False): if "REMOVE" in root: continue for name in files: if name.startswith("FAM_") and name.endswith( ".cif") and len(name) <= 14: pth = os.path.join(root, name) sd[int(name[4:-4])] = pth # for k in sorted(sd.keys()): pathList.append(sd[k]) except Exception as e: logger.exception("Failing with %s", str(e)) # return self.__applyFileLimit(pathList) def getBirdChemCompPathList(self): return self.__getBirdChemCompPathList( self.__getRepoTopPath("bird_chem_comp")) def __getBirdChemCompPathList(self, topRepoPath): """ Return the list of definition file paths in the current repository. List is ordered in increasing PRD ID numerical code. """ pathList = [] try: sd = {} for root, _, files in os.walk(topRepoPath, topdown=False): if "REMOVE" in root: continue for name in files: if name.startswith("PRDCC_") and name.endswith( ".cif") and len(name) <= 16: pth = os.path.join(root, name) sd[int(name[6:-4])] = pth # for k in sorted(sd.keys()): pathList.append(sd[k]) except Exception as e: logger.exception("Failing with %s", str(e)) # return self.__applyFileLimit(pathList) def __applyFileLimit(self, pathList): logger.debug("Length of file path list %d (limit %r)", len(pathList), self.__fileLimit) if self.__fileLimit: return pathList[:self.__fileLimit] else: return pathList def __buildFamilyIndex(self): """ Using information from the PRD family definition: # loop_ _pdbx_reference_molecule_list.family_prd_id _pdbx_reference_molecule_list.prd_id FAM_000010 PRD_000041 FAM_000010 PRD_000042 FAM_000010 PRD_000043 FAM_000010 PRD_000044 FAM_000010 PRD_000048 FAM_000010 PRD_000049 FAM_000010 PRD_000051 # """ prdD = {} try: pthL = self.__getLocatorList("bird_family") for pth in pthL: containerL = self.__mU.doImport(pth, fmt="mmcif") for container in containerL: catName = "pdbx_reference_molecule_list" if container.exists(catName): catObj = container.getObj(catName) for ii in range(catObj.getRowCount()): familyPrdId = catObj.getValue( attributeName="family_prd_id", rowIndex=ii) prdId = catObj.getValue(attributeName="prd_id", rowIndex=ii) if prdId in prdD: logger.debug( "duplicate prdId in family index %s %s", prdId, familyPrdId) prdD[prdId] = { "familyPrdId": familyPrdId, "c": container } except Exception as e: logger.exception("Failing with %s", str(e)) return prdD def __buildBirdCcIndex(self): """ Using information from the PRD pdbx_reference_molecule category to index the BIRDs corresponding small molecule correspondences """ prdD = {} ccPathD = {} prdStatusD = {} try: ccPathL = self.__getLocatorList("chem_comp") ccPathD = {} for ccPath in ccPathL: _, fn = os.path.split(ccPath) ccId, _ = os.path.splitext(fn) ccPathD[ccId] = ccPath logger.debug("ccPathD length %d", len(ccPathD)) pthL = self.__getLocatorList("bird") for pth in pthL: containerL = self.__mU.doImport(pth, fmt="mmcif") for container in containerL: catName = "pdbx_reference_molecule" if container.exists(catName): catObj = container.getObj(catName) ii = 0 prdId = catObj.getValue(attributeName="prd_id", rowIndex=ii) relStatus = catObj.getValue( attributeName="release_status", rowIndex=ii) prdStatusD[prdId] = relStatus if relStatus != "REL": continue prdRepType = catObj.getValue( attributeName="represent_as", rowIndex=ii) logger.debug("represent as %r", prdRepType) if prdRepType in ["single molecule"]: ccId = catObj.getValueOrDefault( attributeName="chem_comp_id", rowIndex=ii, defaultValue=None) # prdId = catObj.getValue(attributeName="prd_id", rowIndex=ii) logger.debug("mapping prdId %r ccId %r", prdId, ccId) if ccId and ccId in ccPathD: prdD[prdId] = { "ccId": ccId, "ccPath": ccPathD[ccId] } ccPathD[ccPathD[ccId]] = { "ccId": ccId, "prdId": prdId } else: logger.error("Bad ccId %r for BIRD %r", ccId, prdId) except Exception as e: logger.exception("Failing with %s", str(e)) return prdD, ccPathD, prdStatusD # - def mergeBirdAndChemCompRefData(self): prdSmallMolCcD, ccPathD, prdStatusD = self.__buildBirdCcIndex() logger.info("PRD to CCD index length %d CCD map path length %d", len(prdSmallMolCcD), len(ccPathD)) outputPathList = self.mergeBirdRefData(prdSmallMolCcD, prdStatusD) ccOutputPathList = [ pth for pth in self.getChemCompPathList() if pth not in ccPathD ] outputPathList.extend(ccOutputPathList) return outputPathList def mergeBirdRefData(self, prdSmallMolCcD, prdStatusD): """ Consolidate all of the bird reference data in a single container. If the BIRD is a 'small molecule' type then also merge with the associated CC definition. Store the merged data in the REPO_UTIL cache path and ... Return a path list for the consolidated data files - """ outPathList = [] try: birdPathList = self.__getLocatorList("bird") birdPathD = {} for birdPath in birdPathList: _, fn = os.path.split(birdPath) prdId, _ = os.path.splitext(fn) birdPathD[prdId] = birdPath # logger.debug("BIRD data length %d", len(birdPathD)) logger.debug("BIRD keys %r", list(birdPathD.keys())) birdCcPathList = self.__getLocatorList("bird_chem_comp") birdCcPathD = {} for birdCcPath in birdCcPathList: _, fn = os.path.split(birdCcPath) prdCcId, _ = os.path.splitext(fn) prdId = "PRD_" + prdCcId[6:] birdCcPathD[prdId] = birdCcPath # logger.debug("BIRD CC data length %d", len(birdCcPathD)) logger.debug("BIRD CC keys %r", list(birdCcPathD.keys())) fD = self.__buildFamilyIndex() logger.debug("Family index length %d", len(fD)) logger.debug("Family index keys %r", list(fD.keys())) logger.debug("PRD to CCD small mol index length %d", len(prdSmallMolCcD)) # for prdId in birdPathD: if prdId in prdStatusD and prdStatusD[prdId] != "REL": continue fp = os.path.join(self.__cachePath, prdId + ".cif") logger.debug("Export cache path is %r", fp) # pth2 = birdPathD[prdId] cL = self.__mU.doImport(pth2, fmt="mmcif") cFull = cL[0] logger.debug("Got Bird %r", cFull.getName()) # # ccBird = None ccD = None if prdId in prdSmallMolCcD: pthCc = prdSmallMolCcD[prdId]["ccPath"] cL = self.__mU.doImport(pthCc, fmt="mmcif") ccD = cL[0] logger.debug("Got corresponding CCD %r", ccD.getName()) elif prdId in birdCcPathD: pth1 = birdCcPathD[prdId] c1L = self.__mU.doImport(pth1, fmt="mmcif") ccBird = c1L[0] logger.debug("Got ccBird %r", ccBird.getName()) # cFam = None if prdId in fD: cFam = fD[prdId]["c"] logger.debug("Got cFam %r", cFam.getName()) # if ccD: for catName in ccD.getObjNameList(): cFull.append(ccD.getObj(catName)) # if ccBird: for catName in ccBird.getObjNameList(): cFull.append(ccBird.getObj(catName)) if cFam: for catName in cFam.getObjNameList(): cFull.append(cFam.getObj(catName)) # self.__mU.doExport(fp, [cFull], fmt="mmcif") outPathList.append(fp) except Exception as e: logger.exception("Failing with %s", str(e)) # return outPathList # def __exportConfig(self, container): """ - CATEGORY_NAME: diffrn_detector ATTRIBUTE_NAME_LIST: - pdbx_frequency - CATEGORY_NAME: pdbx_serial_crystallography_measurement ATTRIBUTE_NAME_LIST: - diffrn_id - pulse_energy - pulse_duration - xfel_pulse_repetition_rate """ for catName in container.getObjNameList(): cObj = container.getObj(catName) print("- CATEGORY_NAME: %s" % catName) print(" ATTRIBUTE_NAME_LIST:") for atName in cObj.getAttributeList(): print(" - %s" % atName) return True def getIhmDevPathList(self): return self.__getIhmDevPathList(self.__getRepoTopPath("ihm_dev")) def __getIhmDevPathList(self, topRepoPath): """ Return the list of I/HM entries in the current repository. File name template is: PDBDEV_0000 0020_model_v1-0.cif.gz List is ordered in increasing PRDDEV numerical code. """ pathList = [] logger.debug("Searching path %r", topRepoPath) try: sd = {} for root, _, files in os.walk(topRepoPath, topdown=False): if "REMOVE" in root: continue for name in files: if name.startswith("PDBDEV_") and name.endswith( ".cif.gz") and len(name) <= 50: pth = os.path.join(root, name) sd[int(name[7:15])] = pth # for k in sorted(sd.keys()): pathList.append(sd[k]) except Exception as e: logger.exception("Failing search in %r with %s", topRepoPath, str(e)) # return self.__applyFileLimit(pathList)
class Scop2ClassificationProvider(StashableBase): """Extract SCOP2 domain assignments, term descriptions and SCOP classification hierarchy from SCOP and SCOP2B flat files. """ def __init__(self, cachePath, useCache, **kwargs): # _ = kwargs self.__cachePath = cachePath dirName = "scop2" self.__dirPath = os.path.join(self.__cachePath, dirName) self.__useCache = useCache super(Scop2ClassificationProvider, self).__init__(self.__cachePath, [dirName]) # self.__version = "latest" self.__fmt = "pickle" self.__mU = MarshalUtil(workPath=self.__dirPath) self.__nD, self.__ntD, self.__pAD, self.__pBD, self.__pBRootD, self.__fD, self.__sfD, self.__sf2bD = self.__reload( useCache=self.__useCache, fmt=self.__fmt) # if not useCache and not self.testCache(): ok = self.__fetchFromBackup() if ok: self.__nD, self.__ntD, self.__pAD, self.__pBD, self.__pBRootD, self.__fD, self.__sfD, self.__sf2bD = self.__reload( useCache=True, fmt=self.__fmt) # def testCache(self): logger.info( "SCOP2 lengths nD %d pAD %d pBD %d pBRootD %d fD %d sfD %d sf2bD %d", len(self.__nD), len(self.__pAD), len(self.__pBD), len(self.__pBRootD), len(self.__fD), len(self.__sfD), len(self.__sf2bD)) if (len(self.__nD) > 9000) and (len(self.__pAD) > 70000): return True return False def getVersion(self): """Returns the SCOP2 version""" return self.__version def getFamilyIds(self, pdbId, authAsymId): try: return list( set([tup[1] for tup in self.__fD[(pdbId.upper(), authAsymId)]])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getSuperFamilyIds(self, pdbId, authAsymId): try: return list( set([ tup[1] for tup in self.__sfD[(pdbId.upper(), authAsymId)] ])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getFamilyNames(self, pdbId, authAsymId): try: return list( set([ self.__nD[tup[1]] for tup in self.__fD[(pdbId.upper(), authAsymId)] ])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getSuperFamilyNames(self, pdbId, authAsymId): try: return list( set([ self.__nD[tup[1]] for tup in self.__sfD[(pdbId.upper(), authAsymId)] ])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getFamilyResidueRanges(self, pdbId, authAsymId): try: # s/fD.setdefault((pdbId, authAsymId), []).append((domSuperFamilyId, authAsymId, authSeqBeg, authSeqEnd)) return [(tup[0], tup[1], tup[2], tup[3], tup[4]) for tup in self.__fD[(pdbId.upper(), authAsymId)]] except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getSuperFamilyResidueRanges(self, pdbId, authAsymId): try: return [(tup[0], tup[1], tup[2], tup[3], tup[4]) for tup in self.__sfD[(pdbId.upper(), authAsymId)]] except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getSuperFamilyNames2B(self, pdbId, authAsymId): try: return list( set([ self.__nD[tup[1]] for tup in self.__sf2bD[(pdbId.upper(), authAsymId)] ])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getSuperFamilyIds2B(self, pdbId, authAsymId): try: return list( set([ tup[1] for tup in self.__sf2bD[(pdbId.upper(), authAsymId)] ])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getSuperFamilyResidueRanges2B(self, pdbId, authAsymId): try: return [(tup[0], tup[1], tup[2], tup[3], tup[4]) for tup in self.__sf2bD[(pdbId.upper(), authAsymId)]] except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getName(self, domId): try: return self.__nD[domId] except Exception: logger.debug("Undefined SCOP2 id %r", domId) return None def getNameType(self, domId): qD = { "TP": "Protein Type", "CL": "Protein Class", "CF": "Fold", "SF": "Superfamily", "FA": "Family" } try: return qD[self.__ntD[domId]] except Exception: logger.debug("Undefined ECOD id %r", domId) return None def getIdLineage(self, domId): pS = set() try: pS.add(domId) pt = self.__pAD[domId] while (pt is not None) and (pt != 0): pS.add(pt) pt = self.__pAD[pt] # pt = self.__pBD[domId] while (pt is not None) and (pt != 0): pS.add(pt) pt = self.__pBD[pt] except Exception as e: logger.debug("Failing for %r with %s", domId, str(e)) # return sorted(pS) def getNameLineage(self, domId): try: nL = [] for dId in self.getIdLineage(domId): tN = self.getName(dId) tN = tN if tN else "Unnamed" nL.append(tN) return nL except Exception as e: logger.debug("Failing for %r with %s", domId, str(e)) return None def getTreeNodeList(self): tnL = self.__exportTreeNodeList(self.__nD, self.__pAD, self.__pBRootD) return tnL def __getAssignmentFileName(self, fmt="json"): ext = "json" if fmt == "json" else "pic" fn = "scop2_domain_assignments.%s" % ext return fn def __reload(self, useCache=True, fmt="json"): nD = ntD = pAD = pBD = pBRootD = fD = sfD = sf2bD = {} fn = self.__getAssignmentFileName(fmt=fmt) assignmentPath = os.path.join(self.__dirPath, fn) self.__mU.mkdir(self.__dirPath) # if useCache and self.__mU.exists(assignmentPath): sD = self.__mU.doImport(assignmentPath, fmt=fmt) logger.debug("Domain name count %d", len(sD["names"])) self.__version = sD["version"] nD = sD["names"] ntD = sD["nametypes"] pAD = sD["parentsType"] pBD = sD["parentsClass"] pBRootD = sD["parentsClassRoot"] fD = sD["families"] sfD = sD["superfamilies"] sf2bD = sD["superfamilies2b"] elif not useCache: nmL, dmL, scop2bL, _ = self.__fetchFromSource() # ok = False nD = self.__extractNames(nmL) logger.info("Domain name dictionary (%d)", len(nD)) pAD, pBD, pBRootD, ntD, fD, sfD, domToSfD = self.__extractDomainHierarchy( dmL) # logger.info("Domain node parent hierarchy (protein type) (%d)", len(pAD)) logger.info("Domain node parent hierarchy (structural class) (%d)", len(pBD)) logger.info( "Domain node parent hierarchy (structural class root) (%d)", len(pBRootD)) logger.info("SCOP2 core domain assignments (family %d) (sf %d)", len(fD), len(sfD)) # sf2bD = self.__extractScop2bSuperFamilyAssignments( scop2bL, domToSfD) logger.info("SCOP2B SF domain assignments (%d)", len(sf2bD)) # tS = datetime.datetime.now().isoformat() # vS = datetime.datetime.now().strftime("%Y-%m-%d") vS = self.__version sD = { "version": vS, "created": tS, "names": nD, "nametypes": ntD, "parentsType": pAD, "parentsClass": pBD, "parentsClassRoot": pBRootD, "families": fD, "superfamilies": sfD, "superfamilies2b": sf2bD } ok = self.__mU.doExport(assignmentPath, sD, fmt=fmt, indent=3) logger.info("Cache save status %r", ok) # return nD, ntD, pAD, pBD, pBRootD, fD, sfD, sf2bD def __fetchFromBackup(self, fmt="json"): urlTarget = "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/SCOP2" # fn = self.__getAssignmentFileName(fmt=fmt) assignmentPath = os.path.join(self.__dirPath, fn) urlPath = os.path.join(urlTarget, fn) self.__mU.mkdir(assignmentPath) # logger.info("Using backup URL %r", urlPath) fU = FileUtil() ok = fU.get(urlPath, assignmentPath) return ok def __fetchFromSource(self): """Fetch the classification names and domain assignments from SCOP2 and SCOP2B resources. SCOP2 domain names: https://scop.mrc-lmb.cam.ac.uk/files/scop-des-latest.txt SCOP2 domain hierarchy: https://scop.mrc-lmb.cam.ac.uk/files/scop-cla-latest.txt SIFTS extrapolated SCOP2 and SCOP2B assignments: https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_scop2b_sf_uniprot.tsv.gz https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_scop2_uniprot.tsv.gz """ urlTargetScop2 = "https://scop.mrc-lmb.cam.ac.uk/files" encoding = "utf-8-sig" if sys.version_info[0] > 2 else "ascii" fn = "scop-des-latest.txt" url = os.path.join(urlTargetScop2, fn) desL = self.__mU.doImport(url, fmt="list", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(desL)) # fn = "scop-cla-latest.txt" url = os.path.join(urlTargetScop2, fn) claL = self.__mU.doImport(url, fmt="list", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(claL)) # headerLines = self.__mU.doImport(url, fmt="list", uncomment=False, encoding=encoding) self.__version = headerLines[0].split( " ")[3] if headerLines else "2021-05-27" # JDW note cert issues with this site urlTargetSifts = "http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv" fn = "pdb_chain_scop2b_sf_uniprot.tsv.gz" url = os.path.join(urlTargetSifts, fn) scop2bL = self.__mU.doImport(url, fmt="tdd", rowFormat="dict", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(scop2bL)) # fn = "pdb_chain_scop2_uniprot.tsv.gz" url = os.path.join(urlTargetSifts, fn) scop2L = self.__mU.doImport(url, fmt="tdd", rowFormat="dict", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(scop2bL)) # return desL, claL, scop2bL, scop2L def __extractNames(self, nmL): """ """ rD = {} logger.info("Length of input name list %d", len(nmL)) for nm in nmL: ff = nm.split(" ") rD[ff[0]] = " ".join(ff[1:]) # self.__mU.doExport(os.path.join(self.__dirPath, "scop2-names.json"), rD, fmt="json", indent=3) return rD def __extractDomainHierarchy(self, dmL): """Extract the domain node identifier hierarchy from the SCOP2 representative assignment file ... Returns: dict, dict, dict, dict, dict: parent and name type dictionaries, family and superfamily assignments, and domain to superfamily mapping ntD[domainId] = name type TP=protein type, CL=protein class, CF=fold, SF=superfamily, FA=family pD[child domain identifier] = parent domain identifier fD[(pdbId, authAsymId)] = [(faDomId, faId, authAsymId, resBeg, resEnd),] sfD[(pdbId, authAsymId)] = [(sfDomId, sfId, authAsymId, resBeg, resEnd),] domToSfD[domSfid] = sfId Example assignment file: # SCOP release 2021-05-27 # http://scop.mrc-lmb.cam.ac.uk # based on PDB release 2021-05-14 # based on UniProt realese 2021-04-08 # based on SIFTS release 2021-05-19 # FA-DOMID FA-PDBID FA-PDBREG FA-UNIID FA-UNIREG SF-DOMID SF-PDBID SF-PDBREG SF-UNIID SF-UNIREG SCOPCLA 8045703 3H8D C:1143-1264 Q64331 1143-1264 8091604 3H8D C:1143-1264 Q64331 1143-1264 TP=1,CL=1000003,CF=2001470,SF=3002524,FA=4004627 8094330 6J56 A:1158-1282 Q9UM54 1167-1291 8094331 6J56 A:1158-1282 Q9UM54 1167-1291 TP=1,CL=1000003,CF=2001470,SF=3002524,FA=4004627 # """ # Build the parent dictionary and name node type ntD = {} pAD = {} pBD = {} pBRootD = {} fD = {} sfD = {} domToSfD = {} # logger.info("Length of input domain assignment list %d", len(dmL)) for dm in dmL: try: ff = dm.split(" ") domFamilyId = ff[0] domSuperFamilyId = ff[5] rngL = ff[10].split(",") tD = {} for rng in rngL: tL = rng.split("=") tD[tL[0]] = tL[1] # # - # pD[tD["TP"]] = 0 # pD[tD["CL"]] = tD["TP"] # pD[tD["CF"]] = tD["CL"] # pD[tD["SF"]] = tD["CF"] # pD[tD["FA"]] = tD["SF"] # pD[domFamilyId] = tD["FA"] # pD[domSuperFamilyId] = tD["SF"] # # Represent as two trees separately rooted in protein type and structural class pAD[tD["TP"]] = 0 pAD[tD["CF"]] = tD["TP"] pAD[tD["SF"]] = tD["CF"] pAD[tD["FA"]] = tD["SF"] pAD[domFamilyId] = tD["FA"] pAD[domSuperFamilyId] = tD["SF"] # # Use this complete pBD here only for generating ID lineages, but NOT for merging with pAD pBD[tD["CL"]] = 0 pBD[tD["CF"]] = tD["CL"] pBD[tD["SF"]] = tD["CF"] pBD[tD["FA"]] = tD["SF"] pBD[domFamilyId] = tD["FA"] pBD[domSuperFamilyId] = tD["SF"] # # Use pBRootD for creating tree node lists; Don't capture any lower branches to avoid re-creating redundant key:values already in pAD pBRootD[tD["CL"]] = 0 pBRootD[tD["CF"]] = tD["CL"] # ntD[tD["FA"]] = "FA" ntD[tD["SF"]] = "SF" ntD[tD["CF"]] = "CF" ntD[tD["CL"]] = "CL" ntD[tD["TP"]] = "TP" # pdbId = ff[1] authAsymId, authSeqBeg, authSeqEnd = self.__parseAssignment( ff[2]) if authAsymId is not None: fD.setdefault((pdbId, authAsymId), []).append( (domFamilyId, tD["FA"], authAsymId, authSeqBeg, authSeqEnd)) pdbId = ff[6] authAsymId, authSeqBeg, authSeqEnd = self.__parseAssignment( ff[7]) if authAsymId is not None: sfD.setdefault((pdbId, authAsymId), []).append( (domSuperFamilyId, tD["SF"], authAsymId, authSeqBeg, authSeqEnd)) # domToSfD[domSuperFamilyId] = tD["SF"] except Exception as e: logger.exception("Failing for case %r: %s", dm, str(e)) # logger.info("pAD (%d) pBD (%d) pBRootD (%d) ntD (%d)", len(pAD), len(pBD), len(pBRootD), len(ntD)) logger.info("fD (%d) sfD (%d)", len(fD), len(sfD)) return pAD, pBD, pBRootD, ntD, fD, sfD, domToSfD def __parseAssignment(self, tS): authAsymId = authSeqBeg = authSeqEnd = None try: fL = tS.split(":") authAsymId = fL[0] rS = fL[1] if rS[0] == "-": authSeqBeg = -int(rS[1:].split("-")[0]) authSeqEnd = int(rS[1:].split("-")[1]) else: authSeqBeg = int(rS.split("-")[0]) authSeqEnd = int(rS.split("-")[1]) except Exception: pass return authAsymId, authSeqBeg, authSeqEnd def __extractScop2bSuperFamilyAssignments(self, scop2bL, domToSfD): """ Extract the SCOP2B SIFTS superfamily domain assignments for PDB structure entries. Returns: aD[(pdbId, authAsymId)] = [(sfDomId, sfId, authAsymId, resBeg, resEnd),] Example: # 2021/06/12 - 05:52 | PDB: 23.21 | UniProt: 2021.03 PDB CHAIN SF_DOMID SP_PRIMARY RES_BEG RES_END PDB_BEG PDB_END SP_BEG SP_END 5id7 B 8033045 P02768 197 388 197 388 221 412 1o9x A 8033045 P02768 197 388 197 388 221 412 """ sfD = {} try: for rowD in scop2bL: if rowD["SF_DOMID"] in domToSfD: sfD.setdefault( (rowD["PDB"].upper(), rowD["CHAIN"]), []).append( (rowD["SF_DOMID"], domToSfD[rowD["SF_DOMID"]], rowD["CHAIN"], rowD["PDB_BEG"], rowD["PDB_END"])) else: logger.warning("Missing SCOP2B SF ID mapping for %r", rowD["SF_DOMID"]) except Exception as e: logger.exception("Failing with %s", str(e)) return sfD def __exportTreeNodeList(self, nD, pAD, pBRootD): """Create node list from the SCOP2 parent and name/description dictionaries. Exclude the root node from the tree. """ # rootId = 0 pL = [rootId] # logger.info("nD %d pAD %d pBRootD %d pL %r", len(nD), len(pAD), len(pBRootD), pL) # create child dictionary cD = {} for ctId, ptId in pAD.items(): cD.setdefault(ptId, []).append(ctId) for ctId, ptId in pBRootD.items(): cD.setdefault(ptId, []).append(ctId) # logger.debug("cD %d", len(cD)) # idL = [] for rootId in sorted(pL): visited = set([rootId]) queue = collections.deque(visited) while queue: tId = queue.popleft() idL.append(tId) if tId not in cD: # logger.warning("No children for scop tId %r", tId) continue for childId in cD[tId]: if childId not in visited: queue.append(childId) visited.add(childId) # dL = [] for tId in idL: displayName = nD[tId] if tId in nD else None ptIdL = [] if tId in pAD: ptIdL.append(pAD[tId]) if tId in pBRootD: ptIdL.append(pBRootD[tId]) lL = self.getIdLineage(tId)[1:] # # d = {'id': str(tId), 'name': displayName, 'lineage': [str(t) for t in lL], 'parents': [str(ptId)], 'depth': len(lL)} if tId == rootId: continue elif any([ptId == rootId for ptId in ptIdL]): dD = {"id": str(tId), "name": displayName, "depth": 0} else: displayName = displayName if displayName else "Domain %s" % str( tId) dD = { "id": str(tId), "name": displayName, "parents": ptIdL, "depth": len(lL) } dL.append(dD) return dL
class IMGTTargetFeatureProvider(StashableBase): """Accessors for IMGT (The International Immunogenetic Information System) target features""" # Link out using the IMGT - # http://www.imgt.org/3Dstructure-DB/cgi/details.cgi?pdbcode=5w5m&Part=Chain # def __init__(self, **kwargs): # self.__cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) self.__dirName = "IMGT-features" super(IMGTTargetFeatureProvider, self).__init__(self.__cachePath, [self.__dirName]) self.__dirPath = os.path.join(self.__cachePath, self.__dirName) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__fD = self.__reload(self.__dirPath, useCache) # def testCache(self, minCount=20000): logger.info( "IMGT feature count %d", len(self.__fD["features"]) if "features" in self.__fD else 0) if self.__fD and "features" in self.__fD and len( self.__fD["features"]) > minCount: return True else: return False def hasFeatures(self, rcsbInstanceId): """Return if features exist for the input instance identifier (auth_asym_id) Args: rcsbInstanceId (str): <pdbId (lower case)>.<auth_asym_id (case sensitive)> Returns: bool: True for success or False otherwise """ return rcsbInstanceId in self.__fD["features"] def getFeatures(self, rcsbInstanceId): """Return features for the instance identifier (auth_asym_id) Args: rcsbInstanceId (str): <pdbId (lower case)>.<auth_asym_id (case sensitive)> Returns: list: list of feature dictionaries """ try: return self.__fD["features"][rcsbInstanceId] except Exception: return [] def __getFeatureDataPath(self): return os.path.join(self.__dirPath, "IMGT-feature-data.json") def reload(self): self.__fD = self.__reload(self.__dirPath, True) return True def __reload(self, dirPath, useCache): startTime = time.time() fD = {} featurePath = self.__getFeatureDataPath() # logger.info("useCache %r featurePath %r", useCache, featurePath) if useCache and self.__mU.exists(featurePath): fD = self.__mU.doImport(featurePath, fmt="json") else: fU = FileUtil() fU.mkdir(dirPath) # --- logger.info("Completed reload (useCache %r) at %s (%.4f seconds)", useCache, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return fD def buildFeatureList(self, useCache=True): """Build polymer instance feature list for IMGT annotations. Returns: bool: True for success or False otherwise 5w5m_B": { "description": "FUSION-TNFRSF1B-GAMMA-1", "domains": { "C-DOMAIN|CH2|1": { "geneAlleles": [ { "taxName": "H**o sapiens", "geneAllele": "IGHG4*01" }, { "taxName": "H**o sapiens", "geneAllele": "IGHG4*03" }, { "taxName": "H**o sapiens", "geneAllele": "IGHG4*04" } ], "alignment": { "begEntitySeqId": 7, "endEntitySeqId": 116, "begIMGTSeqId": "1", "endIMGTSeqId": "105" } }, "C-DOMAIN|CH3|2": { "geneAlleles": [ { "taxName": "H**o sapiens", "geneAllele": "IGHG4*01" }, { "taxName": "H**o sapiens", "geneAllele": "IGHG4*04" } ], "alignment": { "begEntitySeqId": 117, "endEntitySeqId": 221, "begIMGTSeqId": "106", "endIMGTSeqId": "209" } } }, "proteinName": "IgG4 Sigma1 Fc", "receptorType": "IG", "receptorDescription": "FUSION-[TNFRSF1B]2-FC-GAMMA-1", "species": "H**o sapiens (human)" }, """ rDL = [] imgtP = IMGTTargetProvider(cachePath=self.__cachePath, useCache=useCache) # provenanceSource = "IMGT" refScheme = "PDB entity" assignVersion = imgtP.getVersion() # chainD = imgtP.getChains() # fTupL = [ ("description", "IMGT_ANTIBODY_DESCRIPTION"), ("proteinName", "IMGT_ANTIBODY_PROTEIN_NAME"), ("receptorType", "IMGT_ANTIBODY_RECEPTOR_TYPE"), ("receptorDescription", "IMGT_ANTIBODY_RECEPTOR_DESCRIPTION"), ("species", "IMGT_ANTIBODY_ORGANISM_NAME"), ] ii = 1 # for chainId, chD in chainD.items(): entryId = chainId[:4] authAsymId = chainId.split("_")[1] # descriptive features - for fTup in fTupL: rD = { "entry_id": entryId, "auth_asym_id": authAsymId, "type": fTup[1], "feature_id": "IMGT_" + str(ii), "name": chD[fTup[0]] if fTup[0] in chD else None, "provenance_source": provenanceSource, "reference_scheme": refScheme, "assignment_version": assignVersion, "feature_positions": [], } rDL.append(rD) ii += 1 # domain features - if "domains" not in chD: continue for domainId, dD in chD["domains"].items(): dIdL = domainId.split("|") domainName = dIdL[0] + " " + dIdL[1] begSeqId = endSeqId = None if "alignment" in dD: begSeqId = dD["alignment"]["begEntitySeqId"] endSeqId = dD["alignment"]["endEntitySeqId"] else: logger.debug("%r missing alignment for in %r", chainId, dD) # gaL = [] if "geneAlleles" in dD: for gD in dD["geneAlleles"]: gaL.append(gD["geneAllele"]) else: logger.debug("%r missing gene and alleles for in %r", chainId, dD) # # rD = { "entry_id": entryId, "auth_asym_id": authAsymId, "type": "IMGT_ANTIBODY_DOMAIN_NAME", "feature_id": "IMGT_" + str(ii), "name": domainName, "provenance_source": provenanceSource, "reference_scheme": refScheme, "assignment_version": assignVersion, "feature_positions": [{ "beg_seq_id": begSeqId, "end_seq_id": endSeqId }], } rDL.append(rD) ii += 1 # for ga in gaL: rD = { "entry_id": entryId, "auth_asym_id": authAsymId, "type": "IMGT_ANTIBODY_GENE_ALLELE_NAME", "feature_id": "IMGT_" + str(ii), "name": ga, "provenance_source": provenanceSource, "reference_scheme": refScheme, "assignment_version": assignVersion, "feature_positions": [{ "beg_seq_id": begSeqId, "end_seq_id": endSeqId }], } rDL.append(rD) ii += 1 # qD = {} for rD in rDL: eId = rD["entry_id"] + "." + rD["auth_asym_id"] qD.setdefault(eId, []).append(rD) # logger.info("IMGT antibody chain features (%d)", len(qD)) # fp = self.__getFeatureDataPath() tS = datetime.datetime.now().isoformat() vS = assignVersion ok = self.__mU.doExport(fp, { "version": vS, "created": tS, "features": qD }, fmt="json", indent=3) return ok
class RepoHoldingsDataPrep(object): """Consolidate legacy data describing repository content updates and repository entry status.""" def __init__(self, **kwargs): self.__cfgOb = kwargs.get("cfgOb", None) self.__cachePath = kwargs.get("cachePath", None) self.__sandboxPath = kwargs.get("sandboxPath", None) self.__filterType = kwargs.get("filterType", "") self.__assignDates = "assign-dates" in self.__filterType # self.__mU = MarshalUtil(workPath=self.__cachePath) self.__currentCacheD = None # def getHoldingsCombinedEntry(self, updateId, dirPath=None): dList = [] retD = self.__getHoldingsCombined(dirPath=dirPath) for entryId, qD in retD.items(): tD = { "rcsb_id": entryId, "entry_id": entryId, "update_id": updateId } rD = { "rcsb_id": entryId, "rcsb_repository_holdings_combined_entry_container_identifiers": tD, "rcsb_repository_holdings_combined": qD, } dList.append(rD) return dList def __getHoldingsCombined(self, dirPath=None): retD = {} dirPath = dirPath if dirPath else self.__sandboxPath currentD = self.__currentCacheD if self.__currentCacheD else self.__getHoldingsCurrent( dirPath=dirPath) for entryId, tD in currentD.items(): retD[entryId] = {"status": "CURRENT", "status_code": "REL"} logger.debug("Released entries %d", len(retD)) # unRelD = self.__getHoldingsUnreleased(dirPath=dirPath) # logger.info("@@@ unRelD %r", unRelD) for entryId, tD in unRelD.items(): if entryId not in retD and tD["status_code"] in [ "AUCO", "AUTH", "HOLD", "HPUB", "POLC", "PROC", "REFI", "REPL", "WAIT", "WDRN" ]: retD[entryId] = { "status": "UNRELEASED", "status_code": tD["status_code"] } logger.debug("Released & unreleased entries %d", len(retD)) # trfD, _ = self.__getHoldingsTransferred(dirPath=dirPath) for entryId, tD in trfD.items(): if entryId not in retD and tD["status_code"] in ["TRSF"]: retD[entryId] = { "status": "REMOVED", "status_code": tD["status_code"] } # logger.debug("Released & unreleased & transferred entries %d", len(retD)) # rmvD, _, replacesD = self.__getHoldingsRemoved(dirPath=dirPath) # # for entryId in rmvD: # if entryId not in retD: # retD[entryId] = {"status": "REMOVED", "status_code": "OBS"} # replacedByD = {} for entryId, tD in replacesD.items(): for sId in tD["id_codes_superseded"]: replacedByD[sId.strip().upper()] = entryId.strip().upper() # logger.info("replacedbyD (%d) rmvD (%d) currentD (%d) retD (%d)", len(replacedByD), len(rmvD), len(currentD), len(retD)) for entryId in rmvD: if entryId in currentD: continue tId = entryId if tId in replacedByD: if tId == replacedByD[tId]: logger.info("Inconsistent obsolete entry info for %r", tId) while tId in replacedByD and tId != replacedByD[tId]: # logger.debug("tId %r replacedByD[tId] %r", tId, replacedByD[tId]) tId = replacedByD[tId] if tId in currentD: retD[entryId] = { "status": "REMOVED", "status_code": "OBS", "id_code_replaced_by_latest": tId } else: logger.debug("%r missing replacedby entry %r", entryId, tId) else: retD[entryId] = {"status": "REMOVED", "status_code": "OBS"} # logger.debug( "Released & unreleased & transferred & removed entries %d", len(retD)) return retD def getHoldingsCurrentEntry(self, updateId, dirPath=None): dList = [] retD = self.__currentCacheD if self.__currentCacheD else self.__getHoldingsCurrent( dirPath=dirPath) self.__currentCacheD = retD for entryId, qD in retD.items(): tD = ({ "rcsb_id": entryId, "entry_id": entryId, "update_id": updateId, "assembly_ids": qD["assembly_ids"] } if "assembly_ids" in qD else { "rcsb_id": entryId, "entry_id": entryId, "update_id": updateId }) rD = { "rcsb_id": entryId, "rcsb_repository_holdings_current_entry_container_identifiers": tD, "rcsb_repository_holdings_current": { "repository_content_types": qD["repository_content_types"] }, } dList.append(rD) return dList def getHoldingsUpdateEntry(self, updateId, dirPath=None): dList = [] retD = self.__getHoldingsUpdate(dirPath=dirPath) for entryId, qD in retD.items(): tD = { "rcsb_id": entryId, "entry_id": entryId, "update_id": updateId } rD = { "rcsb_id": entryId, "rcsb_repository_holdings_update_entry_container_identifiers": tD, "rcsb_repository_holdings_update": qD, } dList.append(rD) return dList def getHoldingsUnreleasedEntry(self, updateId, dirPath=None): dList = [] retD = self.__getHoldingsUnreleased(dirPath=dirPath) prD = self.__getHoldingsPrerelease(dirPath=dirPath) currentD = self.__currentCacheD if self.__currentCacheD else self.__getHoldingsCurrent( dirPath=dirPath) self.__currentCacheD = currentD for entryId, qD in retD.items(): if entryId in currentD: continue rD = {"rcsb_id": entryId} rD["rcsb_repository_holdings_unreleased_entry_container_identifiers"] = { "rcsb_id": entryId, "entry_id": entryId, "update_id": updateId } if entryId in prD: rD["rcsb_repository_holdings_prerelease"] = prD[entryId] qD["prerelease_sequence_available_flag"] = "Y" else: qD["prerelease_sequence_available_flag"] = "N" rD["rcsb_repository_holdings_unreleased"] = qD # dList.append(rD) return dList def getHoldingsRemovedEntry(self, updateId, dirPath=None): dList = [] rmvD, aaD, spsD = self.__getHoldingsRemoved(dirPath=dirPath) trfD, insD = self.__getHoldingsTransferred(dirPath=dirPath) currentD = self.__currentCacheD if self.__currentCacheD else self.__getHoldingsCurrent( dirPath=dirPath) self.__currentCacheD = currentD # # Get the list of candidate keys for removed entries - # entryIdL = sorted(set(list(insD.keys()) + list(rmvD.keys()))) for entryId in entryIdL: if entryId in currentD: continue rD = {"rcsb_id": entryId} rD["rcsb_repository_holdings_removed_entry_container_identifiers"] = { "rcsb_id": entryId, "entry_id": entryId, "update_id": updateId } # if entryId in rmvD: rD["rcsb_repository_holdings_removed"] = rmvD[entryId] if entryId in aaD: rD["rcsb_repository_holdings_removed_audit_author"] = aaD[ entryId] if entryId in spsD: rD["rcsb_repository_holdings_superseded"] = spsD[entryId] if entryId in trfD: rD["rcsb_repository_holdings_transferred"] = trfD[entryId] if entryId in insD: rD["rcsb_repository_holdings_insilico_models"] = insD[entryId] dList.append(rD) return dList def __getHoldingsTransferred(self, dirPath=None): """Parse legacy lists defining the repository contents transferred to alternative repositories Args: updateId (str): update identifier (e.g. 2018_32) dirPath (str): directory path containing update list files **kwargs: unused Returns: (dict): dictionaries containing data for rcsb_repository_holdings_transferred (dict): dictionaries containing data for rcsb_repository_holdings_insilico_models Example input data: ma-czyyf : 262D - TITLE A THREE-DIMENSIONAL MODEL OF THE REV BINDING ELEMENT OF HIV- TITLE 2 1 DERIVED FROM ANALYSES OF IN VITRO SELECTED VARIANTS ma-cfqla : 163D - TITLE A THREE-DIMENSIONAL MODEL OF THE REV BINDING ELEMENT OF HIV- TITLE 2 1 DERIVED FROM ANALYSES OF IN VITRO SELECTED VARIANTS and - 1DX2 REL 1999-12-16 2000-12-15 Tumour Targetting Human ... Beiboer, S.H.W., Reurs, A., Roovers, R.C., Arends, J., Whitelegg, N.R.J., Rees, A.R., Hoogenboom, H.R. and - 1APD OBSLTE 1992-10-15 2APD 1BU0 OBSLTE 1998-10-07 2BU0 1CLJ OBSLTE 1998-03-04 2CLJ 1DU8 OBSLTE 2001-01-31 1GIE 1I2J OBSLTE 2001-01-06 1JA5 """ trsfD = {} insD = {} dirPath = dirPath if dirPath else self.__sandboxPath try: fp = os.path.join(dirPath, "status", "theoretical_model_obsolete.tsv") lineL = self.__mU.doImport(fp, "list") # pylint: disable=no-member # obsDateD = {} obsIdD = {} for line in lineL: fields = line.split("\t") if len(fields) < 3: continue entryId = str(fields[0]).strip().upper() obsDateD[entryId] = dateutil.parser.parse( fields[2]) if self.__assignDates else fields[2] if len(fields) > 3 and len(fields[3]) > 3: obsIdD[entryId] = str(fields[3]).strip().upper() logger.debug("Read %d obsolete insilico id codes", len(obsDateD)) # --------- --------- --------- --------- --------- --------- --------- fp = os.path.join(dirPath, "status", "model-archive-PDB-insilico-mapping.list") lineL = self.__mU.doImport(fp, "list") # trD = {} for line in lineL: fields = line.split(":") if len(fields) < 2: continue entryId = str(fields[1]).strip().upper()[:4] maId = str(fields[0]).strip() trD[entryId] = maId logger.debug("Read %d model archive id codes", len(trD)) # # --------- --------- --------- --------- --------- --------- --------- fp = os.path.join(dirPath, "status", "theoretical_model_v2.tsv") lineL = self.__mU.doImport(fp, "list") # logger.debug("Read %d insilico id codes", len(lineL)) for line in lineL: fields = str(line).split("\t") if len(fields) < 6: continue depDate = dateutil.parser.parse( fields[2]) if self.__assignDates else fields[2] relDate = None if len(fields[3]) >= 10 and not fields[3].startswith("0000"): relDate = dateutil.parser.parse( fields[3]) if self.__assignDates else fields[3] statusCode = "TRSF" if fields[1] == "REL" else fields[1] entryId = str(fields[0]).upper() title = fields[4] # auditAuthors = [t.strip() for t in fields[5].split(";")] repId = None if entryId in trD: repName = "Model Archive" repId = trD[entryId] # dD = { "status_code": statusCode, "deposit_date": depDate, "repository_content_types": ["coordinates"], "title": title, "audit_authors": auditAuthors, } # if relDate: dD["release_date"] = relDate # if repId: dD["remote_accession_code"] = repId dD["remote_repository_name"] = repName if statusCode == "TRSF": trsfD[entryId] = dD # # dD = { "status_code": statusCode, "deposit_date": depDate, "title": title, "audit_authors": auditAuthors } # if relDate: dD["release_date"] = relDate # if entryId in obsDateD: dD["remove_date"] = relDate # if entryId in obsIdD: dD["id_codes_replaced_by"] = [obsIdD[entryId]] # insD[entryId] = dD # logger.info("Transferred entries %d - insilico models %d", len(trsfD), len(insD)) # except Exception as e: logger.exception("Failing with %s", str(e)) return trsfD, insD def __getHoldingsUpdate(self, dirPath=None): """Parse legacy lists defining the contents of the repository update Args: updateId (str): update identifier (e.g. 2018_32) dirPath (str): directory path containing update list files **kwargs: unused Returns: list: List of dictionaries containing rcsb_repository_holdings_update """ retD = {} dirPath = dirPath if dirPath else self.__sandboxPath try: updateTypeList = ["added", "modified", "obsolete"] contentTypeList = ["entries", "mr", "cs", "sf", "nef", "nmr-str"] contentNameD = { "entries": "coordinates", "mr": "NMR restraints", "cs": "NMR chemical shifts", "sf": "structure factors", "nef": "Combined NMR data (NEF)", "nmr-str": "Combined NMR data (NMR-STAR)", } # for updateType in updateTypeList: uD = {} for contentType in contentTypeList: fp = os.path.join(dirPath, "update-lists", updateType + "-" + contentType) if not self.__mU.exists(fp): continue entryIdL = self.__mU.doImport(fp, "list") # for entryId in entryIdL: entryId = entryId.strip().upper() uD.setdefault(entryId, []).append(contentNameD[contentType]) for entryId in uD: uType = "removed" if updateType == "obsolete" else updateType # retD[entryId] = {"update_id": updateId, "entry_id": entryId, "update_type": uType, "repository_content_types": uD[entryId]} retD[entryId] = { "update_type": uType, "repository_content_types": uD[entryId] } return retD except Exception as e: logger.exception("Failing with %s", str(e)) return retD def __getHoldingsCurrent(self, dirPath=None): """Parse legacy lists defining the current contents of the repository update Args: updateId (str): update identifier (e.g. 2018_32) dirPath (str): directory path containing update list files **kwargs: unused Returns: list: List of dictionaries containing data for rcsb_repository_holdings_current """ rD = {} retD = {} dirPath = dirPath if dirPath else self.__sandboxPath try: updateTypeList = ["all"] contentTypeList = [ "pdb", "pdb-format", "mr", "cs", "sf", "nef", "nmr-str" ] contentNameD = { "pdb": "coordinates", "pdb-format": "PDB format coordinates", "mr": "NMR restraints", "cs": "NMR chemical shifts", "sf": "structure factors", "nef": "Combined NMR data (NEF)", "nmr-str": "Combined NMR data (NMR-STAR)", } # tD = {} for updateType in updateTypeList: for contentType in contentTypeList: fp = os.path.join(dirPath, "update-lists", updateType + "-" + contentType + "-list") if not self.__mU.exists(fp): continue entryIdL = self.__mU.doImport(fp, "list") # for entryId in entryIdL: entryId = entryId.strip().upper() if entryId not in tD: tD[entryId.upper()] = {} tD[entryId.upper()][contentNameD[contentType]] = True # fp = os.path.join(dirPath, "status", "biounit_file_list.tsv") lines = self.__mU.doImport(fp, "list") assemD = {} for line in lines: fields = line.split("\t") entryId = fields[0].strip().upper() assemId = fields[1].strip() if entryId not in assemD: assemD[entryId.upper()] = [] assemD[entryId.upper()].append(assemId) # # fp = os.path.join(dirPath, "status", "pdb_bundle_index_list.tsv") bundleIdList = self.__mU.doImport(fp, "list") bundleD = {} for entryId in bundleIdList: bundleD[entryId.strip().upper()] = True # fp = os.path.join(dirPath, "status", "validation_report_list_new.tsv") vList = self.__mU.doImport(fp, "list") valD = {} valImageD = {} valCifD = {} for line in vList: fields = line.split("\t") entryId = fields[0].strip().upper() imageFlag = fields[1].strip().upper() valD[entryId] = True valImageD[entryId] = imageFlag == "Y" if len(fields) > 2: valCifD[entryId] = fields[2].strip().upper() == "Y" # # fp = os.path.join(dirPath, "status", "entries_without_polymers.tsv") pList = self.__mU.doImport(fp, "list") pD = {} for entryId in pList: pD[entryId.strip().upper()] = False # # fp = os.path.join(dirPath, "status", "nmr_restraints_v2_list.tsv") nmrV2List = self.__mU.doImport(fp, "list") nmrV2D = {} for entryId in nmrV2List: nmrV2D[entryId.strip().upper()] = False # if self.__cfgOb: configName = self.__cfgOb.getDefaultSectionName() fp = self.__cfgOb.getPath("RCSB_EDMAP_LIST_PATH", sectionName=configName) else: fp = os.path.join(dirPath, "status", "edmaps.json") qD = self.__mU.doImport(fp, "json") edD = {} for entryId in qD: edD[entryId.upper()] = qD[entryId] # fp = os.path.join(dirPath, "status", "obsolete_entry.json_2") oL = self.__mU.doImport(fp, "json") obsD = {} for dD in oL: obsD[dD["entryId"].upper()] = True logger.info("Removed entry length %d", len(obsD)) # # # Revise content types bundles and assemblies # for qId, dD in tD.items(): entryId = qId.strip().upper() if entryId in obsD: continue rD[entryId] = [] if entryId in bundleD: rD[entryId].append("entry PDB bundle") if "coordinates" in dD: rD[entryId].append("entry mmCIF") rD[entryId].append("entry PDBML") if "PDB format coordinates" in dD: rD[entryId].append("entry PDB") if entryId in assemD: if entryId in bundleD: rD[entryId].append("assembly mmCIF") else: rD[entryId].append("assembly PDB") # for cType in dD: if cType not in [ "coordinates", "PDB format coordinates", "NMR restraints" ]: rD[entryId].append(cType) if cType == "NMR restraints": rD[entryId].append("NMR restraints V1") if entryId in nmrV2D: rD[entryId].append("NMR restraints V2") # if entryId in valD: rD[entryId].append("validation report") if entryId in valImageD and valImageD[entryId]: rD[entryId].append("validation slider image") if entryId in valCifD and valCifD[entryId]: rD[entryId].append("validation data mmCIF") if entryId in edD: rD[entryId].append("2fo-fc Map") rD[entryId].append("fo-fc Map") rD[entryId].append("Map Coefficients") if entryId not in pD: rD[entryId].append("FASTA sequence") # for entryId in rD: if entryId in assemD: retD[entryId] = { "assembly_ids": assemD[entryId], "repository_content_types": rD[entryId] } else: retD[entryId] = {"repository_content_types": rD[entryId]} return retD except Exception as e: logger.exception("Failing with %s", str(e)) return retD def __getHoldingsUnreleased(self, dirPath=None): """Parse the legacy exchange status file containing details for unreleased entries: Args: updateId (str): update identifier (e.g. 2018_32) dirPath (str): directory path containing update list files **kwargs: unused Returns: list: List of dictionaries containing data for rcsb_repository_holdings_unreleased """ retD = {} fields = [] dirPath = dirPath if dirPath else self.__sandboxPath try: # fp = os.path.join(dirPath, "status", "status_v2.txt") lines = self.__mU.doImport(fp, "list") for line in lines: fields = line.split("\t") if len(fields) < 15: continue entryId = fields[1] dD = { "status_code": fields[2] # 'sg_project_name': fields[14], # 'sg_project_abbreviation_': fields[15]} } if fields[11] and fields[11].strip(): dD["title"] = fields[11] if fields[10] and fields[10].strip(): dD["audit_authors"] = [ t.strip() for t in fields[10].split(";") ] # d['audit_authors'] = fields[10] if fields[12] and fields[12].strip(): dD["author_prerelease_sequence_status"] = str( fields[12]).strip().replace("REALEASE", "RELEASE") dTupL = [ ("deposit_date", 3), ("deposit_date_coordinates", 4), ("deposit_date_structure_factors", 5), ("hold_date_structure_factors", 6), ("deposit_date_nmr_restraints", 7), ("hold_date_nmr_restraints", 8), ("release_date", 9), ("hold_date_coordinates", 13), ] for dTup in dTupL: fN = dTup[1] if fields[fN] and len(fields[fN]) >= 4: dD[dTup[0]] = dateutil.parser.parse( fields[fN]) if self.__assignDates else fields[fN] # retD[entryId] = {k: v for k, v in dD.items() if v} except Exception as e: logger.error("Fields: %r", fields) logger.exception("Failing with %s", str(e)) return retD def __getHoldingsRemoved(self, dirPath=None): """Parse the legacy exchange file containing details of removed entries: { "entryId": "125D", "obsoletedDate": "1998-04-15", "title": "SOLUTION STRUCTURE OF THE DNA-BINDING DOMAIN OF CD=2=-GAL4 FROM S. CEREVISIAE", "details": "", "depositionAuthors": [ "Baleja, J.D.", "Wagner, G." ], "depositionDate": "1993-05-05", "releaseDate": "1994-01-31", "obsoletedBy": [ "1AW6" ], "content_type": [ "entry mmCIF", "entry PDB", "entry PDBML", "structure factors" ]}, Returns; (dict) : dictionaries for rcsb_repository_holdings_removed (dict) : dictionaries for rcsb_repository_holdings_removed_audit_authors (dict) : dictionaries for rcsb_repository_holdings_superseded """ # rcsb_repository_holdings_removed rL1D = {} # rcsb_repository_holdings_removed_audit_authors rL2D = {} # rcsb_repository_holdings_superseded rL3D = {} # sD = {} dirPath = dirPath if dirPath else self.__sandboxPath try: fp = os.path.join(dirPath, "status", "obsolete_entry.json_2") dD = self.__mU.doImport(fp, "json") for dT in dD: # --- ctL = dT["content_type"] if "content_type" in dT else [] # --- rbL = dT["obsoletedBy"] if "obsoletedBy" in dT else [] d1 = { "title": dT["title"], "details": dT["details"], "audit_authors": dT["depositionAuthors"] } if rbL: d1["id_codes_replaced_by"] = [t.upper() for t in rbL] if ctL: d1["repository_content_types"] = ctL dTupL = [("deposit_date", "depositionDate"), ("remove_date", "obsoletedDate"), ("release_date", "releaseDate")] for dTup in dTupL: fN = dTup[1] if dT[fN] and len(dT[fN]) > 4: d1[dTup[0]] = dateutil.parser.parse( dT[fN]) if self.__assignDates else dT[fN] rL1D[dT["entryId"]] = {k: v for k, v in d1.items() if v} # for ii, author in enumerate(dT["depositionAuthors"]): d2 = {"ordinal_id": ii + 1, "audit_author": author} rL2D.setdefault(dT["entryId"], []).append(d2) if "obsoletedBy" in dT: for pdbId in dT["obsoletedBy"]: if pdbId not in sD: sD[pdbId] = [] sD[pdbId].append(dT["entryId"]) # for pdbId in sD: if sD[pdbId]: rL3D[pdbId] = {"id_codes_superseded": sD[pdbId]} logger.debug("Computed data lengths %d %d %d", len(rL1D), len(rL2D), len(rL3D)) except Exception as e: logger.exception("Failing with %s", str(e)) return rL1D, rL2D, rL3D def __getHoldingsPrerelease(self, dirPath=None): """Parse the legacy exchange status file containing prerelease sequence data. Args: updateId (str): update identifier (e.g. 2018_32) dirPath (str): directory path containing update list files **kwargs: unused Returns: list: List of dictionaries containing data for rcsb_repository_holdings_prerelease >6I99 Entity 1 HHHHHHENLYFQGELKREEITLLKELGSGQFGVVKLGKWKGQYDVAVKMIKEG.... >6JKE Entity 1 GRVTNQLQYLHKVVMKALWKHQFAWPFRQPVDAVKLGLPDYHKIIKQPMDMGTI.... """ retD = {} fields = [] dirPath = dirPath if dirPath else self.__sandboxPath try: # Get prerelease sequence data fp = os.path.join(dirPath, "sequence", "pdb_seq_prerelease.fasta") sD = self.__mU.doImport(fp, "fasta", commentStyle="prerelease") seqD = {} for sid in sD: fields = sid.split("_") entryId = str(fields[0]).upper() entityId = str(fields[1]) if entryId not in seqD: seqD[entryId] = [] seqD[entryId].append((entityId, sD[sid]["sequence"])) logger.debug("Loaded prerelease sequences for %d entries", len(seqD)) # for entryId, seqTupL in seqD.items(): # dD = {"seq_one_letter_code": seqL} logger.debug("Adding prerelease sequences for %s", entryId) for entityId, seqS in seqTupL: if not seqS: continue retD.setdefault(entryId, []).append({ "entity_id": entityId, "seq_one_letter_code": seqS }) # # retD[entryId] = {k: v for k, vTup in dD.items() if vTup[1]} except Exception as e: logger.error("Fields: %r", fields) logger.exception("Failing with %s", str(e)) return retD
class IMGTTargetProvider(StashableBase): """Accessors for IMGT target annotations.""" def __init__(self, cachePath, useCache, **kwargs): # self.__cachePath = cachePath self.__dirName = "IMGT-targets" imgtDumpUrl = kwargs.get("IMGTDumpUrl", None) super(IMGTTargetProvider, self).__init__(self.__cachePath, [self.__dirName]) self.__dirPath = os.path.join(self.__cachePath, self.__dirName) self.__version = None # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__imgtD = self.__reload(self.__dirPath, useCache=useCache, imgtDumpUrl=imgtDumpUrl) # def testCache(self, minCount=1000): if self.__imgtD and "chains" in self.__imgtD and len( self.__imgtD["chains"]) > minCount: return True else: return False def getVersion(self): return self.__version def getChains(self): return self.__imgtD["chains"] def __reload(self, dirPath, useCache=False, imgtDumpUrl=None, testList=None, maxCount=None): imgtD = {} startTime = time.time() fU = FileUtil() fU.mkdir(dirPath) # imgtDataPath = os.path.join(self.__dirPath, "imgt-data.json") # logger.info("useCache %r imgtFeaturePath %r", useCache, imgtDataPath) if useCache and self.__mU.exists(imgtDataPath): imgtD = self.__mU.doImport(imgtDataPath, fmt="json") self.__version = imgtD["version"] else: imgtDumpUrl = imgtDumpUrl if imgtDumpUrl else "http://www.imgt.org/download/3Dstructure-DB/IMGT3DFlatFiles.tgz" imgtReadmeUrl = "http://www.imgt.org/download/3Dstructure-DB/RELEASE" imgtDumpFileName = fU.getFileName(imgtDumpUrl) imgtDumpPath = os.path.join(dirPath, imgtDumpFileName) imgtReleasePath = os.path.join(dirPath, "IMGT-release.txt") _, fn = os.path.split(imgtDumpUrl) imgtFlatFilePath = os.path.join(self.__dirPath, fn[:-4]) # logger.info("Fetching url %s path %s", imgtDumpUrl, imgtDumpPath) ok1 = fU.get(imgtDumpUrl, imgtDumpPath) ok2 = fU.get(imgtReadmeUrl, imgtReleasePath) fU.unbundleTarfile(imgtDumpPath, dirPath=dirPath) logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1 and ok2, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # --- readmeLines = self.__mU.doImport(imgtReleasePath, fmt="list") self.__version = readmeLines[0].strip() if readmeLines else None logger.info("IMGT version %r", self.__version) # --- chainD, rawD = self.__imgtFlatFileProcessor(imgtFlatFilePath, maxCount=maxCount, testList=testList) # --- tS = datetime.datetime.now().isoformat() # vS = datetime.datetime.now().strftime("%Y-%m-%d") if testList: imgtD = { "version": self.__version, "date": tS, "chains": chainD, "raw": rawD } else: imgtD = { "version": self.__version, "date": tS, "chains": chainD } ok = self.__mU.doExport(imgtDataPath, imgtD, fmt="json", indent=3) logger.info("Completed flatfile prep (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return imgtD def exportFasta(self, withGaps=False): """ Example: The IMGT/GENE-DB FASTA header contains 15 fields separated by '|': 1. IMGT/LIGM-DB accession number(s) 2. IMGT gene and allele name 3. species (may be followed by an "_" and the name of the strain, breed or isolate, if defined) 4. IMGT gene and allele functionality 5. exon(s), region name(s), or extracted label(s) 6. start and end positions in the IMGT/LIGM-DB accession number(s) 7. number of nucleotides in the IMGT/LIGM-DB accession number(s) 8. codon start, or 'NR' (not relevant) for non coding labels 9. +n: number of nucleotides (nt) added in 5' compared to the corresponding label extracted from IMGT/LIGM-DB 10. +n or -n: number of nucleotides (nt) added or removed in 3' compared to the corresponding label extracted from IMGT/LIGM-DB 11. +n, -n, and/or nS: number of added, deleted, and/or substituted nucleotides to correct sequencing errors, or 'not corrected' if non corrected sequencing errors 12. number of amino acids (AA): this field indicates that the sequence is in amino acids 13. number of characters in the sequence: nt (or AA)+IMGT gaps=total 14. partial (if it is) 15. reverse complementary (if it is) """ # -- fU = FileUtil() fU.mkdir(self.__dirPath) if withGaps: imgtTargetUrl = "http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-AA-WithGaps-F+ORF+inframeP" else: imgtTargetUrl = "http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-AA-WithoutGaps-F+ORF+inframeP" imgtTargetFileName = fU.getFileName(imgtTargetUrl) rawFastaPath = os.path.join(self.__dirPath, imgtTargetFileName) # -- logger.debug("Fetching url %s path %s", imgtTargetUrl, rawFastaPath) ok = fU.get(imgtTargetUrl, rawFastaPath) logger.info("Fetch status (%r) url %s path %s", ok, imgtTargetUrl, rawFastaPath) # -- fastaPath = os.path.join(self.__dirPath, "imgt-reference.fa") taxonPath = os.path.join(self.__dirPath, "imgt-reference-taxon.tdd") tP = TaxonomyProvider(cachePath=self.__cachePath, useCache=True) ok = tP.testCache() if not ok: tP = TaxonomyProvider(cachePath=self.__cachePath, useCache=False) rawQD = self.__mU.doImport(rawFastaPath, fmt="fasta", commentStyle="default") oD = {} taxonL = [] for queryId, sD in rawQD.items(): qL = queryId.split("|") tL = qL[2].split("_") taxName = tL[0] taxVar = tL[1].replace(" ", "_") if len(tL) > 1 else None taxId = tP.getTaxId(taxName) if taxId: tD = { "seqId": qL[0], "imgtGene": qL[1], "functionality": qL[3], "labels": qL[4], "taxId": taxId } if taxVar: tD["taxVar"] = taxVar sD.update(tD) else: logger.info("Unknown taxonomy %r (taxName=%r)", queryId, taxName) sD["sequence"].replace(".", "-") seqId = "" cL = [] for k, v in sD.items(): if k in ["sequence"]: continue cL.append(str(v)) cL.append(str(k)) seqId = "|".join(cL) oD[seqId] = sD taxonL.append("%s\t%s" % (seqId, taxId)) # ok1 = self.__mU.doExport(taxonPath, taxonL, fmt="list") ok2 = self.__mU.doExport(fastaPath, oD, fmt="fasta", makeComment=True) return ok1 and ok2 def __imgtFlatFileProcessor(self, flatFilePath, maxCount=None, testList=None): chainD = {} rawD = {} failures = [] idList = [] ic = 0 filePattern = os.path.join(flatFilePath, "*.pdb.gz") logger.info("Collecting flat files with pattern %r", filePattern) for fp in glob.glob(filePattern): ic += 1 if maxCount and ic > maxCount: break logger.debug("Processing file %r", fp) _, fn = os.path.split(fp) pdbId = fn[5:9].lower() if testList and pdbId not in testList: continue idList.append(pdbId) cD = {} tmpD = {} with gzip.open(fp, "rb") as ifh: try: cD, tmpD = self.__imgtRemarkParser(pdbId, ifh) except Exception as e: failures.append(pdbId) logger.exception("Failing for %r with %s", pdbId, str(e)) continue # -- chainD.update(cD) rawD[pdbId] = tmpD # logger.info("ID List (%d)", len(set(idList))) sL = list(rawD.keys()) logger.info("Successes (%d) chains (%d)", len(sL), len(chainD)) logger.info("Exceptions (%d) %r", len(failures), failures) mL = list(set(idList) - set(sL)) logger.info("Missing (%d) %r", len(mL), mL) # return chainD, rawD def __imgtRemarkParser(self, pdbId, ifh): """IMGT REMARK 410 Parser Args: pdbId (str): input PDB ID [description] ifh (obj): input file handle Returns: dict: content dictionary of parsed details """ sD = { "IMGT protein name": { "section": "proteins" }, "ligand(s)": { "section": "ligands" }, "Chain ID ": { "section": "chains" }, } pD = { "Chain ID ": { "ky": "chain_data", "action": "appendAll" }, # "ligand(s)": { "ky": "ligands", "action": "appendLine" }, "IMGT protein name": { "ky": "proteinName", "action": "appendLine" }, "IMGT receptor type": { "ky": "receptorType", "action": "appendLine" }, "IMGT receptor description": { "ky": "receptorDescription", "action": "appendLine" }, "Species": { "ky": "species", "action": "appendLine" }, "Chain ID": { "ky": "chain_ids", "action": "appendLine" }, # } cD = {} oD = {} curSection = None action = None curKy = None curSection = None curChain = None for ul in ifh.readlines(): line = ul.decode("utf-8") if not line.startswith("REMARK 410 "): continue # curLine = line[11:-1] for section, sectionD in sD.items(): if curLine.startswith(section): logger.debug("%r Detected section %r", pdbId, section) curSection = sectionD["section"] first = True break # for label, labelD in pD.items(): if curLine.startswith(label): logger.debug("%r detected label %r", pdbId, label) curKy = labelD["ky"] action = labelD["action"] first = True break # if action == "appendLine": if first: first = False logger.debug("Skipped %r", curKy) continue logger.debug(">> SECTION %r KEY %r Adding %r", curSection, curKy, curLine.strip()) oD.setdefault(curSection, {}).setdefault(curKy, []).append(curLine.strip()) elif action == "appendAll": if first: tL = [t for t in curLine.split(" ") if t] curChain = tL[2] first = False logger.debug("%r current chain key %r", pdbId, curChain) continue oD.setdefault(curSection, {}).setdefault(curChain, []).append(curLine) # -- rD raw extracted REM 410 content # Post-process the domain annotations and alignments for chId, cL in oD["chains"].items() if "chains" in oD else {}: logger.debug("%r chainId %r (%d)", pdbId, chId, len(cL)) tD = {} tD["description"] = self.__getField( cL, label="IMGT chain description ") tD["domains"] = self.__splitDomains(pdbId, cL) # aD = self.__getAlignment(pdbId, cL) if aD and (len(aD["alignMapDL"]) == len(tD["domains"])): aL = aD["alignMapDL"] for ii, dD in enumerate(tD["domains"].values()): if aL and len(aL) > ii: dD["alignment"] = aL[ii] # -- # Integrate raw "proteins" content if "proteins" in oD: paD = self.__getProteinAnnotations(chId, oD["proteins"]) logger.debug("paD %r", paD) tD.update(paD) # -- cD[chId] = tD # return cD, oD # def __getProteinAnnotations(self, chainId, pLD): """ Example: "proteins": { "proteinName": [ "IgG4 Sigma1 Fc" ], "receptorType": [ "IG" ], "receptorDescription": [ "FUSION-[TNFRSF1B]2-FC-GAMMA-1" ], "species": [ "H**o sapiens (human)" ], "chain_ids": [ "5w5m_A,5w5m_B" ] }, """ retD = {} try: ind = -1 if "chain_ids" in pLD: for ii, chS in enumerate(pLD["chain_ids"]): if chainId in chS: ind = ii break if ind >= 0: for ky in [ "proteinName", "receptorType", "receptorDescription", "species" ]: if ky in pLD and len(pLD[ky]) > ind: retD[ky] = pLD[ky][ind] else: logger.info("missing chain %r in %r", chainId, pLD["chain_ids"]) else: logger.info("missing chain details for %r in %r", chainId, pLD) except Exception as e: logger.exception("Failing for %r with %s", chainId, str(e)) return retD def __getField(self, lineList, label): label = "IMGT chain description " ret = None for line in lineList: if line.startswith(label): ret = line[len(label):] break return ret def __splitDomains(self, pdbId, lineList): retD = {} startLabel1 = "-DOMAIN IMGT domain description " startLabel2 = "-LIKE-DOMAIN IMGT domain description " startLabel3 = "-LIKE-DOMAIN IMGT domain description " # geneLabel1 = "-DOMAIN IMGT gene and allele " geneLabel2 = "-LIKE-DOMAIN IMGT gene and allele " geneLabel3 = "-LIKE-DOMAIN IMGT gene and allele " tD = {} domain = None numD = 0 for line in lineList: if line[1:].startswith(startLabel1): numD += 1 domain = line.split(" ")[0].strip() description = line[len(startLabel1) + 1:].strip() continue if line[1:].startswith(startLabel2): numD += 1 domain = line.split(" ")[0].strip() description = line[len(startLabel2) + 1:].strip() continue if line[2:].startswith(startLabel3): numD += 1 domain = line.split(" ")[0].strip() description = line[len(startLabel3) + 1:].strip() continue if domain and line.startswith(domain): tD.setdefault((domain, description, numD), []).append(line) # qD = {} for (domain, description, numD), cL in tD.items(): for line in cL: if line[1:].startswith(geneLabel1): qD.setdefault( (domain + "|" + description + "|" + str(numD)), []).append(line[len(geneLabel1) + 1:]) if line[1:].startswith(geneLabel2): qD.setdefault( (domain + "|" + description + "|" + str(numD)), []).append(line[len(geneLabel2) + 1:]) if line[2:].startswith(geneLabel3): qD.setdefault( (domain + "|" + description + "|" + str(numD)), []).append(line[len(geneLabel3) + 1:]) # # "H**o sapiens IGHG4*01 (96.4%), H**o sapiens IGHG4*03 (96.4%), H**o sapiens IGHG4*04 (96.4%)", for ky, cL in qD.items(): logger.debug("cL %r", cL) tS = "".join(cL) tS = " ".join(tS.split()) logger.debug("tS %r", tS) # handle some missing commas in the raw data - tS = tS.replace(") ", "),") logger.debug("TAX> %r tS %r", pdbId, tS) gnSL = tS.split(",") gDL = [] for gnS in gnSL: tL = gnS.strip().split() logger.debug("tL %r", tL) geneAllele = tL[-2] taxName = " ".join(tL[:-2]) gDL.append({"taxName": taxName, "geneAllele": geneAllele}) retD[ky] = {"geneAlleles": gDL} return retD def __getAlignment(self, pdbId, lineList): try: startPat = "Chain amino acid sequence" endPat1 = "-DOMAIN" endPat2 = "-LIKE-DOMAIN" aL = [] keep = False for line in lineList: if line.startswith(startPat): keep = True continue if line[1:].startswith(endPat1): break if line[1:].startswith(endPat2): break if line[2:].startswith(endPat2): break if keep: aL.append(line) # sLine = "".join(aL[1::2]) mLine = "".join(aL[0::2]) # Lots of cases with (UNK) sequences where REM 410 format is corrupt -- # if "(UNK)" in sLine: # logger.error("%r unknown or modified residue in one-letter-code sequence %r", pdbId, sLine[:30] + "...") # return {} ok, indD = self.__findMatchingGroups(mLine, startGroup="[", endGroup="]") if not ok: logger.error("%r determining alignment boundaries fails", pdbId) return {} pdbRangeL = [] for iBeg, iEnd in indD.items(): if iEnd - iBeg <= 3: continue pdbRangeL.append({ "begEntitySeqId": iBeg + 1, "endEntitySeqId": iEnd + 1 }) ok, indD = self.__findMatchingGroups(mLine, startGroup="(", endGroup=")") if not ok: logger.error("%r determining alignment boundaries fails", pdbId) return {} imgtRangeL = [] try: for k, v in indD.items(): tS = mLine[k + 1:v] tL = tS.split("-") iBeg = tL[0] iEnd = tL[1] imgtRangeL.append({ "begIMGTSeqId": iBeg, "endIMGTSeqId": iEnd }) except Exception as e: logger.error("%r parsing boundaries fails with %r for %r", pdbId, str(e), mLine[:30] + "...") return {} # alignMapDL = [] for pdbD, imgtD in zip(pdbRangeL, imgtRangeL): dD = pdbD dD.update(imgtD) alignMapDL.append(dD) return { "mapping": mLine, "pdbSeq": sLine, "alignMapDL": alignMapDL } except Exception as e: logger.exception("Failing %r with %r with %s", pdbId, mLine, str(e)) return {} def __findMatchingGroups(self, strIn, startGroup="[", endGroup="]"): retD = {} dStack = [] ok = True try: for i, cS in enumerate(strIn): if cS == startGroup: dStack.append(i) elif cS == endGroup: if len(dStack) == 0: logger.error( "No matching closing group at position: %r", str(i)) ok = False retD[dStack.pop()] = i if len(dStack) > 0: logger.error("No matching opening group at: %r", strIn(dStack.pop())) ok = False except Exception: pass return ok, retD
class ScopClassificationProvider(StashableBase): """Extract SCOPe assignments, term descriptions and SCOP classifications from SCOP flat files. """ def __init__(self, **kwargs): # self.__dirName = "scop" if "cachePath" in kwargs: self.__cachePath = os.path.abspath(kwargs.get("cachePath", None)) self.__scopDirPath = os.path.join(self.__cachePath, self.__dirName) else: self.__scopDirPath = kwargs.get("scopDirPath", ".") self.__cachePath, self.__dirName = os.path.split( os.path.abspath(self.__scopDirPath)) super(ScopClassificationProvider, self).__init__(self.__cachePath, [self.__dirName]) # useCache = kwargs.get("useCache", True) # urlTarget = kwargs.get("scopTargetUrl", "http://scop.berkeley.edu/downloads/update") # self.__version = kwargs.get("scopVersion", "2.07-2019-07-23") # self.__version = kwargs.get("scopVersion", "2.07-2020-01-23") # self.__version = kwargs.get("scopVersion", "2.07-2020-05-07") # self.__version = kwargs.get("scopVersion", "2.07-2021-07-07") urlTarget = kwargs.get("scopTargetUrl", "http://scop.berkeley.edu/downloads/parse") self.__version = kwargs.get("scopVersion", "2.08-stable") # urlBackupPath = kwargs.get( "scopUrlBackupPath", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/SCOP" ) # self.__mU = MarshalUtil(workPath=self.__scopDirPath) self.__nD, self.__pD, self.__pdbD = self.__reload( urlTarget, self.__scopDirPath, useCache=useCache, version=self.__version) # if not useCache and not self.testCache(): ok = self.__fetchFromBackup(urlBackupPath, self.__scopDirPath) if ok: self.__nD, self.__pD, self.__pdbD = self.__reload( urlTarget, self.__scopDirPath, useCache=True, version=self.__version) def testCache(self): logger.info("SCOP lengths nD %d pD %d pdbD %d", len(self.__nD), len(self.__pD), len(self.__pdbD)) if (len(self.__nD) > 100) and (len(self.__pD) > 100) and (len( self.__pdbD) > 100): return True return False def __fetchFromBackup(self, urlBackupPath, scopDirPath): pyVersion = sys.version_info[0] fn = "scop_domains-py%s.pic" % str(pyVersion) scopDomainPath = os.path.join(scopDirPath, fn) self.__mU.mkdir(scopDirPath) # backupUrl = urlBackupPath + "/" + fn logger.info("Using backup URL %r", backupUrl) fU = FileUtil() ok = fU.get(backupUrl, scopDomainPath) return ok def getScopVersion(self): return self.__version def getScopSunIds(self, pdbId, authAsymId): """ Get the sunid of the domain assignment for the assignment - aD[(pdbId, authAsymId)] = [(sunId, domainId, (authAsymId, resBeg, resEnd))] aD[(pdbId, authAsymId)] = [(domSunId, domainId, sccs, (authAsymId, resBeg, resEnd))] """ try: return list( set([tup[0] for tup in self.__pdbD[(pdbId, authAsymId)]])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getScopDomainNames(self, pdbId, authAsymId): try: return list( set([tup[1] for tup in self.__pdbD[(pdbId, authAsymId)]])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getScopSccsNames(self, pdbId, authAsymId): try: return list( set([tup[2] for tup in self.__pdbD[(pdbId, authAsymId)]])) except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getScopResidueRanges(self, pdbId, authAsymId): try: return [(tup[0], tup[1], tup[2], tup[3][0], tup[3][1], tup[3][2]) for tup in self.__pdbD[(pdbId, authAsymId)]] except Exception as e: logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e)) return [] def getScopName(self, sunId): try: return self.__nD[sunId] except Exception: logger.debug("Undefined SCOP sunId %r", sunId) return None def getIdLineage(self, sunId): pList = [] try: pList.append(sunId) pt = self.__pD[sunId] while (pt is not None) and (pt != 0): pList.append(pt) pt = self.__pD[pt] except Exception as e: logger.exception("Failing for %r with %s", sunId, str(e)) # pList.reverse() return pList def getNameLineage(self, sunId): try: return [self.getScopName(cId) for cId in self.getIdLineage(sunId)] except Exception as e: logger.exception("Failing for %r with %s", sunId, str(e)) return None def getTreeNodeList(self): return self.__exportTreeNodeList(self.__nD, self.__pD) # ### ### # def __reload(self, urlTarget, scopDirPath, useCache=True, version=None): nD = pD = pdbD = {} pyVersion = sys.version_info[0] scopDomainPath = os.path.join(scopDirPath, "scop_domains-py%s.pic" % str(pyVersion)) self.__mU.mkdir(scopDirPath) # # scopDomainPath = os.path.join(scopDirPath, "scop_domains.json") # if useCache and self.__mU.exists(scopDomainPath): sD = self.__mU.doImport(scopDomainPath, fmt="pickle") logger.debug( "SCOPe name length %d parent length %d assignments %d", len(sD["names"]), len(sD["parents"]), len(sD["assignments"])) nD = sD["names"] pD = sD["parents"] pdbD = sD["assignments"] elif not useCache: ok = False minLen = 1000 logger.info( "Fetch SCOPe name and domain assignment data using target URL %s", urlTarget) desL, claL, hieL = self.__fetchFromSource(urlTarget, version=version) # nD = self.__extractDescription(desL) dmD = self.__extractAssignments(claL) pD = self.__extractHierarchy(hieL, nD) pdbD = self.__buildAssignments(dmD) logger.info("nD %d dmD %d pD %d", len(nD), len(dmD), len(pD)) scopD = {"names": nD, "parents": pD, "assignments": pdbD} if (len(nD) > minLen) and (len(pD) > minLen) and (len(pD) > minLen): ok = self.__mU.doExport(scopDomainPath, scopD, fmt="pickle") logger.debug("Cache save status %r", ok) # return nD, pD, pdbD def __fetchFromSource(self, urlTarget, version="2.07-2019-07-23"): """Fetch the classification names and domain assignments from SCOPe repo. # dir.des.scope.2.07-2019-03-07.txt dir.cla.scope.2.07-2019-03-07.txt dir.hie.scope.2.07-2019-03-07.txt """ encoding = "utf-8-sig" if sys.version_info[0] > 2 else "ascii" fn = "dir.des.scope.%s.txt" % version url = os.path.join(urlTarget, fn) desL = self.__mU.doImport(url, fmt="tdd", rowFormat="list", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(desL)) # fn = "dir.cla.scope.%s.txt" % version url = os.path.join(urlTarget, fn) claL = self.__mU.doImport(url, fmt="tdd", rowFormat="list", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(claL)) # fn = "dir.hie.scope.%s.txt" % version url = os.path.join(urlTarget, fn) hieL = self.__mU.doImport(url, fmt="tdd", rowFormat="list", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(hieL)) # return desL, claL, hieL def __extractDescription(self, desL): """ From dir.des.scope.2.07-2019-03-07.txt: # dir.des.scope.txt # SCOPe release 2.07 (2018-03-02, last updated 2019-03-07) [File format version 1.02] # http://scop.berkeley.edu/ # Copyright (c) 1994-2019 the SCOP and SCOPe authors; see http://scop.berkeley.edu/about 46456 cl a - All alpha proteins 46457 cf a.1 - Globin-like 46458 sf a.1.1 - Globin-like 46459 fa a.1.1.1 - Truncated hemoglobin 46460 dm a.1.1.1 - Protozoan/bacterial hemoglobin 116748 sp a.1.1.1 - Bacillus subtilis [TaxId: 1423] 113449 px a.1.1.1 d1ux8a_ 1ux8 A: 46461 sp a.1.1.1 - Ciliate (Paramecium caudatum) [TaxId: 5885] 14982 px a.1.1.1 d1dlwa_ 1dlw A: 100068 px a.1.1.1 d1uvya_ 1uvy A: 46462 sp a.1.1.1 - Green alga (Chlamydomonas eugametos) [TaxId: 3054] 14983 px a.1.1.1 d1dlya_ 1dly A: 100067 px a.1.1.1 d1uvxa_ 1uvx A: 63437 sp a.1.1.1 - Mycobacterium tuberculosis, HbN [TaxId: 1773] 164742 px a.1.1.1 d2gkma_ 2gkm A: 164743 px a.1.1.1 d2gkmb_ 2gkm B: """ nD = {} for fields in desL: if fields[1] in ["cl", "cf", "sf", "fa", "dm"]: nD[int(fields[0])] = str(fields[4]).strip() logger.debug("Length of name dictionary %d", len(nD)) nD[0] = "root" if 0 not in nD else nD[0] return nD def __extractAssignments(self, claL): """ returns: aD[sunId] = [(), ... ] From dir.cla.scope.2.07-2019-03-07.txt: # dir.cla.scope.txt # SCOPe release 2.07 (2018-03-02, last updated 2019-03-07) [File format version 1.02] # http://scop.berkeley.edu/ # Copyright (c) 1994-2019 the SCOP and SCOPe authors; see http://scop.berkeley.edu/about # old_sunId sccs sunid d1ux8a_ 1ux8 A: a.1.1.1 113449 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=116748,px=113449 d1dlwa_ 1dlw A: a.1.1.1 14982 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46461,px=14982 d1uvya_ 1uvy A: a.1.1.1 100068 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46461,px=100068 d1dlya_ 1dly A: a.1.1.1 14983 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46462,px=14983 d1uvxa_ 1uvx A: a.1.1.1 100067 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46462,px=100067 d2gkma_ 2gkm A: a.1.1.1 164742 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164742 d2gkmb_ 2gkm B: a.1.1.1 164743 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164743 d2gl3a_ 2gl3 A: a.1.1.1 164754 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164754 d2gl3b_ 2gl3 B: a.1.1.1 164755 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164755 d1idra_ 1idr A: a.1.1.1 62301 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=62301 d1idrb_ 1idr B: a.1.1.1 62302 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=62302 d1rtea_ 1rte A: a.1.1.1 105096 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=105096 """ dmD = {} logger.info("Length of class list %d", len(claL)) rng = rngL = tL = None for fields in claL: try: rngL = str(fields[2]).strip().split(",") # dmTupL = [(tt[0], tt[1]) for tt in for rng.split(":") in rngL] # dmTupL = [] for rng in rngL: tL = [t for t in str(rng).strip().split(":") if len(t)] if len(tL) > 1: rL = tL[1].split("-") tt = (tL[0], rL[0], rL[1]) else: tt = (tL[0], None, None) dmTupL.append(tt) # # Get the sid of the domain - # sfL = str(fields[5]).strip().split(",") dmfL = sfL[4].split("=") dmf = int(dmfL[1]) # old domid sccs sunid for domain assignment dmD[int(fields[4])] = (fields[1], dmTupL, fields[0], fields[3], dmf) # except Exception as e: logger.exception( "Failing fields %r rngL %r rng %r tL %r with %s", fields, rngL, rng, tL, str(e)) # # logger.info("Length of domain assignments %d", len(dmD)) return dmD def __buildAssignments(self, dmD): """ Input internal data structure with domain assignments - dmD[sunId] = (pdbId, [(authAsymId, begRes, endRes), ...], domain_name, sccs, sid_domain_assigned) Returns: aD[(pdbId, authAsymId)] = [(domSunId, domainId, sccs, (authAsymId, resBeg, resEnd))] """ pdbD = {} for _, dTup in dmD.items(): for rTup in dTup[1]: pdbD.setdefault((dTup[0], rTup[0]), []).append( (dTup[4], dTup[2], dTup[3], rTup)) return pdbD def __extractHierarchy(self, hieL, nD): """ From dir.hie.scope.2.07-2019-03-07.txt: # dir.hie.scope.txt # SCOPe release 2.07 (2018-03-02, last updated 2019-03-07) [File format version 1.01] # http://scop.berkeley.edu/ # Copyright (c) 1994-2019 the SCOP and SCOPe authors; see http://scop.berkeley.edu/about 0 - 46456,48724,51349,53931,56572,56835,56992,57942,58117,58231,58788,310555 46456 0 46457,46556,46625,46688,46928,46954,46965,46996,47004,47013,47026,47039,47044,47049,47054,47059,47071,...,... 46457 46456 46458,46548 46458 46457 46459,46463,46532,74660,191420 46459 46458 46460,190322 """ pD = {} logger.debug("Length of input hierarchy list %d", len(hieL)) for fields in hieL: chId = int(fields[0]) # if chId not in nD: continue pId = int(fields[1]) if fields[1].isdigit() else None pD[chId] = pId # logger.info("Length of domain parent dictionary %d", len(pD)) return pD def __exportTreeNodeList(self, nD, pD): """Create node list from the SCOPe (sunid) parent and name/description dictionaries. Exclude the root node from the tree. """ # rootId = 0 pL = [rootId] logger.info("nD %d pD %d", len(nD), len(pD)) # create child dictionary cD = {} for ctId, ptId in pD.items(): cD.setdefault(ptId, []).append(ctId) # logger.debug("cD %d", len(cD)) # idL = [] for rootId in sorted(pL): visited = set([rootId]) queue = collections.deque(visited) while queue: tId = queue.popleft() idL.append(tId) if tId not in cD: # logger.warning("No children for scop tId %r", tId) continue for childId in cD[tId]: if childId not in visited: queue.append(childId) visited.add(childId) # dL = [] for tId in idL: displayName = nD[tId] if tId in nD else None ptId = pD[tId] if tId in pD else None lL = self.getIdLineage(tId)[1:] # # d = {'id': str(tId), 'name': displayName, 'lineage': [str(t) for t in lL], 'parents': [str(ptId)], 'depth': len(lL)} if tId == rootId: continue elif ptId == rootId: dD = {"id": str(tId), "name": displayName, "depth": 0} else: dD = { "id": str(tId), "name": displayName, "parents": [str(ptId)], "depth": len(lL) } dL.append(dD) return dL
class EntryInfoProvider(StashableBase): """Accessors (only) for entry-level annotations.""" def __init__(self, **kwargs): # self.__version = "0.50" cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) self.__dirName = "rcsb_entry_info" self.__dirPath = os.path.join(cachePath, self.__dirName) super(EntryInfoProvider, self).__init__(cachePath, [self.__dirName]) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__entryInfoD = self.__reload(fmt="json", useCache=useCache) # def testCache(self, minCount=1): if minCount == 0: return True if self.__entryInfoD and minCount and "entryInfo" in self.__entryInfoD and len( self.__entryInfoD["entryInfo"]) > minCount: logger.info("Entry annotations for (%d) entries", len(self.__entryInfoD["entryInfo"])) return True return False def getEntryInfo(self, entryId): """Return a dictionary of entry-level annotations. Returns: (dict): of entry-level annotations """ try: return self.__entryInfoD["entryInfo"][entryId.upper( )] if entryId.upper() in self.__entryInfoD["entryInfo"] else {} except Exception as e: logger.error("Failing with %r", str(e)) return {} def getEntriesByPolymerEntityCount(self, count): oL = [] try: for entryId, eD in self.__entryInfoD["entryInfo"].items(): if eD["polymer_entity_count"] == count: oL.append(entryId) except Exception as e: logger.error("Failing with %r", str(e)) return oL def __getEntryInfoFilePath(self, fmt="json"): baseFileName = "entry_info_details" fExt = ".json" if fmt == "json" else ".pic" fp = os.path.join(self.__dirPath, baseFileName + fExt) return fp def reload(self): """Reload from the current cache file.""" ok = False try: self.__entryInfoD = self.__reload(fmt="json", useCache=True) ok = self.__entryInfoD is not None except Exception as e: logger.exception("Failing with %s", str(e)) return ok def __reload(self, fmt="json", useCache=True): entryInfoFilePath = self.__getEntryInfoFilePath(fmt=fmt) tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) pcD = {"version": self.__version, "created": tS, "identifiers": {}} if useCache and self.__mU.exists(entryInfoFilePath): logger.info("Reading entry-info cached path %r", entryInfoFilePath) pcD = self.__mU.doImport(entryInfoFilePath, fmt=fmt) return pcD
class OeMoleculeProvider(object): """Utilities build and deliver OE molecule databases from PDB chemical component definition data""" def __init__(self, **kwargs): """Utilities build and deliver OE molecule databases from PDB chemical component definition data Args: cachePath (str, optional): path to the directory containing cache files (default: '.') molBuildType (str,optional): data source for building OE molecules (default: "model-xyz") oeFileNamePrefix (str, optional) file name prefix for all generated databases (default: "oe") """ # Database file names with be prefixed with base prefix plus the molecular build type and perception options oeFileNamePrefixBase = kwargs.get("oeFileNamePrefix", "oe") limitPerceptions = kwargs.get("limitPerceptions", False) molBuildType = kwargs.get("molBuildType", "model-xyz") if limitPerceptions and molBuildType in [ "oe-smiles", "oe-iso-smiles", "inchi" ]: self.__oeFileNamePrefix = oeFileNamePrefixBase + "-" + molBuildType + "-limit" else: self.__oeFileNamePrefix = oeFileNamePrefixBase + "-" + molBuildType # cachePath = kwargs.get("cachePath", ".") self.__dirPath = os.path.join(cachePath, "oe_mol") # self.__fpDbD = {} self.__ssDb = None self.__oeMolD = {} self.__oeMolDb = None self.__oeMolDbTitleD = None # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__molCount = self.__reload(**kwargs) def testCache(self): return self.__mU.exists( os.path.join(self.__dirPath, self.__getOeMolFileName())) and self.__mU.exists( os.path.join(self.__dirPath, self.__getOeMolDbFileName())) def getSubSearchDb(self, screenType="SMARTS", numProc=1, forceRefresh=False): if not self.__ssDb or forceRefresh: oeIo = OeIoUtils() fp = os.path.join(self.__dirPath, self.__getSubSearchFileName(screenType)) logger.info("Opening screened substructure search database %r", fp) self.__ssDb = oeIo.loadOeSubSearchDatabase(fp, screenType, numProc=numProc) return self.__ssDb def getFingerPrintDb(self, fpType, fpDbType="STANDARD", rebuild=False): if fpType not in self.__fpDbD or rebuild: oeIo = OeIoUtils() fastFpDbPath = os.path.join(self.__dirPath, self.__getFastFpDbFileName(fpType)) oeMolDbFilePath = os.path.join(self.__dirPath, self.__getOeMolDbFileName()) fpDb = oeIo.loadOeFingerPrintDatabase(oeMolDbFilePath, fastFpDbPath, inMemory=True, fpType=fpType, fpDbType=fpDbType) if fpDb: self.__fpDbD[fpType] = fpDb # return self.__fpDbD[fpType] def __getOeMolDbTitleIndex(self): oeMolDbTitleD = {} try: for idx in range(self.__oeMolDb.GetMaxMolIdx()): oeMolDbTitleD[self.__oeMolDb.GetTitle(idx)] = idx except Exception as e: logger.exception("Failing with %s", str(e)) return oeMolDbTitleD def getOeMolDatabase(self): if not self.__oeMolDb: oeIo = OeIoUtils() self.__oeMolDb = oeIo.loadOeBinaryDatabaseAndIndex( os.path.join(self.__dirPath, self.__getOeMolDbFileName())) self.__oeMolDbTitleD = self.__getOeMolDbTitleIndex() return self.__oeMolDb, self.__oeMolDbTitleD def getOeMolD(self): try: if not self.__oeMolD: oeIo = OeIoUtils() self.__oeMolD = oeIo.readOeBinaryMolCache( os.path.join(self.__dirPath, self.__getOeMolFileName())) logger.info("Loading OE binary molecule cache length %d", len(self.__oeMolD)) return self.__oeMolD except Exception as e: logger.exception("Failing with %s", str(e)) return None def getMol(self, ccId): try: if not self.__oeMolD: oeIo = OeIoUtils() self.__oeMolD = oeIo.readOeBinaryMolCache( os.path.join(self.__dirPath, self.__getOeMolFileName())) logger.info("Loading OE binary molecule cache length %d", len(self.__oeMolD)) return self.__oeMolD[ccId] except Exception as e: logger.exception("Get molecule %r failing with %s", ccId, str(e)) return None def __getFastFpDbFileName(self, fpType): return "%s-fast-fp-database-%s.fpbin" % (self.__oeFileNamePrefix, fpType) def __getSubSearchFileName(self, screenType): return "%s-ss-database-%s.oeb" % (self.__oeFileNamePrefix, screenType) def __getOeMolDbFileName(self): return "%s-mol-db-components.oeb" % self.__oeFileNamePrefix def __getOeMolFileName(self): return "%s-mol-components.oeb" % self.__oeFileNamePrefix def __reload(self, **kwargs): """Reload the dictionary of OE molecules and related data artifacts for chemical component definitions. Args: molBuildType (str): coordinates to use in building OE molecules from CIF components (model, ideal or None) limitPerceptions(bool): process input descriptors in essentially verbatim mode (default: True) fpTypeList (list): fingerprint type (TREE,PATH,MACCS,CIRCULAR,LINGO) screenTypeList (list): fast sub search screen type (MOLECULE, SMARTS, MDL, ... ) useCache (bool, optional): flag to use cached files. Defaults to True. cachePath (str): path to the top cache directory. Defaults to '.'. numProc (int): number processors to engage in screen substructure search database generation. molLimit (int, optional): limiting number of molecules in data store (default: 0 no limit) suppressHydrogens (bool, optional): flag to suppress explicit hydrogens in the OE data store. Returns: (dict): dictionary of constructed OE molecules """ useCache = kwargs.get("useCache", True) cachePath = kwargs.get("cachePath", ".") numProc = kwargs.get("numProc", 2) molLimit = kwargs.get("molLimit", 0) fpTypeList = kwargs.get("fpTypeList", ["TREE", "PATH", "MACCS", "CIRCULAR", "LINGO"]) # screenTypeList = kwargs.get("screenTypeList", ["SMARTS"]) screenTypeList = kwargs.get("screenTypeList", []) molBuildType = kwargs.get("molBuildType", "model-xyz") limitPerceptions = kwargs.get("limitPerceptions", False) quietFlag = kwargs.get("quietFlag", True) suppressHydrogens = kwargs.get("suppressHydrogens", False) logSizes = kwargs.get("logSizes", False) fpDbType = "STANDARD" # ccCount = 0 oeCount = 0 errCount = 0 failIdList = [] oeIo = OeIoUtils(quietFlag=quietFlag) # -------- oeMolFilePath = os.path.join(self.__dirPath, self.__getOeMolFileName()) if not useCache or (useCache and not self.__mU.exists(oeMolFilePath)): cmpKwargs = { k: v for k, v in kwargs.items() if k not in ["cachePath", "useCache", "molLimit"] } ccmP = ChemCompMoleculeProvider(cachePath=cachePath, useCache=True, molLimit=molLimit, **cmpKwargs) ok = ccmP.testCache(minCount=molLimit, logSizes=logSizes) ccObjD = ccmP.getMolD() if ok else {} ccCount = len(ccObjD) # ------- startTime = time.time() oeCount, errCount, failIdList = oeIo.buildOeBinaryMolCache( oeMolFilePath, ccObjD, molBuildType=molBuildType, quietFlag=quietFlag, fpTypeList=fpTypeList, limitPerceptions=limitPerceptions, suppressHydrogens=suppressHydrogens) logger.info( "Stored %d/%d OeMols (suppressH = %r) created with molBuildType %r (unconverted %d)", oeCount, ccCount, suppressHydrogens, molBuildType, errCount) if failIdList: logger.info("%r failures %r", molBuildType, failIdList) endTime = time.time() logger.info("Constructed %d/%d cached oeMols (%.4f seconds)", oeCount, ccCount, endTime - startTime) # -------- oeMolDbFilePath = os.path.join(self.__dirPath, self.__getOeMolDbFileName()) if not useCache or (useCache and not self.__mU.exists(oeMolDbFilePath)): startTime = time.time() molCount = oeIo.createOeBinaryDatabaseAndIndex( oeMolFilePath, oeMolDbFilePath) endTime = time.time() logger.info( "Created and stored %d indexed OeMols in OE database format (%.4f seconds)", molCount, endTime - startTime) # -------- if fpDbType == "FAST": for fpType in fpTypeList: startTime = time.time() # Fast FP search database file names fpPath = os.path.join(self.__dirPath, self.__getFastFpDbFileName(fpType)) if not useCache or (useCache and not self.__mU.exists(fpPath)): ok = oeIo.createOeFingerPrintDatabase(oeMolDbFilePath, fpPath, fpType=fpType) endTime = time.time() logger.info( "Created and stored %s fingerprint database (%.4f seconds)", fpType, endTime - startTime) # -------- if molBuildType in ["oe-iso-smiles"]: for screenType in screenTypeList: startTime = time.time() fp = os.path.join(self.__dirPath, self.__getSubSearchFileName(screenType)) if not useCache or (useCache and not self.__mU.exists(fp)): ok = oeIo.createOeSubSearchDatabase(oeMolFilePath, fp, screenType=screenType, numProc=numProc) endTime = time.time() logger.info( "Constructed screened substructure database (status %r) with screenType %s (%.4f seconds)", ok, screenType, endTime - startTime) # --------- ssDb = oeIo.loadOeSubSearchDatabase(fp, screenType=screenType, numProc=numProc) ok = ssDb.NumMolecules() == oeCount # ---------- return oeCount
class OeIoUtils(object): """Utility methods to manage OE specific IO and format conversion operations.""" def __init__(self, **kwargs): self.__dirPath = kwargs.get("dirPath", ".") self.__mU = MarshalUtil(workPath=self.__dirPath) self.__oeErrorLevel = oechem.OEErrorLevel_Info if kwargs.get("quietFlag", False): self.setQuiet() # def setQuiet(self): """Suppress OE warnings and processing errors""" oechem.OEThrow.SetLevel(oechem.OEErrorLevel_Quiet) self.__oeErrorLevel = oechem.OEErrorLevel_Quiet def getComponentDefinitions(self, ccdFilePath): rdCcObjL = [] try: rdCcObjL = self.__mU.doImport(ccdFilePath, fmt="mmcif") logger.info("Read %s with %d definitions", ccdFilePath, len(rdCcObjL)) except Exception as e: logger.exception("Loading %s failing with %s", ccdFilePath, str(e)) return rdCcObjL def suppressHydrogens(self, oeMol): tMol = oechem.OEMol(oeMol) if oeMol else None if tMol: oechem.OESuppressHydrogens(tMol) return tMol def chemCompToMol(self, ccdFilePath, molBuildType="model-xyz", quietFlag=False): retMolL = [] try: rdCcObjL = self.__mU.doImport(ccdFilePath, fmt="mmcif") logger.info("Read %s with %d definitions", ccdFilePath, len(rdCcObjL)) oemf = OeMoleculeFactory() if quietFlag: oemf.setQuiet() for ccObj in rdCcObjL: ccId = oemf.setChemCompDef(ccObj) if ccId: ok = oemf.build(molBuildType=molBuildType) if ok: oeMol = oemf.getMol() retMolL.append(oeMol) except Exception as e: logger.exception("Loading %s failing with %s", ccdFilePath, str(e)) return retMolL def descriptorToSmiles(self, descr, descrType, limitPerceptions=False, messageTag=None): """Parse the input descriptor string and return an OE smiles. Args: descr (str): descriptor descrType (str): descriptor type limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor messageTag (srt, optional): prefix string for error messages. Defaults to None. Returns: str: SMILES string """ try: if "SMILES" in descrType.upper() and "ISO" in descrType.upper(): oeMol = self.smilesToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: return oechem.OECreateIsoSmiString(oeMol) else: return None if "SMILES" in descrType.upper(): oeMol = self.smilesToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: return oechem.OECreateCanSmiString(oeMol) else: return None elif "INCHI" in descrType.upper(): oeMol = self.inchiToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: return oechem.OECreateIsoSmiString(oeMol) else: return None except Exception as e: logger.exception("Failing with %s", str(e)) return None def descriptorToMol(self, descr, descrType, limitPerceptions=False, messageTag=None): """Parse the input descriptor string and return a molecule object (OeGraphMol/OeQMol). Args: descr (str): descriptor descrType (str): descriptor type limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor messageTag (srt, optional): prefix string for error messages. Defaults to None. Returns: object: OeGraphMol()/OeQmol() object or None for failure ifs.SetFlavor(oechem.OEFormat_PDB, oechem.OEIFlavor_PDB_Default | oechem.OEIFlavor_PDB_DATA | oechem.OEIFlavor_PDB_ALTLOC) # noq """ try: if "SMILES" in descrType.upper() and "ISO" in descrType.upper(): oeMol = self.smilesToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: isoSmiles = oechem.OECreateIsoSmiString(oeMol) return self.smilesToMol(isoSmiles, limitPerceptions=limitPerceptions, messageTag=messageTag) else: return None if "SMILES" in descrType.upper(): oeMol = self.smilesToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: smiles = oechem.OECreateCanSmiString(oeMol) return self.smilesToMol(smiles, limitPerceptions=limitPerceptions, messageTag=messageTag) else: return None elif "INCHI" in descrType.upper(): oeMol = self.inchiToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: isoSmiles = oechem.OECreateIsoSmiString(oeMol) return self.smilesToMol(isoSmiles, limitPerceptions=limitPerceptions, messageTag=messageTag) elif "SMARTS" in descrType.upper(): return self.smartsToQmol(descr, messageTag=messageTag) else: return None except Exception as e: logger.exception("Failing with %s", str(e)) return None def smilesToMol(self, smiles, limitPerceptions=False, messageTag=None): """Parse the input SMILES string and return a molecule object (OeGraphMol). Args: smiles (str): SMILES string limitPerceptions (bool): flag to limit the perceptions/transformations of input SMILES Returns: object: OeGraphMol() object or None for failure """ try: label = messageTag if messageTag else "" mol = oechem.OEGraphMol() smiles.strip() if limitPerceptions: # convert the SMILES string into a molecule if oechem.OEParseSmiles(mol, smiles, False, False): return mol else: logger.debug( "%s parsing failed for input SMILES string %s", label, smiles) logger.error("%s parsing failed for input SMILES string", label) else: if oechem.OESmilesToMol(mol, smiles): return mol else: logger.debug( "%s converting failed for input SMILES string %s", label, smiles) logger.error( "%s converting failed for input SMILES string", label) except Exception as e: logger.exception("Failing with %s", str(e)) return None def inchiToMol(self, inchi, limitPerceptions=False, messageTag=None): """Parse the input InChI string and return a molecule object (OeGraphMol). Args: inchi (str): InChI string Returns: object: OeGraphMol() object or None for failure """ try: label = messageTag if messageTag else "" mol = oechem.OEGraphMol() inchi = inchi.strip() if limitPerceptions: if oechem.OEParseInChI(mol, inchi): return mol else: logger.debug("%s parsing failed for InChI string %r", label, inchi) logger.error("%s parsing failed for InChI string", label) else: if oechem.OEInChIToMol(mol, inchi): return mol else: logger.debug("%s converting failed for InChI string %r", label, inchi) logger.error("%s converting failed for InChI string", label) except Exception as e: logger.exception("Failing with %s", str(e)) return None def descriptorToQMol(self, descr, descrType, limitPerceptions=False, messageTag=None): """Parse the input descriptor string and return a query molecule object (OeQMol). Args: descr (str): descriptor descrType (str): descriptor type limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor messageTag (srt, optional): prefix string for error messages. Defaults to None. Returns: object: OeQmol() object or None for failure """ oeQMol = label = None try: label = messageTag if messageTag else "" tMol = self.descriptorToMol(descr, descrType, limitPerceptions=limitPerceptions, messageTag=messageTag) if tMol: oeQMol = oechem.OEQMol(tMol) except Exception as e: logger.error("%s Failing for with %s", label, str(e)) return oeQMol if oeQMol else None def smartsToQmol(self, smarts, messageTag=None): """Parse the input SMARTS query string and return a query molecule object (OeQMol). Args: smarts (str): SMARTS query string Returns: object : OeQMol() object or None for failure """ try: label = messageTag if messageTag else "" qmol = oechem.OEQMol() if oechem.OEParseSmarts(qmol, smarts): return qmol else: logger.debug("%s parsing failed for SMARTS string %s", label, smarts) logger.error("%s parsing failed for SMARTS string", label) except Exception as e: logger.exception("Failing with %s", str(e)) return None def fileToMols(self, filePath, use3D=False, largestPart=False): """Parse the input path returning a list of molecule objects (OeGraphMol). Args: filePath (str): file path must have strandard recognized extension ('mol', 'sdf', 'smi', 'oeb'). Returns: list : list of OeGraphMol() objects """ mL = [] oemf = OeMoleculeFactory() try: ifs = oechem.oemolistream() if ifs.open(filePath): for tMol in ifs.GetOEGraphMols(): oeMol = oechem.OEGraphMol(tMol) # if oechem.OEReadMolecule(ifs, oeMol): if largestPart: molL = oemf.getParts(oeMol) if len(molL) > 0: oeMol = molL[0] logger.info( "Using largest bonded molecule part (%d/%d)", len(molL), oeMol.NumAtoms()) if use3D: mL.append( oemf.updateOePerceptions3D( oeMol, aromaticModel=oechem.OEAroModelOpenEye)) else: mL.append( oemf.updateOePerceptions2D( oeMol, aromaticModel=oechem.OEAroModelOpenEye)) except Exception as e: logger.exception("Failing with %s", str(e)) return mL def stringToMols(self, txt, sType="mol2", use3D=False): """Parse the input string as input format type (sType) returning a list of molecule objects (OeGraphMol) Args: txt (str): string text of molecule data sType (str, optional): string data format (mol2, sdf, smiles) . Defaults to "mol2". Returns: list: list of OeGraphMol() objects """ # mL = [] oemf = OeMoleculeFactory() try: if sType not in ["mol2", "sdf", "smiles"]: logger.error("Unsupported string data format") return None fD = { "mol2": oechem.OEFormat_MOL2, "sdf": oechem.OEFormat_SDF, "smiles": oechem.OEFormat_SMI } ifs = oechem.oemolistream() ifs.SetFormat(fD["sType"]) if not ifs.openstring(txt): logger.error("Unable open string data for molecule reader") return None for tMol in ifs.GetOEGraphMols(): oeMol = oechem.OEGraphMol(tMol) if use3D: mL.append( oemf.updateOePerceptions3D( oeMol, aromaticModel=oechem.OEAroModelOpenEye)) else: mL.append( oemf.updateOePerceptions2D( oeMol, aromaticModel=oechem.OEAroModelOpenEye)) except Exception as e: logger.exception("Failing with %s", str(e)) return mL def readOeBinaryMolCache(self, filePath): """Return a list of OeGraphMol() objects read from the cached binary file. Args: filePath (str): file path for the binary OeMol cache Returns: dict: dictionary of OeGraphMol()'s {<ccId>: OeGraphMol(), ... } """ retD = {} startTime = time.time() try: ifs = oechem.oemolistream() if ifs.open(filePath): for oeMol in ifs.GetOEGraphMols(): tMol = oechem.OEGraphMol(oeMol) retD[tMol.GetTitle()] = tMol except Exception as e: logger.exception("Failing with %s", str(e)) endTime = time.time() logger.info("Completed operation at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return retD def createOeFingerPrintDatabase(self, oeMolDbFilePath, oeFpDbFilePath, fpType="TREE", dbType="FAST"): if dbType == "FAST": return self.__createOeFastFingerPrintDatabase(oeMolDbFilePath, oeFpDbFilePath, fpType=fpType) else: return True def __createOeFastFingerPrintDatabase(self, oeMolDbFilePath, oeFpDbFilePath, fpType="TREE"): """Create fast search fingerprint database from the input molecular database. Args: oeMolDbFilePath (str): path to the input molecular database oeFpDbFilePath (str): path to the output fingerprint database fpType (str): finger print type Returns: bool: True for success or False otherwise Supports: OEFPType_Circular OEFPType_Path OEFPType_Tree Not currently supported by OE fp search - OEFPType_MACCS166 OEFPType_Lingo """ startTime = time.time() ok = False try: _ = fpType fpD = { "TREE": oegraphsim.OEFPType_Tree, "CIRCULAR": oegraphsim.OEFPType_Circular, "PATH": oegraphsim.OEFPType_Path } myFpType = fpD[ fpType] if fpType in fpD else oegraphsim.OEFPType_Tree opts = oegraphsim.OECreateFastFPDatabaseOptions( oegraphsim.OEGetFPType(myFpType)) ok = oegraphsim.OECreateFastFPDatabaseFile(oeFpDbFilePath, oeMolDbFilePath, opts) except Exception as e: logger.exception("Failing with %s", str(e)) endTime = time.time() logger.info("Completed operation at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return ok def loadOeFingerPrintDatabase(self, oeMolDbFilePath, oeFpDbFilePath, inMemory=False, fpType="TREE", fpDbType="FAST"): if fpDbType == "FAST": return self.__loadOeFastFingerPrintDatabase(oeFpDbFilePath, inMemory=inMemory, fpType=fpType) else: return self.__loadOeFingerPrintDatabase(oeMolDbFilePath, fpType=fpType) def __loadOeFingerPrintDatabase(self, oeMolDbFilePath, fpType="TREE"): """Create conventional search fingerprint database from the input molecular database. Args: oeMolDbFilePath (str): path to the input molecular database oeFpDbFilePath (str): path to the output fingerprint database fpType (str): finger print type Returns: bool: True for success or False otherwise Supports: OEFPType_Circular OEFPType_Path OEFPType_Tree OEFPType_MACCS166 OEFPType_Lingo """ fpDb = None ok = False startTime = time.time() try: fpD = { "TREE": oegraphsim.OEFPType_Tree, "CIRCULAR": oegraphsim.OEFPType_Circular, "PATH": oegraphsim.OEFPType_Path, "MACCS": oegraphsim.OEFPType_MACCS166, "LINGO": oegraphsim.OEFPType_Lingo, } fpType = fpType if fpType and fpType in fpD else "TREE" tag = "FP_" + fpType oeFpType = fpD[ fpType] if fpType in fpD else oegraphsim.OEFPType_Tree oeMolDb = self.loadOeBinaryDatabaseAndIndex(oeMolDbFilePath) # fpDb = oegraphsim.OEFPDatabase(oeFpType) numMols = oeMolDb.GetMaxMolIdx() logger.debug("fpType %r tag %r oeFpType %r", fpType, tag, oeFpType) oeMol = oechem.OEGraphMol() for idx in range(0, numMols): if oeMolDb.GetMolecule(oeMol, idx): if oeMol.HasData(tag): tfp = oeMol.GetData(tag) fpDb.AddFP(tfp) else: fpDb.AddFP(oeMol) else: logger.info("Missing molecule at index %r", idx) numFp = fpDb.NumFingerPrints() ok = numMols == numFp logger.info( "Loaded molecules %d %s fingerprints %d (%.4f seconds)", numMols, fpType, numFp, time.time() - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) fpDb = None endTime = time.time() logger.debug("Completed with status %r operation at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return fpDb def __loadOeFastFingerPrintDatabase(self, oeFpDbFilePath, inMemory=False, fpType="TREE"): # _ = fpType startTime = time.time() if inMemory: memType = oegraphsim.OEFastFPDatabaseMemoryType_InMemory else: memType = oegraphsim.OEFastFPDatabaseMemoryType_MemoryMapped if not self.__mU.exists(oeFpDbFilePath): logger.error("Missing fingerprint database file %r", oeFpDbFilePath) fpDb = oegraphsim.OEFastFPDatabase(oeFpDbFilePath, memType) if not fpDb.IsValid(): logger.error("Cannot open fingerprint database %r", oeFpDbFilePath) # lenFp = fpDb.NumFingerPrints() memTypeStr = fpDb.GetMemoryTypeString() endTime = time.time() logger.info( "Read fingerprint database length %d loaded %s (%.4f seconds)", lenFp, memTypeStr, endTime - startTime) return fpDb def loadOeBinaryDatabaseAndIndex(self, oeMolDbFilePath): molDb = None try: molDb = oechem.OEMolDatabase() if not molDb.Open(oeMolDbFilePath): logger.error("Unable to open %r", oeMolDbFilePath) molCount = molDb.NumMols() logger.info("Loaded OE database file containing %d molecules", molCount) except Exception as e: logger.exception("Loading %r failing with %s", oeMolDbFilePath, str(e)) return molDb def createOeBinaryDatabaseAndIndex(self, oebMolFilePath, oeMolDbFilePath): """Create OE binary database file and associated index from the input serial binary data file. Args: oebMolFilePath (str): input OeMol stream binary file path oeMolDbFilePath (str): output OeMolDatabase file path Returns: int: number of molecules processed in the database. """ molCount = 0 try: startTime = time.time() moldb = oechem.OEMolDatabase() if not moldb.Open(oebMolFilePath): logger.error("Read fails for %r", oebMolFilePath) return molCount # logger.info( "Opened database in format %r num mols %d max index %d", moldb.GetFormat(), moldb.NumMols(), moldb.GetMaxMolIdx()) moldb.Save(oeMolDbFilePath) tL = list(moldb.GetTitles()) logger.info("First and last titles: %r %r", tL[0], tL[-1]) molCount = moldb.NumMols() endTime = time.time() logger.info("Completed operation at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) return molCount def buildOeBinaryMolCache(self, filePath, ccObjD, molBuildType="model-xyz", quietFlag=False, fpTypeList=None, limitPerceptions=False, suppressHydrogens=False): """Build cache of OEMol() objects from the input chemical component definition list. Args: filePath (str): output cache file path ccObjD (dict): chemical component object dictionary molBuildType (str, optional): [description]. Defaults to "model-xyz". quietFlag (bool, optional): [description]. Defaults to False. fpTypeList (list, optional): fingerprint type list. Defaults to None. limitPerceptions (bool, optional): suppress automatic chemical perceptions. Defaults to False. suppressHydrogens (bool, optional): suppress explicit hydrogen count. Defaults to False. Returns: (int, int, list): chem comp success count, error count, chem comp identifier failure list """ ok = False startTime = time.time() failIdList = [] ccCount = 0 errCount = 0 try: ofs = oechem.oemolostream() ofs.SetFormat(oechem.OEFormat_OEB) if ofs.open(filePath): oemf = OeMoleculeFactory() if quietFlag: oemf.setQuiet() for ccId, ccObj in ccObjD.items(): tId = oemf.setChemCompDef(ccObj) if tId and tId == ccId: ok = oemf.build(molBuildType=molBuildType, limitPerceptions=limitPerceptions) if ok and fpTypeList: fpOk = oemf.addFingerPrints(fpTypeList) if not fpOk: logger.info( "Fingerprint generation fails for %r", ccId) if ok: oeMol = oemf.getMol( suppressHydrogens=suppressHydrogens) oechem.OEWriteMolecule(ofs, oeMol) ccCount += 1 if not ok or not tId: # build failed incomplete component (e.g. missing atoms or bonds) errCount += 1 failIdList.append(ccId) else: logger.error("Unable to open cache database %s", filePath) errCount += 1 except Exception as e: logger.exception("Failing with %s", str(e)) # endTime = time.time() logger.info("Completed operation at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return ccCount, errCount, failIdList # def buildOeBinaryMolCacheFromIndex(self, filePath, ccIdxD, quietFlag=False, fpTypeList=None, limitPerceptions=False, suppressHydrogens=False): """Build cache of OEGraphMol() objects from the input chemical component search index. Args: filePath (str): output cache file path ccIdxD (dict): search index dictionary quietFlag (bool, optional): suppress OE output. Defaults to False. fpTypeList (list, optional): list of fingerprint types. Defaults to None. limitPerceptions (bool, optional): suppress automatic chemical perceptions. Defaults to False. suppressHydrogens (bool, optional): suppress explicit hydrogen count. Defaults to False. Returns: (int, int, list): chem comp success count, error count, chem comp identifier failure list """ failIdList = [] ccCount = 0 errCount = 0 startTime = time.time() try: ofs = oechem.oemolostream() ofs.SetFormat(oechem.OEFormat_OEB) if ofs.open(filePath): oemf = OeMoleculeFactory() if quietFlag: oemf.setQuiet() for searchCcId, ccIdx in ccIdxD.items(): oemf.setDescriptor(ccIdx["smiles"], "oe-iso-smiles", searchCcId) ok = oemf.build(molBuildType="oe-iso-smiles", limitPerceptions=limitPerceptions) if ok and fpTypeList: fpOk = oemf.addFingerPrints(fpTypeList) if not fpOk: logger.info("Fingerprint generation fails for %r", searchCcId) if ok: if not suppressHydrogens: oemf.addExplicitHydrogens() oemf.setSimpleAtomNames() oeMol = oemf.getMol( suppressHydrogens=suppressHydrogens) oechem.OEWriteMolecule(ofs, oeMol) ccCount += 1 if not ok: # build failed incomplete component (e.g. missing atoms or bonds) errCount += 1 failIdList.append(searchCcId) else: logger.error("Unable to open cache database %s", filePath) errCount += 1 except Exception as e: logger.exception("Failing with %s", str(e)) # endTime = time.time() logger.info("Completed operation at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return ccCount, errCount, failIdList def createOeSubSearchDatabase(self, oebMolFilePath, oeSubSearchFilePath, screenType="SMARTS", numProc=2): sort = True keepTitle = True myScreenType = None if screenType == "MOLECULE": myScreenType = oechem.OEGetSubSearchScreenType( oechem.OESubSearchScreenType_Molecule) elif screenType == "MDL": myScreenType = oechem.OEGetSubSearchScreenType( oechem.OESubSearchScreenType_MDL) elif screenType == "SMARTS": myScreenType = oechem.OEGetSubSearchScreenType( oechem.OESubSearchScreenType_SMARTS) opts = oechem.OECreateSubSearchDatabaseOptions(myScreenType) opts.SetSortByBitCounts(sort) opts.SetKeepTitle(keepTitle) opts.SetNumProcessors(numProc) screenStr = myScreenType.GetName() logger.info("Using %d processor(s) to generate database with %s", numProc, screenStr) tracer = oechem.OEConsoleProgressTracer() ok = oechem.OECreateSubSearchDatabaseFile(oeSubSearchFilePath, oebMolFilePath, opts, tracer) return ok def loadOeSubSearchDatabase(self, oeSubSearchFilePath, screenType=None, numProc=1): ssDb = None try: _ = screenType ssDb = oechem.OESubSearchDatabase( oechem.OESubSearchDatabaseType_Default, numProc) tracer = oechem.OEConsoleProgressTracer() if not ssDb.Open(oeSubSearchFilePath, tracer): logger.error("Unable to open %r", oeSubSearchFilePath) logger.info("Opened %r with %r molecules", oeSubSearchFilePath, ssDb.NumMolecules()) except Exception as e: logger.exception("Loading %r failing with %s", oeSubSearchFilePath, str(e)) return ssDb def write(self, filePath, oeMol, constantMol=False, addSdTags=True): """Write an oeMol with format type inferred from the filePath extension (e.g. .mol) Args: filePath (str): filepath with a chemical type extension constantMol (bool, optional): copies molecule before performing format specific perceptions Returns: bool: True for success or False otherwise """ try: molId = os.path.splitext(os.path.basename(filePath))[0] fmt = os.path.splitext(os.path.basename(filePath))[1][1:].lower() # if addSdTags: oemf = OeMoleculeFactory() oemf.setOeMol(oeMol, molId) oemf.addSdTags() oeMol = oemf.getMol() # self.__mU.mkdir(os.path.dirname(filePath)) ofs = oechem.oemolostream() ofs.open(filePath) logger.debug("Writing (fmt=%s) molId %s path %s title %s", fmt, molId, filePath, oeMol.GetTitle()) # if constantMol: oechem.OEWriteConstMolecule(ofs, oeMol) else: oechem.OEWriteMolecule(ofs, oeMol) # # If this is a mol2 file, we need to replace the resname if fmt.startswith("mol2"): # If this is a mol2/mol2h substitute the default substructure id with open(filePath, "r", encoding="utf-8") as ifh: lines = ifh.readlines() lines = [line.replace("<0>", molId) for line in lines] with open(filePath, "w", encoding="utf-8") as ofh: ofh.writelines(lines) return True except Exception as e: logger.exception("Failing for %s with %s", filePath, str(e)) return False def serializeOe(self, oeMol): """Create a string representing the content of the current OE molecule. This serialization uses the OE internal binary format. """ try: oms = oechem.oemolostream() oms.SetFormat(oechem.OEFormat_OEB) oms.openstring() oechem.OEWriteMolecule(oms, oeMol) logger.debug("SMILES %s", oechem.OECreateCanSmiString(oeMol)) logger.debug("Atoms = %d", oeMol.NumAtoms()) return oms.GetString() except Exception as e: logger.exception("Failing with %s", str(e)) def deserializeOe(self, oeS): """Reconstruct an OE molecule from the input string serialization (OE binary). The deserialized molecule is used to initialize the internal OE molecule within this object. Returns: list: OE GraphMol list """ molList = [] try: ims = oechem.oemolistream() ims.SetFormat(oechem.OEFormat_OEB) ims.openstring(oeS) for mol in ims.GetOEGraphMols(): logger.debug("SMILES %s", oechem.OECreateCanSmiString(mol)) logger.debug("title %s", mol.GetTitle()) logger.debug("atoms %d", mol.NumAtoms()) molList.append(oechem.OEGraphMol(mol)) except Exception as e: logger.exception("Failing with %s", str(e)) return molList
class EntryInfoProvider(StashableBase): """Accessors and generators for entry-level annotations.""" def __init__(self, **kwargs): # self.__version = "0.50" cachePath = kwargs.get("cachePath", ".") useCache = kwargs.get("useCache", True) self.__dirName = "rcsb_entry_info" self.__dirPath = os.path.join(cachePath, self.__dirName) super(EntryInfoProvider, self).__init__(cachePath, [self.__dirName]) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__entryInfoD = self.__reload(fmt="json", useCache=useCache) # def testCache(self, minCount=1): if minCount == 0: return True if self.__entryInfoD and minCount and "entryInfo" in self.__entryInfoD and len( self.__entryInfoD["entryInfo"]) > minCount: logger.info("Entry annotations for (%d) entries", len(self.__entryInfoD["entryInfo"])) return True return False def getEntryInfo(self, entryId): """Return a dictionary of entry-level annotations. Returns: (dict): of entry-level annotations """ try: return self.__entryInfoD["entryInfo"][entryId.upper( )] if entryId.upper() in self.__entryInfoD["entryInfo"] else {} except Exception as e: logger.error("Failing with %r", str(e)) return {} def getEntriesByPolymerEntityCount(self, count): oL = [] try: for entryId, eD in self.__entryInfoD["entryInfo"].items(): if eD["polymer_entity_count"] == count: oL.append(entryId) except Exception as e: logger.error("Failing with %r", str(e)) return oL def __getEntryInfoFilePath(self, fmt="json"): baseFileName = "entry_info_details" fExt = ".json" if fmt == "json" else ".pic" fp = os.path.join(self.__dirPath, baseFileName + fExt) return fp def update(self, cfgOb, fmt="json", indent=3): """Update branched entity glycan accession mapping cache. Args: cfgObj (object): ConfigInfo() object instance Returns: (bool): True for success for False otherwise """ ok = False try: entryInfoD = self.__updateEntryInfo(cfgOb) logger.info("Got entry_info for (%d)", len(entryInfoD)) # tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) self.__entryInfoD = { "version": self.__version, "created": tS, "entryInfo": entryInfoD } # infoFilePath = self.__getEntryInfoFilePath(fmt=fmt) kwargs = {"indent": indent} if fmt == "json" else {} ok = self.__mU.doExport(infoFilePath, self.__entryInfoD, fmt=fmt, **kwargs) except Exception as e: logger.exception("Failing with %s", str(e)) return ok def reload(self): """Reload from the current cache file.""" ok = False try: self.__entryInfoD = self.__reload(fmt="json", useCache=True) ok = self.__entryInfoD is not None except Exception as e: logger.exception("Failing with %s", str(e)) return ok def __reload(self, fmt="json", useCache=True): entryInfoFilePath = self.__getEntryInfoFilePath(fmt=fmt) tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) pcD = {"version": self.__version, "created": tS, "identifiers": {}} if useCache and self.__mU.exists(entryInfoFilePath): logger.info("Reading entry-info cached path %r", entryInfoFilePath) pcD = self.__mU.doImport(entryInfoFilePath, fmt=fmt) return pcD def __updateEntryInfo(self, cfgOb): """Get entry_info data""" rD = {} try: obEx = ObjectExtractor( cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_entry", useCache=False, keyAttribute="entry", uniqueAttributes=["rcsb_id"], selectionQuery={}, selectionList=[ "rcsb_id", "rcsb_entry_info.polymer_entity_count" ], ) # eCount = obEx.getCount() logger.info("Entry count is %d", eCount) objD = obEx.getObjects() for _, eD in objD.items(): rcsbId = eD["rcsb_id"] try: rD[rcsbId] = eD["rcsb_entry_info"] except Exception: pass except Exception as e: logger.exception("Failing with %s", str(e)) return rD