class ChEMBLTargetCofactorProvider(StashableBase): """Accessors for ChEMBL target cofactors.""" def __init__(self, **kwargs): # self.__cachePath = kwargs.get("cachePath", ".") self.__dirName = "ChEMBL-cofactors" super(ChEMBLTargetCofactorProvider, self).__init__(self.__cachePath, [self.__dirName]) self.__dirPath = os.path.join(self.__cachePath, self.__dirName) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__fD = self.__reload(self.__dirPath, **kwargs) # def testCache(self, minCount=1): logger.info("ChEMBL cofactor count %d", len(self.__fD["cofactors"]) if "cofactors" in self.__fD else 0) if self.__fD and "cofactors" in self.__fD and len(self.__fD["cofactors"]) > minCount: return True else: return False def hasTarget(self, rcsbEntityId): return rcsbEntityId.upper() in self.__fD["cofactors"] def getTargets(self, rcsbEntityId): try: return self.__fD["cofactors"][rcsbEntityId.upper()] except Exception: return [] def __getCofactorDataPath(self): return os.path.join(self.__dirPath, "ChEMBL-cofactor-data.json") def reload(self): self.__fD = self.__reload(self.__dirPath, useCache=True) return True def __reload(self, dirPath, **kwargs): startTime = time.time() fD = {} useCache = kwargs.get("useCache", True) ok = False cofactorPath = self.__getCofactorDataPath() # logger.info("useCache %r cofactorPath %r", useCache, cofactorPath) if useCache and self.__mU.exists(cofactorPath): fD = self.__mU.doImport(cofactorPath, fmt="json") ok = True else: fU = FileUtil() fU.mkdir(dirPath) # --- logger.info("Completed reload with status (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return fD def buildCofactorList(self, sequenceMatchFilePath, crmpObj=None, lnmpObj=None, maxActivity=5): """Build target cofactor list for the matching entities in the input sequence match file. Args: sequenceMatchFilePath (str): sequence match output file path crmpObj (obj, optional): instance of ChemRefMappingProviderObj() lnmpObj (obj, optional): instance of LigandNeighborMappingProviderObj(). Defaults to None. maxActivity (int, optional): maximum number of prioritized activity records per target Returns: bool: True for success or False otherwise Example activity record - "CHEMBL3243": [ { "assay_chembl_id": "CHEMBL655768", "assay_description": "In vitro inhibitory activity against recombinant human CD45 using fluorescein diphosphate (FDP) as a substrate", "assay_type": "B", "canonical_smiles": "COC(=O)c1ccc(C2=CC(=O)C(=O)c3ccccc32)cc1", "ligand_efficiency": { "bei": "19.78", "le": "0.36", "lle": "3.11", "sei": "9.57" }, "molecule_chembl_id": "CHEMBL301254", "parent_molecule_chembl_id": "CHEMBL301254", "pchembl_value": "5.78", "standard_relation": "=", "standard_type": "IC50", "standard_units": "nM", "standard_value": "1650.0", "target_chembl_id": "CHEMBL3243" }, """ rDL = [] mD = self.__mU.doImport(sequenceMatchFilePath, fmt="json") # chP = ChEMBLTargetProvider(cachePath=self.__cachePath, useCache=False) # --- chaP = ChEMBLTargetActivityProvider(cachePath=self.__cachePath, useCache=True) # provenanceSource = "ChEMBL" refScheme = "PDB entity" assignVersion = chP.getAssignmentVersion() for queryId, matchDL in mD.items(): qCmtD = self.__decodeComment(queryId) unpId = qCmtD["uniprotId"] queryTaxId = qCmtD["taxId"] if "taxId" in qCmtD else None chemblIdL = qCmtD["chemblId"].split(",") if queryTaxId == "-1": logger.info("Skipping target with missing taxonomy %r (%r)", unpId, chemblIdL) continue queryName = chP.getTargetDescription(unpId) for chemblId in chemblIdL: if not chaP.hasTargetActivity(chemblId): logger.debug("Skipping target %r (%r)", unpId, chemblId) # continue # -- chemCompNeighborsD = {} if lnmpObj: for matchD in matchDL: tCmtD = self.__decodeComment(matchD["target"]) entryId = tCmtD["entityId"].split("_")[0] entityId = tCmtD["entityId"].split("_")[1] rcsbEntityId = entryId + "_" + entityId chemCompIdList = lnmpObj.getLigandNeighbors(rcsbEntityId) chemCompNeighborsD.update({k: True for k in chemCompIdList}) # -- for matchD in matchDL: tCmtD = self.__decodeComment(matchD["target"]) entryId = tCmtD["entityId"].split("_")[0] entityId = tCmtD["entityId"].split("_")[1] # taDL = chaP.getTargetActivity(chemblId) logger.debug("Target %r has (%d) activity records", chemblId, len(taDL)) # --- actL = [] for taD in taDL: if taD["assay_type"] in ["B", "F"]: try: if taD["standard_units"] == "nM" and taD["standard_value"] and float(taD["standard_value"]) > 0.0: pV = -math.log10(float(taD["standard_value"]) * 10.0e-9) actD = { "cofactor_id": taD["molecule_chembl_id"], "assay_id": taD["assay_chembl_id"], "assay_description": taD["assay_description"], "measurement_type": "p" + taD["standard_type"], "measurement_value": round(pV, 2), "smiles": taD["canonical_smiles"], "molecule_name": taD["molecule_name"], "inchi_key": taD["inchi_key"], "action": taD["action"], "moa": taD["moa"], "max_phase": taD["max_phase"], } actD = self.__addLocalIds(actD, crmpObj=crmpObj) actL.append(actD) except Exception as e: logger.debug("Failing for tAD %r with %s", taD, str(e)) # --- actL = self.__activityListSelect(actL, chemCompNeighborsD, maxActivity=maxActivity) if not actL: logger.debug("No ChEMBL cofactors for %s %s", chemblId, unpId) # --- # aligned_target.entity_beg_seq_id (current target is PDB entity in json) # aligned_target.target_beg_seq_id (current query is target seq in json) # aligned_target.length fpL = [] if "alignedRegions" in matchD: fpL = [ { "entity_beg_seq_id": arD["targetBegin"], "target_beg_seq_id": arD["queryBegin"], "length": arD["targetEnd"] - arD["targetBegin"], } for arD in matchD["alignedRegions"] ] else: fpL = [ { "entity_beg_seq_id": matchD["targetBegin"], "target_beg_seq_id": matchD["queryBegin"], "length": matchD["alignLen"], } ] # --- rD = { "entry_id": entryId, "entity_id": entityId, "query_uniprot_id": unpId, "query_id": chemblId, "query_id_type": "ChEMBL", "query_name": queryName, "provenance_source": provenanceSource, "reference_scheme": refScheme, "assignment_version": assignVersion, "query_taxonomy_id": int(queryTaxId) if queryTaxId else None, "target_taxonomy_id": int(matchD["targetTaxId"]) if "targetTaxId" in matchD else None, # "aligned_target": fpL, # "taxonomy_match_status": matchD["taxonomyMatchStatus"] if "taxonomyMatchStatus" in matchD else None, "lca_taxonomy_id": matchD["lcaTaxId"] if "lcaTaxId" in matchD else None, "lca_taxonomy_name": matchD["lcaTaxName"] if "lcaTaxName" in matchD else None, "lca_taxonomy_rank": matchD["lcaRank"] if "lcaRank" in matchD else None, "cofactors": actL, } rDL.append(rD) # qD = {} for rD in rDL: eId = rD["entry_id"] + "_" + rD["entity_id"] qD.setdefault(eId, []).append(rD) # fp = self.__getCofactorDataPath() tS = datetime.datetime.now().isoformat() # vS = datetime.datetime.now().strftime("%Y-%m-%d") vS = assignVersion ok = self.__mU.doExport(fp, {"version": vS, "created": tS, "cofactors": qD}, fmt="json", indent=3) return ok def __addLocalIds(self, cfD, crmpObj=None): # if crmpObj: localIdL = crmpObj.getLocalIds("CHEMBL", cfD["cofactor_id"]) if localIdL: localId = localIdL[0] if localId.startswith("PRD_"): cfD["prd_id"] = localId else: cfD["chem_comp_id"] = localId return cfD def __activityListSelect(self, activityDL, chemCompNeighborsD, maxActivity=5): retL = [] mappedNeighborL = [] unmappedL = activityDL # if chemCompNeighborsD: unmappedL = [] # Select out the any cases for molecules that map to a neighbor chemical component. for activityD in activityDL: if "chem_comp_id" in activityD and activityD["chem_comp_id"] in chemCompNeighborsD: activityD["neighbor_in_pdb"] = "Y" mappedNeighborL.append(activityD) else: unmappedL.append(activityD) activityD["neighbor_in_pdb"] = "N" # numLeft = maxActivity - len(mappedNeighborL) if numLeft > 0: unmappedL = sorted(unmappedL, key=lambda k: k["measurement_value"], reverse=True) retL = mappedNeighborL retL.extend(unmappedL[:numLeft]) retL = sorted(retL, key=lambda k: k["measurement_value"], reverse=True) else: logger.debug("Mapped neighbor cofactors (%d) excluded unmapped (%d)", len(mappedNeighborL), len(unmappedL)) retL = sorted(mappedNeighborL, key=lambda k: k["measurement_value"], reverse=True) return retL def __decodeComment(self, comment, separator="|"): dD = {} try: ti = iter(comment.split(separator)) dD = {tup[1]: tup[0] for tup in zip(ti, ti)} except Exception: pass return dD
class DictMethodRunnerTests(unittest.TestCase): def setUp(self): self.__export = True self.__numProc = 2 self.__fileLimit = 200 mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") configPath = os.path.join(mockTopPath, "config", "dbload-setup-example.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # self.__testCaseList = [ { "contentType": "pdbx_core", "mockLength": 50, "mergeContent": ["vrpt"] }, { "contentType": "bird_chem_comp_core", "mockLength": 17, "mergeContent": None }, ] # self.__modulePathMap = self.__cfgOb.get( "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __runContentType(self, contentType, mockLength, mergeContent): """Read and process test fixture data files from the input content type.""" try: dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True) dictApi = dP.getApiByName(contentType) rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) locatorObjList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContent) containerList = self.__rpP.getContainerList(locatorObjList) # logger.debug("Length of locator list %d\n", len(locatorObjList)) self.assertGreaterEqual(len(locatorObjList), mockLength) for container in containerList: cName = container.getName() # # if cName not in ["1B5F"]: # continue logger.debug("Processing container %s", cName) dmh.apply(container) if self.__export: savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif") self.__mU.doExport(savePath, [container], fmt="mmcif") except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testMethodRunner(self): """Test method runner for multiple content types.""" for tD in self.__testCaseList: self.__runContentType(tD["contentType"], tD["mockLength"], tD["mergeContent"]) def testMethodRunnerSetup(self): """Test the setup methods for method runner class""" try: dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True) dictApi = dP.getApiByName("pdbx") rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) ok = dmh is not None self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def assemble(self, maxRFactor=10.0): """Concatenate models into the input file path subject to the R value constraint. Relabel the models sequentially for each parent chemical component. Args: assembleModelPath (str): path for concatenated model file maxRFactor (float, optional): limiting R-value. Defaults to 10.0. Returns: (bool): True for success or False otherwise """ dataContainerL = [] mU = MarshalUtil(workPath=self.__cachePath) # combine CCDC and COD model build index files modelIndexD = self.__ccdcmb.fetchModelIndex() codD = self.__codmb.fetchModelIndex() for pId, mDL in codD.items(): if pId in modelIndexD: modelIndexD[pId] += codD[pId] else: modelIndexD[pId] = codD[pId] # modelIndexD = self.__addPriorMatchDetails(modelIndexD) modelIndexD = self.__updateVariantDetails(modelIndexD) priorMapD = {} for _, mDL in modelIndexD.items(): try: mDLS = sorted(mDL, key=itemgetter("priorModelId", "variantType", "rFactor"), reverse=False) except Exception: mDLS = sorted(mDL, key=itemgetter("priorModelId", "variantType"), reverse=False) numStd = 0 matchIdD = {} for mD in mDLS: isStd = False if mD["variantType"].startswith("A"): numStd += 1 isStd = True # if "rFactor" in mD and mD[ "rFactor"] and mD["rFactor"] > maxRFactor: logger.info("Skipping model %s isStd (%r) rValue (%r)", mD["modelId"], isStd, mD["rFactor"]) continue if numStd and not isStd: logger.info("Skipping model %s isStd (%r) numStd (%d)", mD["modelId"], isStd, numStd) continue # # Exclude duplicate matches in priority order ... if mD["matchId"] in matchIdD: logger.info("Skipping duplicate matchId %r in %r", mD["matchId"], mD["modelId"]) continue # matchIdD[mD["matchId"]] = True cL = mU.doImport(mD["modelPath"], fmt="mmcif") logger.debug("Read %d from %s", len(cL), mD["modelPath"]) dataContainerL.extend(cL) if not mD["priorModelId"].startswith("Z"): priorMapD[mD["modelId"]] = (mD["priorModelId"], mD["priorMatchDate"]) # logger.debug("priorMapD %r", priorMapD) fn = "chem_comp_models-%s.cif" % self.__getToday() assembleModelPath = os.path.join(self.__ccdcmb.getModelDirFilePath(), fn) # -- relabel parentModelCountD = defaultdict(int) priorIdLD = {} for dataContainer in dataContainerL: tModelId = dataContainer.getName() tId = self.__parseId(tModelId)[0] pId = tId.split("|")[0] if tModelId in priorMapD: pCount = self.__parseId(priorMapD[tModelId][0])[1] priorIdLD.setdefault(pId, []).append(pCount) self.__replaceModelId(dataContainer, tModelId, priorMapD[tModelId][0]) self.__updateAuditDate(dataContainer, priorMapD[tModelId][1]) parentModelCountD[pId] = sorted(priorIdLD[pId])[-1] logger.debug("%s current model %r prior model %r count %d", pId, tModelId, priorMapD[tModelId][0], parentModelCountD[pId]) else: parentModelCountD[pId] += 1 pModelId = self.__makePublicModelId(pId, parentModelCountD[pId]) self.__replaceModelId(dataContainer, tModelId, pModelId) ok = mU.doExport(assembleModelPath, dataContainerL, fmt="mmcif") logger.info("Assembled %d models status %r", len(dataContainerL), ok) self.__checkAssembledModels(assembleModelPath) return len(dataContainerL)
class ChemCompSearchIndexProvider(object): """Utilities to read and process the index of chemical component definitions search targets""" def __init__(self, **kwargs): # self.__cachePath = kwargs.get("cachePath", ".") self.__dirPath = os.path.join(self.__cachePath, "chem_comp") self.__mU = MarshalUtil(workPath=self.__dirPath) self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc") self.__searchIdx = self.__reload(**kwargs) def testCache(self, minCount=None, logSizes=False): if logSizes and self.__searchIdx: logger.info("searchIdxD (%.2f MB)", getObjSize(self.__searchIdx) / 1000000.0) ok = self.__searchIdx and len(self.__searchIdx) >= minCount if minCount else self.__searchIdx is not None return ok def getIndex(self): return self.__searchIdx def getIndexEntry(self, searchCcId): try: return self.__searchIdx[searchCcId] except Exception as e: logger.debug("Get index entry %r failing with %s", searchCcId, str(e)) return None def getIndexFilePath(self): return os.path.join(self.__dirPath, "%s-search-idx-chemical-components.json" % self.__ccFileNamePrefix) def __reload(self, **kwargs): """Reload or created index of PDB chemical components. Args: cachePath (str): path to the directory containing cache files ccIdxFileName (str): serialized chemical component data index file name Returns: (list): chemical component data containers """ # searchIdxD = {} useChemAxon = kwargs.get("useChemAxon", True) useCache = kwargs.get("useCache", True) molLimit = kwargs.get("molLimit", 0) numProc = kwargs.get("numProc", 1) maxChunkSize = kwargs.get("maxChunkSize", 20) limitPerceptions = kwargs.get("limitPerceptions", True) quietFlag = kwargs.get("quietFlag", True) skipObsolete = kwargs.get("skipObsolete", True) searchIdxFilePath = self.getIndexFilePath() # if useCache and self.__mU.exists(searchIdxFilePath): _, fExt = os.path.splitext(searchIdxFilePath) searchIdxFormat = "json" if fExt == ".json" else "pickle" rdCcIdxD = self.__mU.doImport(searchIdxFilePath, fmt=searchIdxFormat) searchIdxD = {k: rdCcIdxD[k] for k in sorted(rdCcIdxD.keys())[:molLimit]} if molLimit else rdCcIdxD else: cmpKwargs = {k: v for k, v in kwargs.items() if k not in ["cachePath", "useCache", "molLimit"]} ccmP = ChemCompMoleculeProvider(cachePath=self.__cachePath, useCache=True, molLimit=molLimit, skipObsolete=skipObsolete, **cmpKwargs) ok1 = ccmP.testCache(minCount=molLimit, logSizes=True) # descrD = {} ok2 = True if useChemAxon: caxP = ChemAxonDescriptorProvider(cachePath=self.__cachePath, useCache=True, **cmpKwargs) ok2 = caxP.testCache(minCount=molLimit) descrD = caxP.getDescriptorIndex() # if ok1 & ok2: searchIdxD = self.__updateChemCompSearchIndex(ccmP.getMolD(), descrD, searchIdxFilePath, molLimit, limitPerceptions, numProc, maxChunkSize, quietFlag) logger.info("Storing %s with data for %d search candidates (status=%r) ", searchIdxFilePath, len(searchIdxD), ok1 & ok2) # logger.info("Using Chemaxon descriptors for (%d) components", descrD) # for idxD in searchIdxD.values(): idxD["atom-types"] = set(idxD["type-counts"].keys()) if "type-counts" in idxD else set() return searchIdxD def __updateChemCompSearchIndex(self, ccObjD, descrD, filePath, molLimit, limitPerceptions, numProc, maxChunkSize, quietFlag): searchIdxD = {} try: # Serialized index of chemical component search targets startTime = time.time() _, fExt = os.path.splitext(filePath) fileFormat = "json" if fExt == ".json" else "pickle" if numProc <= 1: searchIdxD = self.__buildChemCompSearchIndex(ccObjD, descrD, limitPerceptions=limitPerceptions, molLimit=molLimit) else: searchIdxD = self.__buildChemCompSearchIndexMulti( ccObjD, descrD, limitPerceptions=limitPerceptions, molLimit=molLimit, numProc=numProc, maxChunkSize=maxChunkSize, quietFlag=quietFlag ) ok = self.__mU.doExport(filePath, searchIdxD, fmt=fileFormat) endTime = time.time() logger.info("Storing %s (%s) with %d search definitions (status=%r) (%.4f seconds)", filePath, fileFormat, len(searchIdxD), ok, endTime - startTime) # except Exception as e: logger.exception("Failing with %s", str(e)) # return searchIdxD def __buildChemCompSearchIndex(self, ccObjD, descrD, limitPerceptions=False, molLimit=None): """Internal method return a dictionary of extracted chemical component descriptors and formula.""" rD = {} try: for ii, ccId in enumerate(ccObjD, 1): if molLimit and ii > molLimit: break # ---- oemf = OeMoleculeFactory() oemf.setQuiet() tId = oemf.setChemCompDef(ccObjD[ccId]) if tId != ccId: logger.error("%s chemical component definition import error", ccId) # ---- oemf.clearExternalDescriptors() for smi in descrD[ccId] if ccId in descrD else []: oemf.addExternalDescriptor("smiles", smi, "chemaxon-smiles") # ---- smiD = oemf.buildRelated(limitPerceptions=limitPerceptions) logger.debug("%s related molecular forms %d", ccId, len(smiD)) rD.update(smiD) except Exception as e: logger.exception("Failing with %s", str(e)) return rD def __buildChemCompSearchIndexMulti(self, ccObjD, descrD, limitPerceptions=False, molLimit=None, numProc=2, maxChunkSize=20, quietFlag=False): # ccIdList = sorted(ccObjD.keys())[:molLimit] if molLimit else sorted(ccObjD.keys()) logger.info("Input definition length %d numProc %d limitPerceptions %r", len(ccIdList), numProc, limitPerceptions) # rWorker = ChemCompSearchIndexWorker(ccObjD) # mpu = MultiProcPoolUtil(verbose=True) mpu = MultiProcUtil(verbose=True) optD = {"maxChunkSize": maxChunkSize, "limitPerceptions": limitPerceptions, "quietFlag": quietFlag, "descrD": descrD} mpu.setOptions(optD) mpu.set(workerObj=rWorker, workerMethod="buildRelatedList") ok, failList, resultList, _ = mpu.runMulti(dataList=ccIdList, numProc=numProc, numResults=1, chunkSize=maxChunkSize) if failList: logger.info("Index definitions with failures (%d): %r", len(failList), failList) logger.info("Multi-proc status %r failures %r result length %r", ok, len(failList), len(resultList[0])) # JDW rD = {vD["name"]: vD for vD in resultList[0]} return rD def matchMolecularFormulaRange(self, typeRangeD, matchSubset=False): """Find matching formula for the input atom type range query (evaluates min <= ff <= max). Args: typeRangeD (dict): dictionary of element ranges {'<element_name>: {'min': <int>, 'max': <int>}} matchSubset (bool, optional): test for formula subset (default: False) Returns: (list): chemical component identifiers with matching formula (MatchResults) """ rL = [] try: if not typeRangeD: return rL myTypeRangeD = {k.upper(): v for k, v in typeRangeD.items()} queryTypeS = set(myTypeRangeD.keys()) for ccId, idxD in self.__searchIdx.items(): tD = idxD["type-counts"] # targetTypeS = set(tD.keys()) if not matchSubset and idxD["atom-types"] != queryTypeS: continue # if not queryTypeS.issubset(idxD["atom-types"]): continue match = True for atomType, rangeD in myTypeRangeD.items(): try: if ("min" in rangeD and rangeD["min"] > tD[atomType]) or ("max" in rangeD and rangeD["max"] < tD[atomType]): match = False break except Exception: match = False break if match: # logger.info("%s formula %r query %r", ccId, idxD["type-counts"], typeRangeD) rL.append(MatchResults(ccId=ccId, searchType="formula", formula=idxD["formula"])) except Exception as e: logger.exception("Failing for %r with %s", typeRangeD, str(e)) return rL def filterMinimumMolecularFormula(self, typeCountD): """Find molecules with the minimum formula composition for the input atom type query (evaluates min <= ff). Args: typeCountD (dict): dictionary of element minimum values {'<element_name>: #} Returns: (list): chemical component identifiers """ rL = [] try: if not typeCountD: return list(self.__searchIdx.keys()) queryTypeS = set(typeCountD.keys()) for ccId, idxD in self.__searchIdx.items(): tD = idxD["type-counts"] if not queryTypeS.issubset(tD): continue match = True for atomType, minCount in typeCountD.items(): try: if minCount > tD[atomType]: match = False break except Exception: match = False break if match: rL.append(ccId) except Exception as e: logger.exception("Failing for %r with %s", typeCountD, str(e)) return rL def filterMinimumFormulaAndFeatures(self, typeCountD, featureCountD): """Find molecules with the minimum formula and feature composition. Args: typeCountD (dict): dictionary of element minimum values {'<element_name>: #} featureCountD (dict): dictionary of feature minimum values {'<element_name>: #} Returns: (list): chemical component identifiers """ rL = [] try: if not typeCountD or not featureCountD: return list(self.__searchIdx.keys()) # ---- featureQueryS = set(featureCountD.keys()) typeQueryS = set(typeCountD.keys()) # for ccId, idxD in self.__searchIdx.items(): tD = idxD["type-counts"] fD = idxD["feature-counts"] # if not typeQueryS.issubset(tD) or not featureQueryS.issubset(fD): continue match = True for atomType, minCount in typeCountD.items(): try: if minCount > tD[atomType]: match = False break except Exception: match = False break if not match: continue # for featureType, minCount in featureCountD.items(): try: if minCount > fD[featureType]: match = False break except Exception: match = False break # if match: rL.append(ccId) except Exception as e: logger.exception("Failing for %r with %s", typeCountD, str(e)) return rL
class ChemCompDepictWrapper(SingletonClass): """Wrapper for chemical component depiction operations.""" def __init__(self): self.__startTime = time.time() # --- self.__workPath = "." self.__mU = MarshalUtil(workPath=self.__workPath) self.__configD = None self.__cachePath = None # --- self.__statusDescriptorError = -100 self.__searchError = -200 self.__searchSuccess = 0 self.__imageCount = 0 def readConfig(self, resetImagePath=True): # ok = False try: self.__cachePath = os.environ.get("CHEM_DEPICT_CACHE_PATH", ".") configFileName = os.environ.get("CHEM_DEPICT_CONFIG_FILE_NAME", "depict-config.json") # configFilePath = os.path.join(self.__cachePath, "config", configFileName) configD = {} if self.__mU.exists(configFilePath): configD = self.__mU.doImport(configFilePath, fmt="json") logger.debug("configD: %r", configD) if configD and (len(configD) >= 2) and float( configD["versionNumber"]) > 0.1: logger.info("Read version %r sections %r from %s", configD["versionNumber"], list(configD.keys()), configFilePath) ok = True # if resetImagePath: # Allow the configuration to be relocatable. tS = configD[ "imageDir"] if "imageDir" in configD else "images" configD["imageDirPath"] = os.path.join( self.__cachePath, tS) configD["versionNumber"] = "0.2" else: # Handle missing config for now configD["imageDir"] = "images" configD["imageDirPath"] = os.path.join(self.__cachePath, configD["imageDir"]) logger.warning("Reading config file fails from path %r", configFilePath) logger.warning("Using config %r", configD) ok = True # self.__configD = configD except Exception as e: logger.exception("Failing with %s", str(e)) ok = False return ok def setConfig(self, cachePath, **kwargs): """Provide dependencies for rebuilding depict file dependencies. Args: cachePath (str): path to cache data files. Other options are propagated to configurations of the wrapped classes in __bootstrapDepictConfig() """ self.__configD = self.__makeBootstrapDepictConfig(cachePath, **kwargs) return len(self.__configD) >= 2 def __makeBootstrapDepictConfig(self, cachePath, **kwargs): """Create depict configuration bootstrap file""" configD = {} try: storeConfig = kwargs.get("storeConfig", True) os.environ["CHEM_DEPICT_CACHE_PATH"] = os.path.join(cachePath) configDirPath = os.path.join(cachePath, "config") configFilePath = os.path.join(configDirPath, "depict-config.json") # logger.info("Updating depict configuration using %s", configFilePath) # imageDirPath = os.path.join(cachePath, "images") self.__mU.mkdir(imageDirPath) configD = {"versionNumber": 0.20, "imageDir": "images"} if storeConfig: self.__mU.mkdir(configDirPath) self.__mU.doExport(configFilePath, configD, fmt="json", indent=3) except Exception as e: logger.exception("Failing with %s", str(e)) return configD # def setImageCount(self, imageCount): self.__imageCount = imageCount def getImageCount(self): return self.__imageCount def __makeImagePath(self): imageDirPath = self.__configD[ "imageDirPath"] if self.__configD and "imageDirPath" in self.__configD else "." fileRotateIncrement = self.__configD[ "fileRotateIncrement"] if self.__configD and "fileRotateIncrement" in self.__configD else 50 ic = self.__imageCount % fileRotateIncrement imagePath = os.path.join(imageDirPath, "image-%s.svg" % ic) return imagePath def depictMolecule(self, identifier, identifierType, imagePath=None, **kwargs): """Create depiction from InChI, SMILES descriptors or PDB identifier.""" try: imagePath = imagePath if imagePath else self.__makeImagePath() oeio = OeIoUtils() if identifierType.lower() in ["smiles"]: oeMol = oeio.smilesToMol(identifier) elif identifierType.lower() in ["inchi"]: oeMol = oeio.inchiToMol(identifier) elif identifierType.lower() in ["identifierpdb"]: ccsw = ChemCompSearchWrapper() oesmP = ccsw.getSearchMoleculeProvider() oeMol = oesmP.getMol(identifier) # ok = self.__depictOne(oeMol, imagePath, **kwargs) return imagePath if ok else None except Exception as e: logger.exception("Failing with %s", str(e)) return None def __depictOne(self, oeMol, imagePath, **kwargs): """Single Args: oeMol (object): instance of an OE graph molecule imagePath (string): file path for image Returns: bool: True for success or False otherwise """ try: title = kwargs.get("title", None) oed = OeDepict() oed.setMolTitleList([("Target", oeMol, title)]) # --- bondDisplayWidth = 10.0 numAtoms = oeMol.NumAtoms() if numAtoms > 100 and numAtoms <= 200: bondDisplayWidth = 6.0 elif numAtoms > 200: bondDisplayWidth = 4.0 # --- oed.setDisplayOptions( imageSizeX=kwargs.get("imageSizeX", 2500), imageSizeY=kwargs.get("imageSizeX", 2500), labelAtomName=kwargs.get("labelAtomName", False), labelAtomCIPStereo=kwargs.get("labelAtomCIPStereo", True), labelAtomIndex=kwargs.get("labelAtomIndex", False), labelBondIndex=kwargs.get("labelBondIndex", False), labelBondCIPStereo=kwargs.get("labelBondCIPStereo", True), cellBorders=kwargs.get("cellBorders", True), bondDisplayWidth=bondDisplayWidth, ) oed.setGridOptions(rows=1, cols=1, cellBorders=False) oed.prepare() oed.write(imagePath) self.__imageCount += 1 return True except Exception as e: logger.exception("Failing with %s", str(e)) return False def status(self): unitS = "MB" if platform.system() == "Darwin" else "GB" rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("Maximum resident memory size %.4f %s", rusageMax / 10**6, unitS) endTime = time.time() logger.info("Status at %s (up %.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def alignMoleculePair(self, refIdentifier, refIdentifierType, fitIdentifier, fitIdentifierType, imagePath=None, **kwargs): """Create aligned depiction for a target molecule InChI, SMILES descriptors or PDB identifier.""" try: imagePath = imagePath if imagePath else self.__makeImagePath() oeio = OeIoUtils() ccsw = ChemCompSearchWrapper() oesmP = ccsw.getSearchMoleculeProvider() # --- if refIdentifierType.lower() in ["smiles"]: oeMolRef = oeio.smilesToMol(refIdentifier) elif refIdentifierType.lower() in ["inchi"]: oeMolRef = oeio.inchiToMol(refIdentifier) elif refIdentifierType.lower() in ["identifierpdb"]: oeMolRef = oesmP.getMol(refIdentifier) # if fitIdentifierType.lower() in ["smiles"]: oeMolFit = oeio.smilesToMol(fitIdentifier) elif fitIdentifierType.lower() in ["inchi"]: oeMolFit = oeio.inchiToMol(fitIdentifier) elif fitIdentifierType.lower() in ["identifierpdb"]: oeMolFit = oesmP.getMol(fitIdentifier) # --- logger.info("oeMolRef atoms %r", oeMolRef.NumAtoms()) logger.info("oeMolFit atoms %r", oeMolFit.NumAtoms()) displayIdRef = "Ref" displayIdFit = "Fit" ok = self.__depictAlignedPair(oeMolRef, displayIdRef, oeMolFit, displayIdFit, imagePath, **kwargs) return imagePath if ok else None except Exception as e: logger.exception("Failing with %s", str(e)) return None def __depictAlignedPair(self, oeMolRef, displayIdRef, oeMolFit, displayIdFit, imagePath, **kwargs): """Depict pairwise MCSS alignment""" try: # oed = OeDepictMCSAlignPage() oed.setSearchType(sType="relaxed") # oed.setRefMol(oeMolRef, displayIdRef) oed.setFitMol(oeMolFit, displayIdFit) # # imagePath = self.__makeImagePath() # --- bondDisplayWidth = 10.0 numAtomsRef = oeMolRef.NumAtoms() if numAtomsRef > 100 and numAtomsRef <= 200: bondDisplayWidth = 6.0 elif numAtomsRef > 200: bondDisplayWidth = 4.0 # --- oed.setDisplayOptions( imageSizeX=kwargs.get("imageSizeX", 2500), imageSizeY=kwargs.get("imageSizeX", 2500), labelAtomName=kwargs.get("labelAtomName", False), labelAtomCIPStereo=kwargs.get("labelAtomCIPStereo", True), labelAtomIndex=kwargs.get("labelAtomIndex", False), labelBondIndex=kwargs.get("labelBondIndex", False), labelBondCIPStereo=kwargs.get("labelBondCIPStereo", True), cellBorders=kwargs.get("cellBorders", True), bondDisplayWidth=bondDisplayWidth, highlightStyleFit=kwargs.get("highlightStyleFit", "ballAndStickInverse"), ) # aML = oed.alignPair(imagePath=imagePath) logger.info("Aligned atom count %d", len(aML)) # # self.assertGreater(len(aML), 1) # if aML: # for (rCC, rAt, tCC, tAt) in aML: # logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt) return True except Exception as e: logger.exception("Failing with %s", str(e)) return False def toMolFile(self, identifier, identifierType, molfilePath=None, fmt="mol", **kwargs): """Create molfile (fmt) from InChI, SMILES descriptors or PDB identifier.""" try: molfilePath = molfilePath if molfilePath else self.__makeMolfilePath( fmt=fmt) oeio = OeIoUtils() if identifierType.lower() in ["smiles"]: oeMol = oeio.smilesToMol(identifier) oeMol.SetTitle("From SMILES") elif identifierType.lower() in ["inchi"]: oeMol = oeio.inchiToMol(identifier) oeMol.SetTitle("From InChI") elif identifierType.lower() in ["identifierpdb"]: ccsw = ChemCompSearchWrapper() oesmP = ccsw.getSearchMoleculeProvider() oeMol = oesmP.getMol(identifier) # ok = self.__toMolFile(oeMol, molfilePath, **kwargs) return molfilePath if ok else None except Exception as e: logger.exception("Failing with %s", str(e)) return None def __toMolFile(self, oeMol, molfilePath, **kwargs): """Write the Args: oeMol (object): instance of an OE graph molecule molfilePath (string): file path for molfile (type determined by extension) Returns: bool: True for success or False otherwise """ try: _ = kwargs oeio = OeIoUtils() oeio.write(molfilePath, oeMol, constantMol=True) return True except Exception as e: logger.exception("Failing with %s", str(e)) return False def __makeMolfilePath(self, fmt="mol"): imageDirPath = self.__configD[ "imageDirPath"] if self.__configD and "imageDirPath" in self.__configD else "." fileRotateIncrement = self.__configD[ "fileRotateIncrement"] if self.__configD and "fileRotateIncrement" in self.__configD else 50 ic = self.__imageCount % fileRotateIncrement molPath = os.path.join(imageDirPath, "molfile-%s.%s" % (ic, fmt)) return molPath
class MarshalUtilTests(unittest.TestCase): def setUp(self): self.__verbose = True self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "mmcif_pdbx_v5_next.dic") self.__pathJsonTestFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "vrpt_dictmap.json") self.__pathIndexFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_EXCHANGE_SANDBOX", "update-lists", "all-pdb-list") self.__pathCifFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_BIRD_CC_REPO", "0", "PRDCC_000010.cif") # self.__workPath = os.path.join(HERE, "test-output") self.__pathSaveDictionaryFile = os.path.join(self.__workPath, "mmcif_pdbx_v5_next.dic") self.__pathSaveJsonTestFile = os.path.join(self.__workPath, "json-content.json") self.__pathSaveIndexFile = os.path.join(self.__workPath, "all-pdb-list") self.__pathSaveCifFile = os.path.join(self.__workPath, "cif-content.cif") # self.__pathFastaFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_EXCHANGE_SANDBOX", "sequence", "pdb_seq_prerelease.fasta") self.__pathSaveFastaFile = os.path.join(self.__workPath, "test-pre-release.fasta") # self.__urlTarget = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz" self.__urlTargetBad = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump-missing.tar.gz" # self.__mU = MarshalUtil() self.__startTime = time.time() logger.debug("Running tests on version %s", __version__) logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testReadWriteInParts(self): """Test the case reading and writing in parts.""" try: lenL = 12013 aL = [100, 200, 300, 400, 500] dL = [aL for ii in range(lenL)] numParts = 4 sPath = os.path.join(self.__workPath, "list-m-data.json") ok = self.__mU.doExport(sPath, dL, numParts=numParts, fmt="json", indent=3) # self.assertTrue(ok) rL = self.__mU.doImport(sPath, numParts=numParts, fmt="json") logger.info("Reading %d parts with total length %d", numParts, len(rL)) self.assertEqual(dL, rL) # lenD = 23411 qD = OrderedDict([("a", 100), ("b", 100), ("c", 100)]) dD = OrderedDict([(str(ii), qD) for ii in range(lenD)]) numParts = 4 sPath = os.path.join(self.__workPath, "dict-m-data.json") ok = self.__mU.doExport(sPath, dD, numParts=numParts, fmt="json", indent=3) self.assertTrue(ok) rD = self.__mU.doImport(sPath, numParts=numParts, fmt="json") logger.info("Reading %d parts with total length %d", numParts, len(rD)) self.assertEqual(dD, rD) rD = self.__mU.doImport(sPath, numParts=numParts, fmt="json") logger.info("Reading %d parts with total length %d", numParts, len(rD)) self.assertEqual(dD, rD) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReadDictionaryFile(self): """Test the case read PDBx/mmCIF dictionary text file""" try: cL = self.__mU.doImport(self.__pathPdbxDictionaryFile, fmt="mmcif-dict") logger.debug("Dictionary container list %d", len(cL)) self.assertGreaterEqual(len(cL), 1) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReadCifFile(self): """Test the case read PDBx/mmCIF text file""" try: cL = self.__mU.doImport(self.__pathCifFile, fmt="mmcif") logger.debug("Container list %d", len(cL)) self.assertGreaterEqual(len(cL), 1) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReadListFile(self): """Test the case read list text file""" try: cL = self.__mU.doImport(self.__pathIndexFile, fmt="list") logger.debug("List length %d", len(cL)) self.assertGreaterEqual(len(cL), 1000) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReadJsonFile(self): """Test the case read JSON file""" try: rObj = self.__mU.doImport(self.__pathJsonTestFile, fmt="json") logger.debug("Object length %d", len(rObj)) self.assertGreaterEqual(len(rObj), 1) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReadWriteDictionaryFiles(self): """Test the case read and write PDBx/mmCIF dictionary text file""" try: cL = self.__mU.doImport(self.__pathPdbxDictionaryFile, fmt="mmcif-dict") logger.debug("Dictionary container list %d", len(cL)) self.assertGreaterEqual(len(cL), 1) ok = self.__mU.doExport(self.__pathSaveDictionaryFile, cL, fmt="mmcif-dict") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReadWriteCifFile(self): """Test the case read and write PDBx/mmCIF text file""" try: cL = self.__mU.doImport(self.__pathCifFile, fmt="mmcif") logger.debug("Container list %d", len(cL)) self.assertGreaterEqual(len(cL), 1) ok = self.__mU.doExport(self.__pathSaveCifFile, cL, fmt="mmcif") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReadWriteJsonFile(self): """Test the case read and write JSON file""" try: rObj = self.__mU.doImport(self.__pathJsonTestFile, fmt="json") logger.debug("Object length %d", len(rObj)) self.assertGreaterEqual(len(rObj), 1) ok = self.__mU.doExport(self.__pathSaveJsonTestFile, rObj, fmt="json") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReadWriteListFile(self): """Test the case read and write list text file""" try: cL = self.__mU.doImport(self.__pathIndexFile, fmt="list") logger.debug("List element %r length %d", cL[0], len(cL)) count = 0 for cV in cL: fields = cV.split() count += len(fields) _ = count self.assertGreaterEqual(len(cL), 1000) ok = self.__mU.doExport(self.__pathSaveIndexFile, cL, fmt="list") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReadWriteFastaFile(self): """Test the case read and write FASTA sequence file""" try: sD = self.__mU.doImport(self.__pathFastaFile, fmt="fasta", commentStyle="prerelease") logger.debug("Sequence length %d", len(sD)) self.assertGreaterEqual(len(sD), 500) ok = self.__mU.doExport(self.__pathSaveFastaFile, sD, fmt="fasta") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReadUrlTarfile(self): """Test the case to read URL target and extract a member""" try: mU = MarshalUtil(workPath=self.__workPath) _, fn = os.path.split(self.__urlTarget) # nmL = mU.doImport(self.__urlTarget, fmt="tdd", rowFormat="list", tarMember="names.dmp") self.assertGreater(len(nmL), 2000000) logger.info("Names %d", len(nmL)) ndL = mU.doImport(os.path.join(self.__workPath, fn), fmt="tdd", rowFormat="list", tarMember="nodes.dmp") self.assertGreater(len(ndL), 2000000) logger.info("Nodes %d", len(ndL)) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReadUrlTddfile(self): """Test the case to read URL target of a tdd""" try: mU = MarshalUtil(workPath=self.__workPath) version = "2.07-2019-07-23" urlTarget = "http://scop.berkeley.edu/downloads/update" encoding = "utf-8-sig" if sys.version_info[0] > 2 else "ascii" fn = "dir.des.scope.%s.txt" % version url = os.path.join(urlTarget, fn) logger.info("Fetch url %r", url) desL = mU.doImport(url, fmt="tdd", rowFormat="list", uncomment=True, encoding=encoding) logger.info("Fetched URL is %s len %d", url, len(desL)) self.assertGreater(len(desL), 100) logger.info("Lines %d", len(desL)) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReadUrlTarfileFail(self): """Test the case to read URL target and extract a member (failing case)""" try: mU = MarshalUtil(workPath=self.__workPath) rL = mU.doImport(self.__urlTargetBad, fmt="tdd", rowFormat="list", tarMember="names.dmp") logger.info("Return is %r", rL) self.assertEqual(len(rL), 0) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class PharosTargetProvider(StashableBase): """Accessors for Pharos target assignments.""" def __init__(self, **kwargs): # self.__cachePath = kwargs.get("cachePath", ".") self.__dirName = "Pharos-targets" super(PharosTargetProvider, self).__init__(self.__cachePath, [self.__dirName]) self.__dirPath = os.path.join(self.__cachePath, self.__dirName) # self.__mU = MarshalUtil(workPath=self.__dirPath) reloadDb = kwargs.get("reloadDb", False) fromDb = kwargs.get("fromDb", False) useCache = kwargs.get("useCache", False) pharosDumpUrl = kwargs.get("pharosDumpUrl", None) mysqlUser = kwargs.get("mysqlUser", None) mysqlPassword = kwargs.get("mysqlPassword", None) self.__version = None if reloadDb or fromDb: self.__reload(self.__dirPath, reloadDb=reloadDb, fromDb=fromDb, useCache=useCache, pharosDumpUrl=pharosDumpUrl, mysqlUser=mysqlUser, mysqlPassword=mysqlPassword) # def testCache(self): return True def getVersion(self): return self.__version def __reload(self, dirPath, reloadDb=False, fromDb=False, useCache=False, pharosDumpUrl=None, mysqlUser=None, mysqlPassword=None): startTime = time.time() pharosSelectedTables = ["drug_activity", "cmpd_activity", "target", "protein", "t2tc"] pharosDumpUrl = pharosDumpUrl if pharosDumpUrl else "http://juniper.health.unm.edu/tcrd/download/latest.sql.gz" pharosReadmeUrl = "http://juniper.health.unm.edu/tcrd/download/latest.README" ok = False fU = FileUtil() pharosDumpFileName = fU.getFileName(pharosDumpUrl) pharosDumpPath = os.path.join(dirPath, pharosDumpFileName) pharosUpdatePath = os.path.join(dirPath, "pharos-update.sql") pharosReadmePath = os.path.join(dirPath, "pharos-readme.txt") logPath = os.path.join(dirPath, "pharosLoad.log") # fU.mkdir(dirPath) # exU = ExecUtils() # if reloadDb: logger.info("useCache %r pharosDumpPath %r", useCache, pharosDumpPath) if useCache and self.__mU.exists(pharosDumpPath): ok = True else: logger.info("Fetching url %s path %s", pharosDumpUrl, pharosDumpPath) ok1 = fU.get(pharosDumpUrl, pharosDumpPath) ok2 = fU.get(pharosReadmeUrl, pharosReadmePath) logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1 and ok2, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # --- readmeLines = self.__mU.doImport(pharosReadmePath, fmt="list") self.__version = readmeLines[0].split(" ")[1][1:] if readmeLines else "6" # --- logger.info("Filtering SQL dump %r for selected tables %r", pharosDumpFileName, pharosSelectedTables) doWrite = True # Note: the pharos dump file latest.sql.gz is not gzipped with open(pharosDumpPath, "r", encoding="utf-8") as ifh, open(pharosUpdatePath, "w", encoding="utf-8") as ofh: for line in ifh: if line.startswith("-- Table structure for table"): tN = line.split(" ")[-1][1:-2] doWrite = True if tN in pharosSelectedTables else False if doWrite: ofh.write(line) # --- ok = exU.run( "mysql", execArgList=["-v", "-u", mysqlUser, "--password=%s" % mysqlPassword, "-e", "create database if not exists tcrd6;"], outPath=logPath, outAppend=False, timeOut=None, ) # ok = exU.run( # "mysql", # execArgList=["-u", mysqlUser, "--password=%s" % mysqlPassword, "tcrd6"], # outPath=logPath, # inpPath=pharosDumpPath, # outAppend=True, # timeOut=None, # ) shellCmd = 'trap "" SIGHUP SIGINT SIGTERM; nohup mysql -u %s --password=%s tcrd6 < %s >& %s' % (mysqlUser, mysqlPassword, pharosUpdatePath, logPath) ok = exU.runShell( shellCmd, outPath=None, inpPath=None, outAppend=True, timeOut=None, ) logger.info("SQL dump restore status %r", ok) # -- if fromDb: for tbl in pharosSelectedTables: outPath = os.path.join(dirPath, "%s.tdd" % tbl) # if useCache and self.__mU.exists(outPath): # continue ok = exU.run( "mysql", execArgList=["-u", mysqlUser, "--password=%s" % mysqlPassword, "-e", "use tcrd6; select * from %s;" % tbl], outPath=outPath, outAppend=False, timeOut=None, suppressStderr=True, ) logger.info("SQL table %s export status %r", tbl, ok) return ok def exportProteinFasta(self, fastaPath, taxonPath, addTaxonomy=False): try: proteinFilePath = os.path.join(self.__dirPath, "protein.tdd") pDL = self.__mU.doImport(proteinFilePath, fmt="tdd", rowFormat="dict") fD = {} taxonL = [] if addTaxonomy: umP = UniProtIdMappingProvider(self.__cachePath) umP.reload(useCache=True) # for pD in pDL: unpId = pD["uniprot"] proteinId = pD["id"] seq = pD["seq"] taxId = umP.getMappedId(unpId, mapName="NCBI-taxon") taxId = taxId if taxId else "-1" cD = {"sequence": seq, "uniprotId": unpId, "proteinId": proteinId, "taxId": taxId} seqId = "" cL = [] for k, v in cD.items(): if k in ["sequence"]: continue cL.append(str(v)) cL.append(str(k)) seqId = "|".join(cL) fD[seqId] = cD taxonL.append("%s\t%s" % (seqId, taxId)) ok = self.__mU.doExport(taxonPath, taxonL, fmt="list") else: for pD in pDL: unpId = pD["uniprot"] proteinId = pD["id"] seq = pD["seq"] cD = {"sequence": seq, "uniprotId": unpId, "proteinId": proteinId} seqId = "" cL = [] for k, v in cD.items(): if k in ["sequence"]: continue cL.append(str(v)) cL.append(str(k)) seqId = "|".join(cL) fD[seqId] = cD # logger.info("Writing %d pharos targets to %s", len(fD), fastaPath) ok = self.__mU.doExport(fastaPath, fD, fmt="fasta", makeComment=True) except Exception as e: logger.exception("Failing with %s", str(e)) return ok
class RepositoryProvider(object): def __init__(self, cfgOb, cachePath=None, numProc=8, fileLimit=None, verbose=False): self.__fileLimit = fileLimit self.__numProc = numProc self.__verbose = verbose self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__topCachePath = cachePath if cachePath else "." self.__cachePath = os.path.join( self.__topCachePath, self.__cfgOb.get("REPO_UTIL_CACHE_DIR", sectionName=self.__configName)) # self.__mU = MarshalUtil(workPath=self.__cachePath) # self.__ccPathD = None # self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s" def getLocatorObjList(self, contentType, inputPathList=None, mergeContentTypes=None, excludeIds=None): """Convenience method to get the data path list for the input repository content type. Args: contentType (str): Repository content type (e.g. pdbx, chem_comp, bird, ...) inputPathList (list, optional): path list that will be returned if provided. mergeContentTypes (list, optional): repository content types to combined with the primary content type. excludeIds (list or dict): exclude any locators for idCodes in this list or dictionary Returns: Obj list: data file paths or tuple of file paths """ inputPathList = inputPathList if inputPathList else [] if inputPathList: return self.getLocatorObjListWithInput( contentType, inputPathList=inputPathList, mergeContentTypes=mergeContentTypes) # if mergeContentTypes and "vrpt" in mergeContentTypes and contentType in [ "pdbx", "pdbx_core" ]: dictPath = os.path.join( self.__topCachePath, self.__cfgOb.get( "DICTIONARY_CACHE_DIR", sectionName=self.__cfgOb.getDefaultSectionName())) os.environ["_RP_DICT_PATH_"] = dictPath locatorList = self.getEntryLocatorObjList( mergeContentTypes=mergeContentTypes) else: locatorList = self.__getLocatorList(contentType, inputPathList=inputPathList) # if excludeIds: fL = [] for locator in locatorList: if isinstance(locator, str): pth = locator else: pth = locator[0]["locator"] # idCode = self.__getIdcodeFromLocatorPath(contentType, pth) if idCode in excludeIds: continue fL.append(locator) locatorList = fL return locatorList def getLocatorObjListWithInput(self, contentType, inputPathList=None, mergeContentTypes=None): """Convenience method to get the data path list for the input repository content type. Args: contentType (str): Repository content type (e.g. pdbx, chem_comp, bird, ...) inputPathList (list, optional): path list that will be returned if provided. mergeContentTypes (list, optional): repository content types to combined with the primary content type. Returns: Obj list: data file paths or tuple of file paths """ inputPathList = inputPathList if inputPathList else [] locatorList = self.__getLocatorList(contentType, inputPathList=inputPathList) # JDW move the following to config if mergeContentTypes and "vrpt" in mergeContentTypes and contentType in [ "pdbx", "pdbx_core" ]: dictPath = os.path.join( self.__topCachePath, self.__cfgOb.get( "DICTIONARY_CACHE_DIR", sectionName=self.__cfgOb.getDefaultSectionName())) os.environ["_RP_DICT_PATH_"] = dictPath # locObjL = [] for locator in locatorList: if isinstance(locator, str): kwD = HashableDict({}) oL = [ HashableDict({ "locator": locator, "fmt": "mmcif", "kwargs": kwD }) ] for mergeContentType in mergeContentTypes: _, fn = os.path.split(locator) idCode = fn[:4] if fn and len(fn) >= 8 else None mergeLocator = self.__getLocator( mergeContentType, idCode, checkExists=True) if idCode else None if mergeLocator: # kwD = HashableDict({"marshalHelper": vrd.toCif}) kwD = HashableDict({"marshalHelper": toCifWrapper}) oL.append( HashableDict({ "locator": mergeLocator, "fmt": "xml", "kwargs": kwD })) lObj = tuple(oL) else: logger.error("Unexpected output locator type %r", locator) lObj = locator locObjL.append(lObj) # locatorList = locObjL # - return locatorList def getContainerList(self, locatorObjList): """Return the data container list obtained by parsing the input locator object list.""" cL = [] for locatorObj in locatorObjList: myContainerList = self.__mergeContainers(locatorObj, fmt="mmcif", mergeTarget=0) for cA in myContainerList: cL.append(cA) return cL def __mergeContainers(self, locatorObj, fmt="mmcif", mergeTarget=0): """Consolidate content in auxiliary files locatorObj[1:] into locatorObj[0] container index 'mergeTarget'. """ # cL = [] try: if isinstance(locatorObj, str): cL = self.__mU.doImport(locatorObj, fmt=fmt) return cL if cL else [] elif isinstance(locatorObj, (list, tuple)) and locatorObj: dD = locatorObj[0] kw = dD["kwargs"] cL = self.__mU.doImport(dD["locator"], fmt=dD["fmt"], **kw) if cL: for dD in locatorObj[1:]: kw = dD["kwargs"] rObj = self.__mU.doImport(dD["locator"], fmt=dD["fmt"], **kw) mergeL = rObj if rObj else [] for mc in mergeL: cL[mergeTarget].merge(mc) # return cL else: return [] except Exception as e: logger.exception("Failing for %r with %s", locatorObj, str(e)) return cL def getLocatorsFromPaths(self, locatorObjList, pathList, locatorIndex=0): """Return locator objects with paths (locatorObjIndex) matching the input pathList.""" # index the input locatorObjList rL = [] try: if locatorObjList and isinstance(locatorObjList[0], str): return pathList # locIdx = {} for ii, locatorObj in enumerate(locatorObjList): if "locator" in locatorObj[locatorIndex]: locIdx[locatorObj[locatorIndex]["locator"]] = ii # for pth in pathList: jj = locIdx[pth] if pth in locIdx else None if jj is not None: rL.append(locatorObjList[jj]) except Exception as e: logger.exception("Failing with %s", str(e)) # return rL def getLocatorIdcodes(self, contentType, locatorObjList, locatorIndex=0): try: if locatorObjList and isinstance(locatorObjList[0], str): return [ self.__getIdcodeFromLocatorPath(contentType, pth) for pth in locatorObjList ] else: return [ self.__getIdcodeFromLocatorPath( contentType, locatorObj[locatorIndex]["locator"]) for locatorObj in locatorObjList ] except Exception as e: logger.exception("Failing with %s", str(e)) return [] def getLocatorPaths(self, locatorObjList, locatorIndex=0): try: if locatorObjList and isinstance(locatorObjList[0], str): return locatorObjList else: return [ locatorObj[locatorIndex]["locator"] for locatorObj in locatorObjList ] except Exception as e: logger.exception("Failing with %s", str(e)) return [] def __getLocatorList(self, contentType, inputPathList=None): """Internal convenience method to return repository path list by content type:""" outputPathList = [] inputPathList = inputPathList if inputPathList else [] try: if contentType in ["bird", "bird_core"]: outputPathList = inputPathList if inputPathList else self.getBirdPathList( ) elif contentType == "bird_family": outputPathList = inputPathList if inputPathList else self.getBirdFamilyPathList( ) elif contentType in ["chem_comp"]: outputPathList = inputPathList if inputPathList else self.getChemCompPathList( ) elif contentType in ["bird_chem_comp"]: outputPathList = inputPathList if inputPathList else self.getBirdChemCompPathList( ) elif contentType in ["pdbx", "pdbx_core"]: outputPathList = inputPathList if inputPathList else self.getEntryPathList( ) elif contentType in [ "chem_comp_core", "bird_consolidated", "bird_chem_comp_core" ]: outputPathList = inputPathList if inputPathList else self.mergeBirdAndChemCompRefData( ) elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]: outputPathList = inputPathList if inputPathList else self.getIhmDevPathList( ) elif contentType in [ "pdb_distro", "da_internal", "status_history" ]: outputPathList = inputPathList if inputPathList else [] else: logger.warning("Unsupported contentType %s", contentType) except Exception as e: logger.exception("Failing with %s", str(e)) if self.__fileLimit: outputPathList = outputPathList[:self.__fileLimit] return sorted(outputPathList) def __getLocator(self, contentType, idCode, version="v1-0", checkExists=False): """Convenience method to return repository path for a content type and cardinal identifier.""" pth = None try: idCodel = idCode.lower() if contentType == "bird": pth = os.path.join(self.__getRepoTopPath(contentType), idCode[-1], idCode + ".cif") elif contentType == "bird_family": pth = os.path.join(self.__getRepoTopPath(contentType), idCode[-1], idCode + ".cif") elif contentType in ["chem_comp", "chem_comp_core"]: pth = os.path.join(self.__getRepoTopPath(contentType), idCode[0], idCode, idCode + ".cif") elif contentType in ["bird_chem_comp"]: pth = os.path.join(self.__getRepoTopPath(contentType), idCode[-1], idCode + ".cif") elif contentType in ["pdbx", "pdbx_core"]: pth = os.path.join(self.__getRepoTopPath(contentType), idCodel[1:3], idCodel + ".cif.gz") elif contentType in ["bird_consolidated", "bird_chem_comp_core"]: pth = os.path.join(self.__getRepoTopPath(contentType), idCode + ".cif") elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]: pth = os.path.join(self.__getRepoTopPath(contentType), idCode, idCode + "_model_%s.cif.gz" % version) elif contentType in [ "pdb_distro", "da_internal", "status_history" ]: pass elif contentType in ["vrpt"]: pth = os.path.join(self.__getRepoTopPath(contentType), idCodel[1:3], idCodel, idCodel + "_validation.xml.gz") else: logger.warning("Unsupported contentType %s", contentType) except Exception as e: logger.exception("Failing with %s", str(e)) if checkExists: pth = pth if self.__mU.exists(pth) else None return pth def __getIdcodeFromLocatorPath(self, contentType, pth): """Convenience method to return the idcode from the locator path.""" idCode = None try: bn = os.path.basename(pth) if contentType in [ "pdbx", "pdbx_core", "bird", "bird_family", "chem_comp", "chem_comp_core", "bird_consolidated", "bird_chem_comp_core" ]: idCode = bn.split(".")[0] elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]: tC = bn.split(".")[0] idCode = "_".join(tC.split("_")[:2]) elif contentType in [ "pdb_distro", "da_internal", "status_history" ]: idCode = None elif contentType in ["vrpt"]: tC = bn.split(".")[0] idCode = tC.split("_")[0] else: logger.warning("Unsupported contentType %s", contentType) idCode = idCode.upper() if idCode else None except Exception as e: logger.exception("Failing for %r %r with %s", contentType, pth, str(e)) return idCode def __getRepoTopPath(self, contentType): """Convenience method to return repository top path from configuration data.""" pth = None try: if contentType == "bird": pth = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=self.__configName) elif contentType == "bird_family": pth = self.__cfgOb.getPath("BIRD_FAMILY_REPO_PATH", sectionName=self.__configName) elif contentType in ["chem_comp", "chem_comp_core"]: pth = self.__cfgOb.getPath("CHEM_COMP_REPO_PATH", sectionName=self.__configName) elif contentType in ["bird_chem_comp"]: pth = self.__cfgOb.getPath("BIRD_CHEM_COMP_REPO_PATH", sectionName=self.__configName) elif contentType in ["pdbx", "pdbx_core"]: pth = self.__cfgOb.getPath("PDBX_REPO_PATH", sectionName=self.__configName) elif contentType in ["bird_consolidated", "bird_chem_comp_core"]: pth = self.__cachePath elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]: pth = self.__cfgOb.getPath("IHM_DEV_REPO_PATH", sectionName=self.__configName) elif contentType in [ "pdb_distro", "da_internal", "status_history" ]: pass elif contentType in ["vrpt"]: pth = self.__cfgOb.getEnvValue("VRPT_REPO_PATH_ENV", sectionName=self.__configName, default=None) if pth is None: pth = self.__cfgOb.getPath("VRPT_REPO_PATH", sectionName=self.__configName) else: logger.debug( "Using validation report path from environment assignment %s", pth) else: logger.warning("Unsupported contentType %s", contentType) except Exception as e: logger.exception("Failing with %s", str(e)) return pth def _chemCompPathWorker(self, dataList, procName, optionsD, workingDir): """Return the list of chemical component definition file paths in the current repository.""" _ = procName _ = workingDir topRepoPath = optionsD["topRepoPath"] pathList = [] for subdir in dataList: dd = os.path.join(topRepoPath, subdir) for root, _, files in os.walk(dd, topdown=False): if "REMOVE" in root: continue for name in files: if name.endswith(".cif") and len(name) <= 7: pathList.append(os.path.join(root, name)) return dataList, pathList, [] def getChemCompPathList(self): return self.__getChemCompPathList(self.__getRepoTopPath("chem_comp"), numProc=self.__numProc) def __getChemCompPathList(self, topRepoPath, numProc=8): """Get the path list for the chemical component definition repository""" ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting at %s", ts) startTime = time.time() pathList = [] try: dataS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" dataList = [a for a in dataS] optD = {} optD["topRepoPath"] = topRepoPath mpu = MultiProcUtil(verbose=self.__verbose) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="_chemCompPathWorker") _, _, retLists, _ = mpu.runMulti(dataList=dataList, numProc=numProc, numResults=1) pathList = retLists[0] endTime0 = time.time() logger.debug("Path list length %d in %.4f seconds", len(pathList), endTime0 - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) return self.__applyFileLimit(pathList) def _entryLocatorObjWithMergeWorker(self, dataList, procName, optionsD, workingDir): """Return the list of entry locator objects including merge content in the current repository.""" _ = procName _ = workingDir topRepoPath = optionsD["topRepoPath"] mergeContentTypes = optionsD["mergeContentTypes"] locatorObjList = [] for subdir in dataList: dd = os.path.join(topRepoPath, subdir) for root, _, files in os.walk(dd, topdown=False): if "REMOVE" in root: continue for fn in files: if (fn.endswith(".cif.gz") and len(fn) == 11) or (fn.endswith(".cif") and len(fn) == 8): locator = os.path.join(root, fn) kwD = HashableDict({}) oL = [ HashableDict({ "locator": locator, "fmt": "mmcif", "kwargs": kwD }) ] for mergeContentType in mergeContentTypes: idCode = fn[:4] if fn and len(fn) >= 8 else None mergeLocator = self.__getLocator( mergeContentType, idCode, checkExists=True) if idCode else None if mergeLocator: kwD = HashableDict( {"marshalHelper": toCifWrapper}) oL.append( HashableDict({ "locator": mergeLocator, "fmt": "xml", "kwargs": kwD })) lObj = tuple(oL) locatorObjList.append(lObj) return dataList, locatorObjList, [] def getEntryLocatorObjList(self, mergeContentTypes=None): return self.__getEntryLocatorObjList( self.__getRepoTopPath("pdbx"), numProc=self.__numProc, mergeContentTypes=mergeContentTypes) def __getEntryLocatorObjList(self, topRepoPath, numProc=8, mergeContentTypes=None): """Get the path list for structure entries in the input repository""" ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting at %s", ts) startTime = time.time() pathList = [] try: dataList = [] anL = "abcdefghijklmnopqrstuvwxyz0123456789" for a1 in anL: for a2 in anL: hc = a1 + a2 dataList.append(hc) hc = a2 + a1 dataList.append(hc) dataList = list(set(dataList)) # optD = {} optD["topRepoPath"] = topRepoPath optD["mergeContentTypes"] = mergeContentTypes mpu = MultiProcUtil(verbose=self.__verbose) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="_entryLocatorObjWithMergeWorker") _, _, retLists, _ = mpu.runMulti(dataList=dataList, numProc=numProc, numResults=1) pathList = retLists[0] endTime0 = time.time() logger.debug("Locator object list length %d in %.4f seconds", len(pathList), endTime0 - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) return self.__applyFileLimit(pathList) def _entryPathWorker(self, dataList, procName, optionsD, workingDir): """Return the list of entry file paths in the current repository.""" _ = procName _ = workingDir topRepoPath = optionsD["topRepoPath"] pathList = [] for subdir in dataList: dd = os.path.join(topRepoPath, subdir) for root, _, files in os.walk(dd, topdown=False): if "REMOVE" in root: continue for name in files: if (name.endswith(".cif.gz") and len(name) == 11) or (name.endswith(".cif") and len(name) == 8): pathList.append(os.path.join(root, name)) return dataList, pathList, [] def getEntryPathList(self): return self.__getEntryPathList(self.__getRepoTopPath("pdbx"), numProc=self.__numProc) def __getEntryPathList(self, topRepoPath, numProc=8): """Get the path list for structure entries in the input repository""" ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting at %s", ts) startTime = time.time() pathList = [] try: dataList = [] anL = "abcdefghijklmnopqrstuvwxyz0123456789" for a1 in anL: for a2 in anL: hc = a1 + a2 dataList.append(hc) hc = a2 + a1 dataList.append(hc) dataList = list(set(dataList)) # optD = {} optD["topRepoPath"] = topRepoPath mpu = MultiProcUtil(verbose=self.__verbose) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="_entryPathWorker") _, _, retLists, _ = mpu.runMulti(dataList=dataList, numProc=numProc, numResults=1) pathList = retLists[0] endTime0 = time.time() logger.debug("Path list length %d in %.4f seconds", len(pathList), endTime0 - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) return self.__applyFileLimit(pathList) def getBirdPathList(self): return self.__getBirdPathList(self.__getRepoTopPath("bird")) def __getBirdPathList(self, topRepoPath): """Return the list of definition file paths in the current repository. List is ordered in increasing PRD ID numerical code. """ pathList = [] try: sd = {} for root, _, files in os.walk(topRepoPath, topdown=False): if "REMOVE" in root: continue for name in files: if name.startswith("PRD_") and name.endswith( ".cif") and len(name) <= 14: pth = os.path.join(root, name) sd[int(name[4:-4])] = pth # for k in sorted(sd.keys()): pathList.append(sd[k]) except Exception as e: logger.exception("Failing with %s", str(e)) # return self.__applyFileLimit(pathList) def getBirdFamilyPathList(self): return self.__getBirdFamilyPathList( self.__getRepoTopPath("bird_family")) def __getBirdFamilyPathList(self, topRepoPath): """Return the list of definition file paths in the current repository. List is ordered in increasing PRD ID numerical code. """ pathList = [] try: sd = {} for root, _, files in os.walk(topRepoPath, topdown=False): if "REMOVE" in root: continue for name in files: if name.startswith("FAM_") and name.endswith( ".cif") and len(name) <= 14: pth = os.path.join(root, name) sd[int(name[4:-4])] = pth # for k in sorted(sd.keys()): pathList.append(sd[k]) except Exception as e: logger.exception("Failing with %s", str(e)) # return self.__applyFileLimit(pathList) def getBirdChemCompPathList(self): return self.__getBirdChemCompPathList( self.__getRepoTopPath("bird_chem_comp")) def __getBirdChemCompPathList(self, topRepoPath): """Return the list of definition file paths in the current repository. List is ordered in increasing PRD ID numerical code. """ pathList = [] try: sd = {} for root, _, files in os.walk(topRepoPath, topdown=False): if "REMOVE" in root: continue for name in files: if name.startswith("PRDCC_") and name.endswith( ".cif") and len(name) <= 16: pth = os.path.join(root, name) sd[int(name[6:-4])] = pth # for k in sorted(sd.keys()): pathList.append(sd[k]) except Exception as e: logger.exception("Failing with %s", str(e)) # return self.__applyFileLimit(pathList) def __applyFileLimit(self, pathList): logger.debug("Length of file path list %d (limit %r)", len(pathList), self.__fileLimit) if self.__fileLimit: return pathList[:self.__fileLimit] else: return pathList def __buildFamilyIndex(self): """Using information from the PRD family definition: # loop_ _pdbx_reference_molecule_list.family_prd_id _pdbx_reference_molecule_list.prd_id FAM_000010 PRD_000041 FAM_000010 PRD_000042 FAM_000010 PRD_000043 FAM_000010 PRD_000044 FAM_000010 PRD_000048 FAM_000010 PRD_000049 FAM_000010 PRD_000051 # """ prdD = {} try: pthL = self.__getLocatorList("bird_family") for pth in pthL: containerL = self.__mU.doImport(pth, fmt="mmcif") for container in containerL: catName = "pdbx_reference_molecule_list" if container.exists(catName): catObj = container.getObj(catName) for ii in range(catObj.getRowCount()): familyPrdId = catObj.getValue( attributeName="family_prd_id", rowIndex=ii) prdId = catObj.getValue(attributeName="prd_id", rowIndex=ii) if prdId in prdD: logger.debug( "duplicate prdId in family index %s %s", prdId, familyPrdId) prdD[prdId] = { "familyPrdId": familyPrdId, "c": container } except Exception as e: logger.exception("Failing with %s", str(e)) return prdD def __buildBirdCcIndex(self): """Using information from the PRD pdbx_reference_molecule category to index the BIRDs corresponding small molecule correspondences """ prdD = {} ccPathD = {} prdStatusD = {} try: ccPathL = self.__getLocatorList("chem_comp") ccPathD = {} for ccPath in ccPathL: _, fn = os.path.split(ccPath) ccId, _ = os.path.splitext(fn) ccPathD[ccId] = ccPath logger.info("Chemical component path list (%d)", len(ccPathD)) pthL = self.__getLocatorList("bird") logger.info("BIRD path list (%d)", len(pthL)) for pth in pthL: containerL = self.__mU.doImport(pth, fmt="mmcif") for container in containerL: catName = "pdbx_reference_molecule" if container.exists(catName): catObj = container.getObj(catName) ii = 0 prdId = catObj.getValue(attributeName="prd_id", rowIndex=ii) relStatus = catObj.getValue( attributeName="release_status", rowIndex=ii) prdStatusD[prdId] = relStatus if relStatus != "REL": continue prdRepType = catObj.getValue( attributeName="represent_as", rowIndex=ii) logger.debug("represent as %r", prdRepType) if prdRepType in ["single molecule"]: ccId = catObj.getValueOrDefault( attributeName="chem_comp_id", rowIndex=ii, defaultValue=None) # prdId = catObj.getValue(attributeName="prd_id", rowIndex=ii) logger.debug("mapping prdId %r ccId %r", prdId, ccId) if ccId and ccId in ccPathD: prdD[prdId] = { "ccId": ccId, "ccPath": ccPathD[ccId] } ccPathD[ccPathD[ccId]] = { "ccId": ccId, "prdId": prdId } else: logger.error("Bad ccId %r for BIRD %r", ccId, prdId) except Exception as e: logger.exception("Failing with %s", str(e)) logger.info( "Candidate Chemical Components (%d) BIRDS (%d) BIRD status details (%d)", len(prdD), len(ccPathD), len(prdStatusD)) return prdD, ccPathD, prdStatusD # - def mergeBirdAndChemCompRefData(self): prdSmallMolCcD, ccPathD, prdStatusD = self.__buildBirdCcIndex() logger.info("PRD to CCD index length %d CCD map path length %d", len(prdSmallMolCcD), len(ccPathD)) outputPathList = self.mergeBirdRefData(prdSmallMolCcD, prdStatusD) ccOutputPathList = [ pth for pth in self.getChemCompPathList() if pth not in ccPathD ] outputPathList.extend(ccOutputPathList) return outputPathList def mergeBirdRefData(self, prdSmallMolCcD, prdStatusD): """Consolidate all of the bird reference data in a single container. If the BIRD is a 'small molecule' type then also merge with the associated CC definition. Store the merged data in the REPO_UTIL cache path and ... Return a path list for the consolidated data files - """ outPathList = [] try: birdPathList = self.__getLocatorList("bird") birdPathD = {} for birdPath in birdPathList: _, fn = os.path.split(birdPath) prdId, _ = os.path.splitext(fn) birdPathD[prdId] = birdPath # logger.info("BIRD path length %d", len(birdPathD)) logger.debug("BIRD keys %r", list(birdPathD.keys())) birdCcPathList = self.__getLocatorList("bird_chem_comp") birdCcPathD = {} for birdCcPath in birdCcPathList: _, fn = os.path.split(birdCcPath) prdCcId, _ = os.path.splitext(fn) prdId = "PRD_" + prdCcId[6:] birdCcPathD[prdId] = birdCcPath # logger.info("BIRDCC path length %d", len(birdCcPathD)) logger.debug("BIRD CC keys %r", list(birdCcPathD.keys())) fD = self.__buildFamilyIndex() logger.info("BIRD Family index length %d", len(fD)) logger.debug("Family index keys %r", list(fD.keys())) logger.info("PRD to CCD small mol index length %d", len(prdSmallMolCcD)) # iSkip = 0 for prdId in birdPathD: if prdId in prdStatusD and prdStatusD[prdId] != "REL": logger.debug("Skipping BIRD with non-REL status %s", prdId) iSkip += 1 continue fp = os.path.join(self.__cachePath, prdId + ".cif") logger.debug("Export cache path is %r", fp) # pth2 = birdPathD[prdId] cL = self.__mU.doImport(pth2, fmt="mmcif") cFull = cL[0] logger.debug("Got Bird %r", cFull.getName()) # # ccBird = None ccD = None if prdId in prdSmallMolCcD: pthCc = prdSmallMolCcD[prdId]["ccPath"] cL = self.__mU.doImport(pthCc, fmt="mmcif") ccD = cL[0] logger.debug("Got corresponding CCD %r", ccD.getName()) elif prdId in birdCcPathD: pth1 = birdCcPathD[prdId] c1L = self.__mU.doImport(pth1, fmt="mmcif") ccBird = c1L[0] logger.debug("Got ccBird %r", ccBird.getName()) # cFam = None if prdId in fD: cFam = fD[prdId]["c"] logger.debug("Got cFam %r", cFam.getName()) # if ccD: for catName in ccD.getObjNameList(): cFull.append(ccD.getObj(catName)) # if ccBird: for catName in ccBird.getObjNameList(): cFull.append(ccBird.getObj(catName)) if cFam: for catName in cFam.getObjNameList(): cFull.append(cFam.getObj(catName)) # self.__mU.doExport(fp, [cFull], fmt="mmcif") outPathList.append(fp) except Exception as e: logger.exception("Failing with %s", str(e)) # logger.info( "Merged BIRD/Family/CC path length %d (skipped non-released %d)", len(outPathList), iSkip) return outPathList # def __exportConfig(self, container): """ - CATEGORY_NAME: diffrn_detector ATTRIBUTE_NAME_LIST: - pdbx_frequency - CATEGORY_NAME: pdbx_serial_crystallography_measurement ATTRIBUTE_NAME_LIST: - diffrn_id - pulse_energy - pulse_duration - xfel_pulse_repetition_rate """ for catName in container.getObjNameList(): cObj = container.getObj(catName) print("- CATEGORY_NAME: %s" % catName) print(" ATTRIBUTE_NAME_LIST:") for atName in cObj.getAttributeList(): print(" - %s" % atName) return True def getIhmDevPathList(self): return self.__getIhmDevPathList(self.__getRepoTopPath("ihm_dev")) def __getIhmDevPathList(self, topRepoPath): """Return the list of I/HM entries in the current repository. File name template is: PDBDEV_0000 0020_model_v1-0.cif.gz List is ordered in increasing PRDDEV numerical code. """ pathList = [] logger.debug("Searching path %r", topRepoPath) try: sd = {} for root, _, files in os.walk(topRepoPath, topdown=False): if "REMOVE" in root: continue for name in files: if name.startswith("PDBDEV_") and name.endswith( ".cif.gz") and len(name) <= 50: pth = os.path.join(root, name) sd[int(name[7:15])] = pth # for k in sorted(sd.keys()): pathList.append(sd[k]) except Exception as e: logger.exception("Failing search in %r with %s", topRepoPath, str(e)) # return self.__applyFileLimit(pathList)
class ChemRefMappingProvider(StashableBase): """Accessors for chemical reference identifier mapping data.""" def __init__(self, cachePath, useCache=True): # self.__cachePath = cachePath self.__useCache = useCache self.__dirName = "chemref-mapping" super(ChemRefMappingProvider, self).__init__(self.__cachePath, [self.__dirName]) self.__dirPath = os.path.join(self.__cachePath, self.__dirName) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__rD = {} self.__mapD = self.__reload(self.__dirPath, useCache) # def testCache(self, minCount=0): logger.info( "Mapping count %d", len(self.__mapD["mapping"]) if "mapping" in self.__mapD else 0) if minCount == 0 or self.__mapD and "mapping" in self.__mapD and len( self.__mapD["mapping"]) >= minCount: return True else: return False def getReferenceIds(self, referenceResourceName, localId): """Get the identifiers in the reference resource corresponding to input local identifiers (Chemical Component or BIRD). Args: referenceResourceName (str): chemical reference resource name (DrugBank, ChEMBL, ChEBI, PubChem, ...) localId (str): local identifier for a Chemical Component or BIRD definition Returns: list: list of reference identifiers """ if not self.__rD: for rN, forwardD in self.__mapD["mapping"].items(): # {refId :[lId, lId, ...], ...} reverseD = {} for refId, rcsbIdL in forwardD.items(): for rId in rcsbIdL: reverseD.setdefault(rId, []).append(refId) self.__rD[rN] = reverseD # try: return self.__rD[referenceResourceName.upper()][localId] except Exception: return [] def getLocalIds(self, referenceResourceName, referenceId): """Get the local identifiers (Chemical Component or BIRD) corresponding to identifiers in chemical reference resource. Args: referenceResourceName (str): chemical reference resource name (DrugBank, ChEMBL, ChEBI, PubChem, ...) referenceId (str): identifier in the chemical reference resource Returns: list: list of local Chemical Component or BIRD identifiers """ try: return self.__mapD["mapping"][ referenceResourceName.upper()][referenceId] except Exception: return [] def __getMappingDataPath(self): return os.path.join(self.__dirPath, "chemref-mapping-data.json") def __reload(self, dirPath, useCache): startTime = time.time() fD = {} ok = False mappingPath = self.__getMappingDataPath() # logger.info("useCache %r mappingPath %r", useCache, mappingPath) if useCache and self.__mU.exists(mappingPath): fD = self.__mU.doImport(mappingPath, fmt="json") ok = True else: fU = FileUtil() fU.mkdir(dirPath) # --- logger.info("Completed reload with status (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return fD def fetchChemRefMapping(self, cfgOb, referenceResourceNameList=None): """Fetch reference resource mapping for chemical component and BIRD definitions Args: cfgOb (obj): instance configuration class ConfigUtil() referenceResourceNameList (list, optional): list of chemical reference resources. Defaults to [DrugBank, ChEMBL]. Returns: bool: True for success or False otherwise """ try: rnL = referenceResourceNameList if referenceResourceNameList is not None else [ "DrugBank", "ChEMBL" ] mD = {} crExt = ChemRefExtractor(cfgOb) for referenceResourceName in rnL: idD = crExt.getChemCompAccessionMapping( referenceResourceName=referenceResourceName) logger.info("%s mapping dictionary (%d)", referenceResourceName, len(idD)) mD[referenceResourceName.upper()] = idD # fp = self.__getMappingDataPath() tS = datetime.datetime.now().isoformat() vS = datetime.datetime.now().strftime("%Y-%m-%d") ok = self.__mU.doExport(fp, { "version": vS, "created": tS, "mapping": mD }, fmt="json", indent=3) return ok except Exception as e: logger.exception("Failing with %s", str(e)) return False
class PharosTargetCofactorProvider(StashableBase): """Accessors for Pharos target cofactors.""" def __init__(self, **kwargs): # self.__cachePath = kwargs.get("cachePath", ".") self.__dirName = "Pharos-cofactors" super(PharosTargetCofactorProvider, self).__init__(self.__cachePath, [self.__dirName]) self.__dirPath = os.path.join(self.__cachePath, self.__dirName) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__fD = self.__reload(self.__dirPath, **kwargs) # def testCache(self, minCount=1): logger.info( "Pharos cached cofactor count %d", len(self.__fD["cofactors"]) if "cofactors" in self.__fD else 0) if self.__fD and "cofactors" in self.__fD and len( self.__fD["cofactors"]) > minCount: return True else: return False def hasTarget(self, rcsbEntityId): return rcsbEntityId.upper() in self.__fD["cofactors"] def getTargets(self, rcsbEntityId): try: return self.__fD["cofactors"][rcsbEntityId.upper()] except Exception: return [] def __getCofactorDataPath(self): return os.path.join(self.__dirPath, "Pharos-cofactor-data.json") def reload(self): self.__fD = self.__reload(self.__dirPath, useCache=True) return True def __reload(self, dirPath, **kwargs): startTime = time.time() fD = {} useCache = kwargs.get("useCache", True) ok = False cofactorPath = self.__getCofactorDataPath() # logger.info("useCache %r cofactorPath %r", useCache, cofactorPath) if useCache and self.__mU.exists(cofactorPath): fD = self.__mU.doImport(cofactorPath, fmt="json") ok = True else: fU = FileUtil() fU.mkdir(dirPath) # --- numCofactors = len(fD["cofactors"]) if fD and "cofactors" in fD else 0 logger.info( "Completed reload of (%d) cofactors with status (%r) at %s (%.4f seconds)", numCofactors, ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return fD def buildCofactorList(self, sequenceMatchFilePath, crmpObj=None, lnmpObj=None, maxActivity=5): """Build target cofactor list for the matching entities in the input sequence match file. Args: sequenceMatchFilePath (str): sequence match output file path crmpObj (obj, optional): instance of ChemRefMappingProviderObj(). Defaults to None. lnmpObj (obj, optional): instance of LigandNeighborMappingProviderObj(). Defaults to None. maxActivity (int, optional): maximum number of prioritized activity records per target. Defaults to 5. Returns: bool: True for success or False otherwise Example Pharos activity record - { "version": "2021-06-17", "created": "2021-06-17T11:10:54.563394", "activity": { "2232": [ { "smiles": "CC(=CCC\\\\C(=C/Cc1c(O)cc(O)c(C(=O)CCc2ccc(O)cc2)c1O)\\\\C)C", "chemblId": "CHEMBL3360923", "pubChemId": "118724585", "activity": 6.0, "activityType": "IC50", "activityUnits": "nM", "name": "1-[3-(3,7-dimethylocta-2,6-dien-1-yl)-2,4,6-trihydroxyphenyl]-3-(4-hydroxyphenyl)propan-1-one", "pubmedId": "25375026", "patent": "USxxxxxx", }, ... """ rDL = [] mD = self.__mU.doImport(sequenceMatchFilePath, fmt="json") # --- chaP = PharosTargetActivityProvider(cachePath=self.__cachePath, useCache=True) # provenanceSource = "Pharos" refScheme = "PDB entity" assignVersion = chaP.getAssignmentVersion() for queryId, matchDL in mD.items(): # "O43508|uniprotId|7987|proteinId|9606|taxId" qCmtD = self.__decodeComment(queryId) unpId = qCmtD["uniprotId"] queryTaxId = qCmtD["taxId"] if "taxId" in qCmtD else None pharosId = qCmtD["proteinId"] if queryTaxId == "-1": logger.debug("Skipping target with missing taxonomy %r (%r)", unpId, pharosId) continue # if not chaP.hasTargetActivity(pharosId): logger.debug("Skipping target with no activities %r (%r)", unpId, pharosId) # continue # -- chemCompNeighborsD = {} if lnmpObj: for matchD in matchDL: tCmtD = self.__decodeComment(matchD["target"]) entryId = tCmtD["entityId"].split("_")[0] entityId = tCmtD["entityId"].split("_")[1] rcsbEntityId = entryId + "_" + entityId chemCompIdList = lnmpObj.getLigandNeighbors(rcsbEntityId) chemCompNeighborsD.update( {k: True for k in chemCompIdList}) # -- queryName = chaP.getTargetInfo(pharosId, "name") # -- for matchD in matchDL: tCmtD = self.__decodeComment(matchD["target"]) entryId = tCmtD["entityId"].split("_")[0] entityId = tCmtD["entityId"].split("_")[1] rcsbEntityId = entryId + "_" + entityId # taDL = chaP.getTargetActivity(pharosId) logger.debug("Target %r has (%d) activity records", pharosId, len(taDL)) actL = [] # cfDL = [] chD = {} for taD in taDL: if taD["chemblId"] in chD: chD[taD["chemblId"]] = True continue actD = { "cofactor_id": taD["chemblId"], "cofactor_name": taD["molecule_name"] if "name" in taD else None, "measurement_type": "p" + taD["activityType"], "measurement_value": taD["activity"], "pubmed_ids": [taD["pubmedId"]] if "pubmedId" in taD else None, "patent_nos": taD["patents"] if "patents" in taD else None, "smiles": taD["smiles"] if "smiles" in taD else None, "action": taD["action"] if "action" in taD else None, "pharmacology": taD["pharmacology"] if "pharmacology" in taD else None, } actD = self.__addLocalIds(actD, crmpObj=crmpObj) actL.append(actD) # actL = self.__activityListSelect(actL, chemCompNeighborsD, maxActivity=maxActivity) if not actL: logger.debug("No Pharos cofactors for %s %s", pharosId, unpId) # --- # aligned_target.entity_beg_seq_id (current target is PDB entity in json) # aligned_target.target_beg_seq_id (current query is target seq in json) # aligned_target.length fpL = [] if "alignedRegions" in matchD: fpL = [{ "entity_beg_seq_id": arD["targetBegin"], "target_beg_seq_id": arD["queryBegin"], "length": arD["targetEnd"] - arD["targetBegin"], } for arD in matchD["alignedRegions"]] else: fpL = [{ "entity_beg_seq_id": matchD["targetBegin"], "target_beg_seq_id": matchD["queryBegin"], "length": matchD["alignLen"], }] # --- rD = { "entry_id": entryId, "entity_id": entityId, "query_uniprot_id": unpId, "query_id": pharosId, "query_id_type": "Pharos", "query_name": queryName, "provenance_source": provenanceSource, "reference_scheme": refScheme, "assignment_version": assignVersion, "query_taxonomy_id": int(queryTaxId) if queryTaxId else None, "target_taxonomy_id": int(matchD["targetTaxId"]) if "targetTaxId" in matchD else None, "aligned_target": fpL, "taxonomy_match_status": matchD["taxonomyMatchStatus"] if "taxonomyMatchStatus" in matchD else None, "lca_taxonomy_id": matchD["lcaTaxId"] if "lcaTaxId" in matchD else None, "lca_taxonomy_name": matchD["lcaTaxName"] if "lcaTaxName" in matchD else None, "lca_taxonomy_rank": matchD["lcaRank"] if "lcaRank" in matchD else None, "cofactors": actL, } rDL.append(rD) # qD = {} for rD in rDL: eId = rD["entry_id"] + "_" + rD["entity_id"] qD.setdefault(eId, []).append(rD) # fp = self.__getCofactorDataPath() tS = datetime.datetime.now().isoformat() # vS = datetime.datetime.now().strftime("%Y-%m-%d") vS = assignVersion ok = self.__mU.doExport(fp, { "version": vS, "created": tS, "cofactors": qD }, fmt="json", indent=3) return ok def __addLocalIds(self, cfD, crmpObj=None): # if crmpObj: localIdL = crmpObj.getLocalIds("CHEMBL", cfD["cofactor_id"]) if localIdL: localId = localIdL[0] if localId.startswith("PRD_"): cfD["prd_id"] = localId else: cfD["chem_comp_id"] = localId return cfD def __activityListSelect(self, activityDL, chemCompNeighborsD, maxActivity=5): """Prioritizing the activity data for locally mapped neighbor ligands and the best binding examples. Args: activityDL (list): full list of activity objects chemCompNeighborsD (dict, optional): index of all chemical components with neighbor interactions to the query target. Defaults {}. maxCount (int, optional): maximum number of activity object returned. Defaults to 5. Returns: list: prioritized and trimmed list of activity objects """ retL = [] mappedNeighborL = [] unmappedL = activityDL if chemCompNeighborsD: unmappedL = [] # Select out the any cases for molecules that map to a neighbor chemical component. for activityD in activityDL: if "chem_comp_id" in activityD and activityD[ "chem_comp_id"] in chemCompNeighborsD: activityD["neighbor_in_pdb"] = "Y" mappedNeighborL.append(activityD) else: unmappedL.append(activityD) activityD["neighbor_in_pdb"] = "N" # numLeft = maxActivity - len(mappedNeighborL) if numLeft > 0: unmappedL = sorted(unmappedL, key=lambda k: k["measurement_value"], reverse=True) retL = mappedNeighborL retL.extend(unmappedL[:numLeft]) retL = sorted(retL, key=lambda k: k["measurement_value"], reverse=True) else: logger.debug( "Mapped neighbor cofactors (%d) excluded unmapped (%d)", len(mappedNeighborL), len(unmappedL)) retL = sorted(mappedNeighborL, key=lambda k: k["measurement_value"], reverse=True) return retL def __decodeComment(self, comment, separator="|"): dD = {} try: ti = iter(comment.split(separator)) dD = {tup[1]: tup[0] for tup in zip(ti, ti)} except Exception: pass return dD
def search(self, dataList, procName, optionsD, workingDir): """Worker method to execute a shell to search CCDC for the input mol2 path list. Args: dataList (list): list of mol2 file paths to be searched procName (str): processName optionsD (dict): dictionary of options workingDir (str): path to working directory (not used) Returns: (successList, resultList, []): success and result lists of mol2 paths with CCDC matches """ resultPath = optionsD["resultPath"] searchType = optionsD["searchType"] pythonRootPath = optionsD["pythonRootPath"] csdHome = optionsD["csdHome"] timeOut = optionsD["timeOut"] timeOut = timeOut if timeOut and timeOut > 0 else 120 _ = workingDir resultList = [] startTime = time.time() logger.info("starting %s at %s", procName, time.strftime("%Y %m %d %H:%M:%S", time.localtime())) # try: stopPath = os.path.join(resultPath, "STOP") logger.info("%s starting search data length %d", procName, len(dataList)) if self.__checkStop(stopPath): logger.info("%s stopping", procName) return resultList, resultList, [] # queryListFilePath = os.path.join(resultPath, procName, "queryFileList.list") mU = MarshalUtil() ok = mU.doExport(queryListFilePath, dataList, fmt="list") if not ok: return resultList, resultList, [] # exU = ExecUtils() logger.debug("%s executing shell for %s", procName, queryListFilePath) cmdPath = os.path.join(pythonRootPath, "bin", "ccdc_search_cli") hitListPath = os.path.join(resultPath, procName, "hitList.list") logPath = os.path.join(resultPath, procName, "execlog.log") logger.debug("cmdPath %r", cmdPath) ok = exU.runShell( "%s --mol_list_path %s --result_path %s --search_type %s --csdhome %s --hit_list_path %s" % (cmdPath, queryListFilePath, resultPath, searchType, csdHome, hitListPath), outPath=logPath, outAppend=True, timeOut=timeOut, suppressStderr=False, ) # if ok and mU.exists(hitListPath): resultList = mU.doImport(hitListPath, fmt="list") except Exception as e: logger.exception("Failing with %s", str(e)) endTime = time.time() logger.info("%s (result length %d) completed at %s (%.2f seconds)", procName, len(resultList), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return resultList, resultList, []
class NeighborInteractionProvider(object): """Generators and accessors for non-polymer instance target interactions.""" def __init__(self, cfgOb, configName, cachePath, **kwargs): # self.__version = __version__ self.__cfgOb = cfgOb self.__configName = configName self.__cachePath = cachePath self.__fileLimit = kwargs.get("fileLimit", None) self.__dirPath = os.path.join(cachePath, "neighbor-interactions") self.__numProc = kwargs.get("numProc", 2) self.__chunkSize = kwargs.get("chunkSize", 10) useCache = kwargs.get("useCache", True) # # - Configuration for stash services - # Local target directory name to be stashed. (subdir of dirPath) # self.__stashDir = "ligand-target-neighbors" # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) self.__neighborD = self.__reload(fmt="pickle", useCache=useCache) # def testCache(self, minCount=0): try: if minCount == 0: return True if self.__neighborD and minCount and len( self.__neighborD["entries"]) >= minCount: logger.info( "Target neighbor data for (%d) entries created %r version %r", len(self.__neighborD["entries"]), self.__neighborD["created"], self.__neighborD["version"]) return True except Exception: pass return False def getLigandNeighborIndex(self, entryId): """Return the target neighbors for the non-polymer instances for the input entry. Args: entryId (str): entry identifier Returns: (dict): {ligandAsymId: {(targetAsymId, targetAuthSeqId): nnIndex1, (): nnIndex2} """ try: return self.__neighborD["entries"][ entryId.upper()]["ligandNeighborIndexD"] except Exception: pass return {} def getTargetNeighborIndex(self, entryId): """Return the ligand neighbors for the polymer or branched entity instances in the input entry. Args: entryId (str): entry identifier Returns: (dict): {(targetAsymId, targetAuthSeqId): {(ligandAsymId): nnIndex1, (): nnIndex2} """ try: return self.__neighborD["entries"][ entryId.upper()]["targetNeighborIndexD"] except Exception: pass return {} def getNearestNeighborList(self, entryId): """Return the list of neares neighbors for the entry. Args: entryId (str): entry identifier Returns: list: [LigandTargetInstance(), ...] """ try: return self.__neighborD["entries"][ entryId.upper()]["nearestNeighbors"] except Exception: pass return [] def getLigandNeighborBoundState(self, entryId): """Return the dicitonary of ligand instances with isBound boolean status. Args: entryId (str): entry identifier Returns: (dict): {ligandAsymId: True if isBound, ... } """ try: return self.__neighborD["entries"][ entryId.upper()]["ligandIsBoundD"] except Exception: pass return {} def getAtomCounts(self, entryId): """Return the non-polymer instance atom counts for the input entry (all reported atoms). Args: entryId (str): entry identifier Returns: (dict): {asymId: {'FL': count, 'altA': count, 'altB': count, ... }} """ try: return self.__neighborD["entries"][ entryId.upper()]["ligandAtomCountD"] except Exception: pass return {} def getHydrogenAtomCounts(self, entryId): """Return the non-polymer instance hydrogen atom counts for the input entry. Args: entryId (str): entry identifier Returns: (dict): {asymId: {'FL': count, 'altA': count, 'altB': count, ... }} """ try: return self.__neighborD["entries"][ entryId.upper()]["ligandHydrogenAtomCountD"] except Exception: pass return {} def hasEntry(self, entryId): """Return if the input entry is stored in the cache of non-polymer instance target interactions. Args: entryId (str): entry identifier Returns: (bool): True if entry is in the cache or False otherwise """ try: return entryId in self.__neighborD["entries"] except Exception: pass return False def getEntries(self): """Return a list of entry identifier for which non-polymer instance target interactions are stored. Returns: (list): [entryId, entryId, ... ] """ try: return list(self.__neighborD["entries"].keys()) except Exception: pass return [] def generate(self, distLimit=5.0, updateOnly=False, fmt="pickle", indent=0): """Generate and export non-polymer target interactions for all of the structures in the repository. Args: distLimit (float, optional): interaction distance. Defaults to 5.0. updateOnly (bool): only calculate interactions for new entries. Defaults to False. fmt (str, optional): export file format. Defaults to "pickle". indent (int, optional): json format indent. Defaults to 0. Returns: bool: True for success or False otherwise """ ok = False try: tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) tD = self.__calculateNeighbors(distLimit=distLimit, numProc=self.__numProc, chunkSize=self.__chunkSize, updateOnly=updateOnly) self.__neighborD = { "version": self.__version, "created": tS, "entries": tD } kwargs = { "indent": indent } if fmt == "json" else { "pickleProtocol": 4 } targetFilePath = self.__getTargetFilePath(fmt=fmt) ok = self.__mU.doExport(targetFilePath, self.__neighborD, fmt=fmt, **kwargs) logger.info("Wrote %r status %r", targetFilePath, ok) except Exception as e: logger.exception("Failing with %s", str(e)) return ok def reload(self, fmt="pickle"): self.__neighborD = self.__reload(fmt=fmt, useCache=True) return self.__neighborD is not None def __reload(self, fmt="pickle", useCache=True): """Reload from the current cache file.""" try: targetFilePath = self.__getTargetFilePath(fmt=fmt) tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) neighborD = { "version": self.__version, "created": tS, "entries": {} } logger.debug("useCache %r targetFilePath %r", useCache, targetFilePath) # if useCache and self.__mU.exists(targetFilePath): neighborD = self.__mU.doImport(targetFilePath, fmt=fmt) if fmt != "pickle": for _, nD in neighborD["entries"].items(): nD["nearestNeighbors"] = [ LigandTargetInstance(*neighbor) for neighbor in nD["nearestNeighbors"] ] except Exception as e: logger.exception("Failing with %s", str(e)) # return neighborD def __getTargetFilePath(self, fmt="pickle"): ext = "pic" if fmt == "pickle" else "json" pth = os.path.join(self.__dirPath, "ligand-target-neighbors", "neighbor-data." + ext) return pth def __calculateNeighbors(self, distLimit=5.0, numProc=2, chunkSize=10, updateOnly=False): """Calculate non-polymer target interactions for all repository structure files. Args: distLimit (float, optional): interaction distance limit. Defaults to 5.0. numProc (int, optional): number of processes to use. Defaults to 2. chunkSize (int, optional): incremental chunk size used for distribute work processes. Defaults to 10. Returns: (dict): {entryId: {asymId: [TargetLigandInteraction()], ...}, ...} """ contentType = "pdbx" mergeContent = None rD = {} exD = {} # # updateOnly - will reuse any existing data loaded when this is instantiated # otherwise the cache context is cleared before the calculation. if updateOnly: exD = {k: True for k in self.getEntries()} rD = self.__neighborD[ "entries"] if "entries" in self.__neighborD else {} # locatorObjList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContent, excludeIds=exD) logger.info("Starting with %d numProc %d updateOnly (%r)", len(locatorObjList), self.__numProc, updateOnly) # rWorker = TargetInteractionWorker(self.__rpP) mpu = MultiProcUtil(verbose=True) optD = {"distLimit": distLimit} mpu.setOptions(optD) mpu.set(workerObj=rWorker, workerMethod="build") ok, failList, resultList, _ = mpu.runMulti(dataList=locatorObjList, numProc=numProc, numResults=1, chunkSize=chunkSize) if failList: logger.info("Target interaction build failures (%d): %r", len(failList), failList) # for (entryId, nD) in resultList[0]: rD[entryId] = nD # logger.info( "Completed with multi-proc status %r failures %r total entries with data (%d)", ok, len(failList), len(rD)) return rD def toStash(self): ok = False try: userName = self.__cfgOb.get("_STASH_AUTH_USERNAME", sectionName=self.__configName) password = self.__cfgOb.get("_STASH_AUTH_PASSWORD", sectionName=self.__configName) basePath = self.__cfgOb.get("_STASH_SERVER_BASE_PATH", sectionName=self.__configName) url = self.__cfgOb.get("STASH_SERVER_URL", sectionName=self.__configName) urlFallBack = self.__cfgOb.get("STASH_SERVER_FALLBACK_URL", sectionName=self.__configName) ok = self.__toStash(url, basePath, userName=userName, password=password) ok = self.__toStash(urlFallBack, basePath, userName=userName, password=password) except Exception as e: logger.exception("Failing with %s", str(e)) return ok def __toStash(self, url, stashRemoteDirPath, userName=None, password=None, remoteStashPrefix=None): """Copy tar and gzipped bundled cache data to remote server/location. Args: url (str): server URL (e.g. sftp://hostname.domain) None for local host stashRemoteDirPath (str): path to target directory on remote server userName (str, optional): server username. Defaults to None. password (str, optional): server password. Defaults to None. remoteStashPrefix (str, optional): channel prefix. Defaults to None. Returns: (bool): True for success or False otherwise """ ok = False try: stU = StashUtil(os.path.join(self.__dirPath, "stash"), "ligand-target-neighbors") ok = stU.makeBundle(self.__dirPath, [self.__stashDir]) if ok: ok = stU.storeBundle(url, stashRemoteDirPath, remoteStashPrefix=remoteStashPrefix, userName=userName, password=password) except Exception as e: logger.error("Failing with url %r stashDirPath %r: %s", url, stashRemoteDirPath, str(e)) return ok def fromStash(self): try: minCount = 10 userName = self.__cfgOb.get("_STASH_AUTH_USERNAME", sectionName=self.__configName) password = self.__cfgOb.get("_STASH_AUTH_PASSWORD", sectionName=self.__configName) basePath = self.__cfgOb.get("_STASH_SERVER_BASE_PATH", sectionName=self.__configName) url = self.__cfgOb.get("STASH_SERVER_URL", sectionName=self.__configName) # ok = self.__fromStash(url, basePath, userName=userName, password=password) ok = self.reload() ok = self.testCache(minCount=minCount) if not ok: urlFallBack = self.__cfgOb.get("STASH_SERVER_FALLBACK_URL", sectionName=self.__configName) ok = self.__fromStash(urlFallBack, basePath, userName=userName, password=password) ok = self.testCache(minCount=minCount) ok = self.reload() except Exception as e: logger.exception("Failing with %s", str(e)) return ok def __fromStash(self, url, stashRemoteDirPath, userName=None, password=None, remoteStashPrefix=None): """Restore local cache from a tar and gzipped bundle to fetched from a remote server/location. Args: url (str): server URL (e.g. sftp://hostname.domain) None for local host stashRemoteDirPath (str): path to target directory on remote server userName (str, optional): server username. Defaults to None. password (str, optional): server password. Defaults to None. remoteStashPrefix (str, optional): channel prefix. Defaults to None. Returns: (bool): True for success or False otherwise """ ok = False try: stU = StashUtil(os.path.join(self.__dirPath, "stash"), "ligand-target-neighbors") ok = stU.fetchBundle(self.__dirPath, url, stashRemoteDirPath, remoteStashPrefix=remoteStashPrefix, userName=userName, password=password) except Exception as e: logger.error("Failing with url %r stashDirPath %r: %s", url, stashRemoteDirPath, str(e)) return ok def convert(self, fmt1="json", fmt2="pickle"): # targetFilePath = self.__getTargetFilePath(fmt=fmt1) self.__neighborD = self.__mU.doImport(targetFilePath, fmt=fmt1) # targetFilePath = self.__getTargetFilePath(fmt=fmt2) ok = self.__mU.doExport(targetFilePath, self.__neighborD, fmt=fmt2, pickleProtocol=4) return ok
class EntityInstanceExtractor(object): """Selected utilities to extract data from entity instance collections. >>> from operator import itemgetter >>> >>> seq2 = [1, 2, 4, 5, 6, 8, 9, 10] >>> list = [] >>> for k, g in groupby(enumerate(seq2), lambda (i,x):i-x): ... list.append(map(itemgetter(1), g)) ... >>> print list [[1, 2], [4, 5, 6], [8, 9, 10]] Or as a list comprehension: >>> [map(itemgetter(1), g) for k, g in groupby(enumerate(seq2), lambda (i,x):i-x)] [[1, 2], [4, 5, 6], [8, 9, 10]] ## ## import numpy as np def main(): # Generate some random data x = np.cumsum(np.random.random(1000) - 0.5) condition = np.abs(x) < 1 # Print the start and stop indicies of each region where the absolute # values of x are below 1, and the min and max of each of these regions for start, stop in contiguous_regions(condition): segment = x[start:stop] print start, stop print segment.min(), segment.max() import numpy as np Samples = np.array([[1, 2, 3], [1, 2]]) c = np.hstack(Samples) # Will gives [1,2,3,1,2] mean, std = np.mean(c), np.std(c) newSamples = np.asarray([(np.array(xi)-mean)/std for xi in Samples]) print newSamples """ def __init__(self, cfgOb): self.__cfgOb = cfgOb self.__resourceName = "MONGO_DB" # self.__seqCache = {} self.__mU = MarshalUtil() # def getEntryInfo(self, **kwargs): """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)""" resLimit = kwargs.get("resLimit", 3.5) expMethod = kwargs.get("expMethod", "X-ray") # dbName = kwargs.get("dbName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_entry") # entryD = {} try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(dbName, collectionName): logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName)) qD = { "rcsb_entry_info.experimental_method": expMethod, "refine.0.ls_d_res_high": { "$lte": resLimit } } selectL = [ "rcsb_entry_container_identifiers", "rcsb_entry_info", "refine" ] dL = mg.fetch(dbName, collectionName, selectL, queryD=qD) logger.info("Selection %r fetch result count %d", selectL, len(dL)) # for dV in dL: if "rcsb_entry_container_identifiers" not in dV: continue entryId = dV["rcsb_entry_container_identifiers"][ "entry_id"] entryD[entryId] = {} if "rcsb_entry_info" in dV and "polymer_composition" in dV[ "rcsb_entry_info"]: entryD[entryId] = { "polymer_composition": dV["rcsb_entry_info"]["polymer_composition"], "experimental_method": dV["rcsb_entry_info"]["experimental_method"], } if "refine" in dV and dV[ "refine"] and "ls_d_res_high" in dV["refine"][ 0]: entryD[entryId]["ls_d_res_high"] = dV["refine"][0][ "ls_d_res_high"] logger.debug("Got res %r", dV["refine"][0]["ls_d_res_high"]) except Exception as e: logger.exception("Failing with %s", str(e)) return entryD # def getEntityIds(self, entryIdList): """ """ dbName = "pdbx_core" collectionName = "pdbx_core_polymer_entity" docD = {} try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(dbName, collectionName): logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName)) for entryId in entryIdList: qD = { "rcsb_polymer_entity_container_identifiers.entry_id": entryId } selectL = ["rcsb_polymer_entity_container_identifiers"] tL = mg.fetch(dbName, collectionName, selectL, queryD=qD) # logger.debug("Selection %r fetch result count %d", selectL, len(tL)) docD[entryId] = [ vv["rcsb_polymer_entity_container_identifiers"] for vv in tL ] logger.debug("docD is %r", docD) except Exception as e: logger.exception("Failing with %s", str(e)) return docD def getPolymerEntities(self, entryD, **kwargs): """Add 'selected_polymer_entities' satisfying the input contiditions and add this to the input entry dictionary.""" dbName = kwargs.get("dbName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity") resultKey = kwargs.get("resultKey", "selected_polymer_entities") savePath = kwargs.get("savePath", "entry-data.pic") entryLimit = kwargs.get("entryLimit", None) saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"}) # try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(dbName, collectionName): logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName)) selectL = [ "rcsb_polymer_entity_container_identifiers", "entity_poly.type", "entity_poly.pdbx_seq_one_letter_code_can", "rcsb_entity_source_organism.ncbi_taxonomy_id", "rcsb_entity_source_organism.ncbi_scientific_name", "struct_ref.pdbx_seq_one_letter_code", "struct_ref.pdbx_db_accession", "struct_ref.db_name", "struct_ref.entity_id", ] iCount = 0 for entryId in entryD: # if resultKey in entryD[entryId]: continue # qD = { "rcsb_polymer_entity_container_identifiers.entry_id": entryId, "entity_poly.rcsb_entity_polymer_type": "Protein", "entity.rcsb_multiple_source_flag": "N", } # dL = mg.fetch(dbName, collectionName, selectL, queryD=qD) logger.debug("%s query %r fetch result count %d", entryId, qD, len(dL)) eD = {} for ii, dV in enumerate(dL, 1): rD = {} logger.debug("%s (%4d) d is %r", entryId, ii, dV) if "rcsb_polymer_entity_container_identifiers" in dV and "asym_ids" in dV[ "rcsb_polymer_entity_container_identifiers"]: rD["asym_ids"] = dV[ "rcsb_polymer_entity_container_identifiers"][ "asym_ids"] rD["entity_id"] = dV[ "rcsb_polymer_entity_container_identifiers"][ "entity_id"] if "entity_poly" in dV and "type" in dV[ "entity_poly"]: rD["type"] = dV["entity_poly"]["type"] rD["seq_one_letter_code_can"] = dV[ "entity_poly"][ "pdbx_seq_one_letter_code_can"] if "rcsb_entity_source_organism" in dV: rD["ncbi_taxonomy_id"] = dV[ "rcsb_entity_source_organism"][0][ "ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in dV[ "rcsb_entity_source_organism"][ 0] else None rD["ncbi_scientific_name"] = ( dV["rcsb_entity_source_organism"][0] ["ncbi_scientific_name"] if "ncbi_scientific_name" in dV["rcsb_entity_source_organism"][0] else None) if "struct_ref" in dV and len( dV["struct_ref"]) == 1: rD["seq_one_letter_code_ref"] = dV["struct_ref"][ 0]["pdbx_seq_one_letter_code"] if "pdbx_seq_one_letter_code" in dV[ "struct_ref"][0] else None rD["db_accession"] = dV["struct_ref"][0][ "pdbx_db_accession"] if "pdbx_db_accession" in dV[ "struct_ref"][0] else None rD["db_name"] = dV["struct_ref"][0][ "db_name"] if "db_name" in dV[ "struct_ref"][0] else None # refDbName = rD["db_name"] dbAccession = rD["db_accession"] dbRefSeq = self.__seqCache[ dbAccession] if dbAccession in self.__seqCache else None if refDbName in ["UNP"] and not dbRefSeq: dbRefSeq = self.__fetchUniprot(dbAccession) self.__seqCache[dbAccession] = dbRefSeq logger.debug("Fetch uniprot %r", dbRefSeq) rD["ref_db_seq"] = dbRefSeq else: rD["seq_one_letter_code_ref"] = rD[ "db_accession"] = rD["db_name"] = None # if "entity_id" in rD: eD[rD["entity_id"]] = copy.copy(rD) entryD[entryId][resultKey] = copy.copy(eD) iCount += 1 if iCount % 10 == 0: logger.info( "Completed polymer entities fetch %d/%d entries", iCount, len(entryD)) if iCount % 2000 == 0: ok = self.__mU.doExport(savePath, entryD, **saveKwargs) logger.info( "Saved polymer entity results (%d) status %r in %s", iCount, ok, savePath) if entryLimit and iCount >= entryLimit: logger.info("Quitting after %d", iCount) break # # for entryId in entryD: # logger.debug(">> %s docD %r" % (entryId, entryD[entryId])) ok = self.__mU.doExport(savePath, entryD, **saveKwargs) logger.info( "Saved polymer entity results (%d) entries %d status %r in %s", iCount, len(entryD), ok, savePath) except Exception as e: logger.exception("Failing with %s", str(e)) return entryD def getEntityInstances(self, entryD, **kwargs): """Get the selected validation data for the instances in the input entry dictionary. entryD[entryId]['selected_polymer_entities'][entityId]['validation'] = {} Add keys: 'pdbx_vrpt_instance_results' and 'pdbx_unobs_or_zero_occ_residues' to the validation dictionary above. Args: resourceName (str): resource name (e.g. DrugBank, CCDC) **kwargs: unused Returns: entryD: { } """ dbName = kwargs.get("dbName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity_instance") savePath = kwargs.get("savePath", "entry-data.pic") saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"}) entryLimit = kwargs.get("entryLimit", None) # try: optF = False iCount = 0 with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(dbName, collectionName): logger.info("%s %s total document count is %d", dbName, collectionName, mg.count(dbName, collectionName)) # for entryId, dV in entryD.items(): for entityId, peD in dV[ "selected_polymer_entities"].items(): # if 'anal_instances' in peD: # continue vD = {} for asymId in peD["asym_ids"]: qD = { "rcsb_polymer_entity_instance_container_identifiers.entry_id": entryId, "rcsb_polymer_entity_instance_container_identifiers.asym_id": asymId, } # qD = {'rcsb_entity_instance_container_validation_identifiers.entity_type': 'polymer'} # selectL = ['pdbx_vrpt_instance_results', 'pdbx_unobs_or_zero_occ_residues'] selectL = ["pdbx_vrpt_instance_results"] tL = mg.fetch(dbName, collectionName, selectL, queryD=qD) dV = {} if not tL: logger.info( "No validation data for %s %s %s(%s)", dbName, collectionName, entryId, asymId) continue # logger.debug( ">>> %s %s (%s) dict key length %d ", collectionName, entryId, asymId, len(tL[0])) # if optF: dV["pdbx_vrpt_instance_results"] = tL[0][ "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[ 0] else [] dV["pdbx_unobs_or_zero_occ_residues"] = tL[0][ "pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[ 0] else [] # if optF: urdL = tL[0][ "pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[ 0] else [] oL = [{ "label_seq_id": urd["label_seq_id"], "label_comp_id": urd["label_comp_id"] } for urd in urdL] dV["pdbx_unobs_or_zero_occ_residues"] = oL # try: irdL = tL[0][ "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[ 0] else [] oL = [{ "label_seq_id": ird["label_seq_id"], "label_comp_id": ird["label_comp_id"] } for ird in irdL] dV["pdbx_vrpt_instance_results_seq"] = oL except Exception as e: logger.error( "Failing with entryId %s entityId %s asymId %s bad validation data %s", entryId, entityId, asymId, str(e)) # try: irdL = tL[0][ "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[ 0] else [] oL = [{ "OWAB": ird["OWAB"], "label_seq_id": ird["label_seq_id"], "label_comp_id": ird["label_comp_id"] } for ird in irdL] dV["pdbx_vrpt_instance_results_occ"] = oL except Exception as e: logger.debug( "Failing with entryId %s entityId %s asymId %s bad validation data %s", entryId, entityId, asymId, str(e)) vD[asymId] = copy.copy(dV) # analD = self.analEntity(entryId, peD, vD) entryD[entryId]["selected_polymer_entities"][ entityId]["anal_instances"] = copy.copy(analD) iCount += 1 if iCount % 500 == 0: logger.info("Completed %d/%d entries", iCount, len(entryD)) if iCount % 2000 == 0: ok = self.__mU.doExport(savePath, entryD, **saveKwargs) logger.info( "Saved polymer entity instance results (%d) status %r in %s", iCount, ok, savePath) if entryLimit and iCount >= entryLimit: break ok = self.__mU.doExport(savePath, entryD, **saveKwargs) logger.info( "Saved polymer instance results (%d) entries %d status %r in %s", iCount, len(entryD), ok, savePath) except Exception as e: logger.exception("Failing with %s", str(e)) return entryD def analEntity(self, entryId, entityD, vD, **kwargs): """ {'polymer_composition': 'protein/NA', 'experimental_method': 'X-ray', 'selected_polymer_entities': {'1': {'asym_ids': ['D', 'C', 'E', 'A', 'B', 'F'], 'entity_id': '1', 'type': 'polypeptide(L)', 'seq_one_letter_code_can': 'MAKGQSLQDPFLNALRRERVPVSIYLVNGIKLQGQIESFDQFVILLKNTVSQMVYKHAISTVVPS', 'ncbi_taxonomy_id': 511693, 'ncbi_scientific_name': 'Escherichia coli BL21', 'seq_one_letter_code_ref': 'MAKGQSLQDPFLNALRRERVPVSIYLVNGIKLQGQIESFDQFVILLKNTVSQMVYKHAISTVVPS', 'db_accession': 'C5W5L7', 'db_name': 'UNP', 'validation': {'D': {'pdbx_vrpt_instance_results': [{'OWAB': 29.45, 'label_seq_id': 5, 'label_comp_id': 'GLN'}, {'OWAB': 26.12, 'label_seq_id': 6, 'label_comp_id': 'SER'}, {'OWAB': 22.72, 'label_seq_id': 7, 'label_comp_id': 'LEU'}, {'OWAB': 14.56, 'label_seq_id': 8, 'label_comp_id': 'GLN'}, {'OWAB': 19.18, 'label_seq_id': 9, 'label_comp_id': 'ASP'}, {'OWAB': 16.56, 'label_seq_id': 10, 'label_comp_id': 'PRO'}, {'OWAB': 14.78, 'label_seq_id': 11, 'label_comp_id': 'PHE'}, {'OWAB': 11.2, 'label_seq_id': 12, 'label_comp_id': 'LEU'}, }}...] 'pdbx_unobs_or_zero_occ_residues': [{'label_seq_id': 1, 'label_comp_id': 'MET'}, {'label_seq_id': 2, 'label_comp_id': 'ALA'}, {'label_seq_id': 3, 'label_comp_id': 'LYS'}, {'label_seq_id': 4, 'label_comp_id': 'GLY'}]} """ _ = kwargs analD = {} try: entityId = entityD["entity_id"] asymIdL = entityD["asym_ids"] refSeq = entityD[ "seq_one_letter_code_ref"] if "seq_one_letter_code_ref" in entityD else None entitySeq = entityD[ "seq_one_letter_code_can"] if "seq_one_letter_code_can" in entityD else None # ------- # Get UniProt # dbName = entityD["db_name"] if "db_name" in entityD else None dbAccession = entityD[ "db_accession"] if "db_accession" in entityD else None dbRefSeq = entityD[ "ref_db_seq"] if "ref_db_seq" in entityD else None # -- if dbRefSeq: logger.debug("%s (%s) ref db %4d: %r", dbAccession, dbName, len(dbRefSeq), dbRefSeq) if refSeq: logger.debug("%s (%s) seq ref pdb %4d: %r", dbAccession, dbName, len(refSeq), refSeq) if entitySeq: logger.debug("%s (%s) entity sample %4d: %r", dbAccession, dbName, len(entitySeq), entitySeq) # lenRefDbSeq = len(dbRefSeq) if dbRefSeq else None lenEntitySeq = len(entitySeq) # sampleSeqCov = 1.0 - float(lenRefDbSeq - lenEntitySeq) / float(lenRefDbSeq) if lenRefDbSeq else None # # - for asymId in asymIdL: if asymId not in vD: logger.error("Missing validation data for %s %s %s", entryId, entityId, asymId) continue # irDL = vD[asymId][ "pdbx_vrpt_instance_results_seq"] if "pdbx_vrpt_instance_results_seq" in vD[ asymId] else [] lsL = list(set([dV["label_seq_id"] for dV in irDL])) lenInstanceSeq = len(lsL) instRefDbSeqCov = 1.0 - float( lenRefDbSeq - lenInstanceSeq) / float( lenRefDbSeq) if lenRefDbSeq else None instSampleSeqCov = 1.0 - float( lenEntitySeq - lenInstanceSeq) / float(lenEntitySeq) # occDL = vD[asymId][ "pdbx_vrpt_instance_results_occ"] if "pdbx_vrpt_instance_results_occ" in vD[ asymId] else [] # average the owabRegD = {} if occDL: owabD = {} for dV in occDL: owabD.setdefault(dV["label_seq_id"], []).append(dV["OWAB"]) # # logger.info("owabD %r" % owabD) meanOwabD = {k: mean(v) for k, v in owabD.items()} meanOwab = mean(meanOwabD.values()) stdevOwab = stdev(meanOwabD.values()) # logger.debug( ">> Length of B values list %d mean %.3f stdev %.3f", len(meanOwabD), meanOwab, stdevOwab) # meanOwabA = np.array(list(meanOwabD.values())) # condition = meanOwabA > (meanOwab + meanOwab) regL = self.__contiguousRegions(condition) for ii, (start, stop) in enumerate(regL, 1): segment = meanOwabA[start:stop] logger.debug( "B value range = start %d stop %d min %.3f max %.3f", start, stop, segment.min(), segment.max()) owabRegD[ii] = { "length": stop - start + 1, "occ_min": segment.min(), "occ_max": segment.max() } # # # if False: # uDL = vD[asymId]['pdbx_unobs_or_zero_occ_residues'] if 'pdbx_unobs_or_zero_occ_residues' in vD[asymId] else [] # unobsL = [d['label_seq_id'] for d in uDL] # # segL = [] # for k, g in groupby(enumerate(lsL), lambda x: x[0] - x[1]): # logger.info(" Segment entryId %s entityId %s asymId %s: %r" % (entryId, entityId, asymId, list(map(itemgetter(1), g)))) # # for k, g in groupby(enumerate(lsL), lambda(i, x): i - x): # logger.info(" entryId %s entityId %s asymId %s: %r" % (entryId, entityId, asymId, list(map(itemgetter(1), g))) segL = [ list(map(itemgetter(1), g)) for _, g in groupby(enumerate(lsL), lambda x: x[0] - x[1]) ] logger.debug("Modeled sequence length %d segments %d", len(lsL), len(segL)) # gapD = {} for ii in range(1, len(segL)): bG = segL[ii - 1][-1] eG = segL[ii][0] gapD[ii] = eG - bG - 1 logger.debug("Gap %d length %d", ii, gapD[ii]) # # if instRefDbSeqCov: logger.debug( "Summary %s %s %s refcov %.2f sampleCov %.2f - gaps (%d) %r owabs seqments (%d) %r", entryId, entityId, asymId, instRefDbSeqCov, instSampleSeqCov, len(gapD), list(gapD.values()), len(owabRegD), list(owabRegD.values()), ) else: logger.debug( "Summary %s %s %s sampleCov %.2f - gaps (%d) %r owabs seqments (%d) %r", entryId, entityId, asymId, instSampleSeqCov, len(gapD), list(gapD), len(owabRegD), list(owabRegD.values()), ) # analD[asymId] = { "coverage_inst_refdb": instRefDbSeqCov, "coverage_inst_entity": instSampleSeqCov, "gapD": copy.copy(gapD), "owabRegiond": copy.copy(owabRegD) } logger.debug("entry %s entity %s analD %r", entryId, entityId, analD) except Exception as e: logger.exception("%s failing with %s", entryId, str(e)) # return analD def __getSegments(self, values): xV = np.asarray(values) # Generate some random data # x = np.cumsum(np.random.random(1000) - 0.5) # condition = np.abs(xV) < 1 # Print the start and stop indicies of each region where the absolute # values of x are below 1, and the min and max of each of these regions for start, stop in self.__contiguousRegions(condition): segment = xV[start:stop] print(start, stop) print(segment.min(), segment.max()) def __contiguousRegions(self, condition): """Finds contiguous True regions of the boolean array "condition. Returns a 2D array where the first column is the start index of the region and the second column is the end index. """ # Find the indicies of changes in "condition" dV = np.diff(condition) (idx, ) = dV.nonzero() # We need to start things after the change in "condition". Therefore, # we'll shift the index by 1 to the right. idx += 1 if condition[0]: # If the start of condition is True prepend a 0 idx = np.r_[0, idx] if condition[-1]: # If the end of condition is True, append the length of the array idx = np.r_[idx, condition.size] # Edit # Reshape the result into two columns idx.shape = (-1, 2) return idx def __window(self, seq, num=2): """Returns a sliding window (of width n) over data from the iterable s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... """ it = iter(seq) result = tuple(islice(it, num)) if len(result) == num: yield result for elem in it: result = result[1:] + (elem, ) yield result def missingElements(self, lV): missing = chain.from_iterable( range(x + 1, y) for x, y in self.__window(lV) if (y - x) > 1) return list(missing) def __fetchUniprot(self, uniProtId): baseUrl = "http://www.uniprot.org" wsEndPoint = "/uniprot/" fS = "" try: fullUrl = baseUrl + wsEndPoint + uniProtId + ".fasta" result = requests.get(fullUrl) if result.ok: fL = result.text.split("\n") fS = "".join(fL[1:]) else: logger.error("UniProt Fasta request for %s returns status %r", uniProtId, result.status_code) except Exception as e: logger.error("Failing request for %s with %s", uniProtId, str(e)) return fS
class ChEMBLTargetMechanismProvider(StashableBase): """Accessors for ChEMBL target mechanism data.""" def __init__(self, cachePath, useCache): # self.__cachePath = cachePath self.__dirName = "ChEMBL-target-mechanism" super(ChEMBLTargetMechanismProvider, self).__init__(self.__cachePath, [self.__dirName]) self.__dirPath = os.path.join(self.__cachePath, self.__dirName) self.__mU = MarshalUtil(workPath=self.__cachePath) baseVersion = 28 self.__version = baseVersion logger.info("ChEMBL API MAX_LIMIT %r", Settings.Instance().MAX_LIMIT) # pylint: disable=no-member self.__aD = self.__reload(self.__dirPath, useCache) def testCache(self, minCount=0): if minCount == 0: return True if self.__aD and (len(self.__aD) > minCount): logger.info("Mechanism data for (%d) targets", len(self.__aD)) return True return False def getAssignmentVersion(self): return self.__version def getTargetMechanismDataPath(self): return os.path.join(self.__dirPath, "chembl-target-mechanism-data.json") def __reload(self, dirPath, useCache): startTime = time.time() aD = {} fU = FileUtil() fU.mkdir(dirPath) targetMechanismFilePath = self.getTargetMechanismDataPath() # if useCache and fU.exists(targetMechanismFilePath): logger.info("useCache %r using %r", useCache, targetMechanismFilePath) qD = self.__mU.doImport(targetMechanismFilePath, fmt="json") aD = qD["mechanism"] if "mechanism" in qD else {} # logger.info("Completed reload of (%d) at %s (%.4f seconds)", len(aD), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # return aD def getTargetMechanisms(self, targetChEMBLId): try: return self.__aD[ targetChEMBLId] if targetChEMBLId in self.__aD else [] except Exception: return [] def hasTargetMechanism(self, targetChEMBLId): try: return targetChEMBLId in self.__aD except Exception: return False def fetchTargetMechanismData(self, targetChEMBLIdList, skipExisting=True, chunkSize=50): """Get cofactor mechanism data for the input ChEMBL target list. Args: targetChEMBLIdList (list): list of ChEMBL target identifiers skipExisting (bool, optional): reuse any existing cached data (default: True) chunkSize(int, optional): ChEMBL API batch size for fetches (default: 50) Returns: bool: True for success or False otherwise """ atL = [ "action_type", "molecule_chembl_id", "action_type", "mechanism_of_action", "max_phase", "target_chembl_id", ] targetD = self.__aD if self.__aD else {} idList = [] if skipExisting: for tId in targetChEMBLIdList: if tId in self.__aD: continue idList.append(tId) else: idList = targetChEMBLIdList numToProcess = len(idList) logger.info("Fetching mechanism data for (%d/%d)", numToProcess, len(targetChEMBLIdList)) ok = False try: for ii in range(0, len(idList), chunkSize): logger.info("Begin chunk at ii %d/%d", ii, numToProcess) mch = new_client.mechanism # pylint: disable=no-member mch.set_format("json") mDL = mch.filter( target_chembl_id__in=idList[ii:ii + chunkSize]).only(atL) logger.info("Results (%d)", len(mDL)) if mDL: for mD in mDL: targetD.setdefault(mD["target_chembl_id"], []).append( self.__mechanismSelect(atL, mD)) # logger.info("Completed chunk starting at (%d)", ii) tS = datetime.datetime.now().isoformat() vS = datetime.datetime.now().strftime("%Y-%m-%d") ok = self.__mU.doExport(self.getTargetMechanismDataPath(), { "version": vS, "created": tS, "mechanism": targetD }, fmt="json", indent=3) logger.info("Wrote completed chunk starting at (%d) (%r)", ii, ok) except Exception as e: logger.exception("Failing with %s", str(e)) return ok def __mechanismSelect(self, atL, aD): return {at: aD[at] if at in aD else None for at in atL}
class ValidationReportSchemaUtilsTests(unittest.TestCase): def setUp(self): self.__dirPath = os.path.join(os.path.dirname(TOPDIR), "rcsb", "mock-data") self.__xsdPath = os.path.join(HERE, "test-data", "wwpdb_validation_v004.xsd") self.__dictPath = os.path.join(HERE, "test-output", "vrpt_mmcif_ext_v4.dic") self.__dictStaticPath = os.path.join(HERE, "test-data", "em_validation_ext_v4.dic") # # This schema mapping file is used by the XML report data file reader. self.__dictionaryMapPath = os.path.join(HERE, "test-output", "vrpt_dictmap_v4.json") self.__dictionaryMapCsvPath = os.path.join(HERE, "test-output", "vrpt_dictmap_v4.csv") self.__mU = MarshalUtil() def tearDown(self): pass def testProcessXsdSchema(self): vrsu = ValidationReportSchemaUtils() sObj = vrsu.readSchema(self.__xsdPath, verbose=False) logger.debug("Returns type %r", type(sObj)) logger.debug("Schema category length %d", len(sObj)) ok = self.__mU.doExport(os.path.join(HERE, "test-output", "schema-object.json"), sObj, fmt="json", indent=3) # import static definitions - scL = self.__mU.doImport(self.__dictStaticPath, fmt="mmcif-dict") logger.info("Static definition count %d", len(scL)) # cL = vrsu.buildDictionary(sObj) logger.info("Generated definition count %d", len(cL)) # cL.extend(scL) ok = self.__mU.doExport(self.__dictPath, cL, fmt="mmcif-dict") self.assertTrue(ok) # dictionaryMap = vrsu.getDictionaryMap(sObj) ok = self.__mU.doExport(self.__dictionaryMapPath, dictionaryMap, fmt="json") self.assertTrue(ok) # self.assertTrue("attributes" in dictionaryMap) self.assertTrue(len(dictionaryMap["attributes"]) > 420) def testExportMapping(self): """Export schema correspondences as CSV.""" vrsu = ValidationReportSchemaUtils() sObj = vrsu.readSchema(self.__xsdPath) dictionaryMap = vrsu.getDictionaryMap(sObj) logger.info("Attribute count %d", len(dictionaryMap["attributes"])) rL = [] for ky, dD in dictionaryMap["attributes"].items(): kyL = ky.split("|") catN = kyL[0] atN = kyL[1] row = { "xml_el": catN, "xml_at": atN, "mmcif_cat": dD["cat"], "mmcif_at": dD["at"] } rL.append(row) # # self.__mU.doExport(self.__dictionaryMapCsvPath, rL, fmt="csv")
def buildSearchFiles(self, **kwargs): """Build cif, sdf (optional), and mol2 files for components in the chemical component search index. Exclude ions or other extraneous molecules lacking bonds. Args: ccUrlTarget (str): locator for source chemical component dictionary (default: full public dictionary) birdUrlTarget (str): locator for source BIRD dictionary (default: full public dictionary) limitPerceptions (bool): restrict automatic perceptions in OE molecular build operations (default: False) numProc (int): number of processors useCache (bool): use existing resource file where possible (default: True) molLimit (str): limit the number to ingested chemical compont (default: None) quietFlag (bool): suppress output in OE library operations (default: True) Returns: (int): number molfiles generated """ cachePath = self.__cachePath ccUrlTarget = kwargs.get("ccUrlTarget", None) birdUrlTarget = kwargs.get("birdUrlTarget", None) molLimit = kwargs.get("molLimit", None) quietFlag = kwargs.get("quietFlag", True) fpTypeList = kwargs.get("fpTypeList", []) screenTypeList = kwargs.get("screenTypeList", []) ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc-full" oeFileNamePrefix = "oe-%s" % self.__prefix if self.__prefix else "oe-cc-full" numProc = kwargs.get("numProc", 2) minCount = kwargs.get("minCount", 0) useCache = kwargs.get("useCache", True) useSdf = kwargs.get("useSdf", True) useMol2 = kwargs.get("useMol2", False) limitPerceptions = kwargs.get("limitPerceptions", False) logSizes = False # startTime = time.time() ccmP = ChemCompMoleculeProvider(cachePath=cachePath, useCache=useCache, ccFileNamePrefix=ccFileNamePrefix, ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget, molLimit=molLimit) ok = ccmP.testCache(minCount=minCount, logSizes=logSizes) logger.info( "Completed chemical component provider load %r (%.4f seconds)", ok, time.time() - startTime) # startTime = time.time() oesmp = OeSearchMoleculeProvider( ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget, cachePath=cachePath, ccFileNamePrefix=ccFileNamePrefix, oeFileNamePrefix=oeFileNamePrefix, useCache=useCache, quietFlag=quietFlag, fpTypeList=fpTypeList, screenTypeList=screenTypeList, numProc=numProc, molLimit=molLimit, limitPerceptions=limitPerceptions, ) ok = oesmp.testCache() logger.info("Completed OE molecule provider load %r (%.4f seconds)", ok, time.time() - startTime) # startTime = time.time() ccSIdxP = ChemCompSearchIndexProvider( cachePath=cachePath, useCache=useCache, ccFileNamePrefix=ccFileNamePrefix, limitPerceptions=limitPerceptions, numProc=numProc) ok = ccSIdxP.testCache() logger.info( "Completed chemical component search index load %r (%.4f seconds)", ok, time.time() - startTime) # ccSIdx = ccSIdxP.getIndex() if ccSIdxP and ok else {} logger.info("Search index status %r index length %d", ok, len(ccSIdx)) # ccIdD = {} mU = MarshalUtil() oeU = OeIoUtils(dirPath=cachePath) numMols = 0 searchFileDirPath = self.getSearchDirFilePath() pathTupList = [] for sId in ccSIdx: ccId = sId.split("|")[0] # standard CIF definition if ccId not in ccIdD: cifPath = os.path.join(searchFileDirPath, ccId[0], ccId, ccId + ".cif") if not (useCache and mU.exists(cifPath)): ccMol = ccmP.getMol(ccId) if not self.__checkCif(ccMol): continue mU.doExport(cifPath, [ccMol], fmt="mmcif") # oeMol = oesmp.getMol(sId) if not self.__checkOeMol(oeMol): continue # # Sanity checks on the generated OE molecule # cifPath = os.path.join(searchFileDirPath, ccId[0], ccId, sId + ".cif") if sId != ccId and not (useCache and mU.exists(cifPath)): oeccU = OeChemCompUtils() ok = oeccU.addOeMol(sId, oeMol, missingModelXyz=True, writeIdealXyz=False) if ok: oeccU.write(cifPath) if useSdf: molFilePath = os.path.join(searchFileDirPath, ccId[0], ccId, sId + ".sdf") if not (useCache and mU.exists(molFilePath)): ok = oeU.write(molFilePath, oeMol, constantMol=False, addSdTags=True) if ok: pathTupList.append((sId, molFilePath, "sdf")) # if useMol2: mol2FilePath = os.path.join(searchFileDirPath, ccId[0], ccId, sId + ".mol2") if not (useCache and mU.exists(mol2FilePath)): oeU.write(mol2FilePath, oeMol, constantMol=False, addSdTags=True) if ok: pathTupList.append((sId, mol2FilePath, "mol2")) numMols += 1 # self.__storePathList(pathTupList) return numMols
class ChemAxonDescriptorProvider(StashableBase): """Utilities to deliver ChemAxon rendered chemical descriptors for chemical component definitions.""" def __init__(self, **kwargs): # dirName = "chemaxon" if "cachePath" in kwargs: self.__cachePath = os.path.abspath(kwargs.get("cachePath", None)) self.__dirPath = os.path.join(self.__cachePath, dirName) super(ChemAxonDescriptorProvider, self).__init__(self.__cachePath, [dirName]) # self.__molLimit = kwargs.get("molLimit", 0) self.__ccUrlTarget = kwargs.get("ccUrlTarget", None) self.__birdUrlTarget = kwargs.get("birdUrlTarget", None) useCache = kwargs.get("useCache", True) self.__chunkSize = kwargs.get("chunkSize", 100) self.__mU = MarshalUtil(workPath=self.__dirPath) self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full") self.__version = None self.__descrD = self.__reload(useCache) def testCache(self, minCount=None): ok = self.__descrD and len( self.__descrD ) >= minCount if minCount else self.__descrD is not None logger.info( "Loaded ChemAxon descriptors for (%d) components (success %r)", len(self.__descrD) if self.__descrD else 0, ok) return ok def getDescriptorIndex(self): return self.__descrD def getIndexFilePath(self): return os.path.join( self.__dirPath, "%s-chemaxon-descriptors.json" % self.__ccFileNamePrefix) def getVersion(self): return self.__version def __reload(self, useCache): """Reload or created Chemaxon descriptor mapping index. Args: cachePath (str): path to the directory containing cache files chunkSize (int, optional): number of SMILES per request. Defaults to 100. Returns: (dict): chemical component data containers for each indexed chemical component """ # descrD = {} descrFilePath = self.getIndexFilePath() # if not (useCache and self.__mU.exists(descrFilePath)): url = "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/CHEMAXON/cc-full-chemaxon-descriptors.json" _ = self.__fetchUrl(url, self.__dirPath) # _, fExt = os.path.splitext(descrFilePath) descrFormat = "json" if fExt == ".json" else "pickle" if self.__mU.exists(descrFilePath): dD = self.__mU.doImport(descrFilePath, fmt=descrFormat) descrD = dD["smiles"] self.__version = dD["version"] # return descrD def __fetchUrl(self, urlTarget, dirPath, useCache=False): fU = FileUtil() fn = fU.getFileName(urlTarget) filePath = os.path.join(dirPath, fn) if not (useCache and fU.exists(filePath)): startTime = time.time() ok2 = fU.get(urlTarget, filePath) endTime = time.time() if ok2: logger.info( "Fetched %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime) else: logger.error( "Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime) # return filePath def buildDescriptors(self): descrFilePath = self.getIndexFilePath() ccidxP = ChemCompIndexProvider( ccUrlTarget=self.__ccUrlTarget, birdUrlTarget=self.__birdUrlTarget, cachePath=self.__cachePath, useCache=True, molLimit=self.__molLimit, ccFileNamePrefix=self.__ccFileNamePrefix, ) ok = ccidxP.testCache() if ok: ccIdList = ccidxP.getIdList() self.__descrD = self.__fetchDescriptors(ccIdList, ccidxP, chunkSize=self.__chunkSize) tS = datetime.datetime.now().isoformat() vS = datetime.datetime.now().strftime("%Y-%m-%d") self.__version = vS dD = {"created": tS, "version": vS, "smiles": self.__descrD} ok = self.__mU.doExport(descrFilePath, dD, fmt="json", indent=3) logger.info("Stored %s descriptors for %d components (status=%r) ", descrFilePath, len(self.__descrD), ok) def updateDescriptors(self, useCache=True): ccidxP = ChemCompIndexProvider( ccUrlTarget=self.__ccUrlTarget, birdUrlTarget=self.__birdUrlTarget, cachePath=self.__cachePath, useCache=useCache, molLimit=None, ccFileNamePrefix=self.__ccFileNamePrefix, ) ok = ccidxP.testCache() if ok: ccIdList = ccidxP.getIdList() curIdList = list(self.__descrD.keys()) updIdList = list(set(ccIdList) - set(curIdList)) if updIdList: logger.info( "Updating Chemaxon descriptors for (%d) components", len(updIdList)) uD = self.__fetchDescriptors(updIdList, ccidxP, chunkSize=self.__chunkSize) self.__descrD.update(uD) descrFilePath = self.getIndexFilePath() tS = datetime.datetime.now().isoformat() vS = datetime.datetime.now().strftime("%Y-%m-%d") self.__version = vS dD = {"created": tS, "version": vS, "smiles": self.__descrD} ok = self.__mU.doExport(descrFilePath, dD, fmt="json", indent=3) # return ok def __fetchDescriptors(self, ccIdList, ccidxP, chunkSize=100): """Fetch transformed SMILES descriptors from the ChemAxon webservice. Args: ccIdList (list, str): chemical component identifier list ccidxP (object): instance of the ChemCompIndexProvider() chunksize (int, optional): number of SMILES per request. Defaults to 100. Returns: (dict): dictionary {<ccId>: [<transformed SMILES>, ...], ...} Example API parameter data: { "errorHandlingMode": "FAIL_ON_ERROR", "inputParams": "smiles", "outputParams": "smiles", "structures": [ "CC(C)[C@H](N)C=O", "CC[C@H](C)[C@H](N)C=O", "CC(C)C[C@H](N)C=O" ] } Example query: curl -X POST "https://jchem-microservices.chemaxon.com/jwsio/rest-v1/molconvert/batch" -H "accept: */*" -H "Content-Type: application/json" -d "{ \"errorHandlingMode\": \"FAIL_ON_ERROR\", \"inputParams\": \"smiles\", \"outputParams\": \"mrv\", \"structures\": [ \"CC(C)[C@H](N)C=O\", \"CC[C@H](C)[C@H](N)C=O\", \"CC(C)C[C@H](N)C=O\" ]}" """ descrD = {} smilesCcIdD = {} smilesD = {} for ccId in ccIdList: smiL = list( set( ccidxP.getSMILES(ccId, smiTypeList=[ "oe-iso-smiles", "oe-smiles", "cactvs-iso-smiles", "cactvs-smiles" ]))) smilesCcIdD.setdefault(ccId, []).extend(smiL) for smi in smiL: smilesD.setdefault(smi, []).append(ccId) # logger.info("Translating (%d) SMILES for components (%d)", len(smilesD), len(smilesCcIdD)) # ---- smiLL = [ list(smilesD.keys())[i:i + chunkSize] for i in range(0, len(smilesD), chunkSize) ] # --- baseUrl = "https://jchem-microservices.chemaxon.com" endPoint = "jwsio/rest-v1/molconvert/batch" # hL = [("Accept", "application/json"), ("Content-Type", "application/json")] hD = {"Accept": "application/json", "Content-Type": "application/json"} try: pD = { "errorHandlingMode": "SKIP_ERROR", "inputParams": "smiles", "outputParams": "smiles" } # iCount = 0 for smiL in smiLL: iCount += 1 ureq = UrlRequestUtil() pD["structures"] = smiL logger.debug("pD %r", pD) rDL, retCode = ureq.postUnWrapped( baseUrl, endPoint, pD, headers=hD, sendContentType="application/json", returnContentType="application/json") logger.debug("API result (%r) %r", retCode, rDL) if rDL and len(rDL) == len(smiL): for ii, rD in enumerate(rDL): if "structure" in rD and "successful" in rD and rD[ "successful"]: if smiL[ii] == rD["structure"]: continue for ccId in smilesD[smiL[ii]]: if ccId in descrD and rD[ "structure"] in descrD[ccId]: continue if rD["structure"] in smilesCcIdD[ccId]: continue descrD.setdefault(ccId, []).append(rD["structure"]) else: logger.info("Chunk %d failed (%d)", iCount, len(rDL)) if iCount % 10 == 0: logger.info("Completed processing chunk (%d/%d)", iCount, len(smiLL)) # except Exception as e: logger.exception("Failing with %s", str(e)) return descrD
class ChEMBLTargetProviderTests(unittest.TestCase): skipFull = True def setUp(self): self.__cachePath = os.path.join(HERE, "test-output", "CACHE") self.__fastaPath = os.path.join(HERE, "test-output", "chembl-targets.fa") self.__taxonPath = os.path.join(HERE, "test-output", "chembl-targets-taxon.tdd") self.__dataPath = os.path.join(HERE, "test-data") self.__mU = MarshalUtil(workPath=self.__cachePath) def tearDown(self): pass def testFetchChEMBLTargets(self): try: ctP = ChEMBLTargetProvider(cachePath=self.__cachePath, useCache=False) ok = ctP.testCache() self.assertTrue(ok) ok = ctP.exportFasta(self.__fastaPath, self.__taxonPath, addTaxonomy=False) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testFetchActivityData(self): try: logger.info("MAX_LIMIT %r", Settings.Instance().MAX_LIMIT) # pylint: disable=no-member ctP = ChEMBLTargetProvider(cachePath=self.__cachePath, useCache=True) ok = ctP.testCache() self.assertTrue(ok) # P43088|CHEMBL1987|9606 # P08243|uniprotId|CHEMBL3120|chemblId|9606|taxId tL = ["CHEMBL1987", "CHEMBL3120"] targetD = ctP.getActivityData(tL) ok = self.__mU.doExport(os.path.join( self.__cachePath, "ChEMBL-targets", "chembl-target-activity.json"), targetD, fmt="json", indent=3) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testFetchMechanismData(self): oD = {} try: logger.info("MAX_LIMIT %r", Settings.Instance().MAX_LIMIT) # pylint: disable=no-member ctP = ChEMBLTargetProvider(cachePath=self.__cachePath, useCache=True) ok = ctP.testCache() self.assertTrue(ok) # P43088|CHEMBL1987|9606 # P08243|uniprotId|CHEMBL3120|chemblId|9606|taxId tL = ["CHEMBL1987", "CHEMBL3120"] oD.update(ctP.getMechanismData(tL)) # ok = self.__mU.doExport(os.path.join( self.__cachePath, "ChEMBL-targets", "chembl-target-mechanism.json"), oD, fmt="json", indent=3) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() # @unittest.skipIf(skipFull, "Very long test") def testFetchChEMBLTargetsWithTax(self): try: ctP = ChEMBLTargetProvider(cachePath=self.__cachePath, useCache=True) ok = ctP.testCache() self.assertTrue(ok) ok = ctP.exportFasta(self.__fastaPath, self.__taxonPath, addTaxonomy=True) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def __writeModel(self, targetId, targetPath, fitFD, fitXyzMapD, fitAtomUnMappedL, matchObj, modelId, modelPath): """Write the chemical component model for the input chemical component Id and associated atom mapping and feature details -- ComponentAtomDetails = namedtuple("ComponentAtomDetails", "index atNo name aType x y z fCharge") AlignAtomMap = namedtuple("AlignAtomMap", "refId refAtIdx refAtNo refAtName fitId fitAtIdx fitAtNo fitAtName") AlignAtomUnMapped = namedtuple("AlignAtomUnMapped", "fitId fitAtIdx fitAtNo fitAtType fitAtName fitAtFormalCharge x y z fitNeighbors") """ try: unMappedTypeD = defaultdict(int) hAtomPrefix = "HEX" variantType = self.__getBuildVariant(targetId) # if not self.__testUnMappedProtonation(fitAtomUnMappedL): logger.info("Unmapped non-hydrogen atoms target %r model %r unMapped count (%d)", targetId, modelId, len(fitAtomUnMappedL)) return False, variantType # Get atom partners for the unmapped atoms fitAtMapD = {} for refAtName, fAtTup in fitXyzMapD.items(): fitAtMapD[fAtTup.atName] = refAtName if fitAtomUnMappedL: # Check if neighbors are all mapped ok = True for fitUnTup in fitAtomUnMappedL: for nAtName in fitUnTup.fitNeighbors: if nAtName not in fitAtMapD: ok = False logger.info("Missing mapped neighbor for %r target %r model %r", nAtName, targetId, modelId) break if not ok: return False, variantType else: logger.debug("%s match has unmapped protonation", modelId) variantType = "tautomer_protomer" # # kList = ["xyz", "SMILES", "SMILES_STEREO", "InChI", "InChIKey"] for k in kList: if k not in fitFD: logger.error("Fit feature dictionary for %s missing key %s", targetId, k) return False, variantType # ------------ dataContainer = DataContainer(modelId) # mU = MarshalUtil(workPath=self.__cachePath) myContainerList = mU.doImport(targetPath, fmt="mmcif") myContainer = myContainerList[0] dbName = myContainer.getName() if dbName.upper() != targetId.upper(): logger.info("mismatch datablock (%r) and targetId (%r)", dbName, targetId) cObj = None if myContainer.exists("chem_comp"): cObj = myContainer.getObj("chem_comp") # # catName = "pdbx_chem_comp_model" if not dataContainer.exists(catName): dataContainer.append(DataCategory(catName, attributeNameList=["id", "comp_id"])) # parentId = targetId.split("|")[0] wObj = dataContainer.getObj(catName) wObj.setValue(modelId, "id", 0) wObj.setValue(parentId, "comp_id", 0) # # -------- --------- catName = "pdbx_chem_comp_model_atom" if not dataContainer.exists(catName): dataContainer.append( DataCategory(catName, attributeNameList=["model_id", "atom_id", "type_symbol", "charge", "model_Cartn_x", "model_Cartn_y", "model_Cartn_z", "ordinal_id"]) ) wObj = dataContainer.getObj(catName) # if myContainer.exists("chem_comp_atom"): cObj = myContainer.getObj("chem_comp_atom") # # Only write the mapped atoms in case we are missing hydrogens in the mapping # jj = 0 for ii in range(cObj.getRowCount()): atName = cObj.getValue("atom_id", ii) atType = cObj.getValue("type_symbol", ii) if atName not in fitXyzMapD: unMappedTypeD[atType] += 1 continue fitXyz = fitXyzMapD[atName] # # fCharge = cObj.getValue("charge", ii) # wObj.setValue(modelId, "model_id", jj) wObj.setValue(atName, "atom_id", jj) wObj.setValue(atType, "type_symbol", jj) # wObj.setValue(fitXyz.atFormalCharge, "charge", jj) wObj.setValue("%.4f" % fitXyz.x, "model_Cartn_x", jj) wObj.setValue("%.4f" % fitXyz.y, "model_Cartn_y", jj) wObj.setValue("%.4f" % fitXyz.z, "model_Cartn_z", jj) wObj.setValue(jj + 1, "ordinal_id", jj) jj += 1 # # Add the unmapped atoms ... # AlignAtomUnMapped = namedtuple("AlignAtomUnMapped", "fitId fitAtIdx fitAtNo fitAtType fitAtName fitNeighbors") ii = wObj.getRowCount() for jj, uTup in enumerate(fitAtomUnMappedL): refAtomName = hAtomPrefix + str(jj) wObj.setValue(modelId, "model_id", ii) wObj.setValue(refAtomName, "atom_id", ii) wObj.setValue(uTup.fitAtType, "type_symbol", ii) wObj.setValue(uTup.fitAtFormalCharge, "charge", ii) wObj.setValue("%.4f" % uTup.x, "model_Cartn_x", ii) wObj.setValue("%.4f" % uTup.y, "model_Cartn_y", ii) wObj.setValue("%.4f" % uTup.z, "model_Cartn_z", ii) wObj.setValue(ii + 1, "ordinal_id", ii) # -------- --------- catName = "pdbx_chem_comp_model_bond" if not dataContainer.exists(catName): dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "atom_id_1", "atom_id_2", "value_order", "ordinal_id"])) wObj = dataContainer.getObj(catName) # if myContainer.exists("chem_comp_bond"): cObj = myContainer.getObj("chem_comp_bond") # jj = 0 for ii in range(cObj.getRowCount()): at1 = cObj.getValue("atom_id_1", ii) if at1 not in fitXyzMapD: continue at2 = cObj.getValue("atom_id_2", ii) if at2 not in fitXyzMapD: continue bType = cObj.getValue("value_order", ii) # wObj.setValue(modelId, "model_id", jj) wObj.setValue(at1, "atom_id_1", jj) wObj.setValue(at2, "atom_id_2", jj) wObj.setValue(bType, "value_order", jj) wObj.setValue(jj + 1, "ordinal_id", jj) jj += 1 # ii = wObj.getRowCount() for jj, uTup in enumerate(fitAtomUnMappedL): at1 = hAtomPrefix + str(jj) for nAt in uTup.fitNeighbors: at2 = fitAtMapD[nAt] wObj.setValue(modelId, "model_id", ii) wObj.setValue(at1, "atom_id_1", ii) wObj.setValue(at2, "atom_id_2", ii) wObj.setValue("SING", "value_order", ii) wObj.setValue(ii + 1, "ordinal_id", ii) # -------- --------- catName = "pdbx_chem_comp_model_descriptor" if not dataContainer.exists(catName): dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "type", "descriptor"])) wObj = dataContainer.getObj(catName) # ii = 0 wObj.setValue(modelId, "model_id", ii) wObj.setValue("SMILES", "type", ii) wObj.setValue(fitFD["SMILES"], "descriptor", ii) ii += 1 wObj.setValue(modelId, "model_id", ii) wObj.setValue("SMILES_CANONICAL", "type", ii) wObj.setValue(fitFD["SMILES_STEREO"], "descriptor", ii) ii += 1 wObj.setValue(modelId, "model_id", ii) wObj.setValue("InChI", "type", ii) wObj.setValue(fitFD["InChI"], "descriptor", ii) ii += 1 wObj.setValue(modelId, "model_id", ii) wObj.setValue("InChIKey", "type", ii) wObj.setValue(fitFD["InChIKey"], "descriptor", ii) # # -------- --------- if matchObj.getIdentifier() is not None: catName = "pdbx_chem_comp_model_reference" if not dataContainer.exists(catName): dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "db_name", "db_code"])) wObj = dataContainer.getObj(catName) ii = 0 wObj.setValue(modelId, "model_id", ii) wObj.setValue("CSD", "db_name", ii) wObj.setValue(matchObj.getIdentifier(), "db_code", ii) # featureD = {} v = matchObj.getRFactor() vS = str(v) if v is not None and len(vS) > 0: featureD["r_factor"] = "%.3f" % float(v) # v = matchObj.getTemperature() vS = str(v) # remove string artifacts from temperature string ... if v is not None and len(vS) > 0: tV = vS.upper() try: if tV.endswith("DEG.C"): tV = tV.replace("AT", "") tV = tV.replace("DEG.C", "") tV = float(tV.strip()) tV = tV + 273.15 else: tV = tV.replace("AT", "") tV = tV.replace("K", "") tV = float(tV.strip()) featureD["experiment_temperature"] = tV except Exception as e: logger.exception("Temperature conversion fails for %s (%r) with %s", modelId, vS, tV) # v = matchObj.getCitationDOI() vS = str(v) if v is not None and len(vS) > 0: featureD["publication_doi"] = v # v = matchObj.getCsdVersion() vS = str(v) if v is not None and len(vS) > 0: featureD["csd_version"] = v # if matchObj.getRadiationSource() in ["Neutron"]: featureD["neutron_radiation_experiment"] = True if matchObj.getHasDisorder() in ["Y"]: featureD["has_disorder"] = True # if len(unMappedTypeD) == 1 and "H" in unMappedTypeD: logger.info("model %r heavy_atoms_only", modelId) featureD["heavy_atoms_only"] = True else: featureD["all_atoms_have_sites"] = True # -------- --------- catName = "pdbx_chem_comp_model_feature" if not dataContainer.exists(catName): dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "feature_name", "feature_value"])) wObj = dataContainer.getObj(catName) # fKeyList = ["experiment_temperature", "publication_doi", "r_factor", "csd_version"] ii = 0 for fKey in fKeyList: if fKey in featureD: wObj.setValue(modelId, "model_id", ii) wObj.setValue(fKey, "feature_name", ii) wObj.setValue(str(featureD[fKey]), "feature_value", ii) ii += 1 # boolKeyList = ["has_disorder", "neutron_radiation_experiment", "heavy_atoms_only", "all_atoms_have_sites"] for fKey in boolKeyList: if fKey in featureD: if featureD[fKey]: wObj.setValue(modelId, "model_id", ii) wObj.setValue(fKey, "feature_name", ii) wObj.setValue("Y", "feature_value", ii) ii += 1 # if variantType: wObj.setValue(modelId, "model_id", ii) wObj.setValue(variantType + "_match", "feature_name", ii) wObj.setValue("Y", "feature_value", ii) ii += 1 # -------- --------- catName = "pdbx_chem_comp_model_audit" if not dataContainer.exists(catName): dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "action_type", "date"])) wObj = dataContainer.getObj(catName) # ii = 0 wObj.setValue(modelId, "model_id", ii) wObj.setValue("Initial release", "action_type", ii) wObj.setValue(self.__getToday(), "date", ii) # wObj.setValue('RCSB', 'processing_site', ii) # wObj.setValue('JDW', 'annotator', ii) # wObj.setValue('?', 'details', ii) # ok = mU.doExport(modelPath, [dataContainer], fmt="mmcif") return ok, variantType except Exception as e: logger.exception("Failing for %r %r with %s", targetId, targetPath, str(e)) return False, ""
class PfamProvider(StashableBase): """Manage an index of Pfam identifier to description mappings.""" def __init__(self, **kwargs): urlTargetPfam = kwargs.get( "urlTargetPfam", "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz" ) urlTargetPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/Pfam-A.clans.tsv.gz" self.__version = "34.0" dirName = "pfam" cachePath = kwargs.get("cachePath", ".") dirPath = os.path.join(cachePath, dirName) super(PfamProvider, self).__init__(cachePath, [dirName]) useCache = kwargs.get("useCache", True) # self.__mU = MarshalUtil(workPath=dirPath) self.__pfamD = self.__rebuildCache(urlTargetPfam, urlTargetPfamFB, dirPath, useCache) urlTargetMapPfam = kwargs.get( "urlTargetMapPfam", "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pdb_pfamA_reg.txt.gz" ) urlTargetMapPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/pdb_pfamA_reg.txt.gz" self.__pfamMapD = self.__rebuildMappingCache(urlTargetMapPfam, urlTargetMapPfamFB, dirPath, useCache) def getVersion(self): return self.__version def getDescription(self, pfamId): """Return the description for the input Pfam identifier Args: pfamId (str): Pfam identifier Returns: str: text description of the Pfam domain """ descr = None try: descr = self.__pfamD[pfamId] except Exception: pass return descr def getMapping(self, pdbId): """Return the list of Pfam domain assignments for the input PDB identifer along with residue level mapping information Args: pdbId (str): PDB identifier Returns: list: [{'pfamId': , 'authAsymId": , 'authSeqBeg': , 'authSeqEnd': 'insertBeg': , 'insertEnd': }, {}, ] """ mapL = [] try: mapL = self.__pfamMapD[pdbId.upper()] except Exception: pass return mapL def testCache(self): # Check length ... logger.info("Length PfamD %d", len(self.__pfamD)) return (len(self.__pfamD) > 19000) and (len(self.__pfamMapD) > 150000) # def __rebuildCache(self, urlTargetPfam, urlTargetPfamFB, dirPath, useCache): pfamD = {} fmt = "json" ext = fmt if fmt == "json" else "pic" pfamDataPath = os.path.join(dirPath, "pfam-data.%s" % ext) # logger.debug("Using cache data path %s", dirPath) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(pfamDataPath): pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt) logger.debug("Pfam data length %d", len(pfamD)) elif not useCache: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetPfam, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam)) ok = fU.get(urlTargetPfam, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB)) ok = fU.get(urlTargetPfamFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) pfamD = self.__getPfamIndex(fp) ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt) logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath, ok) # ------ # return pfamD def __getPfamIndex(self, filePath): """Parse annotation classifications # """ pfamD = {} encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {} rowL = self.__mU.doImport(filePath, fmt="tdd", rowFormat="list", **encodingD) for row in rowL: try: pfamId = row[0].strip().upper() idCode = row[3].strip() descr = row[4].strip() pfamD[pfamId] = descr + " (" + idCode + ")" except Exception: pass # return pfamD def __rebuildMappingCache(self, urlTargetPfam, urlTargetPfamFB, dirPath, useCache): fmt = "json" ext = fmt if fmt == "json" else "pic" pfamDataPath = os.path.join(dirPath, "pfam-mapping-data.%s" % ext) # logger.debug("Using cache data path %s", dirPath) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(pfamDataPath): pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt) logger.debug("Pfam mapping data length %d", len(pfamD)) else: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetPfam, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam)) ok = fU.get(urlTargetPfam, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB)) ok = fU.get(urlTargetPfamFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) pfamD = self.__getPfamMapping(fp) ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt) logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath, ok) # ------ # return pfamD def __getPfamMapping(self, filePath): """Parse mapping data""" pFamMapD = {} encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {} rowL = self.__mU.doImport(filePath, fmt="tdd", rowFormat="list", **encodingD) for row in rowL: try: pdbId = row[2].strip().upper() pfamId = row[3].strip().upper() authAsymId = row[5].strip() authSeqBeg = int(row[6].strip()) insertBeg = row[7].strip( ) if row[7].strip() != "NULL" else None authSeqEnd = int(row[8].strip()) insertEnd = row[9].strip( ) if row[9].strip() != "NULL" else None pFamMapD.setdefault(pdbId, []).append({ "pfamId": pfamId, "authAsymId": authAsymId, "authSeqBeg": authSeqBeg, "authSeqEnd": authSeqEnd, "insertBeg": insertBeg, "insertEnd": insertEnd, }) except Exception as e: logger.exception("Failing with %r %s", row, str(e)) # logger.info("Pfam mapping data for (%d) entries", len(pFamMapD)) return pFamMapD
class CARDTargetProvider: """Accessors for CARD target assignments.""" def __init__(self, **kwargs): # self.__cachePath = kwargs.get("cachePath", ".") self.__dirPath = os.path.join(self.__cachePath, "CARD-targets") # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__oD, self.__version = self.__reload(self.__dirPath, **kwargs) # def testCache(self, minCount=3000): if self.__oD and len(self.__oD) > minCount: return True else: return False def hasFeature(self, modelId): return modelId in self.__oD def getFeature(self, modelId, featureKey): try: return self.__oD[modelId][featureKey] except Exception: return None def getAssignmentVersion(self): return self.__version def getTargetDataPath(self): return os.path.join(self.__dirPath, "card-target-data.json") def getCofactorDataPath(self): return None def __reload(self, dirPath, **kwargs): oD = None version = None startTime = time.time() useCache = kwargs.get("useCache", True) # # CARDDumpUrl = kwargs.get("CARDDumpUrl", "https://card.mcmaster.ca/latest/data/broadstreet-v3.1.0.tar.bz2") cardDumpUrl = kwargs.get("CARDDumpUrl", "https://card.mcmaster.ca/latest/data") ok = False fU = FileUtil() cardDumpFileName = "card-data.tar.bz2" cardDumpPath = os.path.join(dirPath, cardDumpFileName) cardDumpDirPath = os.path.join(dirPath, "dump") # fU.mkdir(dirPath) cardDataPath = os.path.join(dirPath, "card-select-data.json") # logger.info("useCache %r CARDDumpPath %r", useCache, cardDumpPath) if useCache and self.__mU.exists(cardDataPath): qD = self.__mU.doImport(cardDataPath, fmt="json") version = qD["version"] oD = qD["data"] else: logger.info("Fetching url %s path %s", cardDumpUrl, cardDumpPath) ok = fU.get(cardDumpUrl, cardDumpPath) fU.mkdir(cardDumpDirPath) fU.uncompress(cardDumpPath, outputDir=cardDumpDirPath) fU.unbundleTarfile(os.path.join(cardDumpDirPath, cardDumpFileName[:-4]), dirPath=cardDumpDirPath) logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) oD, version = self.__parseCardData( os.path.join(cardDumpDirPath, "card.json")) tS = datetime.datetime.now().isoformat() qD = {"version": version, "created": tS, "data": oD} oD = qD["data"] ok = self.__mU.doExport(cardDataPath, qD, fmt="json", indent=3) logger.info("Export CARD data (%d) status %r", len(oD), ok) # --- return oD, version def exportCardFasta(self, fastaPath, taxonPath): ok = self.__exportCardFasta(fastaPath, taxonPath, self.__oD) return ok def __exportCardFasta(self, fastaPath, taxonPath, cardD): """Export a CARD sequence target fasta file Args: fastaPath (str): fasta output file path cardD (dict): card selected data dictionary Returns: (bool): True for success or False otherwise """ sD = {} taxonL = [] try: for modelId, tD in cardD.items(): modelBitScore = None # aroAcc = tD["accession"] aroId = tD["id"] if "sequences" not in tD: continue modelBitScore = tD[ "modelBitScore"] if "modelBitScore" in tD else None for qD in tD["sequences"]: sId = qD["seqId"] seq = qD["sequence"] taxId = qD["taxId"] cD = { "sequence": seq, "modelId": modelId, "aroId": aroId, "seqId": sId, "taxId": taxId } cD["bitScore"] = modelBitScore if modelBitScore else "-1.0" # cId = "" cL = [] for k, v in cD.items(): if k in ["sequence"]: continue cL.append(str(v)) cL.append(str(k)) cId = "|".join(cL) sD[cId] = cD taxonL.append("%s\t%s" % (cId, taxId)) ok = self.__mU.doExport(fastaPath, sD, fmt="fasta", makeComment=True) logger.info("Export CARD fasta (%d) status %r", len(sD), ok) ok = self.__mU.doExport(taxonPath, taxonL, fmt="list") logger.info("Export Taxon (%d) status %r", len(taxonL), ok) except Exception as e: logger.exception("Failing for model %r tD %r with %s", modelId, tD, str(e)) return ok def __parseCardData(self, filePath): """Parse CARD target data Args: filePath (str): card json data file Returns: (dict, string): card selected data dictionary, card version string """ try: oD = {} version = None cD = self.__mU.doImport(filePath, fmt="json") logger.info("CARD model count (%d)", len(cD)) for modelId, mD in cD.items(): if modelId.startswith("_"): if modelId == "_version": version = mD continue oD[modelId] = {} for kTup in [ ("ARO_accession", "accession"), ("ARO_id", "id"), ("ARO_name", "name"), ("ARO_description", "descr"), ("model_name", "modelName"), ("model_type", "modelType"), ]: if kTup[0] in mD: oD[modelId][kTup[1]] = mD[kTup[0]] try: if "model_sequences" in mD: for seqId, tD in mD["model_sequences"][ "sequence"].items(): oD[modelId].setdefault("sequences", []).append({ "seqId": seqId, "sequence": tD["protein_sequence"]["sequence"], "taxId": tD["NCBI_taxonomy"]["NCBI_taxonomy_id"] }) except Exception as e: logger.exception("Failing with %s", str(e)) try: if "model_param" in mD and "blastp_bit_score" in mD[ "model_param"] and "param_value" in mD[ "model_param"]["blastp_bit_score"]: oD[modelId]["modelBitScore"] = mD["model_param"][ "blastp_bit_score"]["param_value"] except Exception as e: logger.exception("Failing with %s", str(e)) except Exception as e: logger.exception("Failing with %s", str(e)) return oD, version
def main(): parser = argparse.ArgumentParser() # parser.add_argument("--mol_list_path", default=None, help="Molecule file list path") parser.add_argument("--result_path", default=None, help="Molecule file list path") parser.add_argument("--search_type", default=None, help="Search type (similarity|substructure)") parser.add_argument("--start_record", default=None, help="Starting record") parser.add_argument("--end_record", default=None, help="End record") parser.add_argument("--csdhome", default=None, help="Path to the CSD release (path to CSD_202x)") parser.add_argument("--python_lib_path", default=None, help="Path to Python library") parser.add_argument("--python_version", default=None, help="Python library version (default: 3.7)") parser.add_argument( "--hit_list_path", default=None, help="Path to list of molecule identifers with search results") # args = parser.parse_args() # try: pyLib = args.python_lib_path if args.python_lib_path else os.path.join( os.environ["PYENV_ROOT"], "versions", "3.7.9", "lib") pyVer = args.python_version if args.python_version else "3.7" csdHome = args.csdhome molFilePath = args.mol_list_path resultPath = args.result_path searchType = args.search_type startRecord = args.start_record endRecord = args.end_record hitListPath = args.hit_list_path except Exception as e: logger.exception("Argument processing problem %s", str(e)) parser.print_help(sys.stderr) exit(1) # try: os.environ["CSDHOME"] = csdHome os.environ[ "LD_LIBRARY_PATH"] = "%s:%s/python%s/site-packages/ccdc/_lib:$LD_LIBRARY_PATH" % ( pyLib, pyLib, pyVer) os.environ[ "DYLD_LIBRARY_PATH"] = "%s/python%s/site-packages/ccdc/_lib" % ( pyLib, pyVer) os.environ[ "DYLD_FRAMEWORK_PATH"] = "%s/python%s/site-packages/ccdc/_lib" % ( pyLib, pyVer) logger.info("Using CSDHOME %s", os.environ["CSDHOME"]) logger.info("Using DYLD_LIBRARY_PATH %s", os.environ["DYLD_LIBRARY_PATH"]) logger.info("Using DYLD_FRAMEWORK_PATH %s", os.environ["DYLD_FRAMEWORK_PATH"]) from rcsb.utils.ccdc.CcdcSearch import CcdcSearch # pylint: disable=import-outside-toplevel ccdcS = CcdcSearch(verbose=True) pL = ccdcS.getList(molFilePath, startRecord=startRecord, endRecord=endRecord) logger.info("Search file %s record length %r", molFilePath, len(pL) if pL else []) # hitL = [] for ii, queryTargetPath in enumerate(pL, 1): _, fn = os.path.split(queryTargetPath) queryTargetId, _ = os.path.splitext(fn) # logger.info("(%d/%d) Start search for %r %r", ii, len(pL), queryTargetId, queryTargetPath) numHits = ccdcS.search(queryTargetId, queryTargetPath, resultPath, searchType=searchType) if numHits: hitL.append(queryTargetId) logger.info("%d searches completed - matched %d", len(pL), len(hitL)) if hitListPath: mU = MarshalUtil() ok = mU.doExport(hitListPath, hitL, fmt="list") logger.info("Wrote hit list (%r) to %s", ok, hitListPath) except Exception as e: logger.exception("Failing with %s", str(e))
class DrugBankTargetCofactorProvider(StashableBase): """Accessors for DrugBank target cofactors.""" def __init__(self, **kwargs): # self.__cachePath = kwargs.get("cachePath", ".") self.__useCache = kwargs.get("useCache", True) self.__fmt = kwargs.get("fmt", "pickle") self.__dirName = "DrugBank-cofactors" super(DrugBankTargetCofactorProvider, self).__init__(self.__cachePath, [self.__dirName]) self.__dirPath = os.path.join(self.__cachePath, self.__dirName) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__fD = self.__reload(self.__dirPath, self.__useCache, self.__fmt) # def testCache(self, minCount=590): logger.info( "DrugBank feature count %d", len(self.__fD["cofactors"]) if "cofactors" in self.__fD else 0) if self.__fD and "cofactors" in self.__fD and len( self.__fD["cofactors"]) > minCount: return True else: return False def hasTarget(self, rcsbEntityId): return rcsbEntityId.upper() in self.__fD["cofactors"] def getTargets(self, rcsbEntityId): try: return self.__fD["cofactors"][rcsbEntityId.upper()] except Exception: return [] def __getCofactorDataPath(self, fmt="json"): fExt = "json" if fmt == "json" else "pic" return os.path.join(self.__dirPath, "drugbank-cofactor-data.%s" % fExt) def reload(self): self.__fD = self.__reload(self.__dirPath, useCache=True, fmt=self.__fmt) return True def __reload(self, dirPath, useCache, fmt): startTime = time.time() fD = {} ok = False cofactorPath = self.__getCofactorDataPath(fmt=fmt) # logger.info("useCache %r featurePath %r", useCache, cofactorPath) if useCache and self.__mU.exists(cofactorPath): fD = self.__mU.doImport(cofactorPath, fmt=fmt) else: fU = FileUtil() fU.mkdir(dirPath) # --- logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return fD def buildCofactorList(self, sequenceMatchFilePath, crmpObj=None, lnmpObj=None): """Build target cofactor list for the matching entities in the input sequence match file. Args: sequenceMatchFilePath (str): sequence match output file path crmpObj (obj, optional): instance of ChemRefMappingProviderObj(). Defaults to None lnmpObj (obj, optional): instance of LigandNeighborMappingProviderObj(). Defaults to None. Returns: bool: True for success or False otherwise """ rDL = [] dbP = DrugBankTargetProvider(cachePath=self.__cachePath, useCache=True) mD = self.__mU.doImport(sequenceMatchFilePath, fmt="json") # provenanceSource = "DrugBank" refScheme = "PDB entity" assignVersion = str(dbP.getAssignmentVersion()) for queryId, matchDL in mD.items(): qCmtD = self.__decodeComment(queryId) unpId = qCmtD["uniprotId"] queryTaxId = qCmtD["taxId"] if "taxId" in qCmtD else None if not dbP.hasCofactor(unpId) or queryTaxId == "-1": logger.info("Skipping target %r", unpId) continue # # -- chemCompNeighborsD = {} if lnmpObj: for matchD in matchDL: tCmtD = self.__decodeComment(matchD["target"]) entryId = tCmtD["entityId"].split("_")[0] entityId = tCmtD["entityId"].split("_")[1] rcsbEntityId = entryId + "_" + entityId chemCompIdList = lnmpObj.getLigandNeighbors(rcsbEntityId) chemCompNeighborsD.update( {k: True for k in chemCompIdList}) # -- # for matchD in matchDL: tCmtD = self.__decodeComment(matchD["target"]) entryId = tCmtD["entityId"].split("_")[0] entityId = tCmtD["entityId"].split("_")[1] # -- dbDL = dbP.getCofactors(unpId) # -- cfDL = [] for dbD in dbDL: cfD = {} cfD["cofactor_id"] = dbD["drugbank_id"] cfD["molecule_name"] = dbD["name"] cfD["target_name"] = dbD["target_name"] # cfD["description"] = dbD["description"] cfD["moa"] = dbD["moa"] # cfD["pharmacology"] = dbD["pharmacology"] cfD["inchi_key"] = dbD["inchi_key"] cfD["smiles"] = dbD["smiles"] cfD["pubmed_ids"] = dbD["pubmed_ids"] cfD = self.__addLocalIds(cfD, crmpObj) # if "chem_comp_id" in cfD and cfD[ "chem_comp_id"] in chemCompNeighborsD: cfD["neighbor_in_pdb"] = "Y" else: cfD["neighbor_in_pdb"] = "N" # cfDL.append(cfD) # --- queryName = cfDL[0][ "target_name"] if cfDL and "target_name" in cfDL[ 0] else None # --- # aligned_target.entity_beg_seq_id (current target is PDB entity in json) # aligned_target.target_beg_seq_id (current query is target seq in json) # aligned_target.length fpL = [] if "alignedRegions" in matchD: fpL = [{ "entity_beg_seq_id": arD["targetBegin"], "target_beg_seq_id": arD["queryBegin"], "length": arD["targetEnd"] - arD["targetBegin"], } for arD in matchD["alignedRegions"]] else: fpL = [{ "entity_beg_seq_id": matchD["targetBegin"], "target_beg_seq_id": matchD["queryBegin"], "length": matchD["alignLen"], }] # --- rD = { "entry_id": entryId, "entity_id": entityId, "query_uniprot_id": unpId, "query_id": unpId, "query_id_type": "DrugBank", "query_name": queryName, "provenance_source": provenanceSource, "reference_scheme": refScheme, "assignment_version": assignVersion, "query_taxonomy_id": int(queryTaxId) if queryTaxId else None, "target_taxonomy_id": int(matchD["targetTaxId"]) if "targetTaxId" in matchD else None, "aligned_target": fpL, "taxonomy_match_status": matchD["taxonomyMatchStatus"] if "taxonomyMatchStatus" in matchD else None, "lca_taxonomy_id": matchD["lcaTaxId"] if "lcaTaxId" in matchD else None, "lca_taxonomy_name": matchD["lcaTaxName"] if "lcaTaxName" in matchD else None, "lca_taxonomy_rank": matchD["lcaRank"] if "lcaRank" in matchD else None, "cofactors": cfDL, } rDL.append(rD) # qD = {} for rD in rDL: eId = rD["entry_id"] + "_" + rD["entity_id"] qD.setdefault(eId, []).append(rD) fp = self.__getCofactorDataPath(fmt=self.__fmt) tS = datetime.datetime.now().isoformat() # vS = datetime.datetime.now().strftime("%Y-%m-%d") vS = assignVersion ok = self.__mU.doExport(fp, { "version": vS, "created": tS, "cofactors": qD }, fmt=self.__fmt, indent=3) return ok def __addLocalIds(self, cfD, crmpOb=None): # if crmpOb: localIdL = crmpOb.getLocalIds("DRUGBANK", cfD["cofactor_id"]) if localIdL: localId = localIdL[0] if localId.startswith("PRD_"): cfD["prd_id"] = localId else: cfD["chem_comp_id"] = localId return cfD def __decodeComment(self, comment, separator="|"): dD = {} try: ti = iter(comment.split(separator)) dD = {tup[1]: tup[0] for tup in zip(ti, ti)} except Exception: pass return dD
class EntityPolymerExtractorTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(EntityPolymerExtractorTests, self).__init__(methodName) self.__verbose = True def setUp(self): # # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml") # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) # self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__workPath = os.path.join(HERE, "test-output") self.__taxonomyDataPath = os.path.join( self.__cachePath, self.__cfgOb.get("NCBI_TAXONOMY_CACHE_DIR", sectionName=configName)) # self.__cacheKwargs = {"fmt": "json", "indent": 3} self.__exdbCacheDirPath = os.path.join( self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName)) # self.__mU = MarshalUtil() self.__entryLimitTest = 18 # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testExtractEntityPolymers(self): """Test case - extract entity polymer info""" try: epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs, entryLimit=self.__entryLimitTest) eCount = epe.getEntryCount() self.assertGreaterEqual(eCount, self.__entryLimitTest) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testAccessEntityPolymerFeatures(self): """Test case - access cached entity polymer info from test cache""" try: epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs) eCount = epe.getEntryCount() logger.info("Entry count %d", eCount) self.assertGreaterEqual(eCount, self.__entryLimitTest) # unpL = epe.getRefSeqAccessions("UNP") logger.info("Ref seq count %d", len(unpL)) self.assertGreaterEqual(len(unpL), 1) # for entryId in ["3RER"]: for entityId in ["1"]: uL = epe.getEntityRefSeqAccessions("UNP", entryId, entityId) logger.info("UNP for %s %s %r", entryId, entityId, uL) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testTaxonomyReadCache(self): """Test case - access cached entity polymer info from test cache""" try: epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs) logger.info("Cache entry count %d", epe.getEntryCount()) # obsL = [] tD = epe.getOrigTaxons() logger.info("Taxons %d", len(tD)) tU = TaxonomyProvider(taxDirPath=self.__taxonomyDataPath, useCache=True) # for entryId, taxIdL in tD.items(): for entityId, iTaxId in taxIdL: # logger.info("entryId %r entityId %r taxId %r" % (entryId, entityId, taxId)) mTaxId = tU.getMergedTaxId(iTaxId) if iTaxId != mTaxId: obsL.append({ "entryId": entryId, "entityId": entityId, "taxId": iTaxId, "replaceTaxId": mTaxId }) logger.info("Obsolete list length %d", len(obsL)) self.__mU.doExport(os.path.join(self.__workPath, "obsolete-taxons.json"), obsL, fmt="json", indent=3) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testAccessEntityPolymerReadCache(self): """Test case - access cached entity polymer info from test cache""" try: epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs) logger.info("Cache entry count %d", epe.getEntryCount()) cD = epe.countRefSeqAccessions("UNP") self.assertGreaterEqual(len(cD), 2) logger.info("UNP reference sequences per entity %r", dict(sorted(cD.items()))) logger.info("Reference sequences per entity %r", dict(sorted(epe.countRefSeqAccessionAny().items()))) logger.info("Reference sequences per ref db %r", dict(sorted(epe.countRefSeqAccessionDbType().items()))) # ok = epe.checkRefSeqAlignRange("UNP") self.assertTrue(ok) unpL = epe.getRefSeqAccessions("UNP") logger.info("Unique UNP reference sequences %d", len(unpL)) self.assertTrue(ok) tD = epe.getUniqueTaxons() logger.info("Unique taxons %d", len(tD)) tD = epe.countRefSeqAccessionByTaxon("UNP") logger.info("Unique taxons %d", len(tD)) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class ScanRepoUtil(object): """Tools for for scanning repositories and collecting coverage and type data information.""" def __init__(self, cfgOb, attributeDataTypeD=None, numProc=4, chunkSize=15, fileLimit=None, maxStepLength=2000, workPath=None): """ Args: cfgOb (object): Configuration object (rcsb.utils.config.ConfigUtil) attributeDataTypeD dictPath (str): Path to supporting data dictionary numProc (int, optional): Number of parallel worker processes used. chunkSize (int, optional): Size of files processed in a single multi-proc process fileLimit (int, optional): maximum file scanned or None for no limit mockTopPath (str, optional): Path to directory containing mock repositories or None maxStepLength (int, optional): maximum number of multi-proc runs to perform """ # self.__attributeDataTypeD = attributeDataTypeD if attributeDataTypeD else {} # Limit the load length of each file type for testing - Set to None to remove - self.__fileLimit = fileLimit self.__maxStepLength = maxStepLength # # Controls for multiprocessing execution - self.__numProc = numProc self.__chunkSize = chunkSize # self.__cfgOb = cfgOb # self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s" self.__workPath = workPath self.__mU = MarshalUtil(workPath=self.__workPath) self.__rpP = RepositoryProvider(self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__workPath) def scanContentType(self, contentType, mergeContentTypes=None, scanType="full", inputPathList=None, scanDataFilePath=None, failedFilePath=None, saveInputFileListPath=None): """Driver method for repository scan operation Args: contentType (str): one of 'bird','bird_family','bird_chem_comp', chem_comp','pdbx' scanType (str, optional): 'full' [or 'incr' to be supported] inputPathList (list, optional): list of input file paths to scan scanDataFilePath (str, optional): file path for serialized scan data (Pickle format) failedFilePath (str, optional): file path for list of files that fail scanning operation saveInputFileListPath str, optional): Path to store file path list that is scanned Returns: bool: True for success or False otherwise """ try: startTime = self.__begin(message="scanning operation") # locatorObjList = self.__rpP.getLocatorObjList( contentType=contentType, inputPathList=inputPathList, mergeContentTypes=mergeContentTypes) # if saveInputFileListPath: self.__mU.doExport(saveInputFileListPath, self.__rpP.getLocatorPaths(locatorObjList), fmt="list") logger.debug("Saving %d paths in %s", len(locatorObjList), saveInputFileListPath) # optD = {} optD["contentType"] = contentType optD["logSize"] = True optD["scanType"] = scanType # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - # numProc = self.__numProc chunkSize = self.__chunkSize if locatorObjList and self.__chunkSize < len( locatorObjList) else 0 # # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - numPaths = len(locatorObjList) logger.debug("Processing %d total paths", numPaths) numProc = min(numProc, numPaths) maxStepLength = self.__maxStepLength if numPaths > maxStepLength: numLists = int(numPaths / maxStepLength) subLists = [ locatorObjList[i::numLists] for i in range(numLists) ] else: subLists = [locatorObjList] # if subLists: logger.debug( "Starting with numProc %d outer subtask count %d subtask length ~ %d", numProc, len(subLists), len(subLists[0])) # numResults = 1 failList = [] retLists = [[] for ii in range(numResults)] diagList = [] for ii, subList in enumerate(subLists): logger.info("Running outer subtask %d of %d length %d", ii + 1, len(subLists), len(subList)) # mpu = MultiProcUtil(verbose=True) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="scanWorker") ok, failListT, retListsT, diagListT = mpu.runMulti( dataList=subList, numProc=numProc, numResults=numResults, chunkSize=chunkSize) failList.extend(failListT) # retLists is a list of lists - logger.debug("status %r fail len %r ret len %r", ok, len(failListT), len(retListsT)) for jj in range(numResults): retLists[jj].extend(retListsT[jj]) diagList.extend(diagListT) logger.debug("Scan failed path list %r", failList) logger.debug( "Scan path list success length %d load list failed length %d", len(locatorObjList), len(failList)) logger.debug("Returned metadata length %r", len(retLists[0])) # if failedFilePath and failList: wOk = self.__mU.doExport(failedFilePath, self.__rpP.getLocatorPaths(failList), fmt="list") logger.debug("Writing scan failure path list to %s status %r", failedFilePath, wOk) # if scanType == "incr": scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle", default=None) logger.debug("Imported scan data with keys %r", list(scanDataD.keys())) else: scanDataD = {} # if scanDataFilePath and retLists[0]: for ssTup in retLists[0]: cId = ssTup.containerId if scanType == "full" and cId in scanDataD: logger.error("Duplicate container id %s in %r and %r", cId, ssTup.fromPath, scanDataD[cId].fromPath) # scanDataD[cId] = ssTup ok = self.__mU.doExport(scanDataFilePath, scanDataD, fmt="pickle") tscanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle") ok = tscanDataD == scanDataD self.__end(startTime, "scanning operation with status " + str(ok)) # return ok except Exception as e: logger.exception("Failing with %s", str(e)) return False def evalScan(self, scanDataFilePath, evalJsonFilePath, evalType="data_type"): scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle") if evalType in ["data_type"]: rD = self.__evalScanDataType(scanDataD) elif evalType in ["data_coverage"]: rD, _ = self.__evalScanDataCoverage(scanDataD) else: logger.debug("Unknown evalType %r", evalType) ok = self.__mU.doExport(evalJsonFilePath, rD, fmt="json") return ok def evalScanItem(self, scanDataFilePath, evalFilePath): scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle") _, cL = self.__evalScanDataCoverage(scanDataD) ok = self.__mU.doExport(evalFilePath, cL, fmt="list") return ok def __evalScanDataType(self, scanDataD): """ ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec') ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict') """ # for populated sD[category] -> d[atName]->{minWidth: , maxWidth:, minPrec:, maxPrec: , count} sD = {} for cId in scanDataD: ssTup = scanDataD[cId] dD = ssTup.scanCategoryDict for catName in dD: if catName not in sD: sD[catName] = {} for svTup in dD[catName]: if svTup.atName not in sD[catName]: sD[catName][svTup.atName] = { "minWidth": svTup.minWidth, "maxWidth": svTup.maxWidth, "minPrec": svTup.minPrec, "maxPrec": svTup.maxPrec, "count": 1 } continue sD[catName][svTup.atName]["minWidth"] = min( sD[catName][svTup.atName]["minWidth"], svTup.minWidth) sD[catName][svTup.atName]["maxWidth"] = max( sD[catName][svTup.atName]["maxWidth"], svTup.maxWidth) sD[catName][svTup.atName]["minPrec"] = min( sD[catName][svTup.atName]["minPrec"], svTup.minPrec) sD[catName][svTup.atName]["maxPrec"] = max( sD[catName][svTup.atName]["maxPrec"], svTup.maxPrec) sD[catName][svTup.atName]["count"] += 1 return sD def __evalScanDataCoverage(self, scanDataD): """ ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec') ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict') """ # for populated sD[category] -> d[atName]->{count: #, instances: [id,id,id]} sD = {} for cId in scanDataD: ssTup = scanDataD[cId] dD = ssTup.scanCategoryDict for catName in dD: if catName not in sD: sD[catName] = {} for svTup in dD[catName]: if svTup.atName not in sD[catName]: sD[catName][svTup.atName] = { "count": 0, "instances": [] } sD[catName][svTup.atName]["instances"].append( svTup.containerId) sD[catName][svTup.atName]["count"] += 1 cL = [] for catName, aD in sD.items(): for atName, tD in aD.items(): cL.append("%s\t%s" % ("_" + catName + "." + atName, tD["count"])) return sD, cL def scanWorker(self, dataList, procName, optionsD, workingDir): """Multi-proc worker method for scanning repository data files-""" try: _ = workingDir startTime = self.__begin(message=procName) # Recover common options scanType = optionsD["scanType"] contentType = optionsD["contentType"] # successList = [] retList = [] containerList = self.__getContainerList(dataList) for container in containerList: ret = self.__scanContainer(container) successList.append(ret.fromPath) retList.append(ret) # logger.debug( "%s scanType %s contentType %spathlist length %d containerList length %d", procName, scanType, contentType, len(dataList), len(containerList)) ok = len(successList) == len(dataList) # self.__end(startTime, procName + " with status " + str(ok)) return successList, retList, [] except Exception as e: logger.error("Failing with dataList %r", dataList) logger.exception("Failing with %s", str(e)) return [], [], [] def __getContainerList(self, locatorObjList): """""" utcnow = datetime.datetime.utcnow() ts = utcnow.strftime("%Y-%m-%d:%H:%M:%S") cL = [] myContainerList = self.__rpP.getContainerList(locatorObjList) for loc in locatorObjList: myContainerList = self.__rpP.getContainerList([loc]) lPathL = self.__rpP.getLocatorPaths([loc]) for cA in myContainerList: dc = DataCategory("rcsb_load_status", ["name", "load_date", "locator"], [[cA.getName(), ts, lPathL[0]]]) logger.debug("data category %r", dc) cA.append(dc) cL.append(cA) return cL def __scanContainer(self, container): """Scan the input container for Get the file name - """ cName = container.getName() loadStatusObj = container.getObj("rcsb_load_status") lName = loadStatusObj.getValue(attributeName="name", rowIndex=0) lFilePath = loadStatusObj.getValue(attributeName="locator", rowIndex=0) lDate = loadStatusObj.getValue(attributeName="load_date", rowIndex=0) # oD = {} for objName in container.getObjNameList(): if objName == "rcsb_load_status": continue obj = container.getObj(objName) afD = self.__attributeDataTypeD[ objName] if objName in self.__attributeDataTypeD else {} atNameList = obj.getAttributeList() wMin = {atName: 100000 for atName in atNameList} wMax = {atName: -1 for atName in atNameList} pMin = {atName: 100000 for atName in atNameList} pMax = {atName: -1 for atName in atNameList} for row in obj.getRowList(): for ii, val in enumerate(row): valLen = len(val) if (valLen == 0) or (val == "?") or (val == "."): continue atName = atNameList[ii] wMin[atName] = min(wMin[atName], valLen) wMax[atName] = max(wMax[atName], valLen) if atName in afD and afD[atName] == "float": vPrec = 0 try: fields = val.split(".") vPrec = len(fields[1]) pMin[atName] = min(pMin[atName], vPrec) pMax[atName] = max(pMax[atName], vPrec) except Exception as e: logger.debug("Failed to process float %s %r %r %s", atName, val, vPrec, str(e)) pMin[atName] = 0 pMax[atName] = 0 logger.debug("Got float for %s %r %r", atName, val, vPrec) else: pMin[atName] = 0 pMax[atName] = 0 # ScanValue - containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec oD[objName] = [ ScanValue(cName, objName, atN, wMin[atN], wMax[atN], pMin[atN], pMax[atN]) for atN in wMax if wMax[atN] != -1 ] # ScanSummary containerId, fromPath, scanCategoryDict # ret = ScanSummary(lName, lFilePath, lDate, oD) # return ret def __begin(self, message=""): startTime = time.time() ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting %s at %s", message, ts) return startTime def __end(self, startTime, message=""): endTime = time.time() ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) delta = endTime - startTime logger.debug("Completed %s at %s (%.4f seconds)", message, ts, delta)