Python MarshalUtil.doExport примеры использования

Язык программирования: Python

Пространство имен/Пакет: rcsb.utils.io.MarshalUtil

Класс/Тип: MarshalUtil

Метод/Функция: doExport

Примеров на hotexamples.com: 25

Python MarshalUtil.doExport - 25 примеров найдено. Это лучшие примеры Python кода для rcsb.utils.io.MarshalUtil.MarshalUtil.doExport, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MarshalUtil(30)

doImport(30)

exists(30)

mkdir(26)

doExport(25)

Пример #1

Показать файл

Файл: ChEMBLTargetCofactorProvider.py Проект: rcsb/py-rcsb_utils_targets

class ChEMBLTargetCofactorProvider(StashableBase):
    """Accessors for ChEMBL target cofactors."""

    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__dirName = "ChEMBL-cofactors"
        super(ChEMBLTargetCofactorProvider, self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__fD = self.__reload(self.__dirPath, **kwargs)
        #

    def testCache(self, minCount=1):
        logger.info("ChEMBL cofactor count %d", len(self.__fD["cofactors"]) if "cofactors" in self.__fD else 0)
        if self.__fD and "cofactors" in self.__fD and len(self.__fD["cofactors"]) > minCount:
            return True
        else:
            return False

    def hasTarget(self, rcsbEntityId):
        return rcsbEntityId.upper() in self.__fD["cofactors"]

    def getTargets(self, rcsbEntityId):
        try:
            return self.__fD["cofactors"][rcsbEntityId.upper()]
        except Exception:
            return []

    def __getCofactorDataPath(self):
        return os.path.join(self.__dirPath, "ChEMBL-cofactor-data.json")

    def reload(self):
        self.__fD = self.__reload(self.__dirPath, useCache=True)
        return True

    def __reload(self, dirPath, **kwargs):
        startTime = time.time()
        fD = {}
        useCache = kwargs.get("useCache", True)
        ok = False
        cofactorPath = self.__getCofactorDataPath()
        #
        logger.info("useCache %r cofactorPath %r", useCache, cofactorPath)
        if useCache and self.__mU.exists(cofactorPath):
            fD = self.__mU.doImport(cofactorPath, fmt="json")
            ok = True
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        logger.info("Completed reload with status (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
        return fD

    def buildCofactorList(self, sequenceMatchFilePath, crmpObj=None, lnmpObj=None, maxActivity=5):
        """Build target cofactor list for the matching entities in the input sequence match file.

        Args:
            sequenceMatchFilePath (str): sequence match output file path
            crmpObj (obj, optional): instance of ChemRefMappingProviderObj()
            lnmpObj (obj, optional): instance of LigandNeighborMappingProviderObj(). Defaults to None.
            maxActivity (int, optional): maximum number of prioritized activity records per target

        Returns:
            bool: True for success or False otherwise

            Example activity record -

                        "CHEMBL3243": [
                    {
                        "assay_chembl_id": "CHEMBL655768",
                        "assay_description": "In vitro inhibitory activity against recombinant human CD45 using fluorescein diphosphate (FDP) as a substrate",
                        "assay_type": "B",
                        "canonical_smiles": "COC(=O)c1ccc(C2=CC(=O)C(=O)c3ccccc32)cc1",
                        "ligand_efficiency": {
                        "bei": "19.78",
                        "le": "0.36",
                        "lle": "3.11",
                        "sei": "9.57"
                        },
                        "molecule_chembl_id": "CHEMBL301254",
                        "parent_molecule_chembl_id": "CHEMBL301254",
                        "pchembl_value": "5.78",
                        "standard_relation": "=",
                        "standard_type": "IC50",
                        "standard_units": "nM",
                        "standard_value": "1650.0",
                        "target_chembl_id": "CHEMBL3243"
                    },
        """
        rDL = []
        mD = self.__mU.doImport(sequenceMatchFilePath, fmt="json")
        #
        chP = ChEMBLTargetProvider(cachePath=self.__cachePath, useCache=False)
        # ---
        chaP = ChEMBLTargetActivityProvider(cachePath=self.__cachePath, useCache=True)
        #
        provenanceSource = "ChEMBL"
        refScheme = "PDB entity"
        assignVersion = chP.getAssignmentVersion()
        for queryId, matchDL in mD.items():
            qCmtD = self.__decodeComment(queryId)
            unpId = qCmtD["uniprotId"]
            queryTaxId = qCmtD["taxId"] if "taxId" in qCmtD else None
            chemblIdL = qCmtD["chemblId"].split(",")
            if queryTaxId == "-1":
                logger.info("Skipping target with missing taxonomy %r (%r)", unpId, chemblIdL)
                continue
            queryName = chP.getTargetDescription(unpId)
            for chemblId in chemblIdL:
                if not chaP.hasTargetActivity(chemblId):
                    logger.debug("Skipping target %r (%r)", unpId, chemblId)
                    # continue
                # --
                chemCompNeighborsD = {}
                if lnmpObj:
                    for matchD in matchDL:
                        tCmtD = self.__decodeComment(matchD["target"])
                        entryId = tCmtD["entityId"].split("_")[0]
                        entityId = tCmtD["entityId"].split("_")[1]
                        rcsbEntityId = entryId + "_" + entityId
                        chemCompIdList = lnmpObj.getLigandNeighbors(rcsbEntityId)
                        chemCompNeighborsD.update({k: True for k in chemCompIdList})
                # --
                for matchD in matchDL:
                    tCmtD = self.__decodeComment(matchD["target"])
                    entryId = tCmtD["entityId"].split("_")[0]
                    entityId = tCmtD["entityId"].split("_")[1]
                    #
                    taDL = chaP.getTargetActivity(chemblId)
                    logger.debug("Target %r has (%d) activity records", chemblId, len(taDL))
                    # ---
                    actL = []
                    for taD in taDL:
                        if taD["assay_type"] in ["B", "F"]:
                            try:
                                if taD["standard_units"] == "nM" and taD["standard_value"] and float(taD["standard_value"]) > 0.0:
                                    pV = -math.log10(float(taD["standard_value"]) * 10.0e-9)
                                    actD = {
                                        "cofactor_id": taD["molecule_chembl_id"],
                                        "assay_id": taD["assay_chembl_id"],
                                        "assay_description": taD["assay_description"],
                                        "measurement_type": "p" + taD["standard_type"],
                                        "measurement_value": round(pV, 2),
                                        "smiles": taD["canonical_smiles"],
                                        "molecule_name": taD["molecule_name"],
                                        "inchi_key": taD["inchi_key"],
                                        "action": taD["action"],
                                        "moa": taD["moa"],
                                        "max_phase": taD["max_phase"],
                                    }
                                    actD = self.__addLocalIds(actD, crmpObj=crmpObj)
                                    actL.append(actD)
                            except Exception as e:
                                logger.debug("Failing for tAD %r with %s", taD, str(e))

                    # ---
                    actL = self.__activityListSelect(actL, chemCompNeighborsD, maxActivity=maxActivity)
                    if not actL:
                        logger.debug("No ChEMBL cofactors for %s %s", chemblId, unpId)
                    # ---
                    # aligned_target.entity_beg_seq_id (current target is PDB entity in json)
                    # aligned_target.target_beg_seq_id (current query is target seq in json)
                    # aligned_target.length
                    fpL = []
                    if "alignedRegions" in matchD:
                        fpL = [
                            {
                                "entity_beg_seq_id": arD["targetBegin"],
                                "target_beg_seq_id": arD["queryBegin"],
                                "length": arD["targetEnd"] - arD["targetBegin"],
                            }
                            for arD in matchD["alignedRegions"]
                        ]
                    else:
                        fpL = [
                            {
                                "entity_beg_seq_id": matchD["targetBegin"],
                                "target_beg_seq_id": matchD["queryBegin"],
                                "length": matchD["alignLen"],
                            }
                        ]
                    # ---
                    rD = {
                        "entry_id": entryId,
                        "entity_id": entityId,
                        "query_uniprot_id": unpId,
                        "query_id": chemblId,
                        "query_id_type": "ChEMBL",
                        "query_name": queryName,
                        "provenance_source": provenanceSource,
                        "reference_scheme": refScheme,
                        "assignment_version": assignVersion,
                        "query_taxonomy_id": int(queryTaxId) if queryTaxId else None,
                        "target_taxonomy_id": int(matchD["targetTaxId"]) if "targetTaxId" in matchD else None,
                        #
                        "aligned_target": fpL,
                        #
                        "taxonomy_match_status": matchD["taxonomyMatchStatus"] if "taxonomyMatchStatus" in matchD else None,
                        "lca_taxonomy_id": matchD["lcaTaxId"] if "lcaTaxId" in matchD else None,
                        "lca_taxonomy_name": matchD["lcaTaxName"] if "lcaTaxName" in matchD else None,
                        "lca_taxonomy_rank": matchD["lcaRank"] if "lcaRank" in matchD else None,
                        "cofactors": actL,
                    }
                    rDL.append(rD)
            #
        qD = {}
        for rD in rDL:
            eId = rD["entry_id"] + "_" + rD["entity_id"]
            qD.setdefault(eId, []).append(rD)
        #
        fp = self.__getCofactorDataPath()
        tS = datetime.datetime.now().isoformat()
        # vS = datetime.datetime.now().strftime("%Y-%m-%d")
        vS = assignVersion
        ok = self.__mU.doExport(fp, {"version": vS, "created": tS, "cofactors": qD}, fmt="json", indent=3)
        return ok

    def __addLocalIds(self, cfD, crmpObj=None):
        #
        if crmpObj:
            localIdL = crmpObj.getLocalIds("CHEMBL", cfD["cofactor_id"])
            if localIdL:
                localId = localIdL[0]
                if localId.startswith("PRD_"):
                    cfD["prd_id"] = localId
                else:
                    cfD["chem_comp_id"] = localId
        return cfD

    def __activityListSelect(self, activityDL, chemCompNeighborsD, maxActivity=5):
        retL = []
        mappedNeighborL = []
        unmappedL = activityDL
        #
        if chemCompNeighborsD:
            unmappedL = []
            # Select out the any cases for molecules that map to a neighbor chemical component.
            for activityD in activityDL:
                if "chem_comp_id" in activityD and activityD["chem_comp_id"] in chemCompNeighborsD:
                    activityD["neighbor_in_pdb"] = "Y"
                    mappedNeighborL.append(activityD)
                else:
                    unmappedL.append(activityD)
                    activityD["neighbor_in_pdb"] = "N"
        #
        numLeft = maxActivity - len(mappedNeighborL)
        if numLeft > 0:
            unmappedL = sorted(unmappedL, key=lambda k: k["measurement_value"], reverse=True)
            retL = mappedNeighborL
            retL.extend(unmappedL[:numLeft])
            retL = sorted(retL, key=lambda k: k["measurement_value"], reverse=True)
        else:
            logger.debug("Mapped neighbor cofactors (%d) excluded unmapped (%d)", len(mappedNeighborL), len(unmappedL))
            retL = sorted(mappedNeighborL, key=lambda k: k["measurement_value"], reverse=True)
        return retL

    def __decodeComment(self, comment, separator="|"):
        dD = {}
        try:
            ti = iter(comment.split(separator))
            dD = {tup[1]: tup[0] for tup in zip(ti, ti)}
        except Exception:
            pass
        return dD

Пример #2

Показать файл

Файл: testDictMethodRunner.py Проект: rcsb/py-rcsb_utils_dictionary

class DictMethodRunnerTests(unittest.TestCase):
    def setUp(self):
        self.__export = True
        self.__numProc = 2
        self.__fileLimit = 200
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        configPath = os.path.join(mockTopPath, "config",
                                  "dbload-setup-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        self.__testCaseList = [
            {
                "contentType": "pdbx_core",
                "mockLength": 50,
                "mergeContent": ["vrpt"]
            },
            {
                "contentType": "bird_chem_comp_core",
                "mockLength": 17,
                "mergeContent": None
            },
        ]
        #
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def __runContentType(self, contentType, mockLength, mergeContent):
        """Read and process test fixture data files from the input content type."""
        try:
            dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                              self.__cachePath,
                                              useCache=True)
            dictApi = dP.getApiByName(contentType)
            rP = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath,
                                            siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContent)
            containerList = self.__rpP.getContainerList(locatorObjList)
            #
            logger.debug("Length of locator list %d\n", len(locatorObjList))
            self.assertGreaterEqual(len(locatorObjList), mockLength)
            for container in containerList:
                cName = container.getName()
                #
                # if cName not in ["1B5F"]:
                #    continue
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            cName + "-with-method.cif")
                    self.__mU.doExport(savePath, [container], fmt="mmcif")

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testMethodRunner(self):
        """Test method runner for multiple content types."""
        for tD in self.__testCaseList:
            self.__runContentType(tD["contentType"], tD["mockLength"],
                                  tD["mergeContent"])

    def testMethodRunnerSetup(self):
        """Test the setup methods for method runner class"""
        try:
            dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                              self.__cachePath,
                                              useCache=True)
            dictApi = dP.getApiByName("pdbx")
            rP = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath,
                                            siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            ok = dmh is not None
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

Пример #3

Показать файл

    def assemble(self, maxRFactor=10.0):
        """Concatenate models into the input file path subject to the R value constraint.
        Relabel the models sequentially for each parent chemical component.

        Args:
            assembleModelPath (str): path for concatenated model file
            maxRFactor (float, optional): limiting R-value. Defaults to 10.0.

        Returns:
            (bool): True for success or False otherwise

        """
        dataContainerL = []
        mU = MarshalUtil(workPath=self.__cachePath)
        # combine CCDC and COD model build index files
        modelIndexD = self.__ccdcmb.fetchModelIndex()
        codD = self.__codmb.fetchModelIndex()
        for pId, mDL in codD.items():
            if pId in modelIndexD:
                modelIndexD[pId] += codD[pId]
            else:
                modelIndexD[pId] = codD[pId]
        #
        modelIndexD = self.__addPriorMatchDetails(modelIndexD)
        modelIndexD = self.__updateVariantDetails(modelIndexD)
        priorMapD = {}
        for _, mDL in modelIndexD.items():
            try:
                mDLS = sorted(mDL,
                              key=itemgetter("priorModelId", "variantType",
                                             "rFactor"),
                              reverse=False)
            except Exception:
                mDLS = sorted(mDL,
                              key=itemgetter("priorModelId", "variantType"),
                              reverse=False)
            numStd = 0
            matchIdD = {}
            for mD in mDLS:
                isStd = False
                if mD["variantType"].startswith("A"):
                    numStd += 1
                    isStd = True
                #
                if "rFactor" in mD and mD[
                        "rFactor"] and mD["rFactor"] > maxRFactor:
                    logger.info("Skipping model %s isStd (%r) rValue (%r)",
                                mD["modelId"], isStd, mD["rFactor"])
                    continue
                if numStd and not isStd:
                    logger.info("Skipping model %s isStd (%r) numStd (%d)",
                                mD["modelId"], isStd, numStd)
                    continue
                #
                # Exclude duplicate matches in priority order ...
                if mD["matchId"] in matchIdD:
                    logger.info("Skipping duplicate matchId %r in %r",
                                mD["matchId"], mD["modelId"])
                    continue
                #
                matchIdD[mD["matchId"]] = True

                cL = mU.doImport(mD["modelPath"], fmt="mmcif")
                logger.debug("Read %d from %s", len(cL), mD["modelPath"])
                dataContainerL.extend(cL)
                if not mD["priorModelId"].startswith("Z"):
                    priorMapD[mD["modelId"]] = (mD["priorModelId"],
                                                mD["priorMatchDate"])
        #
        logger.debug("priorMapD %r", priorMapD)
        fn = "chem_comp_models-%s.cif" % self.__getToday()
        assembleModelPath = os.path.join(self.__ccdcmb.getModelDirFilePath(),
                                         fn)
        # -- relabel
        parentModelCountD = defaultdict(int)
        priorIdLD = {}
        for dataContainer in dataContainerL:
            tModelId = dataContainer.getName()
            tId = self.__parseId(tModelId)[0]
            pId = tId.split("|")[0]
            if tModelId in priorMapD:
                pCount = self.__parseId(priorMapD[tModelId][0])[1]
                priorIdLD.setdefault(pId, []).append(pCount)
                self.__replaceModelId(dataContainer, tModelId,
                                      priorMapD[tModelId][0])
                self.__updateAuditDate(dataContainer, priorMapD[tModelId][1])
                parentModelCountD[pId] = sorted(priorIdLD[pId])[-1]
                logger.debug("%s current model %r prior model %r count %d",
                             pId, tModelId, priorMapD[tModelId][0],
                             parentModelCountD[pId])
            else:
                parentModelCountD[pId] += 1
                pModelId = self.__makePublicModelId(pId,
                                                    parentModelCountD[pId])
                self.__replaceModelId(dataContainer, tModelId, pModelId)

        ok = mU.doExport(assembleModelPath, dataContainerL, fmt="mmcif")
        logger.info("Assembled %d models status %r", len(dataContainerL), ok)
        self.__checkAssembledModels(assembleModelPath)
        return len(dataContainerL)

Пример #4

Показать файл

class ChemCompSearchIndexProvider(object):
    """Utilities to read and process the index of chemical component definitions search targets"""

    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__dirPath = os.path.join(self.__cachePath, "chem_comp")
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc")
        self.__searchIdx = self.__reload(**kwargs)

    def testCache(self, minCount=None, logSizes=False):
        if logSizes and self.__searchIdx:
            logger.info("searchIdxD (%.2f MB)", getObjSize(self.__searchIdx) / 1000000.0)
        ok = self.__searchIdx and len(self.__searchIdx) >= minCount if minCount else self.__searchIdx is not None
        return ok

    def getIndex(self):
        return self.__searchIdx

    def getIndexEntry(self, searchCcId):
        try:
            return self.__searchIdx[searchCcId]
        except Exception as e:
            logger.debug("Get index entry %r failing with %s", searchCcId, str(e))
        return None

    def getIndexFilePath(self):
        return os.path.join(self.__dirPath, "%s-search-idx-chemical-components.json" % self.__ccFileNamePrefix)

    def __reload(self, **kwargs):
        """Reload or created index of PDB chemical components.

        Args:
            cachePath (str): path to the directory containing cache files
            ccIdxFileName (str): serialized chemical component data index file name

         Returns:
            (list): chemical component data containers
        """
        #
        searchIdxD = {}
        useChemAxon = kwargs.get("useChemAxon", True)
        useCache = kwargs.get("useCache", True)
        molLimit = kwargs.get("molLimit", 0)
        numProc = kwargs.get("numProc", 1)
        maxChunkSize = kwargs.get("maxChunkSize", 20)
        limitPerceptions = kwargs.get("limitPerceptions", True)
        quietFlag = kwargs.get("quietFlag", True)
        skipObsolete = kwargs.get("skipObsolete", True)
        searchIdxFilePath = self.getIndexFilePath()
        #
        if useCache and self.__mU.exists(searchIdxFilePath):
            _, fExt = os.path.splitext(searchIdxFilePath)
            searchIdxFormat = "json" if fExt == ".json" else "pickle"
            rdCcIdxD = self.__mU.doImport(searchIdxFilePath, fmt=searchIdxFormat)
            searchIdxD = {k: rdCcIdxD[k] for k in sorted(rdCcIdxD.keys())[:molLimit]} if molLimit else rdCcIdxD
        else:
            cmpKwargs = {k: v for k, v in kwargs.items() if k not in ["cachePath", "useCache", "molLimit"]}
            ccmP = ChemCompMoleculeProvider(cachePath=self.__cachePath, useCache=True, molLimit=molLimit, skipObsolete=skipObsolete, **cmpKwargs)
            ok1 = ccmP.testCache(minCount=molLimit, logSizes=True)
            #
            descrD = {}
            ok2 = True
            if useChemAxon:
                caxP = ChemAxonDescriptorProvider(cachePath=self.__cachePath, useCache=True, **cmpKwargs)
                ok2 = caxP.testCache(minCount=molLimit)
                descrD = caxP.getDescriptorIndex()
            #
            if ok1 & ok2:
                searchIdxD = self.__updateChemCompSearchIndex(ccmP.getMolD(), descrD, searchIdxFilePath, molLimit, limitPerceptions, numProc, maxChunkSize, quietFlag)
                logger.info("Storing %s with data for %d search candidates (status=%r) ", searchIdxFilePath, len(searchIdxD), ok1 & ok2)
        # logger.info("Using Chemaxon descriptors for (%d) components", descrD)
        #
        for idxD in searchIdxD.values():
            idxD["atom-types"] = set(idxD["type-counts"].keys()) if "type-counts" in idxD else set()

        return searchIdxD

    def __updateChemCompSearchIndex(self, ccObjD, descrD, filePath, molLimit, limitPerceptions, numProc, maxChunkSize, quietFlag):
        searchIdxD = {}
        try:
            # Serialized index of chemical component search targets
            startTime = time.time()
            _, fExt = os.path.splitext(filePath)
            fileFormat = "json" if fExt == ".json" else "pickle"
            if numProc <= 1:
                searchIdxD = self.__buildChemCompSearchIndex(ccObjD, descrD, limitPerceptions=limitPerceptions, molLimit=molLimit)
            else:
                searchIdxD = self.__buildChemCompSearchIndexMulti(
                    ccObjD, descrD, limitPerceptions=limitPerceptions, molLimit=molLimit, numProc=numProc, maxChunkSize=maxChunkSize, quietFlag=quietFlag
                )

            ok = self.__mU.doExport(filePath, searchIdxD, fmt=fileFormat)
            endTime = time.time()
            logger.info("Storing %s (%s) with %d search definitions (status=%r) (%.4f seconds)", filePath, fileFormat, len(searchIdxD), ok, endTime - startTime)
        #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return searchIdxD

    def __buildChemCompSearchIndex(self, ccObjD, descrD, limitPerceptions=False, molLimit=None):
        """Internal method return a dictionary of extracted chemical component descriptors and formula."""
        rD = {}
        try:
            for ii, ccId in enumerate(ccObjD, 1):
                if molLimit and ii > molLimit:
                    break
                # ----
                oemf = OeMoleculeFactory()
                oemf.setQuiet()
                tId = oemf.setChemCompDef(ccObjD[ccId])
                if tId != ccId:
                    logger.error("%s chemical component definition import error", ccId)
                # ----
                oemf.clearExternalDescriptors()
                for smi in descrD[ccId] if ccId in descrD else []:
                    oemf.addExternalDescriptor("smiles", smi, "chemaxon-smiles")
                # ----
                smiD = oemf.buildRelated(limitPerceptions=limitPerceptions)
                logger.debug("%s related molecular forms %d", ccId, len(smiD))
                rD.update(smiD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return rD

    def __buildChemCompSearchIndexMulti(self, ccObjD, descrD, limitPerceptions=False, molLimit=None, numProc=2, maxChunkSize=20, quietFlag=False):
        #
        ccIdList = sorted(ccObjD.keys())[:molLimit] if molLimit else sorted(ccObjD.keys())
        logger.info("Input definition length %d numProc %d limitPerceptions %r", len(ccIdList), numProc, limitPerceptions)
        #
        rWorker = ChemCompSearchIndexWorker(ccObjD)
        # mpu = MultiProcPoolUtil(verbose=True)
        mpu = MultiProcUtil(verbose=True)
        optD = {"maxChunkSize": maxChunkSize, "limitPerceptions": limitPerceptions, "quietFlag": quietFlag, "descrD": descrD}
        mpu.setOptions(optD)
        mpu.set(workerObj=rWorker, workerMethod="buildRelatedList")
        ok, failList, resultList, _ = mpu.runMulti(dataList=ccIdList, numProc=numProc, numResults=1, chunkSize=maxChunkSize)
        if failList:
            logger.info("Index definitions with failures (%d): %r", len(failList), failList)
        logger.info("Multi-proc status %r failures %r result length %r", ok, len(failList), len(resultList[0]))
        # JDW
        rD = {vD["name"]: vD for vD in resultList[0]}
        return rD

    def matchMolecularFormulaRange(self, typeRangeD, matchSubset=False):
        """Find matching formula for the input atom type range query (evaluates min <= ff <= max).

        Args:
            typeRangeD (dict): dictionary of element ranges {'<element_name>: {'min': <int>, 'max': <int>}}
            matchSubset (bool, optional): test for formula subset (default: False)

        Returns:
            (list):  chemical component identifiers with matching formula (MatchResults)
        """
        rL = []
        try:
            if not typeRangeD:
                return rL
            myTypeRangeD = {k.upper(): v for k, v in typeRangeD.items()}
            queryTypeS = set(myTypeRangeD.keys())
            for ccId, idxD in self.__searchIdx.items():
                tD = idxD["type-counts"]
                # targetTypeS = set(tD.keys())
                if not matchSubset and idxD["atom-types"] != queryTypeS:
                    continue
                #
                if not queryTypeS.issubset(idxD["atom-types"]):
                    continue
                match = True
                for atomType, rangeD in myTypeRangeD.items():
                    try:
                        if ("min" in rangeD and rangeD["min"] > tD[atomType]) or ("max" in rangeD and rangeD["max"] < tD[atomType]):
                            match = False
                            break
                    except Exception:
                        match = False
                        break
                if match:
                    # logger.info("%s formula %r query %r", ccId, idxD["type-counts"], typeRangeD)
                    rL.append(MatchResults(ccId=ccId, searchType="formula", formula=idxD["formula"]))
        except Exception as e:
            logger.exception("Failing for %r with %s", typeRangeD, str(e))
        return rL

    def filterMinimumMolecularFormula(self, typeCountD):
        """Find molecules with the minimum formula composition for the input atom type query (evaluates min <= ff).

        Args:
            typeCountD (dict): dictionary of element minimum values {'<element_name>: #}

        Returns:
            (list):  chemical component identifiers
        """
        rL = []
        try:
            if not typeCountD:
                return list(self.__searchIdx.keys())

            queryTypeS = set(typeCountD.keys())
            for ccId, idxD in self.__searchIdx.items():
                tD = idxD["type-counts"]
                if not queryTypeS.issubset(tD):
                    continue
                match = True
                for atomType, minCount in typeCountD.items():
                    try:
                        if minCount > tD[atomType]:
                            match = False
                            break
                    except Exception:
                        match = False
                        break
                if match:
                    rL.append(ccId)
        except Exception as e:
            logger.exception("Failing for %r with %s", typeCountD, str(e))
        return rL

    def filterMinimumFormulaAndFeatures(self, typeCountD, featureCountD):
        """Find molecules with the minimum formula and feature composition.

        Args:
            typeCountD (dict): dictionary of element minimum values {'<element_name>: #}
            featureCountD (dict): dictionary of feature minimum values {'<element_name>: #}

        Returns:
            (list):  chemical component identifiers
        """
        rL = []
        try:
            if not typeCountD or not featureCountD:
                return list(self.__searchIdx.keys())
            # ----
            featureQueryS = set(featureCountD.keys())
            typeQueryS = set(typeCountD.keys())
            #
            for ccId, idxD in self.__searchIdx.items():
                tD = idxD["type-counts"]
                fD = idxD["feature-counts"]
                #
                if not typeQueryS.issubset(tD) or not featureQueryS.issubset(fD):
                    continue

                match = True
                for atomType, minCount in typeCountD.items():
                    try:
                        if minCount > tD[atomType]:
                            match = False
                            break
                    except Exception:
                        match = False
                        break

                if not match:
                    continue
                #
                for featureType, minCount in featureCountD.items():
                    try:
                        if minCount > fD[featureType]:
                            match = False
                            break
                    except Exception:
                        match = False
                        break
                #
                if match:
                    rL.append(ccId)
        except Exception as e:
            logger.exception("Failing for %r with %s", typeCountD, str(e))
        return rL

Пример #5

Показать файл

class ChemCompDepictWrapper(SingletonClass):
    """Wrapper for chemical component depiction operations."""
    def __init__(self):
        self.__startTime = time.time()
        # ---
        self.__workPath = "."
        self.__mU = MarshalUtil(workPath=self.__workPath)
        self.__configD = None
        self.__cachePath = None
        # ---
        self.__statusDescriptorError = -100
        self.__searchError = -200
        self.__searchSuccess = 0
        self.__imageCount = 0

    def readConfig(self, resetImagePath=True):
        #
        ok = False
        try:
            self.__cachePath = os.environ.get("CHEM_DEPICT_CACHE_PATH", ".")
            configFileName = os.environ.get("CHEM_DEPICT_CONFIG_FILE_NAME",
                                            "depict-config.json")
            #
            configFilePath = os.path.join(self.__cachePath, "config",
                                          configFileName)
            configD = {}
            if self.__mU.exists(configFilePath):
                configD = self.__mU.doImport(configFilePath, fmt="json")
            logger.debug("configD: %r", configD)
            if configD and (len(configD) >= 2) and float(
                    configD["versionNumber"]) > 0.1:
                logger.info("Read version %r sections %r from %s",
                            configD["versionNumber"], list(configD.keys()),
                            configFilePath)
                ok = True
                #
                if resetImagePath:
                    # Allow the configuration to be relocatable.
                    tS = configD[
                        "imageDir"] if "imageDir" in configD else "images"
                    configD["imageDirPath"] = os.path.join(
                        self.__cachePath, tS)
                    configD["versionNumber"] = "0.2"
            else:
                # Handle missing config for now
                configD["imageDir"] = "images"
                configD["imageDirPath"] = os.path.join(self.__cachePath,
                                                       configD["imageDir"])
                logger.warning("Reading config file fails from path %r",
                               configFilePath)
                logger.warning("Using config %r", configD)
                ok = True
            #
            self.__configD = configD
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            ok = False
        return ok

    def setConfig(self, cachePath, **kwargs):
        """Provide dependencies for rebuilding depict file dependencies.

        Args:
            cachePath (str): path to cache data files.

            Other options are propagated to configurations of the wrapped classes in __bootstrapDepictConfig()

        """
        self.__configD = self.__makeBootstrapDepictConfig(cachePath, **kwargs)
        return len(self.__configD) >= 2

    def __makeBootstrapDepictConfig(self, cachePath, **kwargs):
        """Create depict configuration bootstrap file"""
        configD = {}
        try:
            storeConfig = kwargs.get("storeConfig", True)
            os.environ["CHEM_DEPICT_CACHE_PATH"] = os.path.join(cachePath)
            configDirPath = os.path.join(cachePath, "config")
            configFilePath = os.path.join(configDirPath, "depict-config.json")
            #
            logger.info("Updating depict configuration using %s",
                        configFilePath)
            #
            imageDirPath = os.path.join(cachePath, "images")
            self.__mU.mkdir(imageDirPath)
            configD = {"versionNumber": 0.20, "imageDir": "images"}
            if storeConfig:
                self.__mU.mkdir(configDirPath)
                self.__mU.doExport(configFilePath,
                                   configD,
                                   fmt="json",
                                   indent=3)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return configD
        #

    def setImageCount(self, imageCount):
        self.__imageCount = imageCount

    def getImageCount(self):
        return self.__imageCount

    def __makeImagePath(self):
        imageDirPath = self.__configD[
            "imageDirPath"] if self.__configD and "imageDirPath" in self.__configD else "."
        fileRotateIncrement = self.__configD[
            "fileRotateIncrement"] if self.__configD and "fileRotateIncrement" in self.__configD else 50
        ic = self.__imageCount % fileRotateIncrement
        imagePath = os.path.join(imageDirPath, "image-%s.svg" % ic)
        return imagePath

    def depictMolecule(self,
                       identifier,
                       identifierType,
                       imagePath=None,
                       **kwargs):
        """Create depiction from InChI, SMILES descriptors or PDB identifier."""
        try:
            imagePath = imagePath if imagePath else self.__makeImagePath()
            oeio = OeIoUtils()
            if identifierType.lower() in ["smiles"]:
                oeMol = oeio.smilesToMol(identifier)
            elif identifierType.lower() in ["inchi"]:
                oeMol = oeio.inchiToMol(identifier)
            elif identifierType.lower() in ["identifierpdb"]:
                ccsw = ChemCompSearchWrapper()
                oesmP = ccsw.getSearchMoleculeProvider()
                oeMol = oesmP.getMol(identifier)
            #
            ok = self.__depictOne(oeMol, imagePath, **kwargs)
            return imagePath if ok else None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def __depictOne(self, oeMol, imagePath, **kwargs):
        """Single

        Args:
            oeMol (object): instance of an OE graph molecule
            imagePath (string): file path for image

        Returns:
            bool: True for success or False otherwise
        """
        try:
            title = kwargs.get("title", None)
            oed = OeDepict()
            oed.setMolTitleList([("Target", oeMol, title)])

            # ---
            bondDisplayWidth = 10.0
            numAtoms = oeMol.NumAtoms()
            if numAtoms > 100 and numAtoms <= 200:
                bondDisplayWidth = 6.0
            elif numAtoms > 200:
                bondDisplayWidth = 4.0
            # ---
            oed.setDisplayOptions(
                imageSizeX=kwargs.get("imageSizeX", 2500),
                imageSizeY=kwargs.get("imageSizeX", 2500),
                labelAtomName=kwargs.get("labelAtomName", False),
                labelAtomCIPStereo=kwargs.get("labelAtomCIPStereo", True),
                labelAtomIndex=kwargs.get("labelAtomIndex", False),
                labelBondIndex=kwargs.get("labelBondIndex", False),
                labelBondCIPStereo=kwargs.get("labelBondCIPStereo", True),
                cellBorders=kwargs.get("cellBorders", True),
                bondDisplayWidth=bondDisplayWidth,
            )
            oed.setGridOptions(rows=1, cols=1, cellBorders=False)
            oed.prepare()
            oed.write(imagePath)
            self.__imageCount += 1
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def status(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 10**6,
                    unitS)
        endTime = time.time()
        logger.info("Status at %s (up %.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def alignMoleculePair(self,
                          refIdentifier,
                          refIdentifierType,
                          fitIdentifier,
                          fitIdentifierType,
                          imagePath=None,
                          **kwargs):
        """Create aligned depiction for a target molecule InChI, SMILES descriptors or PDB identifier."""
        try:
            imagePath = imagePath if imagePath else self.__makeImagePath()
            oeio = OeIoUtils()
            ccsw = ChemCompSearchWrapper()
            oesmP = ccsw.getSearchMoleculeProvider()
            # ---
            if refIdentifierType.lower() in ["smiles"]:
                oeMolRef = oeio.smilesToMol(refIdentifier)
            elif refIdentifierType.lower() in ["inchi"]:
                oeMolRef = oeio.inchiToMol(refIdentifier)
            elif refIdentifierType.lower() in ["identifierpdb"]:
                oeMolRef = oesmP.getMol(refIdentifier)
            #
            if fitIdentifierType.lower() in ["smiles"]:
                oeMolFit = oeio.smilesToMol(fitIdentifier)
            elif fitIdentifierType.lower() in ["inchi"]:
                oeMolFit = oeio.inchiToMol(fitIdentifier)
            elif fitIdentifierType.lower() in ["identifierpdb"]:
                oeMolFit = oesmP.getMol(fitIdentifier)
            # ---
            logger.info("oeMolRef atoms %r", oeMolRef.NumAtoms())
            logger.info("oeMolFit atoms %r", oeMolFit.NumAtoms())

            displayIdRef = "Ref"
            displayIdFit = "Fit"
            ok = self.__depictAlignedPair(oeMolRef, displayIdRef, oeMolFit,
                                          displayIdFit, imagePath, **kwargs)
            return imagePath if ok else None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def __depictAlignedPair(self, oeMolRef, displayIdRef, oeMolFit,
                            displayIdFit, imagePath, **kwargs):
        """Depict pairwise MCSS alignment"""
        try:
            #
            oed = OeDepictMCSAlignPage()
            oed.setSearchType(sType="relaxed")
            #
            oed.setRefMol(oeMolRef, displayIdRef)
            oed.setFitMol(oeMolFit, displayIdFit)
            #
            # imagePath = self.__makeImagePath()
            # ---
            bondDisplayWidth = 10.0
            numAtomsRef = oeMolRef.NumAtoms()
            if numAtomsRef > 100 and numAtomsRef <= 200:
                bondDisplayWidth = 6.0
            elif numAtomsRef > 200:
                bondDisplayWidth = 4.0
            # ---
            oed.setDisplayOptions(
                imageSizeX=kwargs.get("imageSizeX", 2500),
                imageSizeY=kwargs.get("imageSizeX", 2500),
                labelAtomName=kwargs.get("labelAtomName", False),
                labelAtomCIPStereo=kwargs.get("labelAtomCIPStereo", True),
                labelAtomIndex=kwargs.get("labelAtomIndex", False),
                labelBondIndex=kwargs.get("labelBondIndex", False),
                labelBondCIPStereo=kwargs.get("labelBondCIPStereo", True),
                cellBorders=kwargs.get("cellBorders", True),
                bondDisplayWidth=bondDisplayWidth,
                highlightStyleFit=kwargs.get("highlightStyleFit",
                                             "ballAndStickInverse"),
            )
            #
            aML = oed.alignPair(imagePath=imagePath)
            logger.info("Aligned atom count %d", len(aML))
            #
            # self.assertGreater(len(aML), 1)
            # if aML:
            #    for (rCC, rAt, tCC, tAt) in aML:
            #        logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt)
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def toMolFile(self,
                  identifier,
                  identifierType,
                  molfilePath=None,
                  fmt="mol",
                  **kwargs):
        """Create molfile (fmt) from InChI, SMILES descriptors or PDB identifier."""
        try:
            molfilePath = molfilePath if molfilePath else self.__makeMolfilePath(
                fmt=fmt)
            oeio = OeIoUtils()
            if identifierType.lower() in ["smiles"]:
                oeMol = oeio.smilesToMol(identifier)
                oeMol.SetTitle("From SMILES")
            elif identifierType.lower() in ["inchi"]:
                oeMol = oeio.inchiToMol(identifier)
                oeMol.SetTitle("From InChI")
            elif identifierType.lower() in ["identifierpdb"]:
                ccsw = ChemCompSearchWrapper()
                oesmP = ccsw.getSearchMoleculeProvider()
                oeMol = oesmP.getMol(identifier)
            #
            ok = self.__toMolFile(oeMol, molfilePath, **kwargs)
            return molfilePath if ok else None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def __toMolFile(self, oeMol, molfilePath, **kwargs):
        """Write the

        Args:
            oeMol (object): instance of an OE graph molecule
            molfilePath (string): file path for molfile (type determined by extension)

        Returns:
            bool: True for success or False otherwise
        """
        try:
            _ = kwargs
            oeio = OeIoUtils()
            oeio.write(molfilePath, oeMol, constantMol=True)
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def __makeMolfilePath(self, fmt="mol"):
        imageDirPath = self.__configD[
            "imageDirPath"] if self.__configD and "imageDirPath" in self.__configD else "."
        fileRotateIncrement = self.__configD[
            "fileRotateIncrement"] if self.__configD and "fileRotateIncrement" in self.__configD else 50
        ic = self.__imageCount % fileRotateIncrement
        molPath = os.path.join(imageDirPath, "molfile-%s.%s" % (ic, fmt))
        return molPath

Пример #6

Показать файл

class MarshalUtilTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb",
                                                     "mock-data",
                                                     "dictionaries",
                                                     "mmcif_pdbx_v5_next.dic")
        self.__pathJsonTestFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                               "dictionaries",
                                               "vrpt_dictmap.json")
        self.__pathIndexFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                            "MOCK_EXCHANGE_SANDBOX",
                                            "update-lists", "all-pdb-list")
        self.__pathCifFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                          "MOCK_BIRD_CC_REPO", "0",
                                          "PRDCC_000010.cif")
        #
        self.__workPath = os.path.join(HERE, "test-output")
        self.__pathSaveDictionaryFile = os.path.join(self.__workPath,
                                                     "mmcif_pdbx_v5_next.dic")
        self.__pathSaveJsonTestFile = os.path.join(self.__workPath,
                                                   "json-content.json")
        self.__pathSaveIndexFile = os.path.join(self.__workPath,
                                                "all-pdb-list")
        self.__pathSaveCifFile = os.path.join(self.__workPath,
                                              "cif-content.cif")
        #
        self.__pathFastaFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                            "MOCK_EXCHANGE_SANDBOX",
                                            "sequence",
                                            "pdb_seq_prerelease.fasta")
        self.__pathSaveFastaFile = os.path.join(self.__workPath,
                                                "test-pre-release.fasta")
        #

        self.__urlTarget = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"
        self.__urlTargetBad = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump-missing.tar.gz"
        #
        self.__mU = MarshalUtil()
        self.__startTime = time.time()
        logger.debug("Running tests on version %s", __version__)
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testReadWriteInParts(self):
        """Test the case reading and writing in parts."""
        try:
            lenL = 12013
            aL = [100, 200, 300, 400, 500]
            dL = [aL for ii in range(lenL)]
            numParts = 4
            sPath = os.path.join(self.__workPath, "list-m-data.json")
            ok = self.__mU.doExport(sPath,
                                    dL,
                                    numParts=numParts,
                                    fmt="json",
                                    indent=3)
            #
            self.assertTrue(ok)
            rL = self.__mU.doImport(sPath, numParts=numParts, fmt="json")
            logger.info("Reading %d parts with total length %d", numParts,
                        len(rL))
            self.assertEqual(dL, rL)
            #
            lenD = 23411
            qD = OrderedDict([("a", 100), ("b", 100), ("c", 100)])
            dD = OrderedDict([(str(ii), qD) for ii in range(lenD)])
            numParts = 4
            sPath = os.path.join(self.__workPath, "dict-m-data.json")
            ok = self.__mU.doExport(sPath,
                                    dD,
                                    numParts=numParts,
                                    fmt="json",
                                    indent=3)
            self.assertTrue(ok)
            rD = self.__mU.doImport(sPath, numParts=numParts, fmt="json")
            logger.info("Reading %d parts with total length %d", numParts,
                        len(rD))
            self.assertEqual(dD, rD)
            rD = self.__mU.doImport(sPath, numParts=numParts, fmt="json")
            logger.info("Reading %d parts with total length %d", numParts,
                        len(rD))
            self.assertEqual(dD, rD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadDictionaryFile(self):
        """Test the case read PDBx/mmCIF dictionary text file"""
        try:
            cL = self.__mU.doImport(self.__pathPdbxDictionaryFile,
                                    fmt="mmcif-dict")
            logger.debug("Dictionary container list %d", len(cL))
            self.assertGreaterEqual(len(cL), 1)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadCifFile(self):
        """Test the case read PDBx/mmCIF text file"""
        try:
            cL = self.__mU.doImport(self.__pathCifFile, fmt="mmcif")
            logger.debug("Container list %d", len(cL))
            self.assertGreaterEqual(len(cL), 1)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadListFile(self):
        """Test the case read list text file"""
        try:
            cL = self.__mU.doImport(self.__pathIndexFile, fmt="list")
            logger.debug("List length %d", len(cL))
            self.assertGreaterEqual(len(cL), 1000)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadJsonFile(self):
        """Test the case read JSON file"""
        try:
            rObj = self.__mU.doImport(self.__pathJsonTestFile, fmt="json")
            logger.debug("Object length %d", len(rObj))
            self.assertGreaterEqual(len(rObj), 1)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteDictionaryFiles(self):
        """Test the case read and write PDBx/mmCIF dictionary text file"""
        try:
            cL = self.__mU.doImport(self.__pathPdbxDictionaryFile,
                                    fmt="mmcif-dict")
            logger.debug("Dictionary container list %d", len(cL))
            self.assertGreaterEqual(len(cL), 1)
            ok = self.__mU.doExport(self.__pathSaveDictionaryFile,
                                    cL,
                                    fmt="mmcif-dict")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteCifFile(self):
        """Test the case read and write PDBx/mmCIF text file"""
        try:
            cL = self.__mU.doImport(self.__pathCifFile, fmt="mmcif")
            logger.debug("Container list %d", len(cL))
            self.assertGreaterEqual(len(cL), 1)
            ok = self.__mU.doExport(self.__pathSaveCifFile, cL, fmt="mmcif")
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteJsonFile(self):
        """Test the case read and write JSON file"""
        try:
            rObj = self.__mU.doImport(self.__pathJsonTestFile, fmt="json")
            logger.debug("Object length %d", len(rObj))
            self.assertGreaterEqual(len(rObj), 1)
            ok = self.__mU.doExport(self.__pathSaveJsonTestFile,
                                    rObj,
                                    fmt="json")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteListFile(self):
        """Test the case read and write list text file"""
        try:
            cL = self.__mU.doImport(self.__pathIndexFile, fmt="list")
            logger.debug("List element %r length %d", cL[0], len(cL))
            count = 0
            for cV in cL:
                fields = cV.split()
                count += len(fields)
            _ = count
            self.assertGreaterEqual(len(cL), 1000)
            ok = self.__mU.doExport(self.__pathSaveIndexFile, cL, fmt="list")
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteFastaFile(self):
        """Test the case read and write FASTA sequence file"""
        try:
            sD = self.__mU.doImport(self.__pathFastaFile,
                                    fmt="fasta",
                                    commentStyle="prerelease")
            logger.debug("Sequence length %d", len(sD))
            self.assertGreaterEqual(len(sD), 500)
            ok = self.__mU.doExport(self.__pathSaveFastaFile, sD, fmt="fasta")
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadUrlTarfile(self):
        """Test the case to read URL target and extract a member"""
        try:
            mU = MarshalUtil(workPath=self.__workPath)
            _, fn = os.path.split(self.__urlTarget)
            #
            nmL = mU.doImport(self.__urlTarget,
                              fmt="tdd",
                              rowFormat="list",
                              tarMember="names.dmp")
            self.assertGreater(len(nmL), 2000000)
            logger.info("Names %d", len(nmL))
            ndL = mU.doImport(os.path.join(self.__workPath, fn),
                              fmt="tdd",
                              rowFormat="list",
                              tarMember="nodes.dmp")
            self.assertGreater(len(ndL), 2000000)
            logger.info("Nodes %d", len(ndL))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadUrlTddfile(self):
        """Test the case to read URL target of a tdd"""
        try:
            mU = MarshalUtil(workPath=self.__workPath)
            version = "2.07-2019-07-23"
            urlTarget = "http://scop.berkeley.edu/downloads/update"
            encoding = "utf-8-sig" if sys.version_info[0] > 2 else "ascii"
            fn = "dir.des.scope.%s.txt" % version
            url = os.path.join(urlTarget, fn)
            logger.info("Fetch url %r", url)
            desL = mU.doImport(url,
                               fmt="tdd",
                               rowFormat="list",
                               uncomment=True,
                               encoding=encoding)
            logger.info("Fetched URL is %s len %d", url, len(desL))
            self.assertGreater(len(desL), 100)
            logger.info("Lines %d", len(desL))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadUrlTarfileFail(self):
        """Test the case to read URL target and extract a member (failing case)"""
        try:
            mU = MarshalUtil(workPath=self.__workPath)
            rL = mU.doImport(self.__urlTargetBad,
                             fmt="tdd",
                             rowFormat="list",
                             tarMember="names.dmp")
            logger.info("Return is %r", rL)
            self.assertEqual(len(rL), 0)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

Пример #7

Показать файл

Файл: PharosTargetProvider.py Проект: rcsb/py-rcsb_utils_targets

class PharosTargetProvider(StashableBase):
    """Accessors for Pharos target assignments."""

    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__dirName = "Pharos-targets"
        super(PharosTargetProvider, self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        reloadDb = kwargs.get("reloadDb", False)
        fromDb = kwargs.get("fromDb", False)
        useCache = kwargs.get("useCache", False)
        pharosDumpUrl = kwargs.get("pharosDumpUrl", None)
        mysqlUser = kwargs.get("mysqlUser", None)
        mysqlPassword = kwargs.get("mysqlPassword", None)
        self.__version = None
        if reloadDb or fromDb:
            self.__reload(self.__dirPath, reloadDb=reloadDb, fromDb=fromDb, useCache=useCache, pharosDumpUrl=pharosDumpUrl, mysqlUser=mysqlUser, mysqlPassword=mysqlPassword)
        #

    def testCache(self):
        return True

    def getVersion(self):
        return self.__version

    def __reload(self, dirPath, reloadDb=False, fromDb=False, useCache=False, pharosDumpUrl=None, mysqlUser=None, mysqlPassword=None):
        startTime = time.time()
        pharosSelectedTables = ["drug_activity", "cmpd_activity", "target", "protein", "t2tc"]
        pharosDumpUrl = pharosDumpUrl if pharosDumpUrl else "http://juniper.health.unm.edu/tcrd/download/latest.sql.gz"
        pharosReadmeUrl = "http://juniper.health.unm.edu/tcrd/download/latest.README"
        ok = False
        fU = FileUtil()
        pharosDumpFileName = fU.getFileName(pharosDumpUrl)
        pharosDumpPath = os.path.join(dirPath, pharosDumpFileName)
        pharosUpdatePath = os.path.join(dirPath, "pharos-update.sql")
        pharosReadmePath = os.path.join(dirPath, "pharos-readme.txt")
        logPath = os.path.join(dirPath, "pharosLoad.log")
        #
        fU.mkdir(dirPath)
        #

        exU = ExecUtils()
        #
        if reloadDb:
            logger.info("useCache %r pharosDumpPath %r", useCache, pharosDumpPath)
            if useCache and self.__mU.exists(pharosDumpPath):
                ok = True
            else:
                logger.info("Fetching url %s path %s", pharosDumpUrl, pharosDumpPath)
                ok1 = fU.get(pharosDumpUrl, pharosDumpPath)
                ok2 = fU.get(pharosReadmeUrl, pharosReadmePath)
                logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1 and ok2, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
            # ---
            readmeLines = self.__mU.doImport(pharosReadmePath, fmt="list")
            self.__version = readmeLines[0].split(" ")[1][1:] if readmeLines else "6"
            # ---
            logger.info("Filtering SQL dump %r for selected tables %r", pharosDumpFileName, pharosSelectedTables)
            doWrite = True
            # Note: the pharos dump file latest.sql.gz is not gzipped
            with open(pharosDumpPath, "r", encoding="utf-8") as ifh, open(pharosUpdatePath, "w", encoding="utf-8") as ofh:
                for line in ifh:
                    if line.startswith("-- Table structure for table"):
                        tN = line.split(" ")[-1][1:-2]
                        doWrite = True if tN in pharosSelectedTables else False
                    if doWrite:
                        ofh.write(line)
            # ---
            ok = exU.run(
                "mysql",
                execArgList=["-v", "-u", mysqlUser, "--password=%s" % mysqlPassword, "-e", "create database if not exists tcrd6;"],
                outPath=logPath,
                outAppend=False,
                timeOut=None,
            )
            # ok = exU.run(
            #     "mysql",
            #     execArgList=["-u", mysqlUser, "--password=%s" % mysqlPassword, "tcrd6"],
            #     outPath=logPath,
            #     inpPath=pharosDumpPath,
            #     outAppend=True,
            #     timeOut=None,
            # )
            shellCmd = 'trap "" SIGHUP SIGINT SIGTERM; nohup mysql -u %s --password=%s tcrd6 < %s >& %s' % (mysqlUser, mysqlPassword, pharosUpdatePath, logPath)
            ok = exU.runShell(
                shellCmd,
                outPath=None,
                inpPath=None,
                outAppend=True,
                timeOut=None,
            )
            logger.info("SQL dump restore status %r", ok)
        # --
        if fromDb:
            for tbl in pharosSelectedTables:
                outPath = os.path.join(dirPath, "%s.tdd" % tbl)
                # if useCache and self.__mU.exists(outPath):
                #   continue
                ok = exU.run(
                    "mysql",
                    execArgList=["-u", mysqlUser, "--password=%s" % mysqlPassword, "-e", "use tcrd6; select * from %s;" % tbl],
                    outPath=outPath,
                    outAppend=False,
                    timeOut=None,
                    suppressStderr=True,
                )
                logger.info("SQL table %s export status %r", tbl, ok)
        return ok

    def exportProteinFasta(self, fastaPath, taxonPath, addTaxonomy=False):
        try:
            proteinFilePath = os.path.join(self.__dirPath, "protein.tdd")
            pDL = self.__mU.doImport(proteinFilePath, fmt="tdd", rowFormat="dict")
            fD = {}
            taxonL = []
            if addTaxonomy:
                umP = UniProtIdMappingProvider(self.__cachePath)
                umP.reload(useCache=True)
                #
                for pD in pDL:
                    unpId = pD["uniprot"]
                    proteinId = pD["id"]
                    seq = pD["seq"]
                    taxId = umP.getMappedId(unpId, mapName="NCBI-taxon")
                    taxId = taxId if taxId else "-1"
                    cD = {"sequence": seq, "uniprotId": unpId, "proteinId": proteinId, "taxId": taxId}
                    seqId = ""
                    cL = []
                    for k, v in cD.items():
                        if k in ["sequence"]:
                            continue
                        cL.append(str(v))
                        cL.append(str(k))
                    seqId = "|".join(cL)
                    fD[seqId] = cD
                    taxonL.append("%s\t%s" % (seqId, taxId))
                ok = self.__mU.doExport(taxonPath, taxonL, fmt="list")
            else:
                for pD in pDL:
                    unpId = pD["uniprot"]
                    proteinId = pD["id"]
                    seq = pD["seq"]
                    cD = {"sequence": seq, "uniprotId": unpId, "proteinId": proteinId}
                    seqId = ""
                    cL = []
                    for k, v in cD.items():
                        if k in ["sequence"]:
                            continue
                        cL.append(str(v))
                        cL.append(str(k))
                    seqId = "|".join(cL)
                    fD[seqId] = cD
            #
            logger.info("Writing %d pharos targets to %s", len(fD), fastaPath)
            ok = self.__mU.doExport(fastaPath, fD, fmt="fasta", makeComment=True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

Пример #8

Показать файл

Файл: RepositoryProvider.py Проект: rcsb/py-rcsb_utils_repository

class RepositoryProvider(object):
    def __init__(self,
                 cfgOb,
                 cachePath=None,
                 numProc=8,
                 fileLimit=None,
                 verbose=False):
        self.__fileLimit = fileLimit
        self.__numProc = numProc
        self.__verbose = verbose
        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__topCachePath = cachePath if cachePath else "."
        self.__cachePath = os.path.join(
            self.__topCachePath,
            self.__cfgOb.get("REPO_UTIL_CACHE_DIR",
                             sectionName=self.__configName))
        #
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        #
        self.__ccPathD = None
        #
        self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s"

    def getLocatorObjList(self,
                          contentType,
                          inputPathList=None,
                          mergeContentTypes=None,
                          excludeIds=None):
        """Convenience method to get the data path list for the input repository content type.

        Args:
            contentType (str): Repository content type (e.g. pdbx, chem_comp, bird, ...)
            inputPathList (list, optional): path list that will be returned if provided.
            mergeContentTypes (list, optional): repository content types to combined with the
                                primary content type.
            excludeIds (list or dict): exclude any locators for idCodes in this list or dictionary

        Returns:
            Obj list: data file paths or tuple of file paths

        """
        inputPathList = inputPathList if inputPathList else []
        if inputPathList:
            return self.getLocatorObjListWithInput(
                contentType,
                inputPathList=inputPathList,
                mergeContentTypes=mergeContentTypes)
        #
        if mergeContentTypes and "vrpt" in mergeContentTypes and contentType in [
                "pdbx", "pdbx_core"
        ]:
            dictPath = os.path.join(
                self.__topCachePath,
                self.__cfgOb.get(
                    "DICTIONARY_CACHE_DIR",
                    sectionName=self.__cfgOb.getDefaultSectionName()))
            os.environ["_RP_DICT_PATH_"] = dictPath
            locatorList = self.getEntryLocatorObjList(
                mergeContentTypes=mergeContentTypes)
        else:
            locatorList = self.__getLocatorList(contentType,
                                                inputPathList=inputPathList)
        #
        if excludeIds:
            fL = []
            for locator in locatorList:
                if isinstance(locator, str):
                    pth = locator
                else:
                    pth = locator[0]["locator"]
                #
                idCode = self.__getIdcodeFromLocatorPath(contentType, pth)
                if idCode in excludeIds:
                    continue
                fL.append(locator)
            locatorList = fL

        return locatorList

    def getLocatorObjListWithInput(self,
                                   contentType,
                                   inputPathList=None,
                                   mergeContentTypes=None):
        """Convenience method to get the data path list for the input repository content type.

        Args:
            contentType (str): Repository content type (e.g. pdbx, chem_comp, bird, ...)
            inputPathList (list, optional): path list that will be returned if provided.
            mergeContentTypes (list, optional): repository content types to combined with the
                                primary content type.

        Returns:
            Obj list: data file paths or tuple of file paths

        """
        inputPathList = inputPathList if inputPathList else []
        locatorList = self.__getLocatorList(contentType,
                                            inputPathList=inputPathList)
        # JDW move the following to config
        if mergeContentTypes and "vrpt" in mergeContentTypes and contentType in [
                "pdbx", "pdbx_core"
        ]:
            dictPath = os.path.join(
                self.__topCachePath,
                self.__cfgOb.get(
                    "DICTIONARY_CACHE_DIR",
                    sectionName=self.__cfgOb.getDefaultSectionName()))
            os.environ["_RP_DICT_PATH_"] = dictPath
            #
            locObjL = []
            for locator in locatorList:
                if isinstance(locator, str):
                    kwD = HashableDict({})
                    oL = [
                        HashableDict({
                            "locator": locator,
                            "fmt": "mmcif",
                            "kwargs": kwD
                        })
                    ]
                    for mergeContentType in mergeContentTypes:
                        _, fn = os.path.split(locator)
                        idCode = fn[:4] if fn and len(fn) >= 8 else None
                        mergeLocator = self.__getLocator(
                            mergeContentType, idCode,
                            checkExists=True) if idCode else None
                        if mergeLocator:
                            # kwD = HashableDict({"marshalHelper": vrd.toCif})
                            kwD = HashableDict({"marshalHelper": toCifWrapper})
                            oL.append(
                                HashableDict({
                                    "locator": mergeLocator,
                                    "fmt": "xml",
                                    "kwargs": kwD
                                }))
                    lObj = tuple(oL)
                else:
                    logger.error("Unexpected output locator type %r", locator)
                    lObj = locator
                locObjL.append(lObj)
            #
            locatorList = locObjL
        # -
        return locatorList

    def getContainerList(self, locatorObjList):
        """Return the data container list obtained by parsing the input locator object list."""
        cL = []
        for locatorObj in locatorObjList:
            myContainerList = self.__mergeContainers(locatorObj,
                                                     fmt="mmcif",
                                                     mergeTarget=0)
            for cA in myContainerList:
                cL.append(cA)
        return cL

    def __mergeContainers(self, locatorObj, fmt="mmcif", mergeTarget=0):
        """Consolidate content in auxiliary files locatorObj[1:] into
        locatorObj[0] container index 'mergeTarget'.

        """
        #
        cL = []
        try:
            if isinstance(locatorObj, str):
                cL = self.__mU.doImport(locatorObj, fmt=fmt)
                return cL if cL else []
            elif isinstance(locatorObj, (list, tuple)) and locatorObj:
                dD = locatorObj[0]
                kw = dD["kwargs"]
                cL = self.__mU.doImport(dD["locator"], fmt=dD["fmt"], **kw)
                if cL:
                    for dD in locatorObj[1:]:
                        kw = dD["kwargs"]
                        rObj = self.__mU.doImport(dD["locator"],
                                                  fmt=dD["fmt"],
                                                  **kw)
                        mergeL = rObj if rObj else []
                        for mc in mergeL:
                            cL[mergeTarget].merge(mc)
                #
                return cL
            else:
                return []
        except Exception as e:
            logger.exception("Failing for %r with %s", locatorObj, str(e))

        return cL

    def getLocatorsFromPaths(self, locatorObjList, pathList, locatorIndex=0):
        """Return locator objects with paths (locatorObjIndex) matching the input pathList."""
        # index the input locatorObjList
        rL = []
        try:
            if locatorObjList and isinstance(locatorObjList[0], str):
                return pathList
            #
            locIdx = {}
            for ii, locatorObj in enumerate(locatorObjList):
                if "locator" in locatorObj[locatorIndex]:
                    locIdx[locatorObj[locatorIndex]["locator"]] = ii
            #
            for pth in pathList:
                jj = locIdx[pth] if pth in locIdx else None
                if jj is not None:
                    rL.append(locatorObjList[jj])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return rL

    def getLocatorIdcodes(self, contentType, locatorObjList, locatorIndex=0):
        try:

            if locatorObjList and isinstance(locatorObjList[0], str):
                return [
                    self.__getIdcodeFromLocatorPath(contentType, pth)
                    for pth in locatorObjList
                ]
            else:
                return [
                    self.__getIdcodeFromLocatorPath(
                        contentType, locatorObj[locatorIndex]["locator"])
                    for locatorObj in locatorObjList
                ]
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return []

    def getLocatorPaths(self, locatorObjList, locatorIndex=0):
        try:
            if locatorObjList and isinstance(locatorObjList[0], str):
                return locatorObjList
            else:
                return [
                    locatorObj[locatorIndex]["locator"]
                    for locatorObj in locatorObjList
                ]
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return []

    def __getLocatorList(self, contentType, inputPathList=None):
        """Internal convenience method to return repository path list by content type:"""
        outputPathList = []
        inputPathList = inputPathList if inputPathList else []
        try:
            if contentType in ["bird", "bird_core"]:
                outputPathList = inputPathList if inputPathList else self.getBirdPathList(
                )
            elif contentType == "bird_family":
                outputPathList = inputPathList if inputPathList else self.getBirdFamilyPathList(
                )
            elif contentType in ["chem_comp"]:
                outputPathList = inputPathList if inputPathList else self.getChemCompPathList(
                )
            elif contentType in ["bird_chem_comp"]:
                outputPathList = inputPathList if inputPathList else self.getBirdChemCompPathList(
                )
            elif contentType in ["pdbx", "pdbx_core"]:
                outputPathList = inputPathList if inputPathList else self.getEntryPathList(
                )
            elif contentType in [
                    "chem_comp_core", "bird_consolidated",
                    "bird_chem_comp_core"
            ]:
                outputPathList = inputPathList if inputPathList else self.mergeBirdAndChemCompRefData(
                )
            elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]:
                outputPathList = inputPathList if inputPathList else self.getIhmDevPathList(
                )
            elif contentType in [
                    "pdb_distro", "da_internal", "status_history"
            ]:
                outputPathList = inputPathList if inputPathList else []
            else:
                logger.warning("Unsupported contentType %s", contentType)
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        if self.__fileLimit:
            outputPathList = outputPathList[:self.__fileLimit]

        return sorted(outputPathList)

    def __getLocator(self,
                     contentType,
                     idCode,
                     version="v1-0",
                     checkExists=False):
        """Convenience method to return repository path for a content type and cardinal identifier."""
        pth = None
        try:
            idCodel = idCode.lower()
            if contentType == "bird":
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode[-1], idCode + ".cif")
            elif contentType == "bird_family":
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode[-1], idCode + ".cif")
            elif contentType in ["chem_comp", "chem_comp_core"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode[0], idCode, idCode + ".cif")
            elif contentType in ["bird_chem_comp"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode[-1], idCode + ".cif")
            elif contentType in ["pdbx", "pdbx_core"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCodel[1:3], idCodel + ".cif.gz")
            elif contentType in ["bird_consolidated", "bird_chem_comp_core"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode + ".cif")
            elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]:
                pth = os.path.join(self.__getRepoTopPath(contentType), idCode,
                                   idCode + "_model_%s.cif.gz" % version)
            elif contentType in [
                    "pdb_distro", "da_internal", "status_history"
            ]:
                pass
            elif contentType in ["vrpt"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCodel[1:3], idCodel,
                                   idCodel + "_validation.xml.gz")
            else:
                logger.warning("Unsupported contentType %s", contentType)
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        if checkExists:
            pth = pth if self.__mU.exists(pth) else None
        return pth

    def __getIdcodeFromLocatorPath(self, contentType, pth):
        """Convenience method to return the idcode from the locator path."""
        idCode = None
        try:
            bn = os.path.basename(pth)
            if contentType in [
                    "pdbx", "pdbx_core", "bird", "bird_family", "chem_comp",
                    "chem_comp_core", "bird_consolidated",
                    "bird_chem_comp_core"
            ]:
                idCode = bn.split(".")[0]
            elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]:
                tC = bn.split(".")[0]
                idCode = "_".join(tC.split("_")[:2])
            elif contentType in [
                    "pdb_distro", "da_internal", "status_history"
            ]:
                idCode = None
            elif contentType in ["vrpt"]:
                tC = bn.split(".")[0]
                idCode = tC.split("_")[0]
            else:
                logger.warning("Unsupported contentType %s", contentType)
            idCode = idCode.upper() if idCode else None
        except Exception as e:
            logger.exception("Failing for %r %r with %s", contentType, pth,
                             str(e))
        return idCode

    def __getRepoTopPath(self, contentType):
        """Convenience method to return repository top path from configuration data."""
        pth = None
        try:
            if contentType == "bird":
                pth = self.__cfgOb.getPath("BIRD_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType == "bird_family":
                pth = self.__cfgOb.getPath("BIRD_FAMILY_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in ["chem_comp", "chem_comp_core"]:
                pth = self.__cfgOb.getPath("CHEM_COMP_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in ["bird_chem_comp"]:
                pth = self.__cfgOb.getPath("BIRD_CHEM_COMP_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in ["pdbx", "pdbx_core"]:
                pth = self.__cfgOb.getPath("PDBX_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in ["bird_consolidated", "bird_chem_comp_core"]:
                pth = self.__cachePath
            elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]:
                pth = self.__cfgOb.getPath("IHM_DEV_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in [
                    "pdb_distro", "da_internal", "status_history"
            ]:
                pass
            elif contentType in ["vrpt"]:
                pth = self.__cfgOb.getEnvValue("VRPT_REPO_PATH_ENV",
                                               sectionName=self.__configName,
                                               default=None)
                if pth is None:
                    pth = self.__cfgOb.getPath("VRPT_REPO_PATH",
                                               sectionName=self.__configName)
                else:
                    logger.debug(
                        "Using validation report path from environment assignment %s",
                        pth)
            else:
                logger.warning("Unsupported contentType %s", contentType)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return pth

    def _chemCompPathWorker(self, dataList, procName, optionsD, workingDir):
        """Return the list of chemical component definition file paths in the current repository."""
        _ = procName
        _ = workingDir
        topRepoPath = optionsD["topRepoPath"]
        pathList = []
        for subdir in dataList:
            dd = os.path.join(topRepoPath, subdir)
            for root, _, files in os.walk(dd, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.endswith(".cif") and len(name) <= 7:
                        pathList.append(os.path.join(root, name))
        return dataList, pathList, []

    def getChemCompPathList(self):
        return self.__getChemCompPathList(self.__getRepoTopPath("chem_comp"),
                                          numProc=self.__numProc)

    def __getChemCompPathList(self, topRepoPath, numProc=8):
        """Get the path list for the chemical component definition repository"""
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting at %s", ts)
        startTime = time.time()
        pathList = []
        try:
            dataS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
            dataList = [a for a in dataS]
            optD = {}
            optD["topRepoPath"] = topRepoPath
            mpu = MultiProcUtil(verbose=self.__verbose)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self, workerMethod="_chemCompPathWorker")
            _, _, retLists, _ = mpu.runMulti(dataList=dataList,
                                             numProc=numProc,
                                             numResults=1)
            pathList = retLists[0]
            endTime0 = time.time()
            logger.debug("Path list length %d  in %.4f seconds", len(pathList),
                         endTime0 - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return self.__applyFileLimit(pathList)

    def _entryLocatorObjWithMergeWorker(self, dataList, procName, optionsD,
                                        workingDir):
        """Return the list of entry locator objects including merge content in the current repository."""
        _ = procName
        _ = workingDir
        topRepoPath = optionsD["topRepoPath"]
        mergeContentTypes = optionsD["mergeContentTypes"]
        locatorObjList = []
        for subdir in dataList:
            dd = os.path.join(topRepoPath, subdir)
            for root, _, files in os.walk(dd, topdown=False):
                if "REMOVE" in root:
                    continue
                for fn in files:
                    if (fn.endswith(".cif.gz")
                            and len(fn) == 11) or (fn.endswith(".cif")
                                                   and len(fn) == 8):
                        locator = os.path.join(root, fn)
                        kwD = HashableDict({})
                        oL = [
                            HashableDict({
                                "locator": locator,
                                "fmt": "mmcif",
                                "kwargs": kwD
                            })
                        ]
                        for mergeContentType in mergeContentTypes:
                            idCode = fn[:4] if fn and len(fn) >= 8 else None
                            mergeLocator = self.__getLocator(
                                mergeContentType, idCode,
                                checkExists=True) if idCode else None
                            if mergeLocator:
                                kwD = HashableDict(
                                    {"marshalHelper": toCifWrapper})
                                oL.append(
                                    HashableDict({
                                        "locator": mergeLocator,
                                        "fmt": "xml",
                                        "kwargs": kwD
                                    }))
                        lObj = tuple(oL)
                        locatorObjList.append(lObj)
        return dataList, locatorObjList, []

    def getEntryLocatorObjList(self, mergeContentTypes=None):
        return self.__getEntryLocatorObjList(
            self.__getRepoTopPath("pdbx"),
            numProc=self.__numProc,
            mergeContentTypes=mergeContentTypes)

    def __getEntryLocatorObjList(self,
                                 topRepoPath,
                                 numProc=8,
                                 mergeContentTypes=None):
        """Get the path list for structure entries in the input repository"""
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting at %s", ts)
        startTime = time.time()
        pathList = []
        try:
            dataList = []
            anL = "abcdefghijklmnopqrstuvwxyz0123456789"
            for a1 in anL:
                for a2 in anL:
                    hc = a1 + a2
                    dataList.append(hc)
                    hc = a2 + a1
                    dataList.append(hc)
            dataList = list(set(dataList))
            #
            optD = {}
            optD["topRepoPath"] = topRepoPath
            optD["mergeContentTypes"] = mergeContentTypes
            mpu = MultiProcUtil(verbose=self.__verbose)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self,
                    workerMethod="_entryLocatorObjWithMergeWorker")
            _, _, retLists, _ = mpu.runMulti(dataList=dataList,
                                             numProc=numProc,
                                             numResults=1)
            pathList = retLists[0]
            endTime0 = time.time()
            logger.debug("Locator object list length %d  in %.4f seconds",
                         len(pathList), endTime0 - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return self.__applyFileLimit(pathList)

    def _entryPathWorker(self, dataList, procName, optionsD, workingDir):
        """Return the list of entry file paths in the current repository."""
        _ = procName
        _ = workingDir
        topRepoPath = optionsD["topRepoPath"]
        pathList = []
        for subdir in dataList:
            dd = os.path.join(topRepoPath, subdir)
            for root, _, files in os.walk(dd, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if (name.endswith(".cif.gz")
                            and len(name) == 11) or (name.endswith(".cif")
                                                     and len(name) == 8):
                        pathList.append(os.path.join(root, name))
        return dataList, pathList, []

    def getEntryPathList(self):
        return self.__getEntryPathList(self.__getRepoTopPath("pdbx"),
                                       numProc=self.__numProc)

    def __getEntryPathList(self, topRepoPath, numProc=8):
        """Get the path list for structure entries in the input repository"""
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting at %s", ts)
        startTime = time.time()
        pathList = []
        try:
            dataList = []
            anL = "abcdefghijklmnopqrstuvwxyz0123456789"
            for a1 in anL:
                for a2 in anL:
                    hc = a1 + a2
                    dataList.append(hc)
                    hc = a2 + a1
                    dataList.append(hc)
            dataList = list(set(dataList))
            #
            optD = {}
            optD["topRepoPath"] = topRepoPath
            mpu = MultiProcUtil(verbose=self.__verbose)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self, workerMethod="_entryPathWorker")
            _, _, retLists, _ = mpu.runMulti(dataList=dataList,
                                             numProc=numProc,
                                             numResults=1)
            pathList = retLists[0]
            endTime0 = time.time()
            logger.debug("Path list length %d  in %.4f seconds", len(pathList),
                         endTime0 - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return self.__applyFileLimit(pathList)

    def getBirdPathList(self):
        return self.__getBirdPathList(self.__getRepoTopPath("bird"))

    def __getBirdPathList(self, topRepoPath):
        """Return the list of definition file paths in the current repository.

        List is ordered in increasing PRD ID numerical code.
        """
        pathList = []
        try:
            sd = {}
            for root, _, files in os.walk(topRepoPath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.startswith("PRD_") and name.endswith(
                            ".cif") and len(name) <= 14:
                        pth = os.path.join(root, name)
                        sd[int(name[4:-4])] = pth
            #
            for k in sorted(sd.keys()):
                pathList.append(sd[k])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return self.__applyFileLimit(pathList)

    def getBirdFamilyPathList(self):
        return self.__getBirdFamilyPathList(
            self.__getRepoTopPath("bird_family"))

    def __getBirdFamilyPathList(self, topRepoPath):
        """Return the list of definition file paths in the current repository.

        List is ordered in increasing PRD ID numerical code.
        """
        pathList = []
        try:
            sd = {}
            for root, _, files in os.walk(topRepoPath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.startswith("FAM_") and name.endswith(
                            ".cif") and len(name) <= 14:
                        pth = os.path.join(root, name)
                        sd[int(name[4:-4])] = pth
            #
            for k in sorted(sd.keys()):
                pathList.append(sd[k])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return self.__applyFileLimit(pathList)

    def getBirdChemCompPathList(self):
        return self.__getBirdChemCompPathList(
            self.__getRepoTopPath("bird_chem_comp"))

    def __getBirdChemCompPathList(self, topRepoPath):
        """Return the list of definition file paths in the current repository.

        List is ordered in increasing PRD ID numerical code.
        """
        pathList = []
        try:
            sd = {}
            for root, _, files in os.walk(topRepoPath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.startswith("PRDCC_") and name.endswith(
                            ".cif") and len(name) <= 16:
                        pth = os.path.join(root, name)
                        sd[int(name[6:-4])] = pth
            #
            for k in sorted(sd.keys()):
                pathList.append(sd[k])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return self.__applyFileLimit(pathList)

    def __applyFileLimit(self, pathList):
        logger.debug("Length of file path list %d (limit %r)", len(pathList),
                     self.__fileLimit)
        if self.__fileLimit:
            return pathList[:self.__fileLimit]
        else:
            return pathList

    def __buildFamilyIndex(self):
        """Using information from the PRD family definition:
        #
        loop_
        _pdbx_reference_molecule_list.family_prd_id
        _pdbx_reference_molecule_list.prd_id
            FAM_000010 PRD_000041
            FAM_000010 PRD_000042
            FAM_000010 PRD_000043
            FAM_000010 PRD_000044
            FAM_000010 PRD_000048
            FAM_000010 PRD_000049
            FAM_000010 PRD_000051
        #
        """
        prdD = {}
        try:
            pthL = self.__getLocatorList("bird_family")
            for pth in pthL:
                containerL = self.__mU.doImport(pth, fmt="mmcif")
                for container in containerL:
                    catName = "pdbx_reference_molecule_list"
                    if container.exists(catName):
                        catObj = container.getObj(catName)
                        for ii in range(catObj.getRowCount()):
                            familyPrdId = catObj.getValue(
                                attributeName="family_prd_id", rowIndex=ii)
                            prdId = catObj.getValue(attributeName="prd_id",
                                                    rowIndex=ii)
                            if prdId in prdD:
                                logger.debug(
                                    "duplicate prdId in family index %s %s",
                                    prdId, familyPrdId)
                            prdD[prdId] = {
                                "familyPrdId": familyPrdId,
                                "c": container
                            }
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return prdD

    def __buildBirdCcIndex(self):
        """Using information from the PRD pdbx_reference_molecule category to
        index the BIRDs corresponding small molecule correspondences

        """
        prdD = {}
        ccPathD = {}
        prdStatusD = {}
        try:
            ccPathL = self.__getLocatorList("chem_comp")
            ccPathD = {}
            for ccPath in ccPathL:
                _, fn = os.path.split(ccPath)
                ccId, _ = os.path.splitext(fn)
                ccPathD[ccId] = ccPath
            logger.info("Chemical component path list (%d)", len(ccPathD))
            pthL = self.__getLocatorList("bird")
            logger.info("BIRD path list (%d)", len(pthL))
            for pth in pthL:
                containerL = self.__mU.doImport(pth, fmt="mmcif")
                for container in containerL:
                    catName = "pdbx_reference_molecule"
                    if container.exists(catName):
                        catObj = container.getObj(catName)
                        ii = 0
                        prdId = catObj.getValue(attributeName="prd_id",
                                                rowIndex=ii)
                        relStatus = catObj.getValue(
                            attributeName="release_status", rowIndex=ii)
                        prdStatusD[prdId] = relStatus
                        if relStatus != "REL":
                            continue
                        prdRepType = catObj.getValue(
                            attributeName="represent_as", rowIndex=ii)
                        logger.debug("represent as %r", prdRepType)
                        if prdRepType in ["single molecule"]:
                            ccId = catObj.getValueOrDefault(
                                attributeName="chem_comp_id",
                                rowIndex=ii,
                                defaultValue=None)
                            # prdId = catObj.getValue(attributeName="prd_id", rowIndex=ii)
                            logger.debug("mapping prdId %r ccId %r", prdId,
                                         ccId)
                            if ccId and ccId in ccPathD:
                                prdD[prdId] = {
                                    "ccId": ccId,
                                    "ccPath": ccPathD[ccId]
                                }
                                ccPathD[ccPathD[ccId]] = {
                                    "ccId": ccId,
                                    "prdId": prdId
                                }
                            else:
                                logger.error("Bad ccId %r for BIRD %r", ccId,
                                             prdId)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        logger.info(
            "Candidate Chemical Components (%d) BIRDS (%d) BIRD status details (%d)",
            len(prdD), len(ccPathD), len(prdStatusD))
        return prdD, ccPathD, prdStatusD

    # -
    def mergeBirdAndChemCompRefData(self):
        prdSmallMolCcD, ccPathD, prdStatusD = self.__buildBirdCcIndex()
        logger.info("PRD to CCD index length %d CCD map path length %d",
                    len(prdSmallMolCcD), len(ccPathD))
        outputPathList = self.mergeBirdRefData(prdSmallMolCcD, prdStatusD)
        ccOutputPathList = [
            pth for pth in self.getChemCompPathList() if pth not in ccPathD
        ]
        outputPathList.extend(ccOutputPathList)
        return outputPathList

    def mergeBirdRefData(self, prdSmallMolCcD, prdStatusD):
        """Consolidate all of the bird reference data in a single container.

        If the BIRD is a 'small molecule' type then also merge with the associated CC definition.

        Store the merged data in the REPO_UTIL cache path and ...

        Return a path list for the consolidated data files -

        """
        outPathList = []
        try:
            birdPathList = self.__getLocatorList("bird")
            birdPathD = {}
            for birdPath in birdPathList:
                _, fn = os.path.split(birdPath)
                prdId, _ = os.path.splitext(fn)
                birdPathD[prdId] = birdPath
            #
            logger.info("BIRD path length %d", len(birdPathD))
            logger.debug("BIRD keys %r", list(birdPathD.keys()))
            birdCcPathList = self.__getLocatorList("bird_chem_comp")
            birdCcPathD = {}
            for birdCcPath in birdCcPathList:
                _, fn = os.path.split(birdCcPath)
                prdCcId, _ = os.path.splitext(fn)
                prdId = "PRD_" + prdCcId[6:]
                birdCcPathD[prdId] = birdCcPath
            #
            logger.info("BIRDCC path length %d", len(birdCcPathD))
            logger.debug("BIRD CC keys %r", list(birdCcPathD.keys()))
            fD = self.__buildFamilyIndex()
            logger.info("BIRD Family index length %d", len(fD))
            logger.debug("Family index keys %r", list(fD.keys()))
            logger.info("PRD to CCD small mol index length %d",
                        len(prdSmallMolCcD))
            #
            iSkip = 0
            for prdId in birdPathD:
                if prdId in prdStatusD and prdStatusD[prdId] != "REL":
                    logger.debug("Skipping BIRD with non-REL status %s", prdId)
                    iSkip += 1
                    continue
                fp = os.path.join(self.__cachePath, prdId + ".cif")
                logger.debug("Export cache path is %r", fp)
                #
                pth2 = birdPathD[prdId]
                cL = self.__mU.doImport(pth2, fmt="mmcif")
                cFull = cL[0]
                logger.debug("Got Bird %r", cFull.getName())
                #
                #
                ccBird = None
                ccD = None
                if prdId in prdSmallMolCcD:
                    pthCc = prdSmallMolCcD[prdId]["ccPath"]
                    cL = self.__mU.doImport(pthCc, fmt="mmcif")
                    ccD = cL[0]
                    logger.debug("Got corresponding CCD %r", ccD.getName())
                elif prdId in birdCcPathD:
                    pth1 = birdCcPathD[prdId]
                    c1L = self.__mU.doImport(pth1, fmt="mmcif")
                    ccBird = c1L[0]
                    logger.debug("Got ccBird %r", ccBird.getName())
                    #
                cFam = None
                if prdId in fD:
                    cFam = fD[prdId]["c"]
                    logger.debug("Got cFam %r", cFam.getName())
                #
                if ccD:
                    for catName in ccD.getObjNameList():
                        cFull.append(ccD.getObj(catName))
                #
                if ccBird:
                    for catName in ccBird.getObjNameList():
                        cFull.append(ccBird.getObj(catName))
                if cFam:
                    for catName in cFam.getObjNameList():
                        cFull.append(cFam.getObj(catName))
                #
                self.__mU.doExport(fp, [cFull], fmt="mmcif")
                outPathList.append(fp)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        logger.info(
            "Merged BIRD/Family/CC path length %d (skipped non-released %d)",
            len(outPathList), iSkip)
        return outPathList
        #

    def __exportConfig(self, container):
        """
        - CATEGORY_NAME: diffrn_detector
          ATTRIBUTE_NAME_LIST:
              - pdbx_frequency
        - CATEGORY_NAME: pdbx_serial_crystallography_measurement
          ATTRIBUTE_NAME_LIST:
              - diffrn_id
              - pulse_energy
              - pulse_duration
              - xfel_pulse_repetition_rate
        """
        for catName in container.getObjNameList():
            cObj = container.getObj(catName)
            print("- CATEGORY_NAME: %s" % catName)
            print("  ATTRIBUTE_NAME_LIST:")
            for atName in cObj.getAttributeList():
                print("       - %s" % atName)
        return True

    def getIhmDevPathList(self):
        return self.__getIhmDevPathList(self.__getRepoTopPath("ihm_dev"))

    def __getIhmDevPathList(self, topRepoPath):
        """Return the list of I/HM entries in the current repository.

        File name template is: PDBDEV_0000 0020_model_v1-0.cif.gz

        List is ordered in increasing PRDDEV numerical code.
        """
        pathList = []
        logger.debug("Searching path %r", topRepoPath)
        try:
            sd = {}
            for root, _, files in os.walk(topRepoPath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.startswith("PDBDEV_") and name.endswith(
                            ".cif.gz") and len(name) <= 50:
                        pth = os.path.join(root, name)
                        sd[int(name[7:15])] = pth
            #
            for k in sorted(sd.keys()):
                pathList.append(sd[k])
        except Exception as e:
            logger.exception("Failing search in %r with %s", topRepoPath,
                             str(e))
        #
        return self.__applyFileLimit(pathList)

Пример #9

Показать файл

Файл: ChemRefMappingProvider.py Проект: rcsb/py-rcsb_exdb

class ChemRefMappingProvider(StashableBase):
    """Accessors for chemical reference identifier mapping data."""
    def __init__(self, cachePath, useCache=True):
        #
        self.__cachePath = cachePath
        self.__useCache = useCache
        self.__dirName = "chemref-mapping"
        super(ChemRefMappingProvider, self).__init__(self.__cachePath,
                                                     [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__rD = {}
        self.__mapD = self.__reload(self.__dirPath, useCache)
        #

    def testCache(self, minCount=0):
        logger.info(
            "Mapping count %d",
            len(self.__mapD["mapping"]) if "mapping" in self.__mapD else 0)
        if minCount == 0 or self.__mapD and "mapping" in self.__mapD and len(
                self.__mapD["mapping"]) >= minCount:
            return True
        else:
            return False

    def getReferenceIds(self, referenceResourceName, localId):
        """Get the identifiers in the reference resource corresponding to input local
        identifiers (Chemical Component or BIRD).

        Args:
            referenceResourceName (str): chemical reference resource name (DrugBank, ChEMBL, ChEBI, PubChem, ...)
            localId (str): local identifier for a Chemical Component or BIRD definition

        Returns:
            list: list of reference identifiers
        """
        if not self.__rD:
            for rN, forwardD in self.__mapD["mapping"].items():
                # {refId :[lId, lId, ...], ...}
                reverseD = {}
                for refId, rcsbIdL in forwardD.items():
                    for rId in rcsbIdL:
                        reverseD.setdefault(rId, []).append(refId)
                self.__rD[rN] = reverseD
        #
        try:
            return self.__rD[referenceResourceName.upper()][localId]
        except Exception:
            return []

    def getLocalIds(self, referenceResourceName, referenceId):
        """Get the local identifiers (Chemical Component or BIRD) corresponding to identifiers in
        chemical reference resource.

        Args:
            referenceResourceName (str): chemical reference resource name (DrugBank, ChEMBL, ChEBI, PubChem, ...)
            referenceId (str): identifier in the chemical reference resource

        Returns:
            list: list of local Chemical Component or BIRD identifiers
        """
        try:
            return self.__mapD["mapping"][
                referenceResourceName.upper()][referenceId]
        except Exception:
            return []

    def __getMappingDataPath(self):
        return os.path.join(self.__dirPath, "chemref-mapping-data.json")

    def __reload(self, dirPath, useCache):
        startTime = time.time()
        fD = {}
        ok = False
        mappingPath = self.__getMappingDataPath()
        #
        logger.info("useCache %r mappingPath %r", useCache, mappingPath)
        if useCache and self.__mU.exists(mappingPath):
            fD = self.__mU.doImport(mappingPath, fmt="json")
            ok = True
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        logger.info("Completed reload with status (%r) at %s (%.4f seconds)",
                    ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    time.time() - startTime)
        return fD

    def fetchChemRefMapping(self, cfgOb, referenceResourceNameList=None):
        """Fetch reference resource mapping for chemical component and BIRD definitions

        Args:
            cfgOb (obj): instance configuration class ConfigUtil()
            referenceResourceNameList (list, optional): list of chemical reference resources. Defaults to [DrugBank, ChEMBL].

        Returns:
            bool: True for success or False otherwise
        """
        try:
            rnL = referenceResourceNameList if referenceResourceNameList is not None else [
                "DrugBank", "ChEMBL"
            ]
            mD = {}
            crExt = ChemRefExtractor(cfgOb)
            for referenceResourceName in rnL:
                idD = crExt.getChemCompAccessionMapping(
                    referenceResourceName=referenceResourceName)
                logger.info("%s mapping dictionary (%d)",
                            referenceResourceName, len(idD))
                mD[referenceResourceName.upper()] = idD
            #
            fp = self.__getMappingDataPath()
            tS = datetime.datetime.now().isoformat()
            vS = datetime.datetime.now().strftime("%Y-%m-%d")
            ok = self.__mU.doExport(fp, {
                "version": vS,
                "created": tS,
                "mapping": mD
            },
                                    fmt="json",
                                    indent=3)
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

Пример #10

Показать файл

class PharosTargetCofactorProvider(StashableBase):
    """Accessors for Pharos target cofactors."""
    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__dirName = "Pharos-cofactors"
        super(PharosTargetCofactorProvider,
              self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__fD = self.__reload(self.__dirPath, **kwargs)
        #

    def testCache(self, minCount=1):
        logger.info(
            "Pharos cached cofactor count %d",
            len(self.__fD["cofactors"]) if "cofactors" in self.__fD else 0)
        if self.__fD and "cofactors" in self.__fD and len(
                self.__fD["cofactors"]) > minCount:
            return True
        else:
            return False

    def hasTarget(self, rcsbEntityId):
        return rcsbEntityId.upper() in self.__fD["cofactors"]

    def getTargets(self, rcsbEntityId):
        try:
            return self.__fD["cofactors"][rcsbEntityId.upper()]
        except Exception:
            return []

    def __getCofactorDataPath(self):
        return os.path.join(self.__dirPath, "Pharos-cofactor-data.json")

    def reload(self):
        self.__fD = self.__reload(self.__dirPath, useCache=True)
        return True

    def __reload(self, dirPath, **kwargs):
        startTime = time.time()
        fD = {}
        useCache = kwargs.get("useCache", True)
        ok = False
        cofactorPath = self.__getCofactorDataPath()
        #
        logger.info("useCache %r cofactorPath %r", useCache, cofactorPath)
        if useCache and self.__mU.exists(cofactorPath):
            fD = self.__mU.doImport(cofactorPath, fmt="json")
            ok = True
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        numCofactors = len(fD["cofactors"]) if fD and "cofactors" in fD else 0
        logger.info(
            "Completed reload of (%d) cofactors with status (%r) at %s (%.4f seconds)",
            numCofactors, ok,
            time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
            time.time() - startTime)
        return fD

    def buildCofactorList(self,
                          sequenceMatchFilePath,
                          crmpObj=None,
                          lnmpObj=None,
                          maxActivity=5):
        """Build target cofactor list for the matching entities in the input sequence match file.

        Args:
            sequenceMatchFilePath (str): sequence match output file path
            crmpObj (obj, optional): instance of ChemRefMappingProviderObj(). Defaults to None.
            lnmpObj (obj, optional): instance of LigandNeighborMappingProviderObj(). Defaults to None.
            maxActivity (int, optional): maximum number of prioritized activity records per target. Defaults to 5.

        Returns:
            bool: True for success or False otherwise

            Example Pharos activity record -

            {
            "version": "2021-06-17",
            "created": "2021-06-17T11:10:54.563394",
            "activity": {
                "2232": [
                    {
                        "smiles": "CC(=CCC\\\\C(=C/Cc1c(O)cc(O)c(C(=O)CCc2ccc(O)cc2)c1O)\\\\C)C",
                        "chemblId": "CHEMBL3360923",
                        "pubChemId": "118724585",
                        "activity": 6.0,
                        "activityType": "IC50",
                        "activityUnits": "nM",
                        "name": "1-[3-(3,7-dimethylocta-2,6-dien-1-yl)-2,4,6-trihydroxyphenyl]-3-(4-hydroxyphenyl)propan-1-one",
                        "pubmedId": "25375026",
                        "patent": "USxxxxxx",
                    }, ...
        """
        rDL = []
        mD = self.__mU.doImport(sequenceMatchFilePath, fmt="json")
        # ---
        chaP = PharosTargetActivityProvider(cachePath=self.__cachePath,
                                            useCache=True)
        #
        provenanceSource = "Pharos"
        refScheme = "PDB entity"
        assignVersion = chaP.getAssignmentVersion()
        for queryId, matchDL in mD.items():
            # "O43508|uniprotId|7987|proteinId|9606|taxId"
            qCmtD = self.__decodeComment(queryId)
            unpId = qCmtD["uniprotId"]
            queryTaxId = qCmtD["taxId"] if "taxId" in qCmtD else None
            pharosId = qCmtD["proteinId"]
            if queryTaxId == "-1":
                logger.debug("Skipping target with missing taxonomy %r (%r)",
                             unpId, pharosId)
                continue
            #
            if not chaP.hasTargetActivity(pharosId):
                logger.debug("Skipping target with no activities %r (%r)",
                             unpId, pharosId)
                # continue
            # --
            chemCompNeighborsD = {}
            if lnmpObj:
                for matchD in matchDL:
                    tCmtD = self.__decodeComment(matchD["target"])
                    entryId = tCmtD["entityId"].split("_")[0]
                    entityId = tCmtD["entityId"].split("_")[1]
                    rcsbEntityId = entryId + "_" + entityId
                    chemCompIdList = lnmpObj.getLigandNeighbors(rcsbEntityId)
                    chemCompNeighborsD.update(
                        {k: True
                         for k in chemCompIdList})
            # --
            queryName = chaP.getTargetInfo(pharosId, "name")
            # --
            for matchD in matchDL:
                tCmtD = self.__decodeComment(matchD["target"])
                entryId = tCmtD["entityId"].split("_")[0]
                entityId = tCmtD["entityId"].split("_")[1]
                rcsbEntityId = entryId + "_" + entityId
                #
                taDL = chaP.getTargetActivity(pharosId)
                logger.debug("Target %r has (%d) activity records", pharosId,
                             len(taDL))
                actL = []
                # cfDL = []
                chD = {}
                for taD in taDL:
                    if taD["chemblId"] in chD:
                        chD[taD["chemblId"]] = True
                        continue

                    actD = {
                        "cofactor_id":
                        taD["chemblId"],
                        "cofactor_name":
                        taD["molecule_name"] if "name" in taD else None,
                        "measurement_type":
                        "p" + taD["activityType"],
                        "measurement_value":
                        taD["activity"],
                        "pubmed_ids":
                        [taD["pubmedId"]] if "pubmedId" in taD else None,
                        "patent_nos":
                        taD["patents"] if "patents" in taD else None,
                        "smiles":
                        taD["smiles"] if "smiles" in taD else None,
                        "action":
                        taD["action"] if "action" in taD else None,
                        "pharmacology":
                        taD["pharmacology"] if "pharmacology" in taD else None,
                    }
                    actD = self.__addLocalIds(actD, crmpObj=crmpObj)
                    actL.append(actD)
                #
                actL = self.__activityListSelect(actL,
                                                 chemCompNeighborsD,
                                                 maxActivity=maxActivity)
                if not actL:
                    logger.debug("No Pharos cofactors for %s %s", pharosId,
                                 unpId)
                # ---
                # aligned_target.entity_beg_seq_id (current target is PDB entity in json)
                # aligned_target.target_beg_seq_id (current query is target seq in json)
                # aligned_target.length
                fpL = []
                if "alignedRegions" in matchD:
                    fpL = [{
                        "entity_beg_seq_id": arD["targetBegin"],
                        "target_beg_seq_id": arD["queryBegin"],
                        "length": arD["targetEnd"] - arD["targetBegin"],
                    } for arD in matchD["alignedRegions"]]
                else:
                    fpL = [{
                        "entity_beg_seq_id": matchD["targetBegin"],
                        "target_beg_seq_id": matchD["queryBegin"],
                        "length": matchD["alignLen"],
                    }]
                # ---
                rD = {
                    "entry_id":
                    entryId,
                    "entity_id":
                    entityId,
                    "query_uniprot_id":
                    unpId,
                    "query_id":
                    pharosId,
                    "query_id_type":
                    "Pharos",
                    "query_name":
                    queryName,
                    "provenance_source":
                    provenanceSource,
                    "reference_scheme":
                    refScheme,
                    "assignment_version":
                    assignVersion,
                    "query_taxonomy_id":
                    int(queryTaxId) if queryTaxId else None,
                    "target_taxonomy_id":
                    int(matchD["targetTaxId"])
                    if "targetTaxId" in matchD else None,
                    "aligned_target":
                    fpL,
                    "taxonomy_match_status":
                    matchD["taxonomyMatchStatus"]
                    if "taxonomyMatchStatus" in matchD else None,
                    "lca_taxonomy_id":
                    matchD["lcaTaxId"] if "lcaTaxId" in matchD else None,
                    "lca_taxonomy_name":
                    matchD["lcaTaxName"] if "lcaTaxName" in matchD else None,
                    "lca_taxonomy_rank":
                    matchD["lcaRank"] if "lcaRank" in matchD else None,
                    "cofactors":
                    actL,
                }
                rDL.append(rD)
        #
        qD = {}
        for rD in rDL:
            eId = rD["entry_id"] + "_" + rD["entity_id"]
            qD.setdefault(eId, []).append(rD)
        #
        fp = self.__getCofactorDataPath()
        tS = datetime.datetime.now().isoformat()
        # vS = datetime.datetime.now().strftime("%Y-%m-%d")
        vS = assignVersion
        ok = self.__mU.doExport(fp, {
            "version": vS,
            "created": tS,
            "cofactors": qD
        },
                                fmt="json",
                                indent=3)
        return ok

    def __addLocalIds(self, cfD, crmpObj=None):
        #
        if crmpObj:
            localIdL = crmpObj.getLocalIds("CHEMBL", cfD["cofactor_id"])
            if localIdL:
                localId = localIdL[0]
                if localId.startswith("PRD_"):
                    cfD["prd_id"] = localId
                else:
                    cfD["chem_comp_id"] = localId
        return cfD

    def __activityListSelect(self,
                             activityDL,
                             chemCompNeighborsD,
                             maxActivity=5):
        """Prioritizing the activity data for locally mapped neighbor ligands and the best binding examples.

        Args:
            activityDL (list): full list of activity objects
            chemCompNeighborsD (dict, optional): index of all chemical components with neighbor interactions to the query target. Defaults {}.
            maxCount (int, optional): maximum number of activity object returned. Defaults to 5.

        Returns:
            list: prioritized and trimmed list of activity objects
        """
        retL = []
        mappedNeighborL = []
        unmappedL = activityDL

        if chemCompNeighborsD:
            unmappedL = []
            # Select out the any cases for molecules that map to a neighbor chemical component.
            for activityD in activityDL:
                if "chem_comp_id" in activityD and activityD[
                        "chem_comp_id"] in chemCompNeighborsD:
                    activityD["neighbor_in_pdb"] = "Y"
                    mappedNeighborL.append(activityD)
                else:
                    unmappedL.append(activityD)
                    activityD["neighbor_in_pdb"] = "N"
        #
        numLeft = maxActivity - len(mappedNeighborL)
        if numLeft > 0:
            unmappedL = sorted(unmappedL,
                               key=lambda k: k["measurement_value"],
                               reverse=True)
            retL = mappedNeighborL
            retL.extend(unmappedL[:numLeft])
            retL = sorted(retL,
                          key=lambda k: k["measurement_value"],
                          reverse=True)
        else:
            logger.debug(
                "Mapped neighbor cofactors (%d) excluded unmapped (%d)",
                len(mappedNeighborL), len(unmappedL))
            retL = sorted(mappedNeighborL,
                          key=lambda k: k["measurement_value"],
                          reverse=True)

        return retL

    def __decodeComment(self, comment, separator="|"):
        dD = {}
        try:
            ti = iter(comment.split(separator))
            dD = {tup[1]: tup[0] for tup in zip(ti, ti)}
        except Exception:
            pass
        return dD

Пример #11

Показать файл

Файл: CcdcSearchExecMp.py Проект: rcsb/py-rcsb_ccmodels

    def search(self, dataList, procName, optionsD, workingDir):
        """Worker method to execute a shell to search CCDC for the input mol2 path list.

        Args:
            dataList (list): list of mol2 file paths to be searched
            procName (str): processName
            optionsD (dict): dictionary of options
            workingDir (str): path to working directory (not used)

        Returns:
            (successList, resultList, []): success and result lists of mol2 paths with CCDC matches
        """
        resultPath = optionsD["resultPath"]
        searchType = optionsD["searchType"]
        pythonRootPath = optionsD["pythonRootPath"]
        csdHome = optionsD["csdHome"]
        timeOut = optionsD["timeOut"]
        timeOut = timeOut if timeOut and timeOut > 0 else 120
        _ = workingDir
        resultList = []
        startTime = time.time()
        logger.info("starting %s at %s", procName,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
        #
        try:
            stopPath = os.path.join(resultPath, "STOP")
            logger.info("%s starting search data length %d", procName,
                        len(dataList))
            if self.__checkStop(stopPath):
                logger.info("%s stopping", procName)
                return resultList, resultList, []
            #
            queryListFilePath = os.path.join(resultPath, procName,
                                             "queryFileList.list")
            mU = MarshalUtil()
            ok = mU.doExport(queryListFilePath, dataList, fmt="list")
            if not ok:
                return resultList, resultList, []
            #
            exU = ExecUtils()
            logger.debug("%s executing shell for %s", procName,
                         queryListFilePath)
            cmdPath = os.path.join(pythonRootPath, "bin", "ccdc_search_cli")
            hitListPath = os.path.join(resultPath, procName, "hitList.list")
            logPath = os.path.join(resultPath, procName, "execlog.log")

            logger.debug("cmdPath %r", cmdPath)
            ok = exU.runShell(
                "%s --mol_list_path %s --result_path %s --search_type %s --csdhome %s --hit_list_path %s"
                % (cmdPath, queryListFilePath, resultPath, searchType, csdHome,
                   hitListPath),
                outPath=logPath,
                outAppend=True,
                timeOut=timeOut,
                suppressStderr=False,
            )
            #
            if ok and mU.exists(hitListPath):
                resultList = mU.doImport(hitListPath, fmt="list")
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        endTime = time.time()
        logger.info("%s (result length %d) completed at %s (%.2f seconds)",
                    procName, len(resultList),
                    time.strftime("%Y %m %d %H:%M:%S",
                                  time.localtime()), endTime - startTime)
        return resultList, resultList, []

Пример #12

Показать файл

class NeighborInteractionProvider(object):
    """Generators and accessors for non-polymer instance target interactions."""
    def __init__(self, cfgOb, configName, cachePath, **kwargs):
        #
        self.__version = __version__
        self.__cfgOb = cfgOb
        self.__configName = configName
        self.__cachePath = cachePath
        self.__fileLimit = kwargs.get("fileLimit", None)
        self.__dirPath = os.path.join(cachePath, "neighbor-interactions")
        self.__numProc = kwargs.get("numProc", 2)
        self.__chunkSize = kwargs.get("chunkSize", 10)
        useCache = kwargs.get("useCache", True)
        #
        #  - Configuration for stash services -
        #    Local target directory name to be stashed.  (subdir of dirPath)
        #
        self.__stashDir = "ligand-target-neighbors"
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        self.__neighborD = self.__reload(fmt="pickle", useCache=useCache)
        #

    def testCache(self, minCount=0):
        try:
            if minCount == 0:
                return True
            if self.__neighborD and minCount and len(
                    self.__neighborD["entries"]) >= minCount:
                logger.info(
                    "Target neighbor data for (%d) entries created %r version %r",
                    len(self.__neighborD["entries"]),
                    self.__neighborD["created"], self.__neighborD["version"])
                return True
        except Exception:
            pass
        return False

    def getLigandNeighborIndex(self, entryId):
        """Return the target neighbors for the non-polymer instances for the input entry.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {ligandAsymId: {(targetAsymId, targetAuthSeqId): nnIndex1, (): nnIndex2}
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandNeighborIndexD"]
        except Exception:
            pass
        return {}

    def getTargetNeighborIndex(self, entryId):
        """Return the ligand neighbors for the polymer or branched entity instances in the input entry.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {(targetAsymId, targetAuthSeqId): {(ligandAsymId): nnIndex1, (): nnIndex2}

        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["targetNeighborIndexD"]
        except Exception:
            pass
        return {}

    def getNearestNeighborList(self, entryId):
        """Return the list of neares neighbors for the entry.

        Args:
            entryId (str): entry identifier

        Returns:
            list: [LigandTargetInstance(), ...]

        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["nearestNeighbors"]
        except Exception:
            pass
        return []

    def getLigandNeighborBoundState(self, entryId):
        """Return the dicitonary of ligand instances with isBound boolean status.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {ligandAsymId: True if isBound,  ...  }
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandIsBoundD"]
        except Exception:
            pass
        return {}

    def getAtomCounts(self, entryId):
        """Return the non-polymer instance atom counts for the input entry (all reported atoms).

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {asymId: {'FL': count, 'altA': count, 'altB': count, ... }}
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandAtomCountD"]
        except Exception:
            pass
        return {}

    def getHydrogenAtomCounts(self, entryId):
        """Return the non-polymer instance hydrogen atom counts for the input entry.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {asymId: {'FL': count, 'altA': count, 'altB': count, ... }}
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandHydrogenAtomCountD"]
        except Exception:
            pass
        return {}

    def hasEntry(self, entryId):
        """Return if the input entry is stored in the cache of non-polymer instance target interactions.

        Args:
            entryId (str): entry identifier

        Returns:
            (bool): True if entry is in the cache or False otherwise
        """
        try:
            return entryId in self.__neighborD["entries"]
        except Exception:
            pass
        return False

    def getEntries(self):
        """Return a list of entry identifier for which non-polymer instance target interactions are stored.

        Returns:
            (list): [entryId, entryId, ... ]
        """
        try:
            return list(self.__neighborD["entries"].keys())
        except Exception:
            pass
        return []

    def generate(self,
                 distLimit=5.0,
                 updateOnly=False,
                 fmt="pickle",
                 indent=0):
        """Generate and export non-polymer target interactions for all of the structures in the repository.

        Args:
            distLimit (float, optional): interaction distance. Defaults to 5.0.
            updateOnly (bool):  only calculate interactions for new entries.  Defaults to False.
            fmt (str, optional): export file format. Defaults to "pickle".
            indent (int, optional): json format indent. Defaults to 0.

        Returns:
            bool: True for success or False otherwise
        """
        ok = False
        try:
            tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
            tD = self.__calculateNeighbors(distLimit=distLimit,
                                           numProc=self.__numProc,
                                           chunkSize=self.__chunkSize,
                                           updateOnly=updateOnly)
            self.__neighborD = {
                "version": self.__version,
                "created": tS,
                "entries": tD
            }
            kwargs = {
                "indent": indent
            } if fmt == "json" else {
                "pickleProtocol": 4
            }
            targetFilePath = self.__getTargetFilePath(fmt=fmt)
            ok = self.__mU.doExport(targetFilePath,
                                    self.__neighborD,
                                    fmt=fmt,
                                    **kwargs)
            logger.info("Wrote %r status %r", targetFilePath, ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def reload(self, fmt="pickle"):
        self.__neighborD = self.__reload(fmt=fmt, useCache=True)
        return self.__neighborD is not None

    def __reload(self, fmt="pickle", useCache=True):
        """Reload from the current cache file."""
        try:
            targetFilePath = self.__getTargetFilePath(fmt=fmt)
            tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
            neighborD = {
                "version": self.__version,
                "created": tS,
                "entries": {}
            }
            logger.debug("useCache %r targetFilePath %r", useCache,
                         targetFilePath)
            #
            if useCache and self.__mU.exists(targetFilePath):
                neighborD = self.__mU.doImport(targetFilePath, fmt=fmt)
                if fmt != "pickle":
                    for _, nD in neighborD["entries"].items():
                        nD["nearestNeighbors"] = [
                            LigandTargetInstance(*neighbor)
                            for neighbor in nD["nearestNeighbors"]
                        ]
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return neighborD

    def __getTargetFilePath(self, fmt="pickle"):
        ext = "pic" if fmt == "pickle" else "json"
        pth = os.path.join(self.__dirPath, "ligand-target-neighbors",
                           "neighbor-data." + ext)
        return pth

    def __calculateNeighbors(self,
                             distLimit=5.0,
                             numProc=2,
                             chunkSize=10,
                             updateOnly=False):
        """Calculate non-polymer target interactions for all repository structure files.

        Args:
            distLimit (float, optional): interaction distance limit. Defaults to 5.0.
            numProc (int, optional): number of processes to use. Defaults to 2.
            chunkSize (int, optional): incremental chunk size used for distribute work processes. Defaults to 10.

        Returns:
            (dict): {entryId: {asymId: [TargetLigandInteraction()], ...}, ...}
        """
        contentType = "pdbx"
        mergeContent = None
        rD = {}
        exD = {}
        #
        # updateOnly - will reuse any existing data loaded when this is instantiated
        #              otherwise the cache context is cleared before the calculation.
        if updateOnly:
            exD = {k: True for k in self.getEntries()}
            rD = self.__neighborD[
                "entries"] if "entries" in self.__neighborD else {}
        #
        locatorObjList = self.__rpP.getLocatorObjList(
            contentType=contentType,
            mergeContentTypes=mergeContent,
            excludeIds=exD)
        logger.info("Starting with %d numProc %d updateOnly (%r)",
                    len(locatorObjList), self.__numProc, updateOnly)
        #
        rWorker = TargetInteractionWorker(self.__rpP)
        mpu = MultiProcUtil(verbose=True)
        optD = {"distLimit": distLimit}
        mpu.setOptions(optD)
        mpu.set(workerObj=rWorker, workerMethod="build")
        ok, failList, resultList, _ = mpu.runMulti(dataList=locatorObjList,
                                                   numProc=numProc,
                                                   numResults=1,
                                                   chunkSize=chunkSize)
        if failList:
            logger.info("Target interaction build failures (%d): %r",
                        len(failList), failList)
        #
        for (entryId, nD) in resultList[0]:
            rD[entryId] = nD
        #
        logger.info(
            "Completed with multi-proc status %r failures %r total entries with data (%d)",
            ok, len(failList), len(rD))
        return rD

    def toStash(self):
        ok = False
        try:
            userName = self.__cfgOb.get("_STASH_AUTH_USERNAME",
                                        sectionName=self.__configName)
            password = self.__cfgOb.get("_STASH_AUTH_PASSWORD",
                                        sectionName=self.__configName)
            basePath = self.__cfgOb.get("_STASH_SERVER_BASE_PATH",
                                        sectionName=self.__configName)
            url = self.__cfgOb.get("STASH_SERVER_URL",
                                   sectionName=self.__configName)
            urlFallBack = self.__cfgOb.get("STASH_SERVER_FALLBACK_URL",
                                           sectionName=self.__configName)
            ok = self.__toStash(url,
                                basePath,
                                userName=userName,
                                password=password)
            ok = self.__toStash(urlFallBack,
                                basePath,
                                userName=userName,
                                password=password)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def __toStash(self,
                  url,
                  stashRemoteDirPath,
                  userName=None,
                  password=None,
                  remoteStashPrefix=None):
        """Copy tar and gzipped bundled cache data to remote server/location.

        Args:
            url (str): server URL (e.g. sftp://hostname.domain) None for local host
            stashRemoteDirPath (str): path to target directory on remote server
            userName (str, optional): server username. Defaults to None.
            password (str, optional): server password. Defaults to None.
            remoteStashPrefix (str, optional): channel prefix. Defaults to None.

        Returns:
            (bool): True for success or False otherwise
        """
        ok = False
        try:
            stU = StashUtil(os.path.join(self.__dirPath, "stash"),
                            "ligand-target-neighbors")
            ok = stU.makeBundle(self.__dirPath, [self.__stashDir])
            if ok:
                ok = stU.storeBundle(url,
                                     stashRemoteDirPath,
                                     remoteStashPrefix=remoteStashPrefix,
                                     userName=userName,
                                     password=password)
        except Exception as e:
            logger.error("Failing with url %r stashDirPath %r: %s", url,
                         stashRemoteDirPath, str(e))
        return ok

    def fromStash(self):
        try:
            minCount = 10
            userName = self.__cfgOb.get("_STASH_AUTH_USERNAME",
                                        sectionName=self.__configName)
            password = self.__cfgOb.get("_STASH_AUTH_PASSWORD",
                                        sectionName=self.__configName)
            basePath = self.__cfgOb.get("_STASH_SERVER_BASE_PATH",
                                        sectionName=self.__configName)
            url = self.__cfgOb.get("STASH_SERVER_URL",
                                   sectionName=self.__configName)
            #
            ok = self.__fromStash(url,
                                  basePath,
                                  userName=userName,
                                  password=password)
            ok = self.reload()
            ok = self.testCache(minCount=minCount)
            if not ok:
                urlFallBack = self.__cfgOb.get("STASH_SERVER_FALLBACK_URL",
                                               sectionName=self.__configName)
                ok = self.__fromStash(urlFallBack,
                                      basePath,
                                      userName=userName,
                                      password=password)
                ok = self.testCache(minCount=minCount)
                ok = self.reload()
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return ok

    def __fromStash(self,
                    url,
                    stashRemoteDirPath,
                    userName=None,
                    password=None,
                    remoteStashPrefix=None):
        """Restore local cache from a tar and gzipped bundle to fetched from a remote server/location.

        Args:
            url (str): server URL (e.g. sftp://hostname.domain) None for local host
            stashRemoteDirPath (str): path to target directory on remote server
            userName (str, optional): server username. Defaults to None.
            password (str, optional): server password. Defaults to None.
            remoteStashPrefix (str, optional): channel prefix. Defaults to None.

        Returns:
            (bool): True for success or False otherwise
        """
        ok = False
        try:
            stU = StashUtil(os.path.join(self.__dirPath, "stash"),
                            "ligand-target-neighbors")
            ok = stU.fetchBundle(self.__dirPath,
                                 url,
                                 stashRemoteDirPath,
                                 remoteStashPrefix=remoteStashPrefix,
                                 userName=userName,
                                 password=password)
        except Exception as e:
            logger.error("Failing with url %r stashDirPath %r: %s", url,
                         stashRemoteDirPath, str(e))
        return ok

    def convert(self, fmt1="json", fmt2="pickle"):
        #
        targetFilePath = self.__getTargetFilePath(fmt=fmt1)
        self.__neighborD = self.__mU.doImport(targetFilePath, fmt=fmt1)
        #
        targetFilePath = self.__getTargetFilePath(fmt=fmt2)
        ok = self.__mU.doExport(targetFilePath,
                                self.__neighborD,
                                fmt=fmt2,
                                pickleProtocol=4)
        return ok

Пример #13

Показать файл

Файл: EntityInstanceExtractor.py Проект: rcsb/py-rcsb_exdb

class EntityInstanceExtractor(object):
    """Selected utilities to extract data from entity instance collections.

    >>> from operator import itemgetter
    >>>
    >>> seq2 = [1, 2, 4, 5, 6, 8, 9, 10]
    >>> list = []
    >>> for k, g in groupby(enumerate(seq2), lambda (i,x):i-x):
    ...     list.append(map(itemgetter(1), g))
    ...
    >>> print list
    [[1, 2], [4, 5, 6], [8, 9, 10]]
    Or as a list comprehension:

    >>> [map(itemgetter(1), g) for k, g in groupby(enumerate(seq2), lambda (i,x):i-x)]
    [[1, 2], [4, 5, 6], [8, 9, 10]]


    ##
    ##

    import numpy as np

    def main():
        # Generate some random data
        x = np.cumsum(np.random.random(1000) - 0.5)
        condition = np.abs(x) < 1

        # Print the start and stop indicies of each region where the absolute
        # values of x are below 1, and the min and max of each of these regions
        for start, stop in contiguous_regions(condition):
            segment = x[start:stop]
            print start, stop
            print segment.min(), segment.max()

    import numpy as np

    Samples = np.array([[1, 2, 3],
                       [1, 2]])
    c = np.hstack(Samples)  # Will gives [1,2,3,1,2]
    mean, std = np.mean(c), np.std(c)
    newSamples = np.asarray([(np.array(xi)-mean)/std for xi in Samples])
    print newSamples

    """
    def __init__(self, cfgOb):
        self.__cfgOb = cfgOb
        self.__resourceName = "MONGO_DB"
        #
        self.__seqCache = {}
        self.__mU = MarshalUtil()
        #

    def getEntryInfo(self, **kwargs):
        """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)"""

        resLimit = kwargs.get("resLimit", 3.5)
        expMethod = kwargs.get("expMethod", "X-ray")
        #
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        #
        entryD = {}
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    qD = {
                        "rcsb_entry_info.experimental_method": expMethod,
                        "refine.0.ls_d_res_high": {
                            "$lte": resLimit
                        }
                    }
                    selectL = [
                        "rcsb_entry_container_identifiers", "rcsb_entry_info",
                        "refine"
                    ]
                    dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
                    logger.info("Selection %r fetch result count %d", selectL,
                                len(dL))
                    #
                    for dV in dL:
                        if "rcsb_entry_container_identifiers" not in dV:
                            continue
                        entryId = dV["rcsb_entry_container_identifiers"][
                            "entry_id"]
                        entryD[entryId] = {}
                        if "rcsb_entry_info" in dV and "polymer_composition" in dV[
                                "rcsb_entry_info"]:
                            entryD[entryId] = {
                                "polymer_composition":
                                dV["rcsb_entry_info"]["polymer_composition"],
                                "experimental_method":
                                dV["rcsb_entry_info"]["experimental_method"],
                            }
                        if "refine" in dV and dV[
                                "refine"] and "ls_d_res_high" in dV["refine"][
                                    0]:
                            entryD[entryId]["ls_d_res_high"] = dV["refine"][0][
                                "ls_d_res_high"]
                            logger.debug("Got res %r",
                                         dV["refine"][0]["ls_d_res_high"])

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD
        #

    def getEntityIds(self, entryIdList):
        """ """
        dbName = "pdbx_core"
        collectionName = "pdbx_core_polymer_entity"
        docD = {}
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    for entryId in entryIdList:
                        qD = {
                            "rcsb_polymer_entity_container_identifiers.entry_id":
                            entryId
                        }
                        selectL = ["rcsb_polymer_entity_container_identifiers"]
                        tL = mg.fetch(dbName,
                                      collectionName,
                                      selectL,
                                      queryD=qD)
                        #
                        logger.debug("Selection %r fetch result count %d",
                                     selectL, len(tL))
                        docD[entryId] = [
                            vv["rcsb_polymer_entity_container_identifiers"]
                            for vv in tL
                        ]
            logger.debug("docD is %r", docD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return docD

    def getPolymerEntities(self, entryD, **kwargs):
        """Add 'selected_polymer_entities' satisfying the input contiditions and add this to the input entry dictionary."""
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName",
                                    "pdbx_core_polymer_entity")
        resultKey = kwargs.get("resultKey", "selected_polymer_entities")
        savePath = kwargs.get("savePath", "entry-data.pic")
        entryLimit = kwargs.get("entryLimit", None)
        saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"})
        #
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    selectL = [
                        "rcsb_polymer_entity_container_identifiers",
                        "entity_poly.type",
                        "entity_poly.pdbx_seq_one_letter_code_can",
                        "rcsb_entity_source_organism.ncbi_taxonomy_id",
                        "rcsb_entity_source_organism.ncbi_scientific_name",
                        "struct_ref.pdbx_seq_one_letter_code",
                        "struct_ref.pdbx_db_accession",
                        "struct_ref.db_name",
                        "struct_ref.entity_id",
                    ]
                    iCount = 0
                    for entryId in entryD:
                        #
                        if resultKey in entryD[entryId]:
                            continue
                        #
                        qD = {
                            "rcsb_polymer_entity_container_identifiers.entry_id":
                            entryId,
                            "entity_poly.rcsb_entity_polymer_type": "Protein",
                            "entity.rcsb_multiple_source_flag": "N",
                        }
                        #
                        dL = mg.fetch(dbName,
                                      collectionName,
                                      selectL,
                                      queryD=qD)
                        logger.debug("%s query %r fetch result count %d",
                                     entryId, qD, len(dL))
                        eD = {}
                        for ii, dV in enumerate(dL, 1):
                            rD = {}
                            logger.debug("%s (%4d) d is %r", entryId, ii, dV)
                            if "rcsb_polymer_entity_container_identifiers" in dV and "asym_ids" in dV[
                                    "rcsb_polymer_entity_container_identifiers"]:
                                rD["asym_ids"] = dV[
                                    "rcsb_polymer_entity_container_identifiers"][
                                        "asym_ids"]
                                rD["entity_id"] = dV[
                                    "rcsb_polymer_entity_container_identifiers"][
                                        "entity_id"]
                            if "entity_poly" in dV and "type" in dV[
                                    "entity_poly"]:
                                rD["type"] = dV["entity_poly"]["type"]
                                rD["seq_one_letter_code_can"] = dV[
                                    "entity_poly"][
                                        "pdbx_seq_one_letter_code_can"]

                            if "rcsb_entity_source_organism" in dV:
                                rD["ncbi_taxonomy_id"] = dV[
                                    "rcsb_entity_source_organism"][0][
                                        "ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in dV[
                                            "rcsb_entity_source_organism"][
                                                0] else None
                                rD["ncbi_scientific_name"] = (
                                    dV["rcsb_entity_source_organism"][0]
                                    ["ncbi_scientific_name"]
                                    if "ncbi_scientific_name"
                                    in dV["rcsb_entity_source_organism"][0]
                                    else None)

                            if "struct_ref" in dV and len(
                                    dV["struct_ref"]) == 1:
                                rD["seq_one_letter_code_ref"] = dV["struct_ref"][
                                    0]["pdbx_seq_one_letter_code"] if "pdbx_seq_one_letter_code" in dV[
                                        "struct_ref"][0] else None
                                rD["db_accession"] = dV["struct_ref"][0][
                                    "pdbx_db_accession"] if "pdbx_db_accession" in dV[
                                        "struct_ref"][0] else None
                                rD["db_name"] = dV["struct_ref"][0][
                                    "db_name"] if "db_name" in dV[
                                        "struct_ref"][0] else None
                                #
                                refDbName = rD["db_name"]
                                dbAccession = rD["db_accession"]
                                dbRefSeq = self.__seqCache[
                                    dbAccession] if dbAccession in self.__seqCache else None

                                if refDbName in ["UNP"] and not dbRefSeq:
                                    dbRefSeq = self.__fetchUniprot(dbAccession)
                                    self.__seqCache[dbAccession] = dbRefSeq
                                    logger.debug("Fetch uniprot %r", dbRefSeq)
                                rD["ref_db_seq"] = dbRefSeq
                            else:
                                rD["seq_one_letter_code_ref"] = rD[
                                    "db_accession"] = rD["db_name"] = None
                            #
                            if "entity_id" in rD:
                                eD[rD["entity_id"]] = copy.copy(rD)

                        entryD[entryId][resultKey] = copy.copy(eD)

                        iCount += 1
                        if iCount % 10 == 0:
                            logger.info(
                                "Completed polymer entities fetch %d/%d entries",
                                iCount, len(entryD))
                        if iCount % 2000 == 0:
                            ok = self.__mU.doExport(savePath, entryD,
                                                    **saveKwargs)
                            logger.info(
                                "Saved polymer entity results (%d) status %r in %s",
                                iCount, ok, savePath)
                        if entryLimit and iCount >= entryLimit:
                            logger.info("Quitting after %d", iCount)
                            break
            #
            # for entryId in entryD:
            #    logger.debug(">>  %s docD  %r" % (entryId, entryD[entryId]))
            ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
            logger.info(
                "Saved polymer entity results (%d) entries %d status %r in %s",
                iCount, len(entryD), ok, savePath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD

    def getEntityInstances(self, entryD, **kwargs):
        """Get the selected validation data for the instances in the input entry dictionary.

        entryD[entryId]['selected_polymer_entities'][entityId]['validation'] = {}

        Add keys: 'pdbx_vrpt_instance_results'  and  'pdbx_unobs_or_zero_occ_residues' to the validation dictionary above.

        Args:
            resourceName (str):  resource name (e.g. DrugBank, CCDC)
            **kwargs: unused

        Returns:
            entryD: { }
        """
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName",
                                    "pdbx_core_polymer_entity_instance")
        savePath = kwargs.get("savePath", "entry-data.pic")
        saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"})
        entryLimit = kwargs.get("entryLimit", None)
        #
        try:
            optF = False
            iCount = 0
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s total document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    #
                    for entryId, dV in entryD.items():
                        for entityId, peD in dV[
                                "selected_polymer_entities"].items():
                            # if 'anal_instances' in peD:
                            #    continue
                            vD = {}
                            for asymId in peD["asym_ids"]:
                                qD = {
                                    "rcsb_polymer_entity_instance_container_identifiers.entry_id":
                                    entryId,
                                    "rcsb_polymer_entity_instance_container_identifiers.asym_id":
                                    asymId,
                                }
                                # qD = {'rcsb_entity_instance_container_validation_identifiers.entity_type': 'polymer'}
                                # selectL = ['pdbx_vrpt_instance_results', 'pdbx_unobs_or_zero_occ_residues']
                                selectL = ["pdbx_vrpt_instance_results"]
                                tL = mg.fetch(dbName,
                                              collectionName,
                                              selectL,
                                              queryD=qD)
                                dV = {}
                                if not tL:
                                    logger.info(
                                        "No validation data for %s %s %s(%s)",
                                        dbName, collectionName, entryId,
                                        asymId)
                                    continue
                                #
                                logger.debug(
                                    ">>> %s %s (%s) dict key length %d ",
                                    collectionName, entryId, asymId,
                                    len(tL[0]))

                                #
                                if optF:
                                    dV["pdbx_vrpt_instance_results"] = tL[0][
                                        "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[
                                            0] else []
                                    dV["pdbx_unobs_or_zero_occ_residues"] = tL[0][
                                        "pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[
                                            0] else []
                                #
                                if optF:
                                    urdL = tL[0][
                                        "pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[
                                            0] else []
                                    oL = [{
                                        "label_seq_id": urd["label_seq_id"],
                                        "label_comp_id": urd["label_comp_id"]
                                    } for urd in urdL]
                                    dV["pdbx_unobs_or_zero_occ_residues"] = oL
                                #
                                try:
                                    irdL = tL[0][
                                        "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[
                                            0] else []
                                    oL = [{
                                        "label_seq_id": ird["label_seq_id"],
                                        "label_comp_id": ird["label_comp_id"]
                                    } for ird in irdL]
                                    dV["pdbx_vrpt_instance_results_seq"] = oL
                                except Exception as e:
                                    logger.error(
                                        "Failing with entryId %s entityId %s asymId %s bad validation data %s",
                                        entryId, entityId, asymId, str(e))

                                #
                                try:
                                    irdL = tL[0][
                                        "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[
                                            0] else []
                                    oL = [{
                                        "OWAB": ird["OWAB"],
                                        "label_seq_id": ird["label_seq_id"],
                                        "label_comp_id": ird["label_comp_id"]
                                    } for ird in irdL]
                                    dV["pdbx_vrpt_instance_results_occ"] = oL
                                except Exception as e:
                                    logger.debug(
                                        "Failing with entryId %s entityId %s asymId %s bad validation data %s",
                                        entryId, entityId, asymId, str(e))

                                vD[asymId] = copy.copy(dV)
                                #
                            analD = self.analEntity(entryId, peD, vD)
                            entryD[entryId]["selected_polymer_entities"][
                                entityId]["anal_instances"] = copy.copy(analD)
                        iCount += 1
                        if iCount % 500 == 0:
                            logger.info("Completed %d/%d entries", iCount,
                                        len(entryD))
                        if iCount % 2000 == 0:
                            ok = self.__mU.doExport(savePath, entryD,
                                                    **saveKwargs)
                            logger.info(
                                "Saved polymer entity instance results (%d) status %r in %s",
                                iCount, ok, savePath)
                        if entryLimit and iCount >= entryLimit:
                            break
            ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
            logger.info(
                "Saved polymer instance results (%d) entries %d status %r in %s",
                iCount, len(entryD), ok, savePath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD

    def analEntity(self, entryId, entityD, vD, **kwargs):
        """

        {'polymer_composition': 'protein/NA', 'experimental_method': 'X-ray',
        'selected_polymer_entities': {'1': {'asym_ids': ['D', 'C', 'E', 'A', 'B', 'F'],
                   'entity_id': '1', 'type': 'polypeptide(L)',
                   'seq_one_letter_code_can': 'MAKGQSLQDPFLNALRRERVPVSIYLVNGIKLQGQIESFDQFVILLKNTVSQMVYKHAISTVVPS',
                   'ncbi_taxonomy_id': 511693,
                    'ncbi_scientific_name': 'Escherichia coli BL21',
                    'seq_one_letter_code_ref': 'MAKGQSLQDPFLNALRRERVPVSIYLVNGIKLQGQIESFDQFVILLKNTVSQMVYKHAISTVVPS',
                    'db_accession': 'C5W5L7',
                    'db_name': 'UNP',
                    'validation': {'D': {'pdbx_vrpt_instance_results': [{'OWAB': 29.45, 'label_seq_id': 5, 'label_comp_id': 'GLN'},
                                                                            {'OWAB': 26.12, 'label_seq_id': 6, 'label_comp_id': 'SER'},
                                                                            {'OWAB': 22.72, 'label_seq_id': 7, 'label_comp_id': 'LEU'},
                                                                            {'OWAB': 14.56, 'label_seq_id': 8, 'label_comp_id': 'GLN'},
                                                                            {'OWAB': 19.18, 'label_seq_id': 9, 'label_comp_id': 'ASP'},
                                                                            {'OWAB': 16.56, 'label_seq_id': 10, 'label_comp_id': 'PRO'},
                                                                            {'OWAB': 14.78, 'label_seq_id': 11, 'label_comp_id': 'PHE'},
                                                                            {'OWAB': 11.2, 'label_seq_id': 12, 'label_comp_id': 'LEU'}, }}...]

                                        'pdbx_unobs_or_zero_occ_residues': [{'label_seq_id': 1, 'label_comp_id': 'MET'},
                                               {'label_seq_id': 2, 'label_comp_id': 'ALA'},
                                                {'label_seq_id': 3, 'label_comp_id': 'LYS'},
                                                 {'label_seq_id': 4, 'label_comp_id': 'GLY'}]}

        """
        _ = kwargs
        analD = {}
        try:
            entityId = entityD["entity_id"]
            asymIdL = entityD["asym_ids"]

            refSeq = entityD[
                "seq_one_letter_code_ref"] if "seq_one_letter_code_ref" in entityD else None
            entitySeq = entityD[
                "seq_one_letter_code_can"] if "seq_one_letter_code_can" in entityD else None
            # -------
            # Get UniProt
            #
            dbName = entityD["db_name"] if "db_name" in entityD else None
            dbAccession = entityD[
                "db_accession"] if "db_accession" in entityD else None
            dbRefSeq = entityD[
                "ref_db_seq"] if "ref_db_seq" in entityD else None
            # --
            if dbRefSeq:
                logger.debug("%s (%s) ref db %4d:  %r", dbAccession, dbName,
                             len(dbRefSeq), dbRefSeq)
            if refSeq:
                logger.debug("%s (%s) seq ref pdb %4d:  %r", dbAccession,
                             dbName, len(refSeq), refSeq)
            if entitySeq:
                logger.debug("%s (%s) entity sample %4d:  %r", dbAccession,
                             dbName, len(entitySeq), entitySeq)
            #
            lenRefDbSeq = len(dbRefSeq) if dbRefSeq else None
            lenEntitySeq = len(entitySeq)
            # sampleSeqCov = 1.0 - float(lenRefDbSeq - lenEntitySeq) / float(lenRefDbSeq) if lenRefDbSeq else None
            #

            # -
            for asymId in asymIdL:
                if asymId not in vD:
                    logger.error("Missing validation data for %s %s %s",
                                 entryId, entityId, asymId)
                    continue
                #
                irDL = vD[asymId][
                    "pdbx_vrpt_instance_results_seq"] if "pdbx_vrpt_instance_results_seq" in vD[
                        asymId] else []
                lsL = list(set([dV["label_seq_id"] for dV in irDL]))
                lenInstanceSeq = len(lsL)

                instRefDbSeqCov = 1.0 - float(
                    lenRefDbSeq - lenInstanceSeq) / float(
                        lenRefDbSeq) if lenRefDbSeq else None
                instSampleSeqCov = 1.0 - float(
                    lenEntitySeq - lenInstanceSeq) / float(lenEntitySeq)
                #
                occDL = vD[asymId][
                    "pdbx_vrpt_instance_results_occ"] if "pdbx_vrpt_instance_results_occ" in vD[
                        asymId] else []
                # average the
                owabRegD = {}
                if occDL:
                    owabD = {}
                    for dV in occDL:
                        owabD.setdefault(dV["label_seq_id"],
                                         []).append(dV["OWAB"])
                    #
                    # logger.info("owabD %r" % owabD)
                    meanOwabD = {k: mean(v) for k, v in owabD.items()}
                    meanOwab = mean(meanOwabD.values())
                    stdevOwab = stdev(meanOwabD.values())
                    #
                    logger.debug(
                        ">> Length of B values list %d mean %.3f stdev %.3f",
                        len(meanOwabD), meanOwab, stdevOwab)
                    #
                    meanOwabA = np.array(list(meanOwabD.values()))
                    #
                    condition = meanOwabA > (meanOwab + meanOwab)
                    regL = self.__contiguousRegions(condition)
                    for ii, (start, stop) in enumerate(regL, 1):
                        segment = meanOwabA[start:stop]
                        logger.debug(
                            "B value range =  start %d stop %d min %.3f max %.3f",
                            start, stop, segment.min(), segment.max())
                        owabRegD[ii] = {
                            "length": stop - start + 1,
                            "occ_min": segment.min(),
                            "occ_max": segment.max()
                        }

                #
                #
                # if False:
                #    uDL = vD[asymId]['pdbx_unobs_or_zero_occ_residues'] if 'pdbx_unobs_or_zero_occ_residues' in vD[asymId] else []
                #    unobsL = [d['label_seq_id'] for d in uDL]
                #
                # segL = []
                # for k, g in groupby(enumerate(lsL), lambda x: x[0] - x[1]):
                #    logger.info(" Segment entryId %s entityId %s asymId %s:  %r" % (entryId, entityId, asymId, list(map(itemgetter(1), g))))
                #
                # for k, g in groupby(enumerate(lsL), lambda(i, x): i - x):
                #    logger.info(" entryId %s entityId %s asymId %s:  %r" % (entryId, entityId, asymId, list(map(itemgetter(1), g)))

                segL = [
                    list(map(itemgetter(1), g))
                    for _, g in groupby(enumerate(lsL), lambda x: x[0] - x[1])
                ]
                logger.debug("Modeled sequence length %d segments %d",
                             len(lsL), len(segL))
                #
                gapD = {}
                for ii in range(1, len(segL)):
                    bG = segL[ii - 1][-1]
                    eG = segL[ii][0]
                    gapD[ii] = eG - bG - 1
                    logger.debug("Gap %d length %d", ii, gapD[ii])
                #
                #
                if instRefDbSeqCov:
                    logger.debug(
                        "Summary %s %s %s refcov %.2f  sampleCov %.2f - gaps (%d) %r owabs seqments (%d) %r",
                        entryId,
                        entityId,
                        asymId,
                        instRefDbSeqCov,
                        instSampleSeqCov,
                        len(gapD),
                        list(gapD.values()),
                        len(owabRegD),
                        list(owabRegD.values()),
                    )
                else:
                    logger.debug(
                        "Summary %s %s %s sampleCov %.2f - gaps (%d) %r owabs seqments (%d) %r",
                        entryId,
                        entityId,
                        asymId,
                        instSampleSeqCov,
                        len(gapD),
                        list(gapD),
                        len(owabRegD),
                        list(owabRegD.values()),
                    )
                #
                analD[asymId] = {
                    "coverage_inst_refdb": instRefDbSeqCov,
                    "coverage_inst_entity": instSampleSeqCov,
                    "gapD": copy.copy(gapD),
                    "owabRegiond": copy.copy(owabRegD)
                }
                logger.debug("entry %s entity %s analD %r", entryId, entityId,
                             analD)
        except Exception as e:
            logger.exception("%s failing with %s", entryId, str(e))
        #
        return analD

    def __getSegments(self, values):
        xV = np.asarray(values)
        # Generate some random data
        # x = np.cumsum(np.random.random(1000) - 0.5)
        #
        condition = np.abs(xV) < 1

        # Print the start and stop indicies of each region where the absolute
        # values of x are below 1, and the min and max of each of these regions
        for start, stop in self.__contiguousRegions(condition):
            segment = xV[start:stop]
            print(start, stop)
            print(segment.min(), segment.max())

    def __contiguousRegions(self, condition):
        """Finds contiguous True regions of the boolean array "condition.

        Returns a 2D array where the first column is the start index of the region and the
        second column is the end index.

        """

        # Find the indicies of changes in "condition"
        dV = np.diff(condition)
        (idx, ) = dV.nonzero()

        # We need to start things after the change in "condition". Therefore,
        # we'll shift the index by 1 to the right.
        idx += 1

        if condition[0]:
            # If the start of condition is True prepend a 0
            idx = np.r_[0, idx]

        if condition[-1]:
            # If the end of condition is True, append the length of the array
            idx = np.r_[idx, condition.size]  # Edit

        # Reshape the result into two columns
        idx.shape = (-1, 2)
        return idx

    def __window(self, seq, num=2):
        """Returns a sliding window (of width n) over data from the iterable
        s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
        """
        it = iter(seq)
        result = tuple(islice(it, num))
        if len(result) == num:
            yield result
        for elem in it:
            result = result[1:] + (elem, )
            yield result

    def missingElements(self, lV):
        missing = chain.from_iterable(
            range(x + 1, y) for x, y in self.__window(lV) if (y - x) > 1)
        return list(missing)

    def __fetchUniprot(self, uniProtId):
        baseUrl = "http://www.uniprot.org"
        wsEndPoint = "/uniprot/"
        fS = ""
        try:
            fullUrl = baseUrl + wsEndPoint + uniProtId + ".fasta"
            result = requests.get(fullUrl)
            if result.ok:
                fL = result.text.split("\n")
                fS = "".join(fL[1:])
            else:
                logger.error("UniProt Fasta request for %s returns status %r",
                             uniProtId, result.status_code)
        except Exception as e:
            logger.error("Failing request for %s with %s", uniProtId, str(e))
        return fS

Пример #14

Показать файл

class ChEMBLTargetMechanismProvider(StashableBase):
    """Accessors for ChEMBL target mechanism data."""
    def __init__(self, cachePath, useCache):
        #
        self.__cachePath = cachePath
        self.__dirName = "ChEMBL-target-mechanism"
        super(ChEMBLTargetMechanismProvider,
              self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        baseVersion = 28
        self.__version = baseVersion
        logger.info("ChEMBL API MAX_LIMIT %r", Settings.Instance().MAX_LIMIT)  # pylint: disable=no-member
        self.__aD = self.__reload(self.__dirPath, useCache)

    def testCache(self, minCount=0):
        if minCount == 0:
            return True
        if self.__aD and (len(self.__aD) > minCount):
            logger.info("Mechanism data for (%d) targets", len(self.__aD))
            return True
        return False

    def getAssignmentVersion(self):
        return self.__version

    def getTargetMechanismDataPath(self):
        return os.path.join(self.__dirPath,
                            "chembl-target-mechanism-data.json")

    def __reload(self, dirPath, useCache):
        startTime = time.time()
        aD = {}
        fU = FileUtil()
        fU.mkdir(dirPath)
        targetMechanismFilePath = self.getTargetMechanismDataPath()
        #
        if useCache and fU.exists(targetMechanismFilePath):
            logger.info("useCache %r using %r", useCache,
                        targetMechanismFilePath)
            qD = self.__mU.doImport(targetMechanismFilePath, fmt="json")
            aD = qD["mechanism"] if "mechanism" in qD else {}
        #
        logger.info("Completed reload of (%d) at %s (%.4f seconds)", len(aD),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    time.time() - startTime)
        #
        return aD

    def getTargetMechanisms(self, targetChEMBLId):
        try:
            return self.__aD[
                targetChEMBLId] if targetChEMBLId in self.__aD else []
        except Exception:
            return []

    def hasTargetMechanism(self, targetChEMBLId):
        try:
            return targetChEMBLId in self.__aD
        except Exception:
            return False

    def fetchTargetMechanismData(self,
                                 targetChEMBLIdList,
                                 skipExisting=True,
                                 chunkSize=50):
        """Get cofactor mechanism data for the input ChEMBL target list.

        Args:
            targetChEMBLIdList (list): list of ChEMBL target identifiers
            skipExisting (bool, optional): reuse any existing cached data (default: True)
            chunkSize(int, optional): ChEMBL API batch size for fetches (default: 50)

        Returns:
          bool:  True for success or False otherwise

        """
        atL = [
            "action_type",
            "molecule_chembl_id",
            "action_type",
            "mechanism_of_action",
            "max_phase",
            "target_chembl_id",
        ]
        targetD = self.__aD if self.__aD else {}
        idList = []
        if skipExisting:
            for tId in targetChEMBLIdList:
                if tId in self.__aD:
                    continue
                idList.append(tId)
        else:
            idList = targetChEMBLIdList

        numToProcess = len(idList)
        logger.info("Fetching mechanism data for (%d/%d)", numToProcess,
                    len(targetChEMBLIdList))
        ok = False
        try:
            for ii in range(0, len(idList), chunkSize):
                logger.info("Begin chunk at ii %d/%d", ii, numToProcess)
                mch = new_client.mechanism  # pylint: disable=no-member
                mch.set_format("json")
                mDL = mch.filter(
                    target_chembl_id__in=idList[ii:ii + chunkSize]).only(atL)

                logger.info("Results (%d)", len(mDL))
                if mDL:
                    for mD in mDL:
                        targetD.setdefault(mD["target_chembl_id"], []).append(
                            self.__mechanismSelect(atL, mD))
                #
                logger.info("Completed chunk starting at (%d)", ii)
                tS = datetime.datetime.now().isoformat()
                vS = datetime.datetime.now().strftime("%Y-%m-%d")
                ok = self.__mU.doExport(self.getTargetMechanismDataPath(), {
                    "version": vS,
                    "created": tS,
                    "mechanism": targetD
                },
                                        fmt="json",
                                        indent=3)
                logger.info("Wrote completed chunk starting at (%d) (%r)", ii,
                            ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def __mechanismSelect(self, atL, aD):
        return {at: aD[at] if at in aD else None for at in atL}

Пример #15

Показать файл

class ValidationReportSchemaUtilsTests(unittest.TestCase):
    def setUp(self):
        self.__dirPath = os.path.join(os.path.dirname(TOPDIR), "rcsb",
                                      "mock-data")
        self.__xsdPath = os.path.join(HERE, "test-data",
                                      "wwpdb_validation_v004.xsd")
        self.__dictPath = os.path.join(HERE, "test-output",
                                       "vrpt_mmcif_ext_v4.dic")
        self.__dictStaticPath = os.path.join(HERE, "test-data",
                                             "em_validation_ext_v4.dic")
        #
        # This schema mapping file is used by the XML report data file reader.
        self.__dictionaryMapPath = os.path.join(HERE, "test-output",
                                                "vrpt_dictmap_v4.json")
        self.__dictionaryMapCsvPath = os.path.join(HERE, "test-output",
                                                   "vrpt_dictmap_v4.csv")
        self.__mU = MarshalUtil()

    def tearDown(self):
        pass

    def testProcessXsdSchema(self):
        vrsu = ValidationReportSchemaUtils()
        sObj = vrsu.readSchema(self.__xsdPath, verbose=False)
        logger.debug("Returns type %r", type(sObj))
        logger.debug("Schema category length %d", len(sObj))
        ok = self.__mU.doExport(os.path.join(HERE, "test-output",
                                             "schema-object.json"),
                                sObj,
                                fmt="json",
                                indent=3)

        # import static definitions -
        scL = self.__mU.doImport(self.__dictStaticPath, fmt="mmcif-dict")
        logger.info("Static definition count %d", len(scL))
        #
        cL = vrsu.buildDictionary(sObj)
        logger.info("Generated definition count %d", len(cL))
        #
        cL.extend(scL)
        ok = self.__mU.doExport(self.__dictPath, cL, fmt="mmcif-dict")
        self.assertTrue(ok)
        #
        dictionaryMap = vrsu.getDictionaryMap(sObj)
        ok = self.__mU.doExport(self.__dictionaryMapPath,
                                dictionaryMap,
                                fmt="json")
        self.assertTrue(ok)
        #
        self.assertTrue("attributes" in dictionaryMap)
        self.assertTrue(len(dictionaryMap["attributes"]) > 420)

    def testExportMapping(self):
        """Export schema correspondences as CSV."""
        vrsu = ValidationReportSchemaUtils()
        sObj = vrsu.readSchema(self.__xsdPath)
        dictionaryMap = vrsu.getDictionaryMap(sObj)
        logger.info("Attribute count %d", len(dictionaryMap["attributes"]))
        rL = []
        for ky, dD in dictionaryMap["attributes"].items():
            kyL = ky.split("|")
            catN = kyL[0]
            atN = kyL[1]
            row = {
                "xml_el": catN,
                "xml_at": atN,
                "mmcif_cat": dD["cat"],
                "mmcif_at": dD["at"]
            }
            rL.append(row)
        #
        #
        self.__mU.doExport(self.__dictionaryMapCsvPath, rL, fmt="csv")

Пример #16

Показать файл

Файл: ChemCompModelGen.py Проект: rcsb/py-rcsb_ccmodels

    def buildSearchFiles(self, **kwargs):
        """Build cif, sdf (optional), and mol2 files for components in the chemical component search index.
           Exclude ions or other extraneous molecules lacking bonds.

        Args:
            ccUrlTarget (str): locator for source chemical component dictionary (default: full public dictionary)
            birdUrlTarget (str): locator for source BIRD dictionary (default: full public dictionary)
            limitPerceptions (bool): restrict automatic perceptions in OE molecular build operations (default: False)
            numProc (int): number of processors
            useCache (bool): use existing resource file where possible (default: True)
            molLimit (str):  limit the number to ingested chemical compont (default: None)
            quietFlag (bool): suppress output in OE library operations (default: True)

        Returns:
            (int): number molfiles generated
        """
        cachePath = self.__cachePath
        ccUrlTarget = kwargs.get("ccUrlTarget", None)
        birdUrlTarget = kwargs.get("birdUrlTarget", None)
        molLimit = kwargs.get("molLimit", None)
        quietFlag = kwargs.get("quietFlag", True)
        fpTypeList = kwargs.get("fpTypeList", [])
        screenTypeList = kwargs.get("screenTypeList", [])
        ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc-full"
        oeFileNamePrefix = "oe-%s" % self.__prefix if self.__prefix else "oe-cc-full"
        numProc = kwargs.get("numProc", 2)
        minCount = kwargs.get("minCount", 0)
        useCache = kwargs.get("useCache", True)
        useSdf = kwargs.get("useSdf", True)
        useMol2 = kwargs.get("useMol2", False)
        limitPerceptions = kwargs.get("limitPerceptions", False)
        logSizes = False
        #
        startTime = time.time()
        ccmP = ChemCompMoleculeProvider(cachePath=cachePath,
                                        useCache=useCache,
                                        ccFileNamePrefix=ccFileNamePrefix,
                                        ccUrlTarget=ccUrlTarget,
                                        birdUrlTarget=birdUrlTarget,
                                        molLimit=molLimit)
        ok = ccmP.testCache(minCount=minCount, logSizes=logSizes)
        logger.info(
            "Completed chemical component provider load %r (%.4f seconds)", ok,
            time.time() - startTime)
        #
        startTime = time.time()
        oesmp = OeSearchMoleculeProvider(
            ccUrlTarget=ccUrlTarget,
            birdUrlTarget=birdUrlTarget,
            cachePath=cachePath,
            ccFileNamePrefix=ccFileNamePrefix,
            oeFileNamePrefix=oeFileNamePrefix,
            useCache=useCache,
            quietFlag=quietFlag,
            fpTypeList=fpTypeList,
            screenTypeList=screenTypeList,
            numProc=numProc,
            molLimit=molLimit,
            limitPerceptions=limitPerceptions,
        )
        ok = oesmp.testCache()
        logger.info("Completed OE molecule provider load %r (%.4f seconds)",
                    ok,
                    time.time() - startTime)
        #
        startTime = time.time()
        ccSIdxP = ChemCompSearchIndexProvider(
            cachePath=cachePath,
            useCache=useCache,
            ccFileNamePrefix=ccFileNamePrefix,
            limitPerceptions=limitPerceptions,
            numProc=numProc)
        ok = ccSIdxP.testCache()
        logger.info(
            "Completed chemical component search index load %r (%.4f seconds)",
            ok,
            time.time() - startTime)
        #
        ccSIdx = ccSIdxP.getIndex() if ccSIdxP and ok else {}
        logger.info("Search index status %r index length %d", ok, len(ccSIdx))
        #
        ccIdD = {}
        mU = MarshalUtil()
        oeU = OeIoUtils(dirPath=cachePath)
        numMols = 0
        searchFileDirPath = self.getSearchDirFilePath()
        pathTupList = []
        for sId in ccSIdx:
            ccId = sId.split("|")[0]
            # standard CIF definition
            if ccId not in ccIdD:
                cifPath = os.path.join(searchFileDirPath, ccId[0], ccId,
                                       ccId + ".cif")
                if not (useCache and mU.exists(cifPath)):
                    ccMol = ccmP.getMol(ccId)
                    if not self.__checkCif(ccMol):
                        continue
                    mU.doExport(cifPath, [ccMol], fmt="mmcif")
            #
            oeMol = oesmp.getMol(sId)
            if not self.__checkOeMol(oeMol):
                continue
            #
            # Sanity checks on the generated OE molecule
            #
            cifPath = os.path.join(searchFileDirPath, ccId[0], ccId,
                                   sId + ".cif")
            if sId != ccId and not (useCache and mU.exists(cifPath)):
                oeccU = OeChemCompUtils()
                ok = oeccU.addOeMol(sId,
                                    oeMol,
                                    missingModelXyz=True,
                                    writeIdealXyz=False)
                if ok:
                    oeccU.write(cifPath)

            if useSdf:
                molFilePath = os.path.join(searchFileDirPath, ccId[0], ccId,
                                           sId + ".sdf")
                if not (useCache and mU.exists(molFilePath)):
                    ok = oeU.write(molFilePath,
                                   oeMol,
                                   constantMol=False,
                                   addSdTags=True)
                    if ok:
                        pathTupList.append((sId, molFilePath, "sdf"))
            #
            if useMol2:
                mol2FilePath = os.path.join(searchFileDirPath, ccId[0], ccId,
                                            sId + ".mol2")
                if not (useCache and mU.exists(mol2FilePath)):
                    oeU.write(mol2FilePath,
                              oeMol,
                              constantMol=False,
                              addSdTags=True)
                    if ok:
                        pathTupList.append((sId, mol2FilePath, "mol2"))
            numMols += 1
        #
        self.__storePathList(pathTupList)
        return numMols

Пример #17

Показать файл

Файл: ChemAxonDescriptorProvider.py Проект: rcsb/py-rcsb_utils_chem

class ChemAxonDescriptorProvider(StashableBase):
    """Utilities to deliver ChemAxon rendered chemical descriptors for chemical component definitions."""
    def __init__(self, **kwargs):
        #
        dirName = "chemaxon"
        if "cachePath" in kwargs:
            self.__cachePath = os.path.abspath(kwargs.get("cachePath", None))
            self.__dirPath = os.path.join(self.__cachePath, dirName)
        super(ChemAxonDescriptorProvider,
              self).__init__(self.__cachePath, [dirName])
        #
        self.__molLimit = kwargs.get("molLimit", 0)
        self.__ccUrlTarget = kwargs.get("ccUrlTarget", None)
        self.__birdUrlTarget = kwargs.get("birdUrlTarget", None)
        useCache = kwargs.get("useCache", True)
        self.__chunkSize = kwargs.get("chunkSize", 100)
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full")
        self.__version = None
        self.__descrD = self.__reload(useCache)

    def testCache(self, minCount=None):
        ok = self.__descrD and len(
            self.__descrD
        ) >= minCount if minCount else self.__descrD is not None
        logger.info(
            "Loaded ChemAxon descriptors for (%d) components (success %r)",
            len(self.__descrD) if self.__descrD else 0, ok)
        return ok

    def getDescriptorIndex(self):
        return self.__descrD

    def getIndexFilePath(self):
        return os.path.join(
            self.__dirPath,
            "%s-chemaxon-descriptors.json" % self.__ccFileNamePrefix)

    def getVersion(self):
        return self.__version

    def __reload(self, useCache):
        """Reload or created Chemaxon descriptor mapping index.

        Args:
            cachePath (str): path to the directory containing cache files
            chunkSize (int, optional): number of SMILES per request. Defaults to 100.

         Returns:
            (dict): chemical component data containers for each indexed chemical component
        """
        #
        descrD = {}
        descrFilePath = self.getIndexFilePath()
        #
        if not (useCache and self.__mU.exists(descrFilePath)):
            url = "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/CHEMAXON/cc-full-chemaxon-descriptors.json"
            _ = self.__fetchUrl(url, self.__dirPath)
        #
        _, fExt = os.path.splitext(descrFilePath)
        descrFormat = "json" if fExt == ".json" else "pickle"
        if self.__mU.exists(descrFilePath):
            dD = self.__mU.doImport(descrFilePath, fmt=descrFormat)
            descrD = dD["smiles"]
            self.__version = dD["version"]
        #
        return descrD

    def __fetchUrl(self, urlTarget, dirPath, useCache=False):
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        filePath = os.path.join(dirPath, fn)
        if not (useCache and fU.exists(filePath)):
            startTime = time.time()
            ok2 = fU.get(urlTarget, filePath)
            endTime = time.time()
            if ok2:
                logger.info(
                    "Fetched %s for resource file %s (status = %r) (%.4f seconds)",
                    urlTarget, filePath, ok2, endTime - startTime)
            else:
                logger.error(
                    "Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)",
                    urlTarget, filePath, ok2, endTime - startTime)
        #
        return filePath

    def buildDescriptors(self):
        descrFilePath = self.getIndexFilePath()
        ccidxP = ChemCompIndexProvider(
            ccUrlTarget=self.__ccUrlTarget,
            birdUrlTarget=self.__birdUrlTarget,
            cachePath=self.__cachePath,
            useCache=True,
            molLimit=self.__molLimit,
            ccFileNamePrefix=self.__ccFileNamePrefix,
        )
        ok = ccidxP.testCache()
        if ok:
            ccIdList = ccidxP.getIdList()
            self.__descrD = self.__fetchDescriptors(ccIdList,
                                                    ccidxP,
                                                    chunkSize=self.__chunkSize)
            tS = datetime.datetime.now().isoformat()
            vS = datetime.datetime.now().strftime("%Y-%m-%d")
            self.__version = vS
            dD = {"created": tS, "version": vS, "smiles": self.__descrD}
            ok = self.__mU.doExport(descrFilePath, dD, fmt="json", indent=3)
            logger.info("Stored %s descriptors for %d components (status=%r) ",
                        descrFilePath, len(self.__descrD), ok)

    def updateDescriptors(self, useCache=True):

        ccidxP = ChemCompIndexProvider(
            ccUrlTarget=self.__ccUrlTarget,
            birdUrlTarget=self.__birdUrlTarget,
            cachePath=self.__cachePath,
            useCache=useCache,
            molLimit=None,
            ccFileNamePrefix=self.__ccFileNamePrefix,
        )
        ok = ccidxP.testCache()
        if ok:
            ccIdList = ccidxP.getIdList()
            curIdList = list(self.__descrD.keys())
            updIdList = list(set(ccIdList) - set(curIdList))
            if updIdList:
                logger.info(
                    "Updating Chemaxon descriptors for (%d) components",
                    len(updIdList))
                uD = self.__fetchDescriptors(updIdList,
                                             ccidxP,
                                             chunkSize=self.__chunkSize)
                self.__descrD.update(uD)
                descrFilePath = self.getIndexFilePath()
                tS = datetime.datetime.now().isoformat()
                vS = datetime.datetime.now().strftime("%Y-%m-%d")
                self.__version = vS
                dD = {"created": tS, "version": vS, "smiles": self.__descrD}
                ok = self.__mU.doExport(descrFilePath,
                                        dD,
                                        fmt="json",
                                        indent=3)
        #
        return ok

    def __fetchDescriptors(self, ccIdList, ccidxP, chunkSize=100):
        """Fetch transformed SMILES descriptors from the ChemAxon webservice.

            Args:
                ccIdList (list, str): chemical component identifier list
                ccidxP (object): instance of the ChemCompIndexProvider()
                chunksize (int, optional): number of SMILES per request. Defaults to 100.

            Returns:
                (dict): dictionary {<ccId>: [<transformed SMILES>, ...], ...}

        Example API parameter data:
                            {
                            "errorHandlingMode": "FAIL_ON_ERROR",
                            "inputParams": "smiles",
                            "outputParams": "smiles",
                            "structures": [
                                "CC(C)[C@H](N)C=O",
                                "CC[C@H](C)[C@H](N)C=O",
                                "CC(C)C[C@H](N)C=O"
                            ]
                            }

        Example query:
        curl -X POST "https://jchem-microservices.chemaxon.com/jwsio/rest-v1/molconvert/batch" -H "accept: */*"
               -H "Content-Type: application/json" -d "{ \"errorHandlingMode\": \"FAIL_ON_ERROR\", \"inputParams\": \"smiles\",
               \"outputParams\": \"mrv\", \"structures\": [ \"CC(C)[C@H](N)C=O\", \"CC[C@H](C)[C@H](N)C=O\", \"CC(C)C[C@H](N)C=O\" ]}"
        """
        descrD = {}
        smilesCcIdD = {}
        smilesD = {}
        for ccId in ccIdList:
            smiL = list(
                set(
                    ccidxP.getSMILES(ccId,
                                     smiTypeList=[
                                         "oe-iso-smiles", "oe-smiles",
                                         "cactvs-iso-smiles", "cactvs-smiles"
                                     ])))
            smilesCcIdD.setdefault(ccId, []).extend(smiL)
            for smi in smiL:
                smilesD.setdefault(smi, []).append(ccId)
        #
        logger.info("Translating (%d) SMILES for components (%d)",
                    len(smilesD), len(smilesCcIdD))
        # ----
        smiLL = [
            list(smilesD.keys())[i:i + chunkSize]
            for i in range(0, len(smilesD), chunkSize)
        ]
        # ---
        baseUrl = "https://jchem-microservices.chemaxon.com"
        endPoint = "jwsio/rest-v1/molconvert/batch"
        # hL = [("Accept", "application/json"), ("Content-Type", "application/json")]
        hD = {"Accept": "application/json", "Content-Type": "application/json"}
        try:
            pD = {
                "errorHandlingMode": "SKIP_ERROR",
                "inputParams": "smiles",
                "outputParams": "smiles"
            }
            #
            iCount = 0
            for smiL in smiLL:
                iCount += 1
                ureq = UrlRequestUtil()
                pD["structures"] = smiL
                logger.debug("pD %r", pD)
                rDL, retCode = ureq.postUnWrapped(
                    baseUrl,
                    endPoint,
                    pD,
                    headers=hD,
                    sendContentType="application/json",
                    returnContentType="application/json")
                logger.debug("API result (%r) %r", retCode, rDL)
                if rDL and len(rDL) == len(smiL):
                    for ii, rD in enumerate(rDL):
                        if "structure" in rD and "successful" in rD and rD[
                                "successful"]:
                            if smiL[ii] == rD["structure"]:
                                continue
                            for ccId in smilesD[smiL[ii]]:
                                if ccId in descrD and rD[
                                        "structure"] in descrD[ccId]:
                                    continue
                                if rD["structure"] in smilesCcIdD[ccId]:
                                    continue
                                descrD.setdefault(ccId,
                                                  []).append(rD["structure"])
                else:
                    logger.info("Chunk %d failed (%d)", iCount, len(rDL))
                if iCount % 10 == 0:
                    logger.info("Completed processing chunk (%d/%d)", iCount,
                                len(smiLL))

            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return descrD

Пример #18

Показать файл

Файл: testChEMBLTargetProvider.py Проект: rcsb/py-rcsb_utils_targets

class ChEMBLTargetProviderTests(unittest.TestCase):
    skipFull = True

    def setUp(self):
        self.__cachePath = os.path.join(HERE, "test-output", "CACHE")
        self.__fastaPath = os.path.join(HERE, "test-output",
                                        "chembl-targets.fa")
        self.__taxonPath = os.path.join(HERE, "test-output",
                                        "chembl-targets-taxon.tdd")
        self.__dataPath = os.path.join(HERE, "test-data")
        self.__mU = MarshalUtil(workPath=self.__cachePath)

    def tearDown(self):
        pass

    def testFetchChEMBLTargets(self):
        try:
            ctP = ChEMBLTargetProvider(cachePath=self.__cachePath,
                                       useCache=False)
            ok = ctP.testCache()
            self.assertTrue(ok)
            ok = ctP.exportFasta(self.__fastaPath,
                                 self.__taxonPath,
                                 addTaxonomy=False)
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testFetchActivityData(self):
        try:
            logger.info("MAX_LIMIT %r", Settings.Instance().MAX_LIMIT)  # pylint: disable=no-member
            ctP = ChEMBLTargetProvider(cachePath=self.__cachePath,
                                       useCache=True)
            ok = ctP.testCache()
            self.assertTrue(ok)
            # P43088|CHEMBL1987|9606
            # P08243|uniprotId|CHEMBL3120|chemblId|9606|taxId
            tL = ["CHEMBL1987", "CHEMBL3120"]
            targetD = ctP.getActivityData(tL)
            ok = self.__mU.doExport(os.path.join(
                self.__cachePath, "ChEMBL-targets",
                "chembl-target-activity.json"),
                                    targetD,
                                    fmt="json",
                                    indent=3)
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testFetchMechanismData(self):
        oD = {}
        try:
            logger.info("MAX_LIMIT %r", Settings.Instance().MAX_LIMIT)  # pylint: disable=no-member
            ctP = ChEMBLTargetProvider(cachePath=self.__cachePath,
                                       useCache=True)
            ok = ctP.testCache()
            self.assertTrue(ok)
            # P43088|CHEMBL1987|9606
            # P08243|uniprotId|CHEMBL3120|chemblId|9606|taxId
            tL = ["CHEMBL1987", "CHEMBL3120"]
            oD.update(ctP.getMechanismData(tL))
            #
            ok = self.__mU.doExport(os.path.join(
                self.__cachePath, "ChEMBL-targets",
                "chembl-target-mechanism.json"),
                                    oD,
                                    fmt="json",
                                    indent=3)
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    #
    @unittest.skipIf(skipFull, "Very long test")
    def testFetchChEMBLTargetsWithTax(self):
        try:
            ctP = ChEMBLTargetProvider(cachePath=self.__cachePath,
                                       useCache=True)
            ok = ctP.testCache()
            self.assertTrue(ok)
            ok = ctP.exportFasta(self.__fastaPath,
                                 self.__taxonPath,
                                 addTaxonomy=True)
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

Пример #19

Показать файл

Файл: ChemCompModelBuild.py Проект: rcsb/py-rcsb_ccmodels

    def __writeModel(self, targetId, targetPath, fitFD, fitXyzMapD, fitAtomUnMappedL, matchObj, modelId, modelPath):
        """Write the chemical component model for the input chemical component Id and associated atom mapping and
        feature details --

            ComponentAtomDetails = namedtuple("ComponentAtomDetails", "index atNo name aType x y z fCharge")
            AlignAtomMap = namedtuple("AlignAtomMap", "refId refAtIdx refAtNo refAtName fitId fitAtIdx fitAtNo fitAtName")
            AlignAtomUnMapped = namedtuple("AlignAtomUnMapped", "fitId fitAtIdx fitAtNo fitAtType fitAtName fitAtFormalCharge x y z fitNeighbors")
        """
        try:
            unMappedTypeD = defaultdict(int)
            hAtomPrefix = "HEX"
            variantType = self.__getBuildVariant(targetId)
            #
            if not self.__testUnMappedProtonation(fitAtomUnMappedL):
                logger.info("Unmapped non-hydrogen atoms target %r model %r unMapped count (%d)", targetId, modelId, len(fitAtomUnMappedL))
                return False, variantType
            # Get atom partners for the unmapped atoms
            fitAtMapD = {}
            for refAtName, fAtTup in fitXyzMapD.items():
                fitAtMapD[fAtTup.atName] = refAtName
            if fitAtomUnMappedL:
                #  Check if neighbors are all mapped
                ok = True
                for fitUnTup in fitAtomUnMappedL:
                    for nAtName in fitUnTup.fitNeighbors:
                        if nAtName not in fitAtMapD:
                            ok = False
                            logger.info("Missing mapped neighbor for %r target %r model %r", nAtName, targetId, modelId)
                            break
                if not ok:
                    return False, variantType
                else:
                    logger.debug("%s match has unmapped protonation", modelId)
                    variantType = "tautomer_protomer"
            #
            #
            kList = ["xyz", "SMILES", "SMILES_STEREO", "InChI", "InChIKey"]
            for k in kList:
                if k not in fitFD:
                    logger.error("Fit feature dictionary for %s missing key %s", targetId, k)
                    return False, variantType
            # ------------
            dataContainer = DataContainer(modelId)
            #
            mU = MarshalUtil(workPath=self.__cachePath)
            myContainerList = mU.doImport(targetPath, fmt="mmcif")
            myContainer = myContainerList[0]
            dbName = myContainer.getName()
            if dbName.upper() != targetId.upper():
                logger.info("mismatch datablock (%r) and targetId (%r)", dbName, targetId)
            cObj = None
            if myContainer.exists("chem_comp"):
                cObj = myContainer.getObj("chem_comp")
            #
            #
            catName = "pdbx_chem_comp_model"
            if not dataContainer.exists(catName):
                dataContainer.append(DataCategory(catName, attributeNameList=["id", "comp_id"]))
            #
            parentId = targetId.split("|")[0]
            wObj = dataContainer.getObj(catName)
            wObj.setValue(modelId, "id", 0)
            wObj.setValue(parentId, "comp_id", 0)
            #
            # --------  ---------
            catName = "pdbx_chem_comp_model_atom"
            if not dataContainer.exists(catName):
                dataContainer.append(
                    DataCategory(catName, attributeNameList=["model_id", "atom_id", "type_symbol", "charge", "model_Cartn_x", "model_Cartn_y", "model_Cartn_z", "ordinal_id"])
                )
            wObj = dataContainer.getObj(catName)
            #
            if myContainer.exists("chem_comp_atom"):
                cObj = myContainer.getObj("chem_comp_atom")
            #
            #  Only write the mapped atoms in case we are missing hydrogens in the mapping
            #
            jj = 0
            for ii in range(cObj.getRowCount()):
                atName = cObj.getValue("atom_id", ii)
                atType = cObj.getValue("type_symbol", ii)
                if atName not in fitXyzMapD:
                    unMappedTypeD[atType] += 1
                    continue
                fitXyz = fitXyzMapD[atName]
                #
                # fCharge = cObj.getValue("charge", ii)
                #
                wObj.setValue(modelId, "model_id", jj)
                wObj.setValue(atName, "atom_id", jj)
                wObj.setValue(atType, "type_symbol", jj)
                #
                wObj.setValue(fitXyz.atFormalCharge, "charge", jj)
                wObj.setValue("%.4f" % fitXyz.x, "model_Cartn_x", jj)
                wObj.setValue("%.4f" % fitXyz.y, "model_Cartn_y", jj)
                wObj.setValue("%.4f" % fitXyz.z, "model_Cartn_z", jj)
                wObj.setValue(jj + 1, "ordinal_id", jj)
                jj += 1
            #
            # Add the unmapped atoms ...
            # AlignAtomUnMapped = namedtuple("AlignAtomUnMapped", "fitId fitAtIdx fitAtNo fitAtType fitAtName fitNeighbors")
            ii = wObj.getRowCount()
            for jj, uTup in enumerate(fitAtomUnMappedL):
                refAtomName = hAtomPrefix + str(jj)
                wObj.setValue(modelId, "model_id", ii)
                wObj.setValue(refAtomName, "atom_id", ii)
                wObj.setValue(uTup.fitAtType, "type_symbol", ii)
                wObj.setValue(uTup.fitAtFormalCharge, "charge", ii)
                wObj.setValue("%.4f" % uTup.x, "model_Cartn_x", ii)
                wObj.setValue("%.4f" % uTup.y, "model_Cartn_y", ii)
                wObj.setValue("%.4f" % uTup.z, "model_Cartn_z", ii)
                wObj.setValue(ii + 1, "ordinal_id", ii)
            # --------  ---------
            catName = "pdbx_chem_comp_model_bond"
            if not dataContainer.exists(catName):
                dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "atom_id_1", "atom_id_2", "value_order", "ordinal_id"]))
            wObj = dataContainer.getObj(catName)
            #
            if myContainer.exists("chem_comp_bond"):
                cObj = myContainer.getObj("chem_comp_bond")
            #
            jj = 0
            for ii in range(cObj.getRowCount()):
                at1 = cObj.getValue("atom_id_1", ii)
                if at1 not in fitXyzMapD:
                    continue
                at2 = cObj.getValue("atom_id_2", ii)
                if at2 not in fitXyzMapD:
                    continue
                bType = cObj.getValue("value_order", ii)
                #
                wObj.setValue(modelId, "model_id", jj)
                wObj.setValue(at1, "atom_id_1", jj)
                wObj.setValue(at2, "atom_id_2", jj)
                wObj.setValue(bType, "value_order", jj)
                wObj.setValue(jj + 1, "ordinal_id", jj)
                jj += 1
            #
            ii = wObj.getRowCount()
            for jj, uTup in enumerate(fitAtomUnMappedL):
                at1 = hAtomPrefix + str(jj)
                for nAt in uTup.fitNeighbors:
                    at2 = fitAtMapD[nAt]
                    wObj.setValue(modelId, "model_id", ii)
                    wObj.setValue(at1, "atom_id_1", ii)
                    wObj.setValue(at2, "atom_id_2", ii)
                    wObj.setValue("SING", "value_order", ii)
                    wObj.setValue(ii + 1, "ordinal_id", ii)

            # --------  ---------
            catName = "pdbx_chem_comp_model_descriptor"
            if not dataContainer.exists(catName):
                dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "type", "descriptor"]))
            wObj = dataContainer.getObj(catName)
            #
            ii = 0
            wObj.setValue(modelId, "model_id", ii)
            wObj.setValue("SMILES", "type", ii)
            wObj.setValue(fitFD["SMILES"], "descriptor", ii)
            ii += 1
            wObj.setValue(modelId, "model_id", ii)
            wObj.setValue("SMILES_CANONICAL", "type", ii)
            wObj.setValue(fitFD["SMILES_STEREO"], "descriptor", ii)
            ii += 1
            wObj.setValue(modelId, "model_id", ii)
            wObj.setValue("InChI", "type", ii)
            wObj.setValue(fitFD["InChI"], "descriptor", ii)
            ii += 1
            wObj.setValue(modelId, "model_id", ii)
            wObj.setValue("InChIKey", "type", ii)
            wObj.setValue(fitFD["InChIKey"], "descriptor", ii)
            #
            # --------  ---------
            if matchObj.getIdentifier() is not None:
                catName = "pdbx_chem_comp_model_reference"
                if not dataContainer.exists(catName):
                    dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "db_name", "db_code"]))
                wObj = dataContainer.getObj(catName)
                ii = 0
                wObj.setValue(modelId, "model_id", ii)
                wObj.setValue("CSD", "db_name", ii)
                wObj.setValue(matchObj.getIdentifier(), "db_code", ii)
            #
            featureD = {}
            v = matchObj.getRFactor()
            vS = str(v)
            if v is not None and len(vS) > 0:
                featureD["r_factor"] = "%.3f" % float(v)
            #
            v = matchObj.getTemperature()
            vS = str(v)
            # remove string artifacts from temperature string ...
            if v is not None and len(vS) > 0:
                tV = vS.upper()
                try:
                    if tV.endswith("DEG.C"):
                        tV = tV.replace("AT", "")
                        tV = tV.replace("DEG.C", "")
                        tV = float(tV.strip())
                        tV = tV + 273.15
                    else:
                        tV = tV.replace("AT", "")
                        tV = tV.replace("K", "")
                        tV = float(tV.strip())
                    featureD["experiment_temperature"] = tV
                except Exception as e:
                    logger.exception("Temperature conversion fails for %s (%r) with %s", modelId, vS, tV)
            #
            v = matchObj.getCitationDOI()
            vS = str(v)
            if v is not None and len(vS) > 0:
                featureD["publication_doi"] = v
            #
            v = matchObj.getCsdVersion()
            vS = str(v)
            if v is not None and len(vS) > 0:
                featureD["csd_version"] = v
            #
            if matchObj.getRadiationSource() in ["Neutron"]:
                featureD["neutron_radiation_experiment"] = True
            if matchObj.getHasDisorder() in ["Y"]:
                featureD["has_disorder"] = True
            #
            if len(unMappedTypeD) == 1 and "H" in unMappedTypeD:
                logger.info("model %r heavy_atoms_only", modelId)
                featureD["heavy_atoms_only"] = True
            else:
                featureD["all_atoms_have_sites"] = True
            # --------  ---------
            catName = "pdbx_chem_comp_model_feature"
            if not dataContainer.exists(catName):
                dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "feature_name", "feature_value"]))
            wObj = dataContainer.getObj(catName)
            #
            fKeyList = ["experiment_temperature", "publication_doi", "r_factor", "csd_version"]
            ii = 0
            for fKey in fKeyList:
                if fKey in featureD:
                    wObj.setValue(modelId, "model_id", ii)
                    wObj.setValue(fKey, "feature_name", ii)
                    wObj.setValue(str(featureD[fKey]), "feature_value", ii)
                    ii += 1

            #
            boolKeyList = ["has_disorder", "neutron_radiation_experiment", "heavy_atoms_only", "all_atoms_have_sites"]
            for fKey in boolKeyList:
                if fKey in featureD:
                    if featureD[fKey]:
                        wObj.setValue(modelId, "model_id", ii)
                        wObj.setValue(fKey, "feature_name", ii)
                        wObj.setValue("Y", "feature_value", ii)
                        ii += 1
            #

            if variantType:
                wObj.setValue(modelId, "model_id", ii)
                wObj.setValue(variantType + "_match", "feature_name", ii)
                wObj.setValue("Y", "feature_value", ii)
                ii += 1

            # --------  ---------
            catName = "pdbx_chem_comp_model_audit"
            if not dataContainer.exists(catName):
                dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "action_type", "date"]))
            wObj = dataContainer.getObj(catName)
            #
            ii = 0
            wObj.setValue(modelId, "model_id", ii)
            wObj.setValue("Initial release", "action_type", ii)
            wObj.setValue(self.__getToday(), "date", ii)
            # wObj.setValue('RCSB', 'processing_site',  ii)
            # wObj.setValue('JDW', 'annotator', ii)
            # wObj.setValue('?', 'details', ii)
            #
            ok = mU.doExport(modelPath, [dataContainer], fmt="mmcif")
            return ok, variantType
        except Exception as e:
            logger.exception("Failing for %r %r with %s", targetId, targetPath, str(e))
        return False, ""

Пример #20

Показать файл

class PfamProvider(StashableBase):
    """Manage an index of Pfam identifier to description mappings."""
    def __init__(self, **kwargs):
        urlTargetPfam = kwargs.get(
            "urlTargetPfam",
            "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz"
        )
        urlTargetPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/Pfam-A.clans.tsv.gz"
        self.__version = "34.0"
        dirName = "pfam"
        cachePath = kwargs.get("cachePath", ".")
        dirPath = os.path.join(cachePath, dirName)
        super(PfamProvider, self).__init__(cachePath, [dirName])
        useCache = kwargs.get("useCache", True)
        #
        self.__mU = MarshalUtil(workPath=dirPath)
        self.__pfamD = self.__rebuildCache(urlTargetPfam, urlTargetPfamFB,
                                           dirPath, useCache)

        urlTargetMapPfam = kwargs.get(
            "urlTargetMapPfam",
            "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pdb_pfamA_reg.txt.gz"
        )
        urlTargetMapPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/pdb_pfamA_reg.txt.gz"
        self.__pfamMapD = self.__rebuildMappingCache(urlTargetMapPfam,
                                                     urlTargetMapPfamFB,
                                                     dirPath, useCache)

    def getVersion(self):
        return self.__version

    def getDescription(self, pfamId):
        """Return the description for the input Pfam identifier

        Args:
            pfamId (str): Pfam identifier

        Returns:
            str: text description of the Pfam domain
        """
        descr = None
        try:
            descr = self.__pfamD[pfamId]
        except Exception:
            pass
        return descr

    def getMapping(self, pdbId):
        """Return the list of Pfam domain assignments for the input PDB identifer along with
        residue level mapping information

        Args:
            pdbId (str): PDB identifier

        Returns:
            list: [{'pfamId': , 'authAsymId":  , 'authSeqBeg': , 'authSeqEnd': 'insertBeg': , 'insertEnd': }, {}, ]
        """
        mapL = []
        try:
            mapL = self.__pfamMapD[pdbId.upper()]
        except Exception:
            pass
        return mapL

    def testCache(self):
        # Check length ...
        logger.info("Length PfamD %d", len(self.__pfamD))
        return (len(self.__pfamD) > 19000) and (len(self.__pfamMapD) > 150000)

    #
    def __rebuildCache(self, urlTargetPfam, urlTargetPfamFB, dirPath,
                       useCache):
        pfamD = {}
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        pfamDataPath = os.path.join(dirPath, "pfam-data.%s" % ext)
        #
        logger.debug("Using cache data path %s", dirPath)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(pfamDataPath):
            pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt)
            logger.debug("Pfam data length %d", len(pfamD))
        elif not useCache:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetPfam,
                        dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam))
            ok = fU.get(urlTargetPfam, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB))
                ok = fU.get(urlTargetPfamFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            pfamD = self.__getPfamIndex(fp)
            ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt)
            logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath,
                        ok)
            # ------
        #
        return pfamD

    def __getPfamIndex(self, filePath):
        """Parse annotation classifications
        #
        """
        pfamD = {}
        encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {}
        rowL = self.__mU.doImport(filePath,
                                  fmt="tdd",
                                  rowFormat="list",
                                  **encodingD)
        for row in rowL:
            try:
                pfamId = row[0].strip().upper()
                idCode = row[3].strip()
                descr = row[4].strip()
                pfamD[pfamId] = descr + " (" + idCode + ")"
            except Exception:
                pass
        #
        return pfamD

    def __rebuildMappingCache(self, urlTargetPfam, urlTargetPfamFB, dirPath,
                              useCache):
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        pfamDataPath = os.path.join(dirPath, "pfam-mapping-data.%s" % ext)
        #
        logger.debug("Using cache data path %s", dirPath)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(pfamDataPath):
            pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt)
            logger.debug("Pfam mapping data length %d", len(pfamD))
        else:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetPfam,
                        dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam))
            ok = fU.get(urlTargetPfam, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB))
                ok = fU.get(urlTargetPfamFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            pfamD = self.__getPfamMapping(fp)
            ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt)
            logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath,
                        ok)
            # ------
        #
        return pfamD

    def __getPfamMapping(self, filePath):
        """Parse mapping data"""
        pFamMapD = {}
        encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {}
        rowL = self.__mU.doImport(filePath,
                                  fmt="tdd",
                                  rowFormat="list",
                                  **encodingD)
        for row in rowL:
            try:
                pdbId = row[2].strip().upper()
                pfamId = row[3].strip().upper()
                authAsymId = row[5].strip()
                authSeqBeg = int(row[6].strip())
                insertBeg = row[7].strip(
                ) if row[7].strip() != "NULL" else None
                authSeqEnd = int(row[8].strip())
                insertEnd = row[9].strip(
                ) if row[9].strip() != "NULL" else None
                pFamMapD.setdefault(pdbId, []).append({
                    "pfamId": pfamId,
                    "authAsymId": authAsymId,
                    "authSeqBeg": authSeqBeg,
                    "authSeqEnd": authSeqEnd,
                    "insertBeg": insertBeg,
                    "insertEnd": insertEnd,
                })
            except Exception as e:
                logger.exception("Failing with %r %s", row, str(e))
        #
        logger.info("Pfam mapping data for (%d) entries", len(pFamMapD))
        return pFamMapD

Пример #21

Показать файл

class CARDTargetProvider:
    """Accessors for CARD target assignments."""
    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__dirPath = os.path.join(self.__cachePath, "CARD-targets")
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__oD, self.__version = self.__reload(self.__dirPath, **kwargs)
        #

    def testCache(self, minCount=3000):
        if self.__oD and len(self.__oD) > minCount:
            return True
        else:
            return False

    def hasFeature(self, modelId):
        return modelId in self.__oD

    def getFeature(self, modelId, featureKey):
        try:
            return self.__oD[modelId][featureKey]
        except Exception:
            return None

    def getAssignmentVersion(self):
        return self.__version

    def getTargetDataPath(self):
        return os.path.join(self.__dirPath, "card-target-data.json")

    def getCofactorDataPath(self):
        return None

    def __reload(self, dirPath, **kwargs):
        oD = None
        version = None
        startTime = time.time()
        useCache = kwargs.get("useCache", True)
        #
        # CARDDumpUrl = kwargs.get("CARDDumpUrl", "https://card.mcmaster.ca/latest/data/broadstreet-v3.1.0.tar.bz2")
        cardDumpUrl = kwargs.get("CARDDumpUrl",
                                 "https://card.mcmaster.ca/latest/data")
        ok = False
        fU = FileUtil()
        cardDumpFileName = "card-data.tar.bz2"
        cardDumpPath = os.path.join(dirPath, cardDumpFileName)
        cardDumpDirPath = os.path.join(dirPath, "dump")
        #
        fU.mkdir(dirPath)
        cardDataPath = os.path.join(dirPath, "card-select-data.json")
        #
        logger.info("useCache %r CARDDumpPath %r", useCache, cardDumpPath)
        if useCache and self.__mU.exists(cardDataPath):
            qD = self.__mU.doImport(cardDataPath, fmt="json")
            version = qD["version"]
            oD = qD["data"]
        else:
            logger.info("Fetching url %s path %s", cardDumpUrl, cardDumpPath)
            ok = fU.get(cardDumpUrl, cardDumpPath)
            fU.mkdir(cardDumpDirPath)
            fU.uncompress(cardDumpPath, outputDir=cardDumpDirPath)
            fU.unbundleTarfile(os.path.join(cardDumpDirPath,
                                            cardDumpFileName[:-4]),
                               dirPath=cardDumpDirPath)
            logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok,
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        time.time() - startTime)
            oD, version = self.__parseCardData(
                os.path.join(cardDumpDirPath, "card.json"))
            tS = datetime.datetime.now().isoformat()
            qD = {"version": version, "created": tS, "data": oD}
            oD = qD["data"]
            ok = self.__mU.doExport(cardDataPath, qD, fmt="json", indent=3)
            logger.info("Export CARD data (%d) status %r", len(oD), ok)
        # ---
        return oD, version

    def exportCardFasta(self, fastaPath, taxonPath):
        ok = self.__exportCardFasta(fastaPath, taxonPath, self.__oD)
        return ok

    def __exportCardFasta(self, fastaPath, taxonPath, cardD):
        """Export a CARD sequence target fasta file

        Args:
            fastaPath (str): fasta output file path
            cardD (dict): card selected data dictionary

        Returns:
            (bool): True for success or False otherwise
        """
        sD = {}
        taxonL = []
        try:
            for modelId, tD in cardD.items():
                modelBitScore = None
                # aroAcc = tD["accession"]
                aroId = tD["id"]
                if "sequences" not in tD:
                    continue
                modelBitScore = tD[
                    "modelBitScore"] if "modelBitScore" in tD else None
                for qD in tD["sequences"]:
                    sId = qD["seqId"]
                    seq = qD["sequence"]
                    taxId = qD["taxId"]
                    cD = {
                        "sequence": seq,
                        "modelId": modelId,
                        "aroId": aroId,
                        "seqId": sId,
                        "taxId": taxId
                    }
                    cD["bitScore"] = modelBitScore if modelBitScore else "-1.0"
                    #
                    cId = ""
                    cL = []
                    for k, v in cD.items():
                        if k in ["sequence"]:
                            continue
                        cL.append(str(v))
                        cL.append(str(k))
                    cId = "|".join(cL)
                    sD[cId] = cD
                    taxonL.append("%s\t%s" % (cId, taxId))

            ok = self.__mU.doExport(fastaPath,
                                    sD,
                                    fmt="fasta",
                                    makeComment=True)
            logger.info("Export CARD fasta (%d) status %r", len(sD), ok)
            ok = self.__mU.doExport(taxonPath, taxonL, fmt="list")
            logger.info("Export Taxon (%d) status %r", len(taxonL), ok)
        except Exception as e:
            logger.exception("Failing for model %r tD %r with %s", modelId, tD,
                             str(e))
        return ok

    def __parseCardData(self, filePath):
        """Parse CARD target data

        Args:
            filePath (str): card json data file

        Returns:
            (dict, string): card selected data dictionary, card version string
        """
        try:
            oD = {}
            version = None
            cD = self.__mU.doImport(filePath, fmt="json")
            logger.info("CARD model count (%d)", len(cD))
            for modelId, mD in cD.items():
                if modelId.startswith("_"):
                    if modelId == "_version":
                        version = mD
                    continue
                oD[modelId] = {}
                for kTup in [
                    ("ARO_accession", "accession"),
                    ("ARO_id", "id"),
                    ("ARO_name", "name"),
                    ("ARO_description", "descr"),
                    ("model_name", "modelName"),
                    ("model_type", "modelType"),
                ]:
                    if kTup[0] in mD:
                        oD[modelId][kTup[1]] = mD[kTup[0]]

                try:
                    if "model_sequences" in mD:
                        for seqId, tD in mD["model_sequences"][
                                "sequence"].items():
                            oD[modelId].setdefault("sequences", []).append({
                                "seqId":
                                seqId,
                                "sequence":
                                tD["protein_sequence"]["sequence"],
                                "taxId":
                                tD["NCBI_taxonomy"]["NCBI_taxonomy_id"]
                            })
                except Exception as e:
                    logger.exception("Failing with %s", str(e))

                try:
                    if "model_param" in mD and "blastp_bit_score" in mD[
                            "model_param"] and "param_value" in mD[
                                "model_param"]["blastp_bit_score"]:
                        oD[modelId]["modelBitScore"] = mD["model_param"][
                            "blastp_bit_score"]["param_value"]

                except Exception as e:
                    logger.exception("Failing with %s", str(e))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return oD, version

Пример #22

Показать файл

Файл: CcdcSearchExec.py Проект: rcsb/py-rcsb_utils_ccdc

def main():
    parser = argparse.ArgumentParser()
    #
    parser.add_argument("--mol_list_path",
                        default=None,
                        help="Molecule file list path")
    parser.add_argument("--result_path",
                        default=None,
                        help="Molecule file list path")
    parser.add_argument("--search_type",
                        default=None,
                        help="Search type (similarity|substructure)")
    parser.add_argument("--start_record", default=None, help="Starting record")
    parser.add_argument("--end_record", default=None, help="End record")
    parser.add_argument("--csdhome",
                        default=None,
                        help="Path to the CSD release (path to CSD_202x)")
    parser.add_argument("--python_lib_path",
                        default=None,
                        help="Path to Python library")
    parser.add_argument("--python_version",
                        default=None,
                        help="Python library version (default: 3.7)")
    parser.add_argument(
        "--hit_list_path",
        default=None,
        help="Path to list of molecule identifers with search results")
    #
    args = parser.parse_args()
    #
    try:
        pyLib = args.python_lib_path if args.python_lib_path else os.path.join(
            os.environ["PYENV_ROOT"], "versions", "3.7.9", "lib")
        pyVer = args.python_version if args.python_version else "3.7"

        csdHome = args.csdhome
        molFilePath = args.mol_list_path
        resultPath = args.result_path
        searchType = args.search_type
        startRecord = args.start_record
        endRecord = args.end_record
        hitListPath = args.hit_list_path
    except Exception as e:
        logger.exception("Argument processing problem %s", str(e))
        parser.print_help(sys.stderr)
        exit(1)
    #
    try:
        os.environ["CSDHOME"] = csdHome
        os.environ[
            "LD_LIBRARY_PATH"] = "%s:%s/python%s/site-packages/ccdc/_lib:$LD_LIBRARY_PATH" % (
                pyLib, pyLib, pyVer)
        os.environ[
            "DYLD_LIBRARY_PATH"] = "%s/python%s/site-packages/ccdc/_lib" % (
                pyLib, pyVer)
        os.environ[
            "DYLD_FRAMEWORK_PATH"] = "%s/python%s/site-packages/ccdc/_lib" % (
                pyLib, pyVer)

        logger.info("Using CSDHOME %s", os.environ["CSDHOME"])
        logger.info("Using DYLD_LIBRARY_PATH %s",
                    os.environ["DYLD_LIBRARY_PATH"])
        logger.info("Using DYLD_FRAMEWORK_PATH %s",
                    os.environ["DYLD_FRAMEWORK_PATH"])

        from rcsb.utils.ccdc.CcdcSearch import CcdcSearch  # pylint: disable=import-outside-toplevel

        ccdcS = CcdcSearch(verbose=True)
        pL = ccdcS.getList(molFilePath,
                           startRecord=startRecord,
                           endRecord=endRecord)
        logger.info("Search file %s record length %r", molFilePath,
                    len(pL) if pL else [])
        #
        hitL = []
        for ii, queryTargetPath in enumerate(pL, 1):
            _, fn = os.path.split(queryTargetPath)
            queryTargetId, _ = os.path.splitext(fn)
            #
            logger.info("(%d/%d) Start search for %r %r", ii, len(pL),
                        queryTargetId, queryTargetPath)
            numHits = ccdcS.search(queryTargetId,
                                   queryTargetPath,
                                   resultPath,
                                   searchType=searchType)
            if numHits:
                hitL.append(queryTargetId)
        logger.info("%d searches completed - matched %d", len(pL), len(hitL))
        if hitListPath:
            mU = MarshalUtil()
            ok = mU.doExport(hitListPath, hitL, fmt="list")
            logger.info("Wrote hit list (%r) to %s", ok, hitListPath)
    except Exception as e:
        logger.exception("Failing with %s", str(e))

Пример #23

Показать файл

Файл: DrugBankTargetCofactorProvider.py Проект: rcsb/py-rcsb_utils_targets

class DrugBankTargetCofactorProvider(StashableBase):
    """Accessors for DrugBank target cofactors."""
    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__useCache = kwargs.get("useCache", True)
        self.__fmt = kwargs.get("fmt", "pickle")
        self.__dirName = "DrugBank-cofactors"
        super(DrugBankTargetCofactorProvider,
              self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__fD = self.__reload(self.__dirPath, self.__useCache, self.__fmt)
        #

    def testCache(self, minCount=590):
        logger.info(
            "DrugBank feature count %d",
            len(self.__fD["cofactors"]) if "cofactors" in self.__fD else 0)
        if self.__fD and "cofactors" in self.__fD and len(
                self.__fD["cofactors"]) > minCount:
            return True
        else:
            return False

    def hasTarget(self, rcsbEntityId):
        return rcsbEntityId.upper() in self.__fD["cofactors"]

    def getTargets(self, rcsbEntityId):
        try:
            return self.__fD["cofactors"][rcsbEntityId.upper()]
        except Exception:
            return []

    def __getCofactorDataPath(self, fmt="json"):
        fExt = "json" if fmt == "json" else "pic"
        return os.path.join(self.__dirPath, "drugbank-cofactor-data.%s" % fExt)

    def reload(self):
        self.__fD = self.__reload(self.__dirPath,
                                  useCache=True,
                                  fmt=self.__fmt)
        return True

    def __reload(self, dirPath, useCache, fmt):
        startTime = time.time()
        fD = {}

        ok = False
        cofactorPath = self.__getCofactorDataPath(fmt=fmt)
        #
        logger.info("useCache %r featurePath %r", useCache, cofactorPath)
        if useCache and self.__mU.exists(cofactorPath):
            fD = self.__mU.doImport(cofactorPath, fmt=fmt)
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        logger.info("Completed reload (%r) at %s (%.4f seconds)", ok,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    time.time() - startTime)
        return fD

    def buildCofactorList(self,
                          sequenceMatchFilePath,
                          crmpObj=None,
                          lnmpObj=None):
        """Build target cofactor list for the matching entities in the input sequence match file.

        Args:
            sequenceMatchFilePath (str): sequence match output file path
            crmpObj (obj, optional): instance of ChemRefMappingProviderObj(). Defaults to None
            lnmpObj (obj, optional): instance of LigandNeighborMappingProviderObj(). Defaults to None.

        Returns:
            bool: True for success or False otherwise
        """
        rDL = []
        dbP = DrugBankTargetProvider(cachePath=self.__cachePath, useCache=True)
        mD = self.__mU.doImport(sequenceMatchFilePath, fmt="json")
        #
        provenanceSource = "DrugBank"
        refScheme = "PDB entity"
        assignVersion = str(dbP.getAssignmentVersion())
        for queryId, matchDL in mD.items():
            qCmtD = self.__decodeComment(queryId)
            unpId = qCmtD["uniprotId"]
            queryTaxId = qCmtD["taxId"] if "taxId" in qCmtD else None
            if not dbP.hasCofactor(unpId) or queryTaxId == "-1":
                logger.info("Skipping target %r", unpId)
                continue
            #
            # --
            chemCompNeighborsD = {}
            if lnmpObj:
                for matchD in matchDL:
                    tCmtD = self.__decodeComment(matchD["target"])
                    entryId = tCmtD["entityId"].split("_")[0]
                    entityId = tCmtD["entityId"].split("_")[1]
                    rcsbEntityId = entryId + "_" + entityId
                    chemCompIdList = lnmpObj.getLigandNeighbors(rcsbEntityId)
                    chemCompNeighborsD.update(
                        {k: True
                         for k in chemCompIdList})
            # --
            #
            for matchD in matchDL:
                tCmtD = self.__decodeComment(matchD["target"])
                entryId = tCmtD["entityId"].split("_")[0]
                entityId = tCmtD["entityId"].split("_")[1]
                # --
                dbDL = dbP.getCofactors(unpId)
                # --
                cfDL = []
                for dbD in dbDL:
                    cfD = {}
                    cfD["cofactor_id"] = dbD["drugbank_id"]
                    cfD["molecule_name"] = dbD["name"]
                    cfD["target_name"] = dbD["target_name"]
                    # cfD["description"] = dbD["description"]
                    cfD["moa"] = dbD["moa"]
                    # cfD["pharmacology"] = dbD["pharmacology"]
                    cfD["inchi_key"] = dbD["inchi_key"]
                    cfD["smiles"] = dbD["smiles"]
                    cfD["pubmed_ids"] = dbD["pubmed_ids"]
                    cfD = self.__addLocalIds(cfD, crmpObj)
                    #
                    if "chem_comp_id" in cfD and cfD[
                            "chem_comp_id"] in chemCompNeighborsD:
                        cfD["neighbor_in_pdb"] = "Y"
                    else:
                        cfD["neighbor_in_pdb"] = "N"
                    #
                    cfDL.append(cfD)
                # ---
                queryName = cfDL[0][
                    "target_name"] if cfDL and "target_name" in cfDL[
                        0] else None
                # ---
                # aligned_target.entity_beg_seq_id (current target is PDB entity in json)
                # aligned_target.target_beg_seq_id (current query is target seq in json)
                # aligned_target.length
                fpL = []
                if "alignedRegions" in matchD:
                    fpL = [{
                        "entity_beg_seq_id": arD["targetBegin"],
                        "target_beg_seq_id": arD["queryBegin"],
                        "length": arD["targetEnd"] - arD["targetBegin"],
                    } for arD in matchD["alignedRegions"]]
                else:
                    fpL = [{
                        "entity_beg_seq_id": matchD["targetBegin"],
                        "target_beg_seq_id": matchD["queryBegin"],
                        "length": matchD["alignLen"],
                    }]
                # ---
                rD = {
                    "entry_id":
                    entryId,
                    "entity_id":
                    entityId,
                    "query_uniprot_id":
                    unpId,
                    "query_id":
                    unpId,
                    "query_id_type":
                    "DrugBank",
                    "query_name":
                    queryName,
                    "provenance_source":
                    provenanceSource,
                    "reference_scheme":
                    refScheme,
                    "assignment_version":
                    assignVersion,
                    "query_taxonomy_id":
                    int(queryTaxId) if queryTaxId else None,
                    "target_taxonomy_id":
                    int(matchD["targetTaxId"])
                    if "targetTaxId" in matchD else None,
                    "aligned_target":
                    fpL,
                    "taxonomy_match_status":
                    matchD["taxonomyMatchStatus"]
                    if "taxonomyMatchStatus" in matchD else None,
                    "lca_taxonomy_id":
                    matchD["lcaTaxId"] if "lcaTaxId" in matchD else None,
                    "lca_taxonomy_name":
                    matchD["lcaTaxName"] if "lcaTaxName" in matchD else None,
                    "lca_taxonomy_rank":
                    matchD["lcaRank"] if "lcaRank" in matchD else None,
                    "cofactors":
                    cfDL,
                }
                rDL.append(rD)
        #
        qD = {}
        for rD in rDL:
            eId = rD["entry_id"] + "_" + rD["entity_id"]
            qD.setdefault(eId, []).append(rD)
        fp = self.__getCofactorDataPath(fmt=self.__fmt)
        tS = datetime.datetime.now().isoformat()
        # vS = datetime.datetime.now().strftime("%Y-%m-%d")
        vS = assignVersion
        ok = self.__mU.doExport(fp, {
            "version": vS,
            "created": tS,
            "cofactors": qD
        },
                                fmt=self.__fmt,
                                indent=3)
        return ok

    def __addLocalIds(self, cfD, crmpOb=None):
        #
        if crmpOb:
            localIdL = crmpOb.getLocalIds("DRUGBANK", cfD["cofactor_id"])
            if localIdL:
                localId = localIdL[0]
                if localId.startswith("PRD_"):
                    cfD["prd_id"] = localId
                else:
                    cfD["chem_comp_id"] = localId
        return cfD

    def __decodeComment(self, comment, separator="|"):
        dD = {}
        try:
            ti = iter(comment.split(separator))
            dD = {tup[1]: tup[0] for tup in zip(ti, ti)}
        except Exception:
            pass
        return dD

Пример #24

Показать файл

Файл: testEntityPolymerExtractor.py Проект: rcsb/py-rcsb_exdb

class EntityPolymerExtractorTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(EntityPolymerExtractorTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                                  "dbload-setup-example.yml")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__workPath = os.path.join(HERE, "test-output")
        self.__taxonomyDataPath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("NCBI_TAXONOMY_CACHE_DIR",
                             sectionName=configName))
        #
        self.__cacheKwargs = {"fmt": "json", "indent": 3}
        self.__exdbCacheDirPath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
        #
        self.__mU = MarshalUtil()
        self.__entryLimitTest = 18
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)\n", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testExtractEntityPolymers(self):
        """Test case - extract entity polymer info"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=self.__exdbCacheDirPath,
                                         useCache=False,
                                         cacheKwargs=self.__cacheKwargs,
                                         entryLimit=self.__entryLimitTest)
            eCount = epe.getEntryCount()
            self.assertGreaterEqual(eCount, self.__entryLimitTest)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testAccessEntityPolymerFeatures(self):
        """Test case - access cached entity polymer info from test cache"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=self.__exdbCacheDirPath,
                                         useCache=False,
                                         cacheKwargs=self.__cacheKwargs)
            eCount = epe.getEntryCount()
            logger.info("Entry count %d", eCount)
            self.assertGreaterEqual(eCount, self.__entryLimitTest)
            #
            unpL = epe.getRefSeqAccessions("UNP")
            logger.info("Ref seq count %d", len(unpL))
            self.assertGreaterEqual(len(unpL), 1)
            #
            for entryId in ["3RER"]:
                for entityId in ["1"]:
                    uL = epe.getEntityRefSeqAccessions("UNP", entryId,
                                                       entityId)
                    logger.info("UNP for %s %s %r", entryId, entityId, uL)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testTaxonomyReadCache(self):
        """Test case - access cached entity polymer info from test cache"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=self.__exdbCacheDirPath,
                                         useCache=False,
                                         cacheKwargs=self.__cacheKwargs)
            logger.info("Cache entry count %d", epe.getEntryCount())
            #
            obsL = []
            tD = epe.getOrigTaxons()
            logger.info("Taxons %d", len(tD))

            tU = TaxonomyProvider(taxDirPath=self.__taxonomyDataPath,
                                  useCache=True)
            #
            for entryId, taxIdL in tD.items():
                for entityId, iTaxId in taxIdL:
                    # logger.info("entryId %r entityId %r taxId %r" % (entryId, entityId, taxId))
                    mTaxId = tU.getMergedTaxId(iTaxId)
                    if iTaxId != mTaxId:
                        obsL.append({
                            "entryId": entryId,
                            "entityId": entityId,
                            "taxId": iTaxId,
                            "replaceTaxId": mTaxId
                        })
            logger.info("Obsolete list length %d", len(obsL))
            self.__mU.doExport(os.path.join(self.__workPath,
                                            "obsolete-taxons.json"),
                               obsL,
                               fmt="json",
                               indent=3)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testAccessEntityPolymerReadCache(self):
        """Test case - access cached entity polymer info from test cache"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=self.__exdbCacheDirPath,
                                         useCache=False,
                                         cacheKwargs=self.__cacheKwargs)
            logger.info("Cache entry count %d", epe.getEntryCount())
            cD = epe.countRefSeqAccessions("UNP")
            self.assertGreaterEqual(len(cD), 2)
            logger.info("UNP reference sequences per entity %r",
                        dict(sorted(cD.items())))
            logger.info("Reference sequences per entity %r",
                        dict(sorted(epe.countRefSeqAccessionAny().items())))
            logger.info("Reference sequences per ref db %r",
                        dict(sorted(epe.countRefSeqAccessionDbType().items())))
            #
            ok = epe.checkRefSeqAlignRange("UNP")
            self.assertTrue(ok)
            unpL = epe.getRefSeqAccessions("UNP")
            logger.info("Unique UNP reference sequences %d", len(unpL))
            self.assertTrue(ok)
            tD = epe.getUniqueTaxons()
            logger.info("Unique taxons %d", len(tD))
            tD = epe.countRefSeqAccessionByTaxon("UNP")
            logger.info("Unique taxons %d", len(tD))
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

Пример #25

Показать файл

Файл: ScanRepoUtil.py Проект: rcsb/py-rcsb_utils_repository

class ScanRepoUtil(object):
    """Tools for for scanning repositories and collecting coverage and type data information."""
    def __init__(self,
                 cfgOb,
                 attributeDataTypeD=None,
                 numProc=4,
                 chunkSize=15,
                 fileLimit=None,
                 maxStepLength=2000,
                 workPath=None):
        """
        Args:
            cfgOb (object): Configuration object (rcsb.utils.config.ConfigUtil)

            attributeDataTypeD
            dictPath (str): Path to supporting data dictionary

            numProc (int, optional): Number of parallel worker processes used.
            chunkSize (int, optional): Size of files processed in a single multi-proc process
            fileLimit (int, optional): maximum file scanned or None for no limit
            mockTopPath (str, optional): Path to directory containing mock repositories or None
            maxStepLength (int, optional): maximum number of multi-proc runs to perform
        """
        #
        self.__attributeDataTypeD = attributeDataTypeD if attributeDataTypeD else {}
        # Limit the load length of each file type for testing  -  Set to None to remove -
        self.__fileLimit = fileLimit
        self.__maxStepLength = maxStepLength
        #
        # Controls for multiprocessing execution -
        self.__numProc = numProc
        self.__chunkSize = chunkSize
        #
        self.__cfgOb = cfgOb
        #
        self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s"

        self.__workPath = workPath
        self.__mU = MarshalUtil(workPath=self.__workPath)
        self.__rpP = RepositoryProvider(self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__workPath)

    def scanContentType(self,
                        contentType,
                        mergeContentTypes=None,
                        scanType="full",
                        inputPathList=None,
                        scanDataFilePath=None,
                        failedFilePath=None,
                        saveInputFileListPath=None):
        """Driver method for repository scan operation

        Args:
            contentType (str):  one of 'bird','bird_family','bird_chem_comp', chem_comp','pdbx'
            scanType (str, optional): 'full' [or 'incr' to be supported]
            inputPathList (list, optional):  list of input file paths to scan
            scanDataFilePath (str, optional): file path for serialized scan data (Pickle format)
            failedFilePath (str, optional): file path for list of files that fail scanning operation
            saveInputFileListPath str, optional): Path to store file path list that is scanned

        Returns:
            bool: True for success or False otherwise

        """
        try:
            startTime = self.__begin(message="scanning operation")
            #
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType,
                inputPathList=inputPathList,
                mergeContentTypes=mergeContentTypes)
            #
            if saveInputFileListPath:
                self.__mU.doExport(saveInputFileListPath,
                                   self.__rpP.getLocatorPaths(locatorObjList),
                                   fmt="list")
                logger.debug("Saving %d paths in %s", len(locatorObjList),
                             saveInputFileListPath)
            #
            optD = {}
            optD["contentType"] = contentType
            optD["logSize"] = True
            optD["scanType"] = scanType
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            #
            numProc = self.__numProc
            chunkSize = self.__chunkSize if locatorObjList and self.__chunkSize < len(
                locatorObjList) else 0
            #
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            numPaths = len(locatorObjList)
            logger.debug("Processing %d total paths", numPaths)
            numProc = min(numProc, numPaths)
            maxStepLength = self.__maxStepLength
            if numPaths > maxStepLength:
                numLists = int(numPaths / maxStepLength)
                subLists = [
                    locatorObjList[i::numLists] for i in range(numLists)
                ]
            else:
                subLists = [locatorObjList]
            #
            if subLists:
                logger.debug(
                    "Starting with numProc %d outer subtask count %d subtask length ~ %d",
                    numProc, len(subLists), len(subLists[0]))
            #
            numResults = 1
            failList = []
            retLists = [[] for ii in range(numResults)]
            diagList = []
            for ii, subList in enumerate(subLists):
                logger.info("Running outer subtask %d of %d length %d", ii + 1,
                            len(subLists), len(subList))
                #
                mpu = MultiProcUtil(verbose=True)
                mpu.setOptions(optionsD=optD)
                mpu.set(workerObj=self, workerMethod="scanWorker")
                ok, failListT, retListsT, diagListT = mpu.runMulti(
                    dataList=subList,
                    numProc=numProc,
                    numResults=numResults,
                    chunkSize=chunkSize)
                failList.extend(failListT)
                # retLists is a list of lists -
                logger.debug("status %r fail len %r ret len %r", ok,
                             len(failListT), len(retListsT))
                for jj in range(numResults):
                    retLists[jj].extend(retListsT[jj])
                diagList.extend(diagListT)
            logger.debug("Scan failed path list %r", failList)
            logger.debug(
                "Scan path list success length %d load list failed length %d",
                len(locatorObjList), len(failList))
            logger.debug("Returned metadata length %r", len(retLists[0]))
            #
            if failedFilePath and failList:
                wOk = self.__mU.doExport(failedFilePath,
                                         self.__rpP.getLocatorPaths(failList),
                                         fmt="list")
                logger.debug("Writing scan failure path list to %s status %r",
                             failedFilePath, wOk)
            #
            if scanType == "incr":
                scanDataD = self.__mU.doImport(scanDataFilePath,
                                               fmt="pickle",
                                               default=None)
                logger.debug("Imported scan data with keys %r",
                             list(scanDataD.keys()))
            else:
                scanDataD = {}
            #
            if scanDataFilePath and retLists[0]:
                for ssTup in retLists[0]:
                    cId = ssTup.containerId
                    if scanType == "full" and cId in scanDataD:
                        logger.error("Duplicate container id %s in %r and %r",
                                     cId, ssTup.fromPath,
                                     scanDataD[cId].fromPath)
                    #
                    scanDataD[cId] = ssTup

                ok = self.__mU.doExport(scanDataFilePath,
                                        scanDataD,
                                        fmt="pickle")
                tscanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
                ok = tscanDataD == scanDataD

            self.__end(startTime, "scanning operation with status " + str(ok))

            #
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return False

    def evalScan(self,
                 scanDataFilePath,
                 evalJsonFilePath,
                 evalType="data_type"):

        scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
        if evalType in ["data_type"]:
            rD = self.__evalScanDataType(scanDataD)
        elif evalType in ["data_coverage"]:
            rD, _ = self.__evalScanDataCoverage(scanDataD)
        else:
            logger.debug("Unknown evalType %r", evalType)
        ok = self.__mU.doExport(evalJsonFilePath, rD, fmt="json")

        return ok

    def evalScanItem(self, scanDataFilePath, evalFilePath):
        scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
        _, cL = self.__evalScanDataCoverage(scanDataD)
        ok = self.__mU.doExport(evalFilePath, cL, fmt="list")
        return ok

    def __evalScanDataType(self, scanDataD):
        """
        ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec')
        ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict')

        """
        # for populated sD[category] -> d[atName]->{minWidth: , maxWidth:, minPrec:, maxPrec: , count}
        sD = {}
        for cId in scanDataD:
            ssTup = scanDataD[cId]
            dD = ssTup.scanCategoryDict
            for catName in dD:
                if catName not in sD:
                    sD[catName] = {}
                for svTup in dD[catName]:
                    if svTup.atName not in sD[catName]:
                        sD[catName][svTup.atName] = {
                            "minWidth": svTup.minWidth,
                            "maxWidth": svTup.maxWidth,
                            "minPrec": svTup.minPrec,
                            "maxPrec": svTup.maxPrec,
                            "count": 1
                        }
                        continue
                    sD[catName][svTup.atName]["minWidth"] = min(
                        sD[catName][svTup.atName]["minWidth"], svTup.minWidth)
                    sD[catName][svTup.atName]["maxWidth"] = max(
                        sD[catName][svTup.atName]["maxWidth"], svTup.maxWidth)
                    sD[catName][svTup.atName]["minPrec"] = min(
                        sD[catName][svTup.atName]["minPrec"], svTup.minPrec)
                    sD[catName][svTup.atName]["maxPrec"] = max(
                        sD[catName][svTup.atName]["maxPrec"], svTup.maxPrec)
                    sD[catName][svTup.atName]["count"] += 1
        return sD

    def __evalScanDataCoverage(self, scanDataD):
        """
        ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec')
        ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict')

        """

        # for populated sD[category] -> d[atName]->{count: #, instances: [id,id,id]}
        sD = {}
        for cId in scanDataD:
            ssTup = scanDataD[cId]
            dD = ssTup.scanCategoryDict
            for catName in dD:
                if catName not in sD:
                    sD[catName] = {}
                for svTup in dD[catName]:
                    if svTup.atName not in sD[catName]:
                        sD[catName][svTup.atName] = {
                            "count": 0,
                            "instances": []
                        }
                    sD[catName][svTup.atName]["instances"].append(
                        svTup.containerId)
                    sD[catName][svTup.atName]["count"] += 1
        cL = []
        for catName, aD in sD.items():
            for atName, tD in aD.items():
                cL.append("%s\t%s" %
                          ("_" + catName + "." + atName, tD["count"]))
        return sD, cL

    def scanWorker(self, dataList, procName, optionsD, workingDir):
        """Multi-proc worker method for scanning repository data files-"""
        try:
            _ = workingDir
            startTime = self.__begin(message=procName)
            # Recover common options

            scanType = optionsD["scanType"]
            contentType = optionsD["contentType"]
            #
            successList = []
            retList = []

            containerList = self.__getContainerList(dataList)
            for container in containerList:
                ret = self.__scanContainer(container)
                successList.append(ret.fromPath)
                retList.append(ret)
            #

            logger.debug(
                "%s scanType %s contentType %spathlist length %d containerList length %d",
                procName, scanType, contentType, len(dataList),
                len(containerList))

            ok = len(successList) == len(dataList)
            #
            self.__end(startTime, procName + " with status " + str(ok))
            return successList, retList, []

        except Exception as e:
            logger.error("Failing with dataList %r", dataList)
            logger.exception("Failing with %s", str(e))

        return [], [], []

    def __getContainerList(self, locatorObjList):
        """"""
        utcnow = datetime.datetime.utcnow()
        ts = utcnow.strftime("%Y-%m-%d:%H:%M:%S")
        cL = []
        myContainerList = self.__rpP.getContainerList(locatorObjList)
        for loc in locatorObjList:
            myContainerList = self.__rpP.getContainerList([loc])
            lPathL = self.__rpP.getLocatorPaths([loc])
            for cA in myContainerList:
                dc = DataCategory("rcsb_load_status",
                                  ["name", "load_date", "locator"],
                                  [[cA.getName(), ts, lPathL[0]]])
                logger.debug("data category %r", dc)
                cA.append(dc)
                cL.append(cA)
        return cL

    def __scanContainer(self, container):
        """Scan the input container for

        Get the file name -
        """
        cName = container.getName()
        loadStatusObj = container.getObj("rcsb_load_status")
        lName = loadStatusObj.getValue(attributeName="name", rowIndex=0)
        lFilePath = loadStatusObj.getValue(attributeName="locator", rowIndex=0)
        lDate = loadStatusObj.getValue(attributeName="load_date", rowIndex=0)
        #
        oD = {}
        for objName in container.getObjNameList():
            if objName == "rcsb_load_status":
                continue
            obj = container.getObj(objName)
            afD = self.__attributeDataTypeD[
                objName] if objName in self.__attributeDataTypeD else {}
            atNameList = obj.getAttributeList()
            wMin = {atName: 100000 for atName in atNameList}
            wMax = {atName: -1 for atName in atNameList}
            pMin = {atName: 100000 for atName in atNameList}
            pMax = {atName: -1 for atName in atNameList}
            for row in obj.getRowList():
                for ii, val in enumerate(row):
                    valLen = len(val)
                    if (valLen == 0) or (val == "?") or (val == "."):
                        continue
                    atName = atNameList[ii]
                    wMin[atName] = min(wMin[atName], valLen)
                    wMax[atName] = max(wMax[atName], valLen)
                    if atName in afD and afD[atName] == "float":
                        vPrec = 0
                        try:
                            fields = val.split(".")
                            vPrec = len(fields[1])
                            pMin[atName] = min(pMin[atName], vPrec)
                            pMax[atName] = max(pMax[atName], vPrec)
                        except Exception as e:
                            logger.debug("Failed to process float %s %r %r %s",
                                         atName, val, vPrec, str(e))
                            pMin[atName] = 0
                            pMax[atName] = 0
                        logger.debug("Got float for %s %r %r", atName, val,
                                     vPrec)
                    else:
                        pMin[atName] = 0
                        pMax[atName] = 0

            # ScanValue - containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec
            oD[objName] = [
                ScanValue(cName, objName, atN, wMin[atN], wMax[atN], pMin[atN],
                          pMax[atN]) for atN in wMax if wMax[atN] != -1
            ]
        # ScanSummary containerId, fromPath, scanCategoryDict
        #
        ret = ScanSummary(lName, lFilePath, lDate, oD)
        #
        return ret

    def __begin(self, message=""):
        startTime = time.time()
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting %s at %s", message, ts)
        return startTime

    def __end(self, startTime, message=""):
        endTime = time.time()
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        delta = endTime - startTime
        logger.debug("Completed %s at %s (%.4f seconds)", message, ts, delta)