def __init__(self, **kwargs):
     #
     self.__cachePath = kwargs.get("cachePath", ".")
     self.__dirPath = os.path.join(self.__cachePath, "chem_comp")
     self.__mU = MarshalUtil(workPath=self.__dirPath)
     self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc")
     self.__ccIdxD = self.__reload(**kwargs)
예제 #2
0
    def getCCDefFile(self, ccFilePath, molBuildType="model-xyz", suppressHydrogens=False):
        """Fetch the molecule definition (ccPath) and build OE molecules
        for comparison.

        """
        #
        mU = MarshalUtil(workPath=self.__workPath)
        rdCcObjL = mU.doImport(ccFilePath, fmt="mmcif")
        oemf = OeMoleculeFactory()
        if not self.__verbose:
            oemf.setQuiet()
        ccId = oemf.setChemCompDef(rdCcObjL[0])
        oemf.build(molBuildType=molBuildType)

        if self.__verbose:
            logger.info("  CCId               = %s", ccId)
            logger.info("  Title              = %s", oemf.getTitle())
            logger.info("  SMILES             = %s", oemf.getCanSMILES())
            logger.info("  SMILES (stereo)    = %s", oemf.getIsoSMILES())
            logger.info("  Formula (Hill)     = %s", oemf.getFormula())
            logger.info("  InChI key          = %s", oemf.getInChIKey())
            logger.info("  InChI              = %s", oemf.getInChI())

        fD = {}
        fD = {"Formula": oemf.getFormula(), "SMILES": oemf.getCanSMILES(), "SMILES_STEREO": oemf.getIsoSMILES(), "InChI": oemf.getInChI(), "InChIKey": oemf.getInChIKey()}

        if suppressHydrogens:
            tMol = oemf.getGraphMolSuppressH()
        else:
            tMol = oemf.getMol()

        fD["OEMOL"] = tMol
        fD["xyz"] = oemf.getAtomDetails(xyzType="model")

        return (ccId, tMol, fD)
    def __init__(self, **kwargs):
        """Utilities build and deliver OE molecules for search applications. Source molecular
           definitions are taken from SMILES descriptors generated by ChemCompSearchIndexProvider()

        Args:
            cachePath (str, optional): path to the directory containing cache files (default: '.')
            ccFileNamePrefix (str, optional) file name prefix for chemical component search index (default: "cc")
            oeFileNamePrefix (str, optional) file name prefix for all generated databases (default: "oe")

        """
        # Database file names with be prefixed with base prefix plus the molecular build type and perception options
        oeFileNamePrefixBase = kwargs.get("oeFileNamePrefix", "oe")
        self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "oe")
        limitPerceptions = kwargs.get("limitPerceptions", False)
        if limitPerceptions:
            self.__oeFileNamePrefix = oeFileNamePrefixBase + "-limit"
        else:
            self.__oeFileNamePrefix = oeFileNamePrefixBase
        #
        cachePath = kwargs.get("cachePath", ".")
        self.__dirPath = os.path.join(cachePath, "oe_mol")
        #
        self.__fpDbD = {}
        self.__ssDb = None
        self.__oeMolD = {}
        self.__oeMolDb = None
        self.__oeMolDbTitleD = None
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__reload(**kwargs)
 def __init__(self, **kwargs):
     self.__dirPath = kwargs.get("holdingsDirPath", ".")
     useCache = kwargs.get("useCache", True)
     baseUrl = kwargs.get(
         "baseUrl",
         "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/development/fall_back/holdings/"
     )
     urlTargetContent = kwargs.get(
         "currentTargetUrl",
         os.path.join(baseUrl, "current_holdings.json.gz"))
     urlFallbackTargetContent = kwargs.get(
         "currentTargetUrl",
         os.path.join(baseUrl, "current_holdings.json.gz"))
     #
     urlTargetIds = kwargs.get(
         "currentTargetUrl", os.path.join(baseUrl,
                                          "current_pdb_ids.json.gz"))
     urlFallbackTargetIds = kwargs.get(
         "currentTargetUrl", os.path.join(baseUrl,
                                          "current_pdb_ids.json.gz"))
     #
     self.__mU = MarshalUtil(workPath=self.__dirPath)
     self.__invD = self.__reloadEntryContent(urlTargetContent,
                                             urlFallbackTargetContent,
                                             self.__dirPath,
                                             useCache=useCache)
     self.__idD = self.__reloadEntryIds(urlTargetIds,
                                        urlFallbackTargetIds,
                                        self.__dirPath,
                                        useCache=useCache)
예제 #5
0
 def testExtractAndSerialize(self):
     """ Test extraction on an example sequence cluster data set.
     """
     try:
         cdp = ClusterDataPrep(workPath=self.__workPath)
         cifD, docBySequenceD, docByClusterD = cdp.extract(
             self.__dataSetId,
             clusterSetLocator=self.__pathClusterData,
             levels=self.__levels,
             clusterType="entity")
         mU = MarshalUtil(workPath=self.__workPath)
         ok = mU.doExport(self.__pathSaveStyleCif,
                          cifD,
                          fmt="json",
                          indent=3)
         self.assertTrue(ok)
         ok = mU.doExport(self.__pathSaveStyleDocSequence,
                          docBySequenceD,
                          fmt="json",
                          indent=3)
         self.assertTrue(ok)
         ok = mU.doExport(self.__pathSaveStyleDocCluster,
                          docByClusterD,
                          fmt="json",
                          indent=3)
         self.assertTrue(ok)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
 def setUp(self):
     #
     #
     self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
     configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                               "dbload-setup-example.yml")
     #
     configName = "site_info_configuration"
     self.__cfgOb = ConfigUtil(configPath=configPath,
                               defaultSectionName=configName,
                               mockTopPath=self.__mockTopPath)
     #
     self.__cachePath = os.path.join(TOPDIR, "CACHE")
     self.__workPath = os.path.join(HERE, "test-output")
     self.__taxonomyDataPath = os.path.join(
         self.__cachePath,
         self.__cfgOb.get("NCBI_TAXONOMY_CACHE_DIR",
                          sectionName=configName))
     #
     self.__cacheKwargs = {"fmt": "json", "indent": 3}
     self.__exdbCacheDirPath = os.path.join(
         self.__cachePath,
         self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
     #
     self.__mU = MarshalUtil()
     self.__entryLimitTest = 18
     #
     self.__startTime = time.time()
     logger.debug("Starting %s at %s", self.id(),
                  time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
 def __getApi(self, dictLocators, **kwargs):
     """ Return an instance of a dictionary API instance for the input dictionary locator list.
     """
     consolidate = kwargs.get("consolidate", True)
     replaceDefinition = kwargs.get("replaceDefinitions", True)
     verbose = kwargs.get("verbose", True)
     #
     ok = self.__reload(dictLocators,
                        self.__dirPath,
                        useCache=self.__useCache)
     #
     dApi = None
     if ok:
         mU = MarshalUtil()
         containerList = []
         for dictLocator in dictLocators:
             cacheFilePath = os.path.join(
                 self.__dirPath, self.__fileU.getFileName(dictLocator))
             containerList.extend(
                 mU.doImport(cacheFilePath, fmt="mmcif-dict"))
         #
         dApi = DictionaryApi(containerList=containerList,
                              consolidate=consolidate,
                              replaceDefinition=replaceDefinition,
                              verbose=verbose)
     return dApi
예제 #8
0
    def testSearchExec(self):
        """Test case:  search cli"""
        try:
            mL = glob.glob(os.path.join(self.__molFileDirPath, "*"))
            logger.info("search list length %d", len(mL))
            mU = MarshalUtil()
            ok = mU.doExport(self.__queryListFilePath, mL, fmt="list")
            exU = ExecUtils()
            logger.info("Executing shell for %s", self.__queryListFilePath)
            cmdPath = os.path.join(TOPDIR, "rcsb", "utils", "ccdc",
                                   "CcdcSearchExec.py")

            logger.info("cmdPath %r", cmdPath)
            ok = exU.runShell(
                "%s %s --mol_list_path %s --result_path %s --search_type %s --csdhome %s"
                % (self.__pythonBinPath, cmdPath, self.__queryListFilePath,
                   self.__ssResultPath, "substructure", self.__csdHome),
                outPath=self.__logPath,
                outAppend=False,
                timeOut=60,
                suppressStderr=False,
            )
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #9
0
 def __init__(self, filePath, **kwargs):
     self.__filePath = filePath
     # Turn off warnings for missing values
     self.__verbose = kwargs.get("verbose", False)
     self.__tD = {}
     self.__mU = MarshalUtil()
     self.__byPassMode = not self.__setup(self.__filePath)
예제 #10
0
    def __init__(self, **kwargs):
        """Utilities build and deliver OE molecule databases from PDB chemical component definition data
        Args:
            cachePath (str, optional): path to the directory containing cache files (default: '.')
            molBuildType (str,optional): data source for building OE molecules (default: "model-xyz")
            oeFileNamePrefix (str, optional) file name prefix for all generated databases (default: "oe")

        """
        # Database file names with be prefixed with base prefix plus the molecular build type and perception options
        oeFileNamePrefixBase = kwargs.get("oeFileNamePrefix", "oe")
        limitPerceptions = kwargs.get("limitPerceptions", False)
        molBuildType = kwargs.get("molBuildType", "model-xyz")
        if limitPerceptions and molBuildType in [
                "oe-smiles", "oe-iso-smiles", "inchi"
        ]:
            self.__oeFileNamePrefix = oeFileNamePrefixBase + "-" + molBuildType + "-limit"
        else:
            self.__oeFileNamePrefix = oeFileNamePrefixBase + "-" + molBuildType
        #
        cachePath = kwargs.get("cachePath", ".")
        self.__dirPath = os.path.join(cachePath, "oe_mol")
        #
        self.__fpDbD = {}
        self.__ssDb = None
        self.__oeMolD = {}
        self.__oeMolDb = None
        self.__oeMolDbTitleD = None
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__molCount = self.__reload(**kwargs)
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__updateId = "2018_25"
        self.__export = False
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                                  sectionName=configName)
        #
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
 def __init__(
     self,
     cfgOb,
     databaseName="pdbx_core",
     collectionName="pdbx_core_polymer_entity",
     polymerType="Protein",
     referenceDatabaseName="UniProt",
     provSource="PDB",
     maxChunkSize=100,
     fetchLimit=None,
     **kwargs
 ):
     self.__cfgOb = cfgOb
     self.__polymerType = polymerType
     self.__mU = MarshalUtil()
     #
     self.__maxChunkSize = maxChunkSize
     self.__statusList = []
     #
     self.__pfP = self.__fetchPfamProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
     self.__ipP = self.__fetchInterProProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
     self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
     self.__goP = self.__fetchGoProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
     self.__ecP = self.__fetchEcProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
     self.__refIdMapD, self.__matchD, self.__refD = self.__reload(databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs)
예제 #13
0
 def __init__(self, **kwargs):
     #
     dirName = "glygen"
     cachePath = kwargs.get("cachePath", ".")
     self.__dirPath = os.path.join(cachePath, dirName)
     super(GlyGenProvider, self).__init__(cachePath, [dirName])
     useCache = kwargs.get("useCache", True)
     #
     baseUrl = kwargs.get(
         "glygenBasetUrl",
         "https://data.glygen.org/ln2data/releases/data/v-1.12.3/reviewed/")
     fallbackUrl = kwargs.get(
         "glygenFallbackUrl",
         "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/glygen/"
     )
     #
     self.__mU = MarshalUtil(workPath=self.__dirPath)
     self.__glycanD = self.__reloadGlycans(baseUrl,
                                           fallbackUrl,
                                           self.__dirPath,
                                           useCache=useCache)
     self.__glycoproteinD = self.__reloadGlycoproteins(baseUrl,
                                                       fallbackUrl,
                                                       self.__dirPath,
                                                       useCache=useCache)
예제 #14
0
    def makeSchemaDef(self, databaseName, dataTyping="ANY", saveSchema=False):
        """Create the schema definition file for a given database (i.e., the files under 'schema_definitions')

        Args:
            databaseName (str): database name (e.g., 'pdbx_comp_model_core')
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)
            saveSchema (bool, optional): whether to save the schema to schemaCachePath or not (default False)

        Returns:
            dict: schema definition dictionary

        """
        schemaDef = None
        try:
            smb = SchemaDefBuild(databaseName,
                                 self.__cfgOb,
                                 cachePath=self.__cachePath)
            schemaDef = smb.build(dataTyping=dataTyping, encodingType="rcsb")
            if schemaDef and saveSchema:
                schemaLocator = self.__getSchemaDefLocator(
                    databaseName, dataTyping=dataTyping)
                localPath = os.path.join(
                    self.__schemaCachePath,
                    self.__fileU.getFileName(schemaLocator))
                mU = MarshalUtil(workPath=self.__workPath)
                mU.doExport(localPath,
                            schemaDef,
                            fmt="json",
                            indent=3,
                            enforceAscii=False)
        except Exception as e:
            logger.exception("Building schema %s failing with %s",
                             databaseName, str(e))
        return schemaDef
예제 #15
0
 def __init__(self, **kwargs):
     #
     self.__cachePath = kwargs.get("cachePath", ".")
     self.__dirPath = os.path.join(self.__cachePath, "CARD-targets")
     #
     self.__mU = MarshalUtil(workPath=self.__dirPath)
     self.__oD, self.__version = self.__reload(self.__dirPath, **kwargs)
예제 #16
0
    def __init__(self, **kwargs):
        urlTargetPfam = kwargs.get(
            "urlTargetPfam",
            "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz"
        )
        urlTargetPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/Pfam-A.clans.tsv.gz"
        self.__version = "34.0"
        dirName = "pfam"
        cachePath = kwargs.get("cachePath", ".")
        dirPath = os.path.join(cachePath, dirName)
        super(PfamProvider, self).__init__(cachePath, [dirName])
        useCache = kwargs.get("useCache", True)
        #
        self.__mU = MarshalUtil(workPath=dirPath)
        self.__pfamD = self.__rebuildCache(urlTargetPfam, urlTargetPfamFB,
                                           dirPath, useCache)

        urlTargetMapPfam = kwargs.get(
            "urlTargetMapPfam",
            "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pdb_pfamA_reg.txt.gz"
        )
        urlTargetMapPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/pdb_pfamA_reg.txt.gz"
        self.__pfamMapD = self.__rebuildMappingCache(urlTargetMapPfam,
                                                     urlTargetMapPfamFB,
                                                     dirPath, useCache)
예제 #17
0
    def __init__(self, **kwargs):
        """Wrapper class for batch chemical search/depiction operations.

        Path and prefix data for wrapper class may be set as keyword arguments
        as environmental variables.

        Args:
            ccUrlTarget (str, optional): path to concatenated chemical component definition file. Defaults to public data file.
            birdUrlTarget (str, optional): path to the concatenated BIRD definition file.  Defaults to public data file.
            cachePath (str): path to top-level cache directory used to store search index file dependencies
                             (default environment variable CHEM_SEARCH_CACHE_PATH or ".")
            numProc (int): multi-process cores to reserve. Default to 6.
            chunkSize (int): multi-process batch size.  Defaults to 50.
        """
        self.__startTime = time.time()
        #
        self.__useCache = kwargs.get("useCache", True)
        self.__numProc = kwargs.get("numProc", 6)
        self.__chunkSize = kwargs.get("chunkSize", 50)
        #
        self.__ccUrlTarget = kwargs.get("ccUrlTarget", None)
        self.__birdUrlTarget = kwargs.get("birdUrlTarget", None)
        self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", os.environ.get("CHEM_SEARCH_CC_PREFIX", "cc-full"))
        #
        self.__cachePath = kwargs.get("cachePath", os.environ.get("CHEM_SEARCH_CACHE_PATH", "."))
        # ---
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        # ---
        self.__ccsw = self.__reload()
예제 #18
0
    def setUp(self):
        self.__verbose = True
        self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "mmcif_pdbx_v5_next.dic")
        self.__pathJsonTestFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "vrpt_dictmap.json")
        self.__pathIndexFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_EXCHANGE_SANDBOX", "update-lists", "all-pdb-list")
        self.__pathCifFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_BIRD_CC_REPO", "0", "PRDCC_000010.cif")
        # self.__locatorCifFile = "https://ftp.wwpdb.org/pub/pdb/data/structures/divided/mmCIF/00/100d.cif.gz"
        self.__locatorCifFileBad = "https://ftp.wwpdb.org/pub/pdb/data/structures/divided/mmCIF/00/100dx.cif.gz"

        self.__locatorCifFile = "https://ftp.wwpdb.org/pub/pdb/data/structures/divided/mmCIF/hr/6hrg.cif.gz"
        #
        self.__workPath = os.path.join(HERE, "test-output")
        self.__pathSaveDictionaryFile = os.path.join(self.__workPath, "mmcif_pdbx_v5_next.dic")
        self.__pathSaveJsonTestFile = os.path.join(self.__workPath, "json-content.json")
        self.__pathSaveIndexFile = os.path.join(self.__workPath, "all-pdb-list")
        self.__pathSaveCifFile = os.path.join(self.__workPath, "cif-content.cif")
        #
        self.__pathFastaFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_EXCHANGE_SANDBOX", "sequence", "pdb_seq_prerelease.fasta")
        self.__pathSaveFastaFile = os.path.join(self.__workPath, "test-pre-release.fasta")
        #

        self.__urlTarget = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"
        self.__urlTargetBad = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump-missing.tar.gz"
        #
        self.__mU = MarshalUtil()
        self.__startTime = time.time()
        logger.debug("Running tests on version %s", __version__)
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
예제 #19
0
    def readDefaultDataTypeMap(self, locator, dataTyping="ANY"):
        """Read data file containing application default data type mapping

              data_rcsb_data_type_map
                loop_
                _pdbx_data_type_application_map.application_name
                _pdbx_data_type_application_map.type_code
                _pdbx_data_type_application_map.app_type_code
                _pdbx_data_type_application_map.app_precision_default
                _pdbx_data_type_application_map.app_width_default
                # .... type mapping data ...

        Return (dict):  map[cifType] -> appType, width, precision
                    mapD['cif_type_code'] -> ['application_name', 'app_type_code', 'app_precision_default', 'app_width_default', 'type_code']
        """
        try:
            #
            mapD = {}
            mU = MarshalUtil(workPath=self.__workPath)
            containerList = mU.doImport(locator, fmt="mmcif", enforceAscii=True, useCharRefs=True, raiseExceptions=True)

            for container in containerList:
                if container.getName() == "rcsb_data_type_map":
                    catObj = container.getObj("pdbx_data_type_application_map")
                    for ii in range(catObj.getRowCount()):
                        dD = catObj.getRowAttributeDict(ii)
                        if dD["application_name"] == dataTyping:
                            mapD[dD["type_code"]] = {k: dD[k] for k in ["app_type_code", "application_name", "type_code"]}
                            mapD[dD["type_code"]].update({k: int(dD[k]) for k in ["app_precision_default", "app_width_default"]})
            return mapD
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return {}
예제 #20
0
 def __parseFasta(self,
                  fastaPath,
                  taxonPath,
                  cachePath,
                  dirPath,
                  addTaxonomy=False):
     # input paths
     chemblTargetRawPath = os.path.join(dirPath, "chembl_targets_raw.fa.gz")
     mU = MarshalUtil(workPath=cachePath)
     oD = {}
     uD = {}
     missTax = 0
     taxonL = []
     try:
         if addTaxonomy:
             umP = UniProtIdMappingProvider(cachePath)
             umP.reload(useCache=True)
         #
         fD = mU.doImport(chemblTargetRawPath,
                          fmt="fasta",
                          commentStyle="default")
         #
         for seqId, sD in fD.items():
             chemblId = seqId.strip().split(" ")[0].strip()
             unpId = seqId[seqId.find("[") + 1:seqId.find("]")]
             seq = sD["sequence"]
             cD = {
                 "sequence": seq,
                 "uniprotId": unpId,
                 "chemblId": chemblId
             }
             if addTaxonomy:
                 taxId = umP.getMappedId(unpId, mapName="NCBI-taxon")
                 cD["taxId"] = taxId if taxId else -1
                 if not taxId:
                     missTax += 1
             #
             seqId = ""
             cL = []
             for k, v in cD.items():
                 if k in ["sequence"]:
                     continue
                 cL.append(str(v))
                 cL.append(str(k))
             seqId = "|".join(cL)
             oD[seqId] = cD
             if addTaxonomy:
                 taxonL.append("%s\t%s" % (seqId, taxId))
             #
             uD.setdefault(unpId, []).append(chemblId)
         #
         ok1 = mU.doExport(fastaPath, oD, fmt="fasta", makeComment=True)
         ok3 = True
         if addTaxonomy:
             ok3 = mU.doExport(taxonPath, taxonL, fmt="list")
         return ok1 & ok3
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     #
     return False
 def testSubsetBuildMoleculeCacheFiltered(self):
     """Test construction of a filtered selection of chemical component definitions."""
     mU = MarshalUtil()
     fD = mU.doImport(self.__missedIdsPath, fmt="json")
     filterIdD = {ccId: True for ccId in fD["filteredIdList"]}
     self.__testBuildMoleculeCacheFiles(filterIdD=filterIdD,
                                        ccFileNamePrefix="cc-filtered")
예제 #22
0
    def reloadDump(self, fmt="json"):
        """Reload PubChem reference data store from saved dump.

        Args:
            fmt (str, optional): format of the backup file (pickle or json). Defaults to "json".

        Returns:
            (int): number of objects restored.
        """
        numUpd = 0
        try:
            # Read from disk backup and update object store -
            if fmt in ["json", "pickle"]:
                fp = self.__getdumpFilePath(fmt="json")
                logger.info("Restoring object store from %s", fp)
                mU = MarshalUtil(workPath=self.__dirPath)
                matchD = mU.doImport(fp, fmt=fmt)
                numUpd = self.__reloadDump(
                    matchD,
                    self.__databaseName,
                    self.__matchIndexCollectionName,
                    indexAttributeNames=["rcsb_id", "rcsb_last_update"])
        except Exception as e:
            logger.exception("Failing for %r with %s", self.__dirPath, str(e))
        # --
        return numUpd
예제 #23
0
    def getSchemaInfo(self, databaseName, dataTyping="ANY"):
        """Convenience method to return essential schema details for the input repository content type.

        Args:
            databaseName (str): schema name  (e.g. pdbx, bird, chem_comp, ...)
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)

        Returns:
            tuple: SchemaDefAccess(object), target database name, target collection name list, primary index attribute list


        """
        sd = None
        dbName = None
        collectionNameList = []
        docIndexD = {}
        try:
            mU = MarshalUtil(workPath=self.__workPath)
            schemaLocator = self.__getSchemaDefLocator(databaseName,
                                                       dataTyping=dataTyping)
            if self.__rebuildFlag:
                filePath = os.path.join(
                    self.__schemaCachePath,
                    self.__fileU.getFileName(schemaLocator))
                self.makeSchemaDef(databaseName,
                                   dataTyping=dataTyping,
                                   saveSchema=True)
            else:
                filePath = self.__reload(schemaLocator,
                                         self.__schemaCachePath,
                                         useCache=self.__useCache)

            if not filePath:
                logger.error("Unable to recover schema %s (%s)", databaseName,
                             dataTyping)
            logger.debug("ContentType %r dataTyping %r schemaLocator %r",
                         databaseName, dataTyping, schemaLocator)
            schemaDef = mU.doImport(filePath, fmt="json")
            if schemaDef:
                logger.debug(
                    "Using cached schema definition for %s application %s",
                    databaseName, dataTyping)
                sd = SchemaDefAccess(schemaDef)
                if sd:
                    dbName = sd.getDatabaseName()
                    collectionInfoList = sd.getCollectionInfo()
                    logger.debug("Schema %s database name %s collections %r",
                                 databaseName, dbName, collectionInfoList)
                    for cd in collectionInfoList:
                        collectionName = cd["NAME"]
                        collectionNameList.append(collectionName)
                        docIndexD[collectionName] = sd.getDocumentIndices(
                            collectionName)

        except Exception as e:
            logger.exception("Retreiving schema %s for %s failing with %s",
                             databaseName, dataTyping, str(e))

        return sd, dbName, collectionNameList, docIndexD
예제 #24
0
 def fetch(self):
     try:
         provenanceFileCachePath = self.__reload(self.__provenanceLocator, self.__provenanceCachePath, useCache=self.__useCache)
         mU = MarshalUtil(workPath=self.__workPath)
         return mU.doImport(provenanceFileCachePath, fmt="json")
     except Exception as e:
         logger.exception("Failed retreiving provenance with %s", str(e))
     return {}
예제 #25
0
 def exportBranchedEntityDetails(self, filePath, fmt="json"):
     """Export branched entity details (BIRD mapping and WURCS descriptors)"""
     rD = self.getBranchedDetails()
     # ----
     mU = MarshalUtil()
     ok = mU.doExport(filePath, rD, fmt=fmt, indent=3)
     logger.info("Exporting (%d) branched entities status %r", len(rD), ok)
     return ok
예제 #26
0
 def __getRegistry(self, registryPath):
     """"""
     try:
         mU = MarshalUtil()
         obj = mU.doImport(registryPath, fmt="json")
         return obj["mmcif_dictionary_registry"]
     except Exception as e:
         logger.exception("Failing for %r with %s", registryPath, str(e))
 def setUp(self):
     self.__cachePath = os.path.join(HERE, "test-output", "CACHE")
     self.__fastaPath = os.path.join(HERE, "test-output",
                                     "chembl-targets.fa")
     self.__taxonPath = os.path.join(HERE, "test-output",
                                     "chembl-targets-taxon.tdd")
     self.__dataPath = os.path.join(HERE, "test-data")
     self.__mU = MarshalUtil(workPath=self.__cachePath)
예제 #28
0
 def __init__(self, cfgOb, refDbName, **kwargs):
     self.__cfgOb = cfgOb
     self.__refDbName = refDbName
     self.__mU = MarshalUtil()
     #
     self.__refIdList = self.__getReferenceAssignments(refDbName, **kwargs)
     self.__refD, self.__matchD = self.__rebuildCache(
         refDbName, self.__refIdList, **kwargs)
예제 #29
0
 def __reload(self, dirPath, baseVersion, useCache, **kwargs):
     startTime = time.time()
     mU = MarshalUtil(workPath=dirPath)
     chemblDbUrl = kwargs.get(
         "ChEMBLDbUrl",
         "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/")
     ok = False
     fU = FileUtil()
     fU.mkdir(dirPath)
     #
     # ChEMBL current version <baseVersion>,...
     # template:  chembl_<baseVersion>.fa.gz
     #
     targetFileName = "chembl_" + str(baseVersion) + ".fa.gz"
     mappingFileName = "chembl_uniprot_mapping.txt"
     #
     chemblTargetPath = os.path.join(dirPath, targetFileName)
     chemblMappingPath = os.path.join(dirPath, mappingFileName)
     mappingFilePath = os.path.join(dirPath, "chembl_uniprot_mapping.json")
     #
     mapD = {}
     if useCache and fU.exists(mappingFilePath):
         logger.info("useCache %r using %r and %r and %r", useCache,
                     chemblTargetPath, chemblMappingPath, mappingFilePath)
         mapD = mU.doImport(mappingFilePath, fmt="json")
     else:
         # Get the ChEMBL UniProt mapping file
         url = os.path.join(chemblDbUrl, mappingFileName)
         ok = fU.get(url, chemblMappingPath)
         logger.info("Fetched %r url %s path %s", ok, url,
                     chemblMappingPath)
         logger.info("Reading ChEMBL mapping file path %s", mappingFilePath)
         rowL = mU.doImport(chemblMappingPath, fmt="tdd", rowFormat="list")
         for row in rowL:
             mapD[row[0]] = (row[1], row[2], row[3])
         ok = mU.doExport(mappingFilePath, mapD, fmt="json")
         logger.info("Processed mapping path %s (%d) %r", mappingFilePath,
                     len(mapD), ok)
         #
         # Get the target FASTA files --
         for vers in range(baseVersion, baseVersion + 10):
             logger.info("Now fetching version %r", vers)
             self.__version = vers
             targetFileName = "chembl_" + str(vers) + ".fa.gz"
             chemblTargetPath = os.path.join(dirPath,
                                             "chembl_targets_raw.fa.gz")
             url = os.path.join(chemblDbUrl, targetFileName)
             ok = fU.get(url, chemblTargetPath)
             logger.info("Fetched %r url %s path %s", ok, url,
                         chemblTargetPath)
             if ok:
                 break
     #
     logger.info("Completed reload at %s (%.4f seconds)",
                 time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                 time.time() - startTime)
     #
     return mapD
예제 #30
0
 def __init__(self, **kwargs):
     self.__cfgOb = kwargs.get("cfgOb", None)
     self.__cachePath = kwargs.get("cachePath", None)
     self.__sandboxPath = kwargs.get("sandboxPath", None)
     self.__filterType = kwargs.get("filterType", "")
     self.__assignDates = "assign-dates" in self.__filterType
     #
     self.__mU = MarshalUtil(workPath=self.__cachePath)
     self.__currentCacheD = None