Exemplo n.º 1
0
 def split(self,
           inputFilePath,
           splitDirPath,
           prefixName="part_",
           maxSizeMB=50):
     chunkSize = maxSizeMB * 1000000
     partNumber = 0
     fU = FileUtil()
     fU.mkdir(splitDirPath)
     manifestPath = os.path.join(splitDirPath, "MANIFEST")
     myHash = fU.hash(inputFilePath, hashType="md5")
     with open(manifestPath, "w") as mfh:
         mfh.write("%s\t%s\n" % (inputFilePath, myHash))
         with open(inputFilePath, "rb") as ifh:
             chunk = ifh.read(chunkSize)
             while chunk:
                 partNumber += 1
                 partName = prefixName + str(partNumber)
                 fp = os.path.join(splitDirPath, partName)
                 with open(fp, "wb") as ofh:
                     ofh.write(chunk)
                 mfh.write("%s\n" % partName)
                 #
                 chunk = ifh.read(chunkSize)
     return partNumber
Exemplo n.º 2
0
 def __reload(self, dirPath, baseVersion, useCache, **kwargs):
     startTime = time.time()
     mU = MarshalUtil(workPath=dirPath)
     chemblDbUrl = kwargs.get(
         "ChEMBLDbUrl",
         "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/")
     ok = False
     fU = FileUtil()
     fU.mkdir(dirPath)
     #
     # ChEMBL current version <baseVersion>,...
     # template:  chembl_<baseVersion>.fa.gz
     #
     targetFileName = "chembl_" + str(baseVersion) + ".fa.gz"
     mappingFileName = "chembl_uniprot_mapping.txt"
     #
     chemblTargetPath = os.path.join(dirPath, targetFileName)
     chemblMappingPath = os.path.join(dirPath, mappingFileName)
     mappingFilePath = os.path.join(dirPath, "chembl_uniprot_mapping.json")
     #
     mapD = {}
     if useCache and fU.exists(mappingFilePath):
         logger.info("useCache %r using %r and %r and %r", useCache,
                     chemblTargetPath, chemblMappingPath, mappingFilePath)
         mapD = mU.doImport(mappingFilePath, fmt="json")
     else:
         # Get the ChEMBL UniProt mapping file
         url = os.path.join(chemblDbUrl, mappingFileName)
         ok = fU.get(url, chemblMappingPath)
         logger.info("Fetched %r url %s path %s", ok, url,
                     chemblMappingPath)
         logger.info("Reading ChEMBL mapping file path %s", mappingFilePath)
         rowL = mU.doImport(chemblMappingPath, fmt="tdd", rowFormat="list")
         for row in rowL:
             mapD[row[0]] = (row[1], row[2], row[3])
         ok = mU.doExport(mappingFilePath, mapD, fmt="json")
         logger.info("Processed mapping path %s (%d) %r", mappingFilePath,
                     len(mapD), ok)
         #
         # Get the target FASTA files --
         for vers in range(baseVersion, baseVersion + 10):
             logger.info("Now fetching version %r", vers)
             self.__version = vers
             targetFileName = "chembl_" + str(vers) + ".fa.gz"
             chemblTargetPath = os.path.join(dirPath,
                                             "chembl_targets_raw.fa.gz")
             url = os.path.join(chemblDbUrl, targetFileName)
             ok = fU.get(url, chemblTargetPath)
             logger.info("Fetched %r url %s path %s", ok, url,
                         chemblTargetPath)
             if ok:
                 break
     #
     logger.info("Completed reload at %s (%.4f seconds)",
                 time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                 time.time() - startTime)
     #
     return mapD
Exemplo n.º 3
0
 def __doAquireLock(self):
     fU = FileUtil()
     mode = os.O_WRONLY | os.O_CREAT | os.O_EXCL | os.O_TRUNC
     try:
         fU.mkdir(os.path.dirname(self.__lockFilePath))
         fd = os.open(self.__lockFilePath, mode)
     except (IOError, OSError):
         pass
     else:
         self.__lockFileFileDescriptor = fd
     return None
Exemplo n.º 4
0
    def makeBundle(self, localParentPath, subDirList):
        """Bundle the subdirectories of the input parent directory path.

        Args:
            localParentPath (str): local parent directory path containing the bundling targets
            subDirList (list, str): list of subdirectories of the parent path to be bundled

        Returns:
            (bool): True for success or False otherwise
        """
        fileU = FileUtil()
        fileU.mkdir(self.__localBundlePath)
        dirPathList = [os.path.join(localParentPath, subDir) for subDir in subDirList]
        okT = fileU.bundleTarfile(self.__localStashTarFilePath, dirPathList, mode="w:gz", recursive=True)
        return okT
    def __reload(self, dirPath, useCache):
        startTime = time.time()
        fD = {}

        ok = False
        featurePath = self.__getFeatureDataPath()
        #
        logger.info("useCache %r featurePath %r", useCache, featurePath)
        if useCache and self.__mU.exists(featurePath):
            fD = self.__mU.doImport(featurePath, fmt="json")
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
        return fD
 def __reload(self, dirPath, **kwargs):
     startTime = time.time()
     fD = {}
     useCache = kwargs.get("useCache", True)
     ok = False
     cofactorPath = self.__getCofactorDataPath()
     #
     logger.info("useCache %r cofactorPath %r", useCache, cofactorPath)
     if useCache and self.__mU.exists(cofactorPath):
         fD = self.__mU.doImport(cofactorPath, fmt="json")
         ok = True
     else:
         fU = FileUtil()
         fU.mkdir(dirPath)
     # ---
     logger.info("Completed reload with status (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
     return fD
 def __reload(self, dirPath, useCache):
     startTime = time.time()
     retD = {}
     ok = False
     mappingPath = self.__getMappingDataPath()
     #
     logger.info("useCache %r mappingPath %r", useCache, mappingPath)
     if useCache and self.__mU.exists(mappingPath):
         retD = self.__mU.doImport(mappingPath, fmt="json")
         ok = True
     else:
         fU = FileUtil()
         fU.mkdir(dirPath)
     # ---
     num = len(retD["mapping"]) if "mapping" in retD else 0
     logger.info("Completed ligand mapping reload (%d) with status (%r) at %s (%.4f seconds)", num, ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
     return retD
Exemplo n.º 8
0
 def __reload(self, dirPath, useCache):
     startTime = time.time()
     aD = {}
     fU = FileUtil()
     fU.mkdir(dirPath)
     targetMechanismFilePath = self.getTargetMechanismDataPath()
     #
     if useCache and fU.exists(targetMechanismFilePath):
         logger.info("useCache %r using %r", useCache,
                     targetMechanismFilePath)
         qD = self.__mU.doImport(targetMechanismFilePath, fmt="json")
         aD = qD["mechanism"] if "mechanism" in qD else {}
     #
     logger.info("Completed reload of (%d) at %s (%.4f seconds)", len(aD),
                 time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                 time.time() - startTime)
     #
     return aD
Exemplo n.º 9
0
 def __reload(self, dirPath, **kwargs):
     oD = None
     version = None
     startTime = time.time()
     useCache = kwargs.get("useCache", True)
     #
     # CARDDumpUrl = kwargs.get("CARDDumpUrl", "https://card.mcmaster.ca/latest/data/broadstreet-v3.1.0.tar.bz2")
     cardDumpUrl = kwargs.get("CARDDumpUrl",
                              "https://card.mcmaster.ca/latest/data")
     ok = False
     fU = FileUtil()
     cardDumpFileName = "card-data.tar.bz2"
     cardDumpPath = os.path.join(dirPath, cardDumpFileName)
     cardDumpDirPath = os.path.join(dirPath, "dump")
     #
     fU.mkdir(dirPath)
     cardDataPath = os.path.join(dirPath, "card-select-data.json")
     #
     logger.info("useCache %r CARDDumpPath %r", useCache, cardDumpPath)
     if useCache and self.__mU.exists(cardDataPath):
         qD = self.__mU.doImport(cardDataPath, fmt="json")
         version = qD["version"]
         oD = qD["data"]
     else:
         logger.info("Fetching url %s path %s", cardDumpUrl, cardDumpPath)
         ok = fU.get(cardDumpUrl, cardDumpPath)
         fU.mkdir(cardDumpDirPath)
         fU.uncompress(cardDumpPath, outputDir=cardDumpDirPath)
         fU.unbundleTarfile(os.path.join(cardDumpDirPath,
                                         cardDumpFileName[:-4]),
                            dirPath=cardDumpDirPath)
         logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok,
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     time.time() - startTime)
         oD, version = self.__parseCardData(
             os.path.join(cardDumpDirPath, "card.json"))
         tS = datetime.datetime.now().isoformat()
         qD = {"version": version, "created": tS, "data": oD}
         oD = qD["data"]
         ok = self.__mU.doExport(cardDataPath, qD, fmt="json", indent=3)
         logger.info("Export CARD data (%d) status %r", len(oD), ok)
     # ---
     return oD, version
Exemplo n.º 10
0
 def __reload(self, dirPath, useCache):
     startTime = time.time()
     aD = {}
     allIdD = {}
     fU = FileUtil()
     fU.mkdir(dirPath)
     targetActivityFilePath = self.getTargetActivityDataPath()
     #
     if useCache and fU.exists(targetActivityFilePath):
         logger.info("useCache %r using %r", useCache, targetActivityFilePath)
         qD = self.__mU.doImport(targetActivityFilePath, fmt="json")
         aD = qD["activity"] if "activity" in qD else {}
         idL = qD["all_ids"] if "all_ids" in qD else []
         allIdD = {k: k in aD for k in idL}
     #
     logger.info(
         "Completed reload (%d activities) (%d tried identifiers) at %s (%.4f seconds)",
         len(aD),
         len(allIdD),
         time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
         time.time() - startTime,
     )
     #
     return aD, allIdD
Exemplo n.º 11
0
    def __reload(self, urlTarget, dirPath, useCache=True):
        """ Reload input GO OBO ontology file and return a nx graph object.
'
        Returns:
            dictionary[goId] = {'name_list': ... , 'id_list': ... 'depth_list': ... }
        """
        goGraph = None
        #
        # mU = MarshalUtil()
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        oboFilePath = os.path.join(dirPath, fn)
        fU.mkdir(dirPath)
        #
        if not useCache:
            for fp in [oboFilePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and fU.exists(oboFilePath):
            goGraph = obonet.read_obo(oboFilePath)
        else:
            logger.info("Fetching url %s to resource file %s", urlTarget,
                        oboFilePath)
            ok = fU.get(urlTarget, oboFilePath)
            if ok:
                goGraph = obonet.read_obo(oboFilePath)
        if goGraph:
            logger.info("Reading %d nodes and %d edges", len(goGraph),
                        goGraph.number_of_edges())
        else:
            logger.info("Go graph construction failing")
        #
        return goGraph
    def exportFasta(self, withGaps=False):
        """
        Example:
            The IMGT/GENE-DB FASTA header contains 15 fields separated by '|':

            1. IMGT/LIGM-DB accession number(s)
            2. IMGT gene and allele name
            3. species (may be followed by an "_" and the name of the strain, breed or isolate, if defined)
            4. IMGT gene and allele functionality
            5. exon(s), region name(s), or extracted label(s)
            6. start and end positions in the IMGT/LIGM-DB accession number(s)
            7. number of nucleotides in the IMGT/LIGM-DB accession number(s)
            8. codon start, or 'NR' (not relevant) for non coding labels
            9. +n: number of nucleotides (nt) added in 5' compared to the corresponding label extracted from IMGT/LIGM-DB
            10. +n or -n: number of nucleotides (nt) added or removed in 3' compared to the corresponding label extracted from IMGT/LIGM-DB
            11. +n, -n, and/or nS: number of added, deleted, and/or substituted nucleotides to correct sequencing errors, or 'not corrected' if non corrected sequencing errors
            12. number of amino acids (AA): this field indicates that the sequence is in amino acids
            13. number of characters in the sequence: nt (or AA)+IMGT gaps=total
            14. partial (if it is)
            15. reverse complementary (if it is)

        """
        # --
        fU = FileUtil()
        fU.mkdir(self.__dirPath)
        if withGaps:
            imgtTargetUrl = "http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-AA-WithGaps-F+ORF+inframeP"
        else:
            imgtTargetUrl = "http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-AA-WithoutGaps-F+ORF+inframeP"
        imgtTargetFileName = fU.getFileName(imgtTargetUrl)
        rawFastaPath = os.path.join(self.__dirPath, imgtTargetFileName)
        # --
        logger.debug("Fetching url %s path %s", imgtTargetUrl, rawFastaPath)
        ok = fU.get(imgtTargetUrl, rawFastaPath)
        logger.info("Fetch status (%r) url %s path %s", ok, imgtTargetUrl,
                    rawFastaPath)
        # --
        fastaPath = os.path.join(self.__dirPath, "imgt-reference.fa")
        taxonPath = os.path.join(self.__dirPath, "imgt-reference-taxon.tdd")
        tP = TaxonomyProvider(cachePath=self.__cachePath, useCache=True)
        ok = tP.testCache()
        if not ok:
            tP = TaxonomyProvider(cachePath=self.__cachePath, useCache=False)

        rawQD = self.__mU.doImport(rawFastaPath,
                                   fmt="fasta",
                                   commentStyle="default")
        oD = {}
        taxonL = []
        for queryId, sD in rawQD.items():
            qL = queryId.split("|")
            tL = qL[2].split("_")
            taxName = tL[0]
            taxVar = tL[1].replace(" ", "_") if len(tL) > 1 else None
            taxId = tP.getTaxId(taxName)
            if taxId:
                tD = {
                    "seqId": qL[0],
                    "imgtGene": qL[1],
                    "functionality": qL[3],
                    "labels": qL[4],
                    "taxId": taxId
                }
                if taxVar:
                    tD["taxVar"] = taxVar
                sD.update(tD)
            else:
                logger.info("Unknown taxonomy %r (taxName=%r)", queryId,
                            taxName)
            sD["sequence"].replace(".", "-")
            seqId = ""
            cL = []
            for k, v in sD.items():
                if k in ["sequence"]:
                    continue
                cL.append(str(v))
                cL.append(str(k))
            seqId = "|".join(cL)
            oD[seqId] = sD
            taxonL.append("%s\t%s" % (seqId, taxId))
        #
        ok1 = self.__mU.doExport(taxonPath, taxonL, fmt="list")
        ok2 = self.__mU.doExport(fastaPath, oD, fmt="fasta", makeComment=True)
        return ok1 and ok2
Exemplo n.º 13
0
class DataTypeApiProvider(SingletonClass):
    """ Data type application and instance information provider.
    """
    def __init__(self, cfgOb, cachePath, useCache=True, **kwargs):
        """Data type application and instance information provider.

        Args:
            cfgOb (object):  ConfigInfo() object instance
            cachePath (str): path to hold the cache directory
            useCache (bool, optional): flag to use cached files. Defaults to True.

        """
        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__useCache = useCache
        self.__cachePath = cachePath
        # self.__contentInfoConfigName = "content_info_helper_configuration"
        self.__fileU = FileUtil()
        self.__contentDefHelper = self.__cfgOb.getHelper(
            "CONTENT_DEF_HELPER_MODULE",
            sectionName=self.__configName,
            cfgOb=self.__cfgOb)
        self.__dirPath = os.path.join(
            cachePath,
            self.__cfgOb.get("DATA_TYPE_INFO_CACHE_DIR",
                             sectionName=self.__configName))
        self.__kwargs = kwargs
        #
        logger.debug("Leaving constructor")

    def getDataTypeInstanceApi(self, databaseName, **kwargs):
        """Return instance of DataTypeInstanceInfo().

        Args:
            databaseName (str): database name

        Returns:
            (object): Instance of DataTypeInstanceInfo()
        """
        _ = kwargs
        dataTypeInstanceLocatorPath = self.__cfgOb.getPath(
            "INSTANCE_DATA_TYPE_INFO_LOCATOR_PATH",
            sectionName=self.__configName)
        dataTypeInstanceFile = self.__contentDefHelper.getDataTypeInstanceFile(
            databaseName) if self.__contentDefHelper else None
        if dataTypeInstanceLocatorPath and dataTypeInstanceFile:
            loc = os.path.join(dataTypeInstanceLocatorPath,
                               dataTypeInstanceFile)
            filePath = self.__reload(loc,
                                     self.__dirPath,
                                     useCache=self.__useCache)
            dtApi = DataTypeInstanceInfo(filePath)
        else:
            # DataTypeInstanceInfo() provides an internal by-pass mode where no coverage data is available.
            dtApi = DataTypeInstanceInfo(None)
            logger.debug("No data coverage available for database %s",
                         databaseName)
        return dtApi

    def getDataTypeApplicationApi(self, appName, **kwargs):
        """Return instance of DataTypeApplicationInfo.

        Args:
            appName (str): application name (e.g., SQL, ANY)

        Returns:
            (object): Instance of DataTypeApplicationInfo()
        """
        _ = kwargs
        dataTypeApplicationLocator = self.__cfgOb.getPath(
            "APP_DATA_TYPE_INFO_LOCATOR", sectionName=self.__configName)
        filePath = self.__reload(dataTypeApplicationLocator,
                                 self.__dirPath,
                                 useCache=self.__useCache)
        dtApi = DataTypeApplicationInfo(
            filePath, dataTyping=appName,
            workPath=self.__dirPath) if filePath else None
        return dtApi

    def __reload(self, urlTarget, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(urlTarget)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Using cache path %s", dirPath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.debug("Fetch data from source %s", urlTarget)
            ok = self.__fileU.get(urlTarget, os.path.join(dirPath, fn))

        return filePath if ok else None
Exemplo n.º 14
0
class DictionaryApiProvider(SingletonClass):
    """ Resource provider for dictionary APIs.
    """
    def __init__(self, dirPath, useCache=True):
        """Resource provider for dictionary APIs.

        Args:
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        """
        self.__apiMap = {}
        self.__dirPath = dirPath
        self.__useCache = useCache
        #
        self.__fileU = FileUtil(workPath=self.__dirPath)
        logger.debug("Leaving constructor")

    def __reload(self, dictLocators, dirPath, useCache=True):
        """Reload local cache of dictionary resources and return a dictionary API instance.

        Args:
            dictLocators (list, str): list of locators for dictionary resource files
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        Returns:
            (object): instance of dictionary API
        """
        #
        # verify the exitence of the cache directory ...
        self.__fileU.mkdir(dirPath)
        if not useCache:
            for dictLocator in dictLocators:
                try:
                    fn = self.__fileU.getFileName(dictLocator)
                    os.remove(os.path.join(dirPath, fn))
                except Exception:
                    pass
        #
        ret = True
        for dictLocator in dictLocators:
            cacheFilePath = os.path.join(dirPath,
                                         self.__fileU.getFileName(dictLocator))
            if useCache and self.__fileU.exists(cacheFilePath):
                # nothing to do
                continue
            logger.debug("Fetching url %s caching in %s", dictLocator,
                         cacheFilePath)
            ok = self.__fileU.get(dictLocator, cacheFilePath)
            ret = ret and ok
        return ret

    def getApi(self, dictLocators, **kwargs):
        """Return a dictionary API object of the input dictioaries.

        Arguments:
            dictLocators {list str} -- list of dictionary locator paths

        Returns:
            [object] -- returns DictionaryApi() object for input dictionaries
        """
        dictFileNames = [
            self.__fileU.getFileName(dictLocator)
            for dictLocator in dictLocators
        ]
        dictTup = tuple(dictFileNames)
        dApi = self.__apiMap[
            dictTup] if dictTup in self.__apiMap else self.__getApi(
                dictLocators, **kwargs)
        self.__apiMap[dictTup] = dApi
        return dApi

    def __getApi(self, dictLocators, **kwargs):
        """ Return an instance of a dictionary API instance for the input dictionary locator list.
        """
        consolidate = kwargs.get("consolidate", True)
        replaceDefinition = kwargs.get("replaceDefinitions", True)
        verbose = kwargs.get("verbose", True)
        #
        ok = self.__reload(dictLocators,
                           self.__dirPath,
                           useCache=self.__useCache)
        #
        dApi = None
        if ok:
            mU = MarshalUtil()
            containerList = []
            for dictLocator in dictLocators:
                cacheFilePath = os.path.join(
                    self.__dirPath, self.__fileU.getFileName(dictLocator))
                containerList.extend(
                    mU.doImport(cacheFilePath, fmt="mmcif-dict"))
            #
            dApi = DictionaryApi(containerList=containerList,
                                 consolidate=consolidate,
                                 replaceDefinition=replaceDefinition,
                                 verbose=verbose)
        return dApi
Exemplo n.º 15
0
    def __reloadFasta(self, dirPath, **kwargs):
        """Reload DrugBank target FASTA data files.

        Args:
            dirPath (str, optional): path to DrugBank cache directory
            useCache (bool, optional): flag to use cached files. Defaults to True.

        Returns:

        """
        startTime = time.time()
        logger.info("Starting db reload at %s",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
        retFilePathList = []
        urlTargetL = [
            "https://go.drugbank.com/releases/latest/downloads/target-all-polypeptide-sequences",
            "https://go.drugbank.com/releases/latest/downloads/enzyme-all-polypeptide-sequences",
            "https://go.drugbank.com/releases/latest/downloads/carrier-all-polypeptide-sequences",
            "https://go.drugbank.com/releases/latest/downloads/transporter-all-polypeptide-sequences",
        ]

        useCache = kwargs.get("useCache", True)
        username = kwargs.get("username", None)
        password = kwargs.get("password", None)
        #
        if not username or not password:
            return retFilePathList
        #
        fU = FileUtil()
        fU.mkdir(dirPath)
        #
        if not useCache:
            #  Clear any cached files
            for urlTarget in urlTargetL:
                baseFileName = fU.getFileName(urlTarget)
                zipFileName = baseFileName + ".fasta.zip"
                retFileName = baseFileName + ".fa"
                for fn in [baseFileName, zipFileName, retFileName]:
                    try:
                        fp = os.path.join(dirPath, fn)
                        os.remove(fp)
                    except Exception:
                        pass
        #
        ok = False
        if useCache:
            ok = True
            for urlTarget in urlTargetL:
                baseFileName = fU.getFileName(urlTarget)
                retFileName = baseFileName + ".fa"
                retFilePath = os.path.join(dirPath, retFileName)
                ok = fU.exists(retFilePath)
                if not ok:
                    break
                retFilePathList.append(retFilePath)
        #
        logger.info("Using cached files %r", ok)
        if not useCache or not ok:
            if not username or not password:
                logger.warning(
                    "Missing credentials for DrugBank file download...")

            for urlTarget in urlTargetL:
                baseFileName = fU.getFileName(urlTarget)
                zipFileName = baseFileName + ".fasta.zip"
                retFileName = baseFileName + ".fa"
                zipFilePath = os.path.join(dirPath, zipFileName)
                retFilePath = os.path.join(dirPath, retFileName)
                basePath = os.path.join(dirPath, baseFileName)
                logger.info("Fetching url %s for FASTA target file %s",
                            urlTarget, baseFileName)
                ok = fU.get(urlTarget,
                            zipFilePath,
                            username=username,
                            password=password)
                endTime = time.time()
                logger.info(
                    "Completed db fetch at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
                #
                ok = fU.unbundleZipfile(zipFilePath, dirPath=basePath)
                fU.put(os.path.join(basePath, "protein.fasta"), retFilePath)
                endTime = time.time()
                logger.info(
                    "Completed unzip at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
                retFilePathList.append(retFilePath)
        return retFilePathList
    def __reloadAssignments(self, dirPath, **kwargs):
        """Fetch and read

        Args:
            dirPath ([type]): [description]

        Returns:
            [type]: [description]

        """
        startTime = time.time()
        aD = {}
        try:
            targetUrl = kwargs.get(
                "assignmentUrl",
                "http://opig.stats.ox.ac.uk/webapps/newsabdab/sabdab/summary/all"
            )
            fU = FileUtil()
            dumpFileName = "sabdab_summary_all.tsv"
            #
            fU.mkdir(dirPath)
            dumpPath = os.path.join(dirPath, dumpFileName)
            logger.info("Fetching url %s path %s", targetUrl, dumpPath)
            ok = fU.get(targetUrl, dumpPath)
            rDL = self.__mU.doImport(dumpPath, fmt="tdd", rowFormat="dict")
            logger.info("SAbDab raw records (%d)", len(rDL))
            logger.debug("rD keys %r", list(rDL[0].keys()))
            kyHL = [
                "pdb", "Hchain", "model", "antigen_chain", "antigen_type",
                "antigen_het_name", "antigen_name", "heavy_subclass"
            ]
            kyLL = [
                "pdb", "Lchain", "model", "antigen_chain", "antigen_type",
                "antigen_het_name", "antigen_name", "light_subclass",
                "light_ctype"
            ]
            #
            for rD in rDL:
                pdbId = rD["pdb"] if rD["pdb"] and rD["pdb"] != "NA" else None
                authAsymIdH = rD["Hchain"] if rD[
                    "Hchain"] and rD["Hchain"] != "NA" else None
                authAsymIdL = rD["Lchain"] if rD[
                    "Lchain"] and rD["Lchain"] != "NA" else None
                if pdbId and authAsymIdH:
                    aD[pdbId + "." + authAsymIdH] = {
                        k: v
                        for k, v in rD.items() if v and v != "NA" and k in kyHL
                    }
                if pdbId and authAsymIdL:
                    aD[pdbId + "." + authAsymIdL] = {
                        k: v
                        for k, v in rD.items() if v and v != "NA" and k in kyLL
                    }

            logger.info("Fetched (%d) SAbDab assignment records.", len(aD))
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        logger.info("Completed reload (%r) at %s (%.4f seconds)", ok,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    time.time() - startTime)
        return aD
Exemplo n.º 17
0
class FileUtilTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb",
                                                     "mock-data",
                                                     "dictionaries",
                                                     "mmcif_pdbx_v5_next.dic")

        self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                               "NCBI", "names.dmp.gz")
        self.__zipFileUrl = "https://inventory.data.gov/dataset/794cd3d7-4d28-4408-8f7d-84b820dbf7f2/resource/6b78ec0c-4980-4ad8-9cbd-2d6eb9eda8e7/download/myfoodapediadata.zip"
        self.__xzFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                     "MOCK_MODBASE_MODELS",
                                     "NP_001030614.1_1.pdb.xz")
        #
        self.__ftpFileUrl = "ftp://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz"
        self.__httpsFileUrl = "https://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz"
        #
        self.__workPath = os.path.join(HERE, "test-output")
        self.__inpDirPath = os.path.join(HERE, "test-data")
        self.__fileU = FileUtil()
        self.__startTime = time.time()
        logger.debug("Running tests on version %s", __version__)
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testTarBundling(self):
        """Test case for tarfile bundling and unbundling"""
        try:
            tP = os.path.join(self.__workPath, "t0.tar.gz")
            dirPath = os.path.join(self.__inpDirPath, "topdir")

            ok = self.__fileU.bundleTarfile(tP, [dirPath],
                                            mode="w:gz",
                                            recursive=True)
            self.assertTrue(ok)

            numBytes = self.__fileU.size(tP)
            self.assertGreaterEqual(numBytes, 250)
            #
            md5 = self.__fileU.hash(tP, hashType="md5")
            self.assertTrue(md5 is not None)
            #
            ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath)
            self.assertTrue(ok)
            #
            tP = os.path.join(self.__workPath, "t1.tar.gz")
            dirPathList = [
                os.path.join(self.__inpDirPath, "topdir", "subdirA"),
                os.path.join(self.__inpDirPath, "topdir", "subdirB")
            ]

            ok = self.__fileU.bundleTarfile(tP,
                                            dirPathList,
                                            mode="w:gz",
                                            recursive=True)
            self.assertTrue(ok)
            #
            ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath)
            self.assertTrue(ok)

            tP = os.path.join(self.__workPath, "t2.tar")
            dirPathList = [
                os.path.join(self.__inpDirPath, "topdir", "subdirA"),
                os.path.join(self.__inpDirPath, "topdir", "subdirB")
            ]

            ok = self.__fileU.bundleTarfile(tP,
                                            dirPathList,
                                            mode="w",
                                            recursive=True)
            self.assertTrue(ok)
            #
            ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath)
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testGetFile(self):
        """Test case for a local files and directories"""
        try:
            remoteLocator = self.__pathPdbxDictionaryFile
            fn = self.__fileU.getFileName(remoteLocator)
            # _, fn = os.path.split(remoteLocator)
            lPath = os.path.join(self.__workPath, fn)
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            ok = self.__fileU.remove(lPath)
            self.assertTrue(ok)
            dPath = os.path.join(self.__workPath, "tdir")
            ok = self.__fileU.mkdir(dPath)
            self.assertTrue(ok)
            ok = self.__fileU.remove(dPath)
            self.assertTrue(ok)
            ok = self.__fileU.remove(";lakdjf")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testMoveAndCopyFile(self):
        """Test case for copying ("put") and moving ("replace") local files"""
        try:
            remoteLocator = self.__pathPdbxDictionaryFile
            fn = self.__fileU.getFileName(remoteLocator)
            # _, fn = os.path.split(remoteLocator)
            lPath = os.path.join(self.__workPath, fn)
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            # Test copy file
            dPath2 = os.path.join(self.__workPath, "tdir")
            ok = self.__fileU.mkdir(dPath2)
            self.assertTrue(ok)
            lPath2 = os.path.join(dPath2, fn)
            ok = self.__fileU.put(lPath, lPath2)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath2)
            self.assertTrue(ok)
            # Remove copied file (to test moving file next)
            ok = self.__fileU.remove(lPath2)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath2)
            self.assertFalse(ok)
            # Test move file
            ok = self.__fileU.replace(lPath, lPath2)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertFalse(ok)
            ok = self.__fileU.exists(lPath2)
            self.assertTrue(ok)
            # Now clean up files and dirs
            ok = self.__fileU.remove(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.remove(dPath2)
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testZipUrl(self):
        """Test case for downloading remote zip file and extracting contents."""
        try:
            remoteLocator = self.__zipFileUrl
            # fn = self.__fileU.getFileName(remoteLocator)
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            lPath = os.path.join(self.__workPath,
                                 self.__fileU.getFileName(self.__zipFileUrl))
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath)
            ok = fp.endswith("Food_Display_Table.xlsx")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testFtpUrl(self):
        """Test case for downloading remote file ftp protocol and extracting contents."""
        try:
            remoteLocator = self.__ftpFileUrl
            # fn = self.__fileU.getFileName(remoteLocator)
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            dirPath = os.path.join(self.__workPath, "chem_comp_models")
            lPath = os.path.join(dirPath,
                                 self.__fileU.getFileName(self.__ftpFileUrl))
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            fp = self.__fileU.uncompress(lPath, outputDir=dirPath)
            ok = fp.endswith("chem_comp_model.cif")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testRemote(self):
        """Test case remote status"""
        try:
            remoteLocator = self.__httpsFileUrl
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            ok = self.__fileU.exists(remoteLocator)
            self.assertTrue(ok)
            size = self.__fileU.size(remoteLocator)
            self.assertGreaterEqual(size, 1000)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skip("DrugBank example -- skipping")
    def testGetDrugBankUrl(self):
        """Test case for downloading drugbank master xml file"""
        try:
            remoteLocator = "https://www.drugbank.ca/releases/latest/downloads/all-full-database"
            un = "username"
            pw = "password"
            # fn = self.__fileU.getFileName(remoteLocator)
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            lPath = os.path.join(self.__workPath, "db-download.zip")
            ok = self.__fileU.get(remoteLocator,
                                  lPath,
                                  username=un,
                                  password=pw)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            self.__fileU.uncompress(lPath, outputDir=self.__workPath)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testXzFile(self):
        """Test case for extracting contents from xz file"""
        try:
            remoteLocator = self.__xzFile
            fn = self.__fileU.getFileName(remoteLocator)
            lPath = os.path.join(self.__workPath, fn)
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath)
            ok = fp.endswith(".pdb")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
    def __reload(self, dirPath, **kwargs):
        startTime = time.time()
        oD = {}
        useCache = kwargs.get("useCache", True)
        targetUrl = kwargs.get(
            "targetUrl",
            "http://opig.stats.ox.ac.uk/webapps/newsabdab/static/downloads/TheraSAbDab_SeqStruc_OnlineDownload.csv"
        )
        #
        ok = False
        fU = FileUtil()
        _, dumpFileName = os.path.split(targetUrl)
        #
        fU.mkdir(dirPath)
        dumpPath = os.path.join(dirPath, dumpFileName)
        dataPath = os.path.join(dirPath, "sabdab-data.json")
        #
        logger.info("useCache %r sabdabDumpPath %r", useCache, dumpPath)
        if useCache and self.__mU.exists(dataPath):
            oD = self.__mU.doImport(dataPath, fmt="json")
        else:
            logger.info("Fetching url %s path %s", targetUrl, dumpPath)
            ok = fU.get(targetUrl, dumpPath)
            #
            rDL = self.__mU.doImport(dumpPath, fmt="csv", rowFormat="dict")
            logger.debug("rD keys %r", list(rDL[0].keys()))
            tD = {}
            for rD in rDL:
                qD = {}
                for kTup in [
                    ("Therapeutic", "antibodyName"),
                    ("Format", "antiBodyFormat"),
                    ("CH1 Isotype", "ch1Isotype"),
                    ("VD LC", "VD_LC"),
                    ("Highest_Clin_Trial (Oct '21)", "maxClinicalPhase"),
                    ("Est. Status", "status"),
                    ("Target", "target"),
                    ("Conditions Approved", "conditionsApproved"),
                    ("Conditions Active", "conditionsActive"),
                ]:
                    if kTup[0] in rD and rD[kTup[0]] not in ["na", "na;na"]:
                        qD[kTup[1]] = rD[kTup[0]]
                    else:
                        qD[kTup[1]] = None
                    if kTup[0] not in rD:
                        logger.error(
                            "SabDab key %r missing in input dataset %r",
                            kTup[0], list(rD.keys()))
                tD[rD["Therapeutic"]] = qD
            aD = self.__reloadAssignments(dirPath, **kwargs)
            #
            tS = datetime.datetime.now().isoformat()
            vS = datetime.datetime.now().strftime("%Y-%m-%d")
            oD = {
                "version": vS,
                "created": tS,
                "identifiers": tD,
                "assignments": aD
            }
            ok = self.__mU.doExport(dataPath, oD, fmt="json", indent=3)
            logger.info(
                "Exporting (%d) Thera-SAbDab data records and (%d) SAbDab assignments in %r status %r",
                len(oD["identifiers"]), len(oD["assignments"]), dataPath, ok)

        # ---
        logger.info("Completed reload (%r) at %s (%.4f seconds)", ok,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    time.time() - startTime)
        return oD["identifiers"], oD["assignments"], dumpPath, oD["version"]
Exemplo n.º 19
0
class MarshalUtil(object):
    """Wrapper for serialization and deserialization methods."""
    def __init__(self, **kwargs):
        self.__workPath = kwargs.get("workPath", ".")
        self.__workDirSuffix = kwargs.get("workDirSuffix", "marshall_")
        self.__workDirPrefix = kwargs.get("workDirSuffix", "_tempdir")
        #
        self.__fileU = FileUtil(workPath=self.__workPath)
        self.__ioU = IoUtil()

    def doExport(self,
                 locator,
                 obj,
                 fmt="list",
                 marshalHelper=None,
                 numParts=None,
                 **kwargs):
        """Serialize the input object at locator path in specified format.  The
        input object is optionally preprocessed by the helper method.

        Args:
            locator (str): target path or URI
            obj (object): data to be serialized
            fmt (str, optional): format for serialization (mmcif, tdd, csv, list). Defaults to "list".
            marshalHelper (method, optional): pre-processor method applied to input data object. Defaults to None.
            numParts (int, optional): serialize the data in parts. Defaults to None. (json and pickle formats)
        Returns:
            bool: True for sucess or False otherwise
        """
        try:
            ret = False
            localFlag = self.__fileU.isLocal(locator)
            if marshalHelper:
                myObj = marshalHelper(obj, **kwargs)
            else:
                myObj = obj
            #
            if localFlag and numParts and fmt in ["json", "pickle"]:
                localFilePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.serializeInParts(localFilePath,
                                                  myObj,
                                                  numParts,
                                                  fmt=fmt,
                                                  **kwargs)
            elif localFlag:
                localFilePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.serialize(localFilePath,
                                           myObj,
                                           fmt=fmt,
                                           workPath=self.__workPath,
                                           **kwargs)
            else:
                with tempfile.TemporaryDirectory(
                        suffix=self.__workDirSuffix,
                        prefix=self.__workDirPrefix,
                        dir=self.__workPath) as tmpDirName:
                    # write a local copy then copy to destination -
                    #
                    localFilePath = os.path.join(
                        self.__workPath, tmpDirName,
                        self.__fileU.getFileName(locator))
                    ok1 = self.__ioU.serialize(localFilePath,
                                               myObj,
                                               fmt=fmt,
                                               workPath=self.__workPath,
                                               **kwargs)
                    ok2 = True
                    if ok1:
                        ok2 = self.__fileU.put(localFilePath, locator,
                                               **kwargs)
                ret = ok1 and ok2
        except Exception as e:
            logger.exception("Exporting locator %r failing with %s", locator,
                             str(e))

        return ret

    def doImport(self,
                 locator,
                 fmt="list",
                 marshalHelper=None,
                 numParts=None,
                 **kwargs):
        """Deserialize data at the target locator in specified format. The deserialized
        data is optionally post-processed by the input helper method.

        Args:
            locator (str): path or URI to input data
            fmt (str, optional): format for deserialization (mmcif, tdd, csv, list). Defaults to "list".
            marshalHelper (method, optional): post-processor method applied to deserialized data object. Defaults to None.
            numParts (int, optional): deserialize the data in parts. Defaults to None. (json and pickle formats)
            tarMember (str, optional): name of a member of tar file bundle. Defaults to None. (tar file format)

        Returns:
            Any: format specific return type
        """
        try:
            tarMember = kwargs.get("tarMember", None)
            localFlag = self.__fileU.isLocal(locator) and not tarMember
            #
            if localFlag and numParts and fmt in ["json", "pickle"]:
                filePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.deserializeInParts(filePath,
                                                    numParts,
                                                    fmt=fmt,
                                                    **kwargs)
            elif localFlag:
                filePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.deserialize(filePath,
                                             fmt=fmt,
                                             workPath=self.__workPath,
                                             **kwargs)
            else:
                #
                if fmt == "mmcif":
                    ret = self.__ioU.deserialize(locator,
                                                 fmt=fmt,
                                                 workPath=self.__workPath,
                                                 **kwargs)
                else:
                    with tempfile.TemporaryDirectory(
                            suffix=self.__workDirSuffix,
                            prefix=self.__workDirPrefix,
                            dir=self.__workPath) as tmpDirName:
                        #
                        # Fetch first then read a local copy -
                        #
                        if tarMember:
                            localFilePath = os.path.join(
                                self.__workPath, tmpDirName, tarMember)
                        else:
                            localFilePath = os.path.join(
                                self.__workPath, tmpDirName,
                                self.__fileU.getFileName(locator))

                        # ---  Local copy approach ---
                        self.__fileU.get(locator, localFilePath, **kwargs)
                        ret = self.__ioU.deserialize(localFilePath,
                                                     fmt=fmt,
                                                     workPath=self.__workPath,
                                                     **kwargs)

            if marshalHelper:
                ret = marshalHelper(ret, **kwargs)
        except Exception as e:
            logger.exception("Importing locator %r failing with %s", locator,
                             str(e))
            ret = None
        return ret

    def exists(self, filePath, mode=os.R_OK):
        return self.__fileU.exists(filePath, mode=mode)

    def mkdir(self, dirPath, mode=0o755):
        return self.__fileU.mkdir(dirPath, mode=mode)

    def remove(self, pth):
        return self.__fileU.remove(pth)
    def __reload(self,
                 dirPath,
                 useCache=False,
                 imgtDumpUrl=None,
                 testList=None,
                 maxCount=None):
        imgtD = {}
        startTime = time.time()

        fU = FileUtil()
        fU.mkdir(dirPath)
        #
        imgtDataPath = os.path.join(self.__dirPath, "imgt-data.json")
        #
        logger.info("useCache %r imgtFeaturePath %r", useCache, imgtDataPath)
        if useCache and self.__mU.exists(imgtDataPath):
            imgtD = self.__mU.doImport(imgtDataPath, fmt="json")
            self.__version = imgtD["version"]
        else:
            imgtDumpUrl = imgtDumpUrl if imgtDumpUrl else "http://www.imgt.org/download/3Dstructure-DB/IMGT3DFlatFiles.tgz"
            imgtReadmeUrl = "http://www.imgt.org/download/3Dstructure-DB/RELEASE"
            imgtDumpFileName = fU.getFileName(imgtDumpUrl)
            imgtDumpPath = os.path.join(dirPath, imgtDumpFileName)
            imgtReleasePath = os.path.join(dirPath, "IMGT-release.txt")
            _, fn = os.path.split(imgtDumpUrl)
            imgtFlatFilePath = os.path.join(self.__dirPath, fn[:-4])
            #
            logger.info("Fetching url %s path %s", imgtDumpUrl, imgtDumpPath)
            ok1 = fU.get(imgtDumpUrl, imgtDumpPath)
            ok2 = fU.get(imgtReadmeUrl, imgtReleasePath)
            fU.unbundleTarfile(imgtDumpPath, dirPath=dirPath)
            logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1
                        and ok2,
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        time.time() - startTime)
            # ---
            readmeLines = self.__mU.doImport(imgtReleasePath, fmt="list")
            self.__version = readmeLines[0].strip() if readmeLines else None
            logger.info("IMGT version %r", self.__version)
            # ---
            chainD, rawD = self.__imgtFlatFileProcessor(imgtFlatFilePath,
                                                        maxCount=maxCount,
                                                        testList=testList)
            # ---
            tS = datetime.datetime.now().isoformat()
            # vS = datetime.datetime.now().strftime("%Y-%m-%d")
            if testList:
                imgtD = {
                    "version": self.__version,
                    "date": tS,
                    "chains": chainD,
                    "raw": rawD
                }
            else:
                imgtD = {
                    "version": self.__version,
                    "date": tS,
                    "chains": chainD
                }
            ok = self.__mU.doExport(imgtDataPath, imgtD, fmt="json", indent=3)
            logger.info("Completed flatfile prep (%r) at %s (%.4f seconds)",
                        ok, time.strftime("%Y %m %d %H:%M:%S",
                                          time.localtime()),
                        time.time() - startTime)
        return imgtD
Exemplo n.º 21
0
class IoUtil(object):
    def __init__(self, **kwargs):
        self.__fileU = FileUtil(**kwargs)

    def serialize(self, filePath, myObj, fmt="pickle", **kwargs):
        """Public method to serialize format appropriate objects

        Args:
            filePath (str): local file path'
            myObj (object): format appropriate object to be serialized
            format (str, optional): one of ['mmcif', mmcif-dict', json', 'list', 'text-dump', pickle' (default)]
            **kwargs: additional keyword arguments passed to worker methods -

        Returns:
            bool: status of serialization operation; true for success or false otherwise

        """
        ret = False
        fmt = str(fmt).lower()
        ret = self.__fileU.mkdirForFile(filePath)
        if not ret:
            return ret
        if fmt in ["mmcif"]:
            ret = self.__serializeMmCif(filePath, myObj, **kwargs)
        elif fmt in ["json"]:
            ret = self.__serializeJson(filePath, myObj, **kwargs)
        elif fmt in ["pickle"]:
            ret = self.__serializePickle(filePath, myObj, **kwargs)
        elif fmt in ["list"]:
            ret = self.__serializeList(filePath,
                                       myObj,
                                       enforceAscii=True,
                                       **kwargs)
        elif fmt in ["mmcif-dict"]:
            ret = self.__serializeMmCifDict(filePath, myObj, **kwargs)
        elif fmt in ["text-dump"]:
            ret = self.__textDump(filePath, myObj, **kwargs)
        elif fmt in ["fasta"]:
            ret = self.__serializeFasta(filePath, myObj, **kwargs)
        elif fmt in ["csv"]:
            ret = self.__serializeCsv(filePath, myObj, **kwargs)
        else:
            pass

        return ret

    def deserialize(self, filePath, fmt="pickle", **kwargs):
        """Public method to deserialize objects in supported formats.

        Args:
            filePath (str): local file path
            format (str, optional): one of ['mmcif', 'json', 'list', ..., 'pickle' (default)]
            **kwargs:  additional keyword arguments passed to worker methods -

        Returns:
            object: deserialized object data

        """
        fmt = str(fmt).lower()
        if fmt in ["mmcif"]:
            ret = self.__deserializeMmCif(filePath, **kwargs)  # type: ignore
        elif fmt in ["json"]:
            ret = self.__deserializeJson(filePath, **kwargs)  # type: ignore
        elif fmt in ["pickle"]:
            ret = self.__deserializePickle(filePath, **kwargs)  # type: ignore
        elif fmt in ["list"]:
            ret = self.__deserializeList(filePath, enforceAscii=True,
                                         **kwargs)  # type: ignore
        elif fmt in ["mmcif-dict"]:
            ret = self.__deserializeMmCifDict(filePath,
                                              **kwargs)  # type: ignore
        elif fmt in ["fasta"]:
            ret = self.__deserializeFasta(filePath, **kwargs)  # type: ignore
        # elif fmt in ["vrpt-xml-to-cif"]:
        #    ret = self.__deserializeVrptToCif(filePath, **kwargs)  # type: ignore
        elif fmt in ["csv", "tdd"]:
            delimiter = kwargs.get("csvDelimiter",
                                   "," if fmt == "csv" else "\t")
            ret = self.__deserializeCsv(filePath,
                                        delimiter=delimiter,
                                        **kwargs)  # type: ignore
        elif fmt in ["xml"]:
            ret = self.__deserializeXml(filePath, **kwargs)  # type: ignore
        else:
            ret = None  # type: ignore

        return ret

    def __sliceInChunks(self, myList, numChunks):
        mc = min(len(myList), numChunks)
        chunkSize = int(len(myList) / mc)
        if len(myList) % mc:
            chunkSize += 1
        for i in range(0, len(myList), chunkSize):
            yield myList[i:i + chunkSize]

    def serializeInParts(self,
                         filePath,
                         myObj,
                         numParts,
                         fmt="json",
                         **kwargs):
        """Public method to serialize format appropriate (json, pickle) objects in multiple parts

        Args:
            filePath (str): local file path
            myObj (object): format appropriate object to be serialized
            numParts (int): divide the data into numParts segments
            format (str, optional): one of ['json' or 'pickle']. Defaults to json
            **kwargs: additional keyword arguments passed to worker methods -

        Returns:
            bool: True for success or False otherwise
        """
        if fmt not in ["json", "pickle"]:
            logger.error("Unsupported format for %s", fmt)
            return False
        pth, fn = os.path.split(filePath)
        self.__fileU.mkdirForFile(pth)
        bn, ext = os.path.splitext(fn)
        ret = True
        if isinstance(myObj, list):
            for ii, subList in enumerate(self.__sliceInChunks(myObj,
                                                              numParts)):
                fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext)
                ok = self.serialize(fp, subList, fmt=fmt, **kwargs)
                ret = ret and ok
        elif isinstance(myObj, dict):
            for ii, keyList in enumerate(
                    self.__sliceInChunks(list(myObj.keys()), numParts)):
                fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext)
                ok = self.serialize(fp,
                                    OrderedDict([(k, myObj[k])
                                                 for k in keyList]),
                                    fmt=fmt,
                                    **kwargs)
                ret = ret and ok
        else:
            logger.error("Unsupported data type for serialization in parts")
            ret = False
        #
        return ret

    def deserializeInParts(self, filePath, numParts, fmt="json", **kwargs):
        """Public method to deserialize objects in supported formats from multiple parts

        Args:
            filePath (str): local file path
            numParts (int): reconstruct the data object from numParts segments
            format (str, optional): one of ['json' or 'pickle']. Defaults to json
            **kwargs: additional keyword arguments passed to worker methods -

        Returns:
            object: deserialized object data
        """
        rObj = None
        if fmt not in ["json", "pickle"]:
            logger.error("Unsupported format for %s", fmt)
            return rObj
        #
        pth, fn = os.path.split(filePath)
        bn, ext = os.path.splitext(fn)
        if not numParts:
            fp = os.path.join(pth, bn + "_part_*" + ext)
            numParts = len(glob.glob(fp))
        #
        for ii in range(numParts):
            fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext)
            tObj = self.deserialize(fp, fmt=fmt, **kwargs)
            if isinstance(tObj, list):
                if not rObj:
                    rObj = []
                rObj.extend(tObj)
            elif isinstance(tObj, dict):
                if not rObj:
                    rObj = OrderedDict()
                rObj.update(tObj)
            else:
                logger.error(
                    "Unsupported data type for deserialization in parts")
        return rObj

    def exists(self, filePath, mode=os.R_OK):
        return self.__fileU.exists(filePath, mode=mode)

    def mkdir(self, dirPath, mode=0o755):
        return self.__fileU.mkdir(dirPath, mode=mode)

    def remove(self, pth):
        return self.__fileU.remove(pth)

    def __deserializeFasta(self, filePath, **kwargs):
        try:
            commentStyle = kwargs.get("commentStyle", "uniprot")
            fau = FastaUtil()
            return fau.readFasta(filePath, commentStyle=commentStyle)
        except Exception as e:
            logger.error("Unable to deserialize %r %r ", filePath, str(e))
        return {}

    def __serializeFasta(self, filePath, myObj, **kwargs):
        try:
            maxLineLength = int(kwargs.get("maxLineLength", 70))
            makeComment = kwargs.get("makeComment", False)
            fau = FastaUtil()
            ok = fau.writeFasta(filePath,
                                myObj,
                                maxLineLength=maxLineLength,
                                makeComment=makeComment)
            return ok
        except Exception as e:
            logger.error("Unable to serialize FASTA file %r  %r", filePath,
                         str(e))
        return False

    def __textDump(self, filePath, myObj, **kwargs):
        try:
            indent = kwargs.get("indent", 1)
            width = kwargs.get("width", 120)
            sOut = pprint.pformat(myObj, indent=indent, width=width)
            with open(filePath, "w") as ofh:
                ofh.write("\n%s\n" % sOut)
            return True
        except Exception as e:
            logger.error("Unable to dump to %r  %r", filePath, str(e))
        return False

    def __serializePickle(self, filePath, myObj, **kwargs):
        try:
            pickleProtocol = kwargs.get("pickleProtocol",
                                        pickle.DEFAULT_PROTOCOL)

            with open(filePath, "wb") as outfile:
                pickle.dump(myObj, outfile, pickleProtocol)
            return True
        except Exception as e:
            logger.error("Unable to serialize %r  %r", filePath, str(e))
        return False

    def __deserializePickle(self, filePath, **kwargs):
        myDefault = kwargs.get("default", {})
        try:
            if sys.version_info[0] > 2:
                encoding = kwargs.get("encoding", "ASCII")
                errors = kwargs.get("errors", "strict")
                with open(filePath, "rb") as outfile:
                    return pickle.load(outfile,
                                       encoding=encoding,
                                       errors=errors)
            else:
                with open(filePath, "rb") as outfile:
                    return pickle.load(outfile)
        except Exception as e:
            logger.warning("Unable to deserialize %r %r", filePath, str(e))
        return myDefault

    def __serializeJson(self, filePath, myObj, **kwargs):
        """Internal method to serialize the input object as JSON.  An encoding
        helper class is included to handle selected python data types (e.g., datetime)
        """
        indent = kwargs.get("indent", 0)
        enforceAscii = kwargs.get("enforceAscii", True)
        try:
            if enforceAscii:
                with open(filePath, "w") as outfile:
                    json.dump(myObj,
                              outfile,
                              indent=indent,
                              cls=JsonTypeEncoder,
                              ensure_ascii=enforceAscii)
            else:
                with io.open(filePath, "w", encoding="utf-8") as outfile:
                    json.dump(myObj,
                              outfile,
                              indent=indent,
                              cls=JsonTypeEncoder,
                              ensure_ascii=enforceAscii)
            return True
        except Exception as e:
            logger.error("Unable to serialize %r  %r", filePath, str(e))
        return False

    def __deserializeJson(self, filePath, **kwargs):
        myDefault = kwargs.get("default", {})
        encoding = kwargs.get("encoding", "utf-8-sig")
        encodingErrors = kwargs.get("encodingErrors", "ignore")
        try:
            if filePath[-3:] == ".gz":
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding=encoding,
                                   errors=encodingErrors) as inpFile:
                        return json.load(inpFile,
                                         object_pairs_hook=OrderedDict)
                else:
                    # Py2 situation non-ascii encodings is problematic
                    # with gzip.open(filePath, "rb") as csvFile:
                    #    oL = self.__csvReader(csvFile, rowFormat, delimiter)
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    with io.open(tPath,
                                 newline="",
                                 encoding=encoding,
                                 errors="ignore") as inpFile:
                        return json.load(inpFile,
                                         object_pairs_hook=OrderedDict)
            else:
                with open(filePath, "r") as inpFile:
                    return json.load(inpFile, object_pairs_hook=OrderedDict)
        except Exception as e:
            logger.warning("Unable to deserialize %r %r", filePath, str(e))
        return myDefault

    def __hasMinSize(self, pth, minSize):
        try:
            return os.path.getsize(pth) >= minSize
        except Exception:
            return False

    def __deserializeMmCif(self, locator, **kwargs):
        """ """
        try:
            containerList = []
            workPath = kwargs.get("workPath", None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            minSize = kwargs.get("minSize", 5)
            #
            if self.__fileU.isLocal(locator):
                if minSize >= 0 and not self.__hasMinSize(locator, minSize):
                    logger.warning("Minimum file size not satisfied for: %r",
                                   locator)
                myIo = IoAdapter(raiseExceptions=raiseExceptions,
                                 useCharRefs=useCharRefs)
                containerList = myIo.readFile(
                    locator, enforceAscii=enforceAscii,
                    outDirPath=workPath)  # type: ignore
            else:
                # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs)
                # containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath)
                containerList = self.__deserializeMmCifRemote(
                    locator, useCharRefs, enforceAscii, workPath)

        except Exception as e:
            logger.error("Failing for %s with %s", locator, str(e))
        return containerList

    @retry((requests.exceptions.RequestException),
           maxAttempts=3,
           delaySeconds=1,
           multiplier=2,
           defaultValue=[],
           logger=logger)
    def __deserializeMmCifRemote(self, locator, useCharRefs, enforceAscii,
                                 workPath):
        containerList = []
        try:
            myIo = IoAdapterPy(raiseExceptions=True, useCharRefs=useCharRefs)
            containerList = myIo.readFile(locator,
                                          enforceAscii=enforceAscii,
                                          outDirPath=workPath)
        except Exception as e:
            raise e
        return containerList

    def __serializeMmCif(self, filePath, containerList, **kwargs):
        """ """
        try:
            ret = False
            workPath = kwargs.get("workPath", None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            #
            myIo = IoAdapter(raiseExceptions=raiseExceptions,
                             useCharRefs=useCharRefs)
            if filePath.endswith(".gz") and workPath:
                rfn = "".join(
                    random.choice(string.ascii_uppercase + string.digits)
                    for _ in range(10))
                tPath = os.path.join(workPath, rfn)
                ret = myIo.writeFile(tPath,
                                     containerList=containerList,
                                     enforceAscii=enforceAscii)
                ret = self.__fileU.compress(tPath,
                                            filePath,
                                            compressType="gzip")
            else:
                ret = myIo.writeFile(filePath,
                                     containerList=containerList,
                                     enforceAscii=enforceAscii)
        except Exception as e:
            logger.error("Failing for %s with %s", filePath, str(e))
        return ret

    def __deserializeMmCifDict(self, filePath, **kwargs):
        """ """
        try:
            containerList = []
            workPath = kwargs.get("workPath", None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            #
            myIo = IoAdapterPy(raiseExceptions=raiseExceptions,
                               useCharRefs=useCharRefs)
            containerList = myIo.readFile(filePath,
                                          enforceAscii=enforceAscii,
                                          outDirPath=workPath)
        except Exception as e:
            logger.error("Failing for %s with %s", filePath, str(e))
        return containerList

    def __serializeMmCifDict(self, filePath, containerList, **kwargs):
        """ """
        try:
            ret = False
            # workPath = kwargs.get('workPath', None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            #
            myIo = IoAdapterPy(raiseExceptions=raiseExceptions,
                               useCharRefs=useCharRefs)
            ret = myIo.writeFile(filePath,
                                 containerList=containerList,
                                 enforceAscii=enforceAscii)
        except Exception as e:
            logger.error("Failing for %s with %s", filePath, str(e))
        return ret

    def __serializeList(self, filePath, aList, enforceAscii=True, **kwargs):
        """ """

        try:
            _ = kwargs
            if enforceAscii:
                encoding = "ascii"
            else:
                encoding = "utf-8"
            #
            if sys.version_info[0] > 2:
                with open(filePath, "w") as ofh:
                    if enforceAscii:
                        for st in aList:
                            ofh.write("%s\n" % st.encode(
                                "ascii", "xmlcharrefreplace").decode("ascii"))
                    else:
                        for st in aList:
                            ofh.write("%s\n" % st)
            else:
                if enforceAscii:
                    with io.open(filePath, "w", encoding=encoding) as ofh:
                        for st in aList:
                            ofh.write("%s\n" % st.encode(
                                "ascii", "xmlcharrefreplace").decode("ascii"))
                else:
                    with open(filePath, "wb") as ofh:
                        for st in aList:
                            ofh.write("%s\n" % st)
            return True
        except Exception as e:
            logger.error("Unable to serialize %r %r", filePath, str(e))
        return False

    def __processList(self, ifh, enforceAscii=True, **kwargs):
        uncomment = kwargs.get("uncomment", True)
        aList = []
        for line in ifh:
            if enforceAscii:
                pth = line[:-1].encode("ascii",
                                       "xmlcharrefreplace").decode("ascii")
            else:
                pth = line[:-1]
            if not pth or (uncomment and pth.startswith("#")):
                continue
            aList.append(pth)
        return aList

    def __deserializeList(self,
                          filePath,
                          enforceAscii=True,
                          encodingErrors="ignore",
                          **kwargs):
        aList = []
        _ = kwargs
        try:
            if filePath[-3:] == ".gz":
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding="utf-8-sig",
                                   errors=encodingErrors) as ifh:
                        aList = self.__processList(ifh,
                                                   enforceAscii=enforceAscii,
                                                   **kwargs)
                else:
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    # for py2 this commented code is problematic for non-ascii data
                    # with gzip.open(filePath, "rb") as ifh:
                    #    aList = self.__processList(ifh, enforceAscii=enforceAscii)
                    with io.open(tPath, encoding="utf-8-sig",
                                 errors="ignore") as ifh:
                        aList = self.__processList(ifh,
                                                   enforceAscii=enforceAscii)
            else:
                with io.open(filePath, encoding="utf-8-sig",
                             errors="ignore") as ifh:
                    aList = self.__processList(ifh,
                                               enforceAscii=enforceAscii,
                                               **kwargs)
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        logger.debug("Reading list length %d", len(aList))
        return aList

    def __csvReader(self, csvFile, rowFormat, delimiter, uncomment=True):
        oL = []

        maxInt = sys.maxsize
        csv.field_size_limit(maxInt)
        if rowFormat == "dict":
            if uncomment:
                reader = csv.DictReader(uncommentFilter(csvFile),
                                        delimiter=delimiter)
            else:
                reader = csv.DictReader(csvFile, delimiter=delimiter)
            for rowD in reader:
                oL.append(rowD)
        elif rowFormat == "list":
            if uncomment:
                reader = csv.reader(uncommentFilter(csvFile),
                                    delimiter=delimiter)
            else:
                reader = csv.reader(csvFile, delimiter=delimiter)
            for rowL in reader:
                oL.append(rowL)
        return oL

    def deserializeCsvIter(self,
                           filePath,
                           delimiter=",",
                           rowFormat="dict",
                           encodingErrors="ignore",
                           uncomment=True,
                           **kwargs):
        """Return an iterator to input CSV format file.

        Args:
            filePath (str): input file path
            delimiter (str, optional): CSV delimiter. Defaults to ",".
            rowFormat (str, optional): format for each process row (list or dict). Defaults to "dict".
            encodingErrors (str, optional): treatment of encoding errors. Defaults to "ignore".
            uncomment (bool, optional): flag to ignore leading comments. Defaults to True.

        Returns:
            (iterator): iterator for rowwise access to processed CSV data
        """
        encoding = kwargs.get("encoding", "utf-8-sig")
        maxInt = sys.maxsize
        csv.field_size_limit(maxInt)
        try:
            if filePath[-3:] == ".gz":
                with gzip.open(filePath,
                               "rt",
                               encoding=encoding,
                               errors=encodingErrors) as csvFile:
                    startIt = itertools.dropwhile(
                        lambda x: x.startswith("#"),
                        csvFile) if uncomment else csvFile
                    if rowFormat == "dict":
                        reader = csv.DictReader(startIt, delimiter=delimiter)
                    elif rowFormat == "list":
                        reader = csv.reader(startIt, delimiter=delimiter)
                    for row in reader:
                        yield row
            else:
                with io.open(filePath,
                             newline="",
                             encoding=encoding,
                             errors="ignore") as csvFile:
                    startIt = itertools.dropwhile(
                        lambda x: x.startswith("#"),
                        csvFile) if uncomment else csvFile
                    if rowFormat == "dict":
                        reader = csv.DictReader(startIt, delimiter=delimiter)
                    elif rowFormat == "list":
                        reader = csv.reader(startIt, delimiter=delimiter)
                    for row in reader:
                        # if uncomment and row.startswith("#"):
                        #    continue
                        yield row
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))

    def __deserializeCsv(self,
                         filePath,
                         delimiter=",",
                         rowFormat="dict",
                         encodingErrors="ignore",
                         uncomment=True,
                         **kwargs):
        oL = []
        encoding = kwargs.get("encoding", "utf-8-sig")
        try:
            if filePath[-3:] == ".gz":
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding=encoding,
                                   errors=encodingErrors) as csvFile:
                        oL = self.__csvReader(csvFile,
                                              rowFormat,
                                              delimiter,
                                              uncomment=uncomment)
                else:
                    # Py2 situation non-ascii encodings is problematic
                    # with gzip.open(filePath, "rb") as csvFile:
                    #    oL = self.__csvReader(csvFile, rowFormat, delimiter)
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    with io.open(tPath,
                                 newline="",
                                 encoding=encoding,
                                 errors="ignore") as csvFile:
                        oL = self.__csvReader(csvFile,
                                              rowFormat,
                                              delimiter,
                                              uncomment=uncomment)
            else:
                with io.open(filePath,
                             newline="",
                             encoding=encoding,
                             errors="ignore") as csvFile:
                    oL = self.__csvReader(csvFile,
                                          rowFormat,
                                          delimiter,
                                          uncomment=uncomment)

            return oL
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        logger.debug("Reading list length %d", len(oL))
        return oL

    def __serializeCsv(self, filePath, rowDictList, fieldNames=None, **kwargs):
        """ """
        _ = kwargs
        try:
            wD = {}
            ret = False
            fNames = fieldNames if fieldNames else list(rowDictList[0].keys())
            # with io.open(filePath, 'w', newline='') as csvFile:
            with open(filePath, "w") as csvFile:
                writer = csv.DictWriter(csvFile, fieldnames=fNames)
                writer.writeheader()
                for ii, rowDict in enumerate(rowDictList):
                    try:
                        wD = {k: v for k, v in rowDict.items() if k in fNames}
                        writer.writerow(wD)
                    except Exception as e:
                        logger.error(
                            "Skipping bad CSV record %d wD %r rowDict %r with %s",
                            ii + 1, wD, rowDict, str(e))
                        continue

            ret = True
        except Exception as e:
            logger.error("Failing for %s : %r with %s", filePath, wD, str(e))
        return ret

    def __csvEncoder(self,
                     csvData,
                     encoding="utf-8-sig",
                     encodingErrors="ignore"):
        """Handle encoding issues for gzipped data in Py2. (beware of the BOM chars)

        Args:
            csvData (text lines): uncompressed data from gzip open
            encoding (str, optional): character encoding. Defaults to "utf-8-sig".
            encodingErrors (str, optional): error treatment. Defaults to "ignore".
        """
        for line in csvData:
            yield line.decode("utf-8-sig", errors=encodingErrors).encode(
                encoding, errors=encodingErrors)

    def __deserializeXmlPrev(self, filePath, **kwargs):
        """Read the input XML file path and return an ElementTree data object instance.

        Args:
            filePath (sting): input XML file path

        Returns:
            object: instance of an ElementTree tree object
        """
        _ = kwargs
        tree = None
        try:
            logger.debug("Parsing XML path %s", filePath)
            if filePath[-3:] == ".gz":
                with gzip.open(filePath, mode="rb") as ifh:
                    tV = time.time()
                    tree = ET.parse(ifh)
            else:
                with open(filePath, mode="rb") as ifh:
                    tV = time.time()
                    tree = ET.parse(ifh)
            logger.debug("Parsed %s in %.2f seconds", filePath,
                         time.time() - tV)
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        return tree

    def __testGzip(self, filePath):
        ok = True
        with gzip.open(filePath, "r") as fh:
            try:
                fh.read(1)
            except gzip.BadGzipFile:
                ok = False
            except Exception:
                ok = False
        logger.debug("Gzip file check %r", ok)
        return ok

    def __deserializeXml(self, filePath, **kwargs):
        """Read the input XML file path and return an ElementTree data object instance.

        Args:
            filePath (sting): input XML file path

        Returns:
            object: instance of an ElementTree tree object
        """
        _ = kwargs
        tree = None
        encoding = kwargs.get("encoding", "utf-8-sig")
        encodingErrors = kwargs.get("encodingErrors", "ignore")
        #
        try:
            logger.debug("Parsing XML path %s", filePath)
            if filePath[-3:] == ".gz" and self.__testGzip(filePath):
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding=encoding,
                                   errors=encodingErrors) as ifh:
                        tV = time.time()
                        tree = ET.parse(ifh)
                else:
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    with io.open(tPath,
                                 encoding=encoding,
                                 errors=encodingErrors) as ifh:
                        tV = time.time()
                        tree = ET.parse(ifh)
            else:
                with io.open(filePath,
                             encoding=encoding,
                             errors=encodingErrors) as ifh:
                    tV = time.time()
                    tree = ET.parse(ifh)
            logger.debug("Parsed %s in %.2f seconds", filePath,
                         time.time() - tV)
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        return tree
Exemplo n.º 22
0
class ProvenanceProvider(SingletonClass):
    """Utilities to access and update provenance details."""
    def __init__(self, cfgOb, cachePath, useCache=True, **kwargs):
        """Utilities to access and update provenance details.

        Args:
            cfgOb ([type]): ConfigInfo() instance
            cachePath ([type]): path to directory containing schema
            useCache (bool, optional): use cached schema. Defaults to True.
        """

        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__cachePath = cachePath
        self.__useCache = useCache
        #
        self.__workPath = os.path.join(self.__cachePath, "work")
        self.__provenanceCachePath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("PROVENANCE_INFO_CACHE_DIR",
                             sectionName=self.__configName))
        self.__provenanceLocator = self.__cfgOb.getPath(
            "PROVENANCE_INFO_LOCATOR", sectionName=self.__configName)
        #
        self.__fileU = FileUtil(workPath=self.__workPath)
        self.__fileU.mkdir(self.__provenanceCachePath)
        self.__kwargs = kwargs
        #

    def __reload(self, locator, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(locator)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Using cache path %s", dirPath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.debug("Fetch data from source %s", locator)
            ok = self.__fileU.get(locator, filePath)

        return filePath if ok else None

    def fetch(self):
        try:
            provenanceFileCachePath = self.__reload(self.__provenanceLocator,
                                                    self.__provenanceCachePath,
                                                    useCache=self.__useCache)
            mU = MarshalUtil(workPath=self.__workPath)
            return mU.doImport(provenanceFileCachePath, fmt="json")
        except Exception as e:
            logger.exception("Failed retreiving provenance with %s", str(e))
        return {}

    def update(self, provD):
        ok = False
        try:
            provenanceFileCachePath = self.__reload(self.__provenanceLocator,
                                                    self.__provenanceCachePath,
                                                    useCache=self.__useCache)
            mU = MarshalUtil(workPath=self.__workPath)
            tD = mU.doImport(provenanceFileCachePath, fmt="json")
            tD.update(provD)
            ok = mU.doExport(provenanceFileCachePath, tD, fmt="json")
        except Exception as e:
            logger.exception("Failed updating provenance with %s", str(e))
        return ok

    def store(self, provD):
        ok = False
        try:
            provenanceFileCachePath = self.__reload(self.__provenanceLocator,
                                                    self.__provenanceCachePath,
                                                    useCache=self.__useCache)
            mU = MarshalUtil(workPath=self.__workPath)
            ok = mU.doExport(provenanceFileCachePath, provD, fmt="json")
        except Exception as e:
            logger.exception("Failed storing provenance with %s", str(e))
        return ok
Exemplo n.º 23
0
class SchemaProvider(SingletonClass):
    """ A collection of schema build and caching methods.

        Static cache worflow:

            <authorative source>  <--   <cache dir>  <-  client API

        Compute workflow:

        <dependent resource files, config file, dictionaries> -> [schema builder] --> <schema def> --> <Json schema>

    """

    def __init__(self, cfgOb, cachePath, useCache=True, rebuildFlag=False, **kwargs):
        """A collection of schema build and caching methods.

        Args:
            cfgOb (object): ConfigInfo() instance
            cachePath (str): path to directory containing schema
            useCache (bool, optional): use cached schema. Defaults to True.
            rebuildFlag (bool, optional): on-the-fly rebuild and cache schema
        """

        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__cachePath = os.path.abspath(cachePath)
        self.__useCache = useCache
        self.__rebuildFlag = rebuildFlag
        self.__useCache = rebuildFlag if rebuildFlag else useCache
        #
        self.__workPath = os.path.join(self.__cachePath, "work")

        self.__fileU = FileUtil(workPath=os.path.join(self.__cachePath, "work"))
        self.__schemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName))
        self.__jsonSchemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("JSON_SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName))
        self.__fileU.mkdir(self.__schemaCachePath)
        self.__fileU.mkdir(self.__jsonSchemaCachePath)
        self.__kwargs = kwargs

    def getSchemaOptions(self, schemaLevel, extraOpts=None):
        opts = extraOpts + "|" if extraOpts else ""
        if schemaLevel == "full":
            return opts + "mandatoryKeys|mandatoryAttributes|bounds|enums|rcsb"
        elif schemaLevel in ["min", "minimum"]:
            return opts + "mandatoryKeys|enums|rcsb"
        else:
            return opts

    def getSchemaInfo(self, databaseName, dataTyping="ANY"):
        """Convenience method to return essential schema details for the input repository content type.

        Args:
            databaseName (str): schema name  (e.g. pdbx, bird, chem_comp, ...)
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)

        Returns:
            tuple: SchemaDefAccess(object), target database name, target collection name list, primary index attribute list


        """
        sd = None
        dbName = None
        collectionNameList = []
        docIndexD = {}
        try:
            mU = MarshalUtil(workPath=self.__workPath)
            schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
            if self.__rebuildFlag:
                filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
                self.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True)
            else:
                filePath = self.__reload(schemaLocator, self.__schemaCachePath, useCache=self.__useCache)

            if not filePath:
                logger.error("Unable to recover schema %s (%s)", databaseName, dataTyping)
            logger.debug("ContentType %r dataTyping %r schemaLocator %r", databaseName, dataTyping, schemaLocator)
            schemaDef = mU.doImport(filePath, fmt="json")
            if schemaDef:
                logger.debug("Using cached schema definition for %s application %s", databaseName, dataTyping)
                sd = SchemaDefAccess(schemaDef)
                if sd:
                    dbName = sd.getDatabaseName()
                    collectionInfoList = sd.getCollectionInfo()
                    logger.debug("Schema %s database name %s collections %r", databaseName, dbName, collectionInfoList)
                    for cd in collectionInfoList:
                        collectionName = cd["NAME"]
                        collectionNameList.append(collectionName)
                        docIndexD[collectionName] = sd.getDocumentIndices(collectionName)

        except Exception as e:
            logger.exception("Retreiving schema %s for %s failing with %s", databaseName, dataTyping, str(e))

        return sd, dbName, collectionNameList, docIndexD

    def schemaDefCompare(self, databaseName, dataTyping="ANY"):
        """Compare computed schema defintion with current source/cached version.

        Args:
            databaseName (str): schema definition name for comparison
            dataTyping (str, optional): data type conventions for the schema comparison. Defaults to "ANY".

        Returns:
            (str): file path for schema difference or None
        """
        mU = MarshalUtil(workPath=self.__workPath)
        schemaDiffPath = os.path.join(self.__cachePath, "schema_diff")
        mU.mkdir(schemaDiffPath)
        schemaPath = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
        fn = self.__fileU.getFileName(schemaPath)
        sD = self.makeSchemaDef(databaseName, dataTyping=dataTyping)
        v2 = sD["DATABASE_VERSION"]
        # ----
        # tPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath) + "-test")
        # logger.info("Exporting schema def to %s", tPath)
        # mU.doExport(tPath, sD, fmt="json", indent=3)
        # sD = mU.doImport(tPath, fmt="json")
        # ----
        cPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath))
        sDCache = mU.doImport(cPath, fmt="json")
        v1 = sDCache["DATABASE_VERSION"]
        #
        numDiff, difD = self.schemaCompare(sDCache, sD)
        #
        # jD = diff(sDCache, sD, syntax="explicit", marshal=True)
        diffPath = None
        if numDiff:
            bn, _ = os.path.splitext(fn)
            diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json")
            # logger.info("diff for %s %s = \n%s", databaseName, dataTyping, pprint.pformat(difD, indent=3, width=100))
            mU.doExport(diffPath, difD, fmt="json", indent=3)
        #
        return diffPath

    def jsonSchemaCompare(self, databaseName, collectionName, encodingType, level, extraOpts=None):
        """Compare computed JSON schema defintion with current source/cached version.

        Args:
            databaseName (str): schema name
            collectionName (str): collection name
            encodingType (str): schema data type conventions (JSON|BSON)
            level (str): metadata level (min|full)
            extraOpts (str): extra schema construction options

        Returns:
            (str): path to the difference file or None
        """
        mU = MarshalUtil(workPath=self.__workPath)
        schemaDiffPath = os.path.join(self.__cachePath, "schema_diff")
        mU.mkdir(schemaDiffPath)
        schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType, level)
        fn = self.__fileU.getFileName(schemaLocator)
        schemaPath = os.path.join(self.__jsonSchemaCachePath, fn)
        #
        sD = self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, saveSchema=False, extraOpts=extraOpts)
        v2 = self.__getSchemaVersion(sD)
        # ----
        # tPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaPath) + "-test")
        # logger.info("Exporting json schema to %s", tPath)
        # mU.doExport(tPath, sD, fmt="json", indent=3)
        # ----
        #
        sDCache = mU.doImport(schemaPath, fmt="json")
        v1 = self.__getSchemaVersion(sDCache)
        if not v1:
            logger.error("no version for %s - %s %s", schemaLocator, databaseName, collectionName)
        #
        numDiff, difD = self.schemaCompare(sDCache, sD)
        # jD = diff(sDCache, sD, marshal=True, syntax="explicit")
        diffPath = None
        if numDiff:
            logger.debug("diff for %s %s %s %s = \n%s", databaseName, collectionName, encodingType, level, pprint.pformat(difD, indent=3, width=100))
            bn, _ = os.path.splitext(fn)
            diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json")
            mU.doExport(diffPath, difD, fmt="json", indent=3)

        return diffPath

    def __getSchemaVersion(self, jsonSchema):
        try:
            comment = jsonSchema["$comment"] if "$comment" in jsonSchema else ""
            ff = comment.split(":")
            version = ff[1].strip()
            return version
        except Exception as e:
            logger.exception("Failing for with %s", str(e))
        return ""

    def __getSchemaDefLocator(self, databaseName, dataTyping="ANY"):
        """Internal method returning schema definition path for the input content type and application.
           Defines schema definition naming convention -

           Args:
            databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...)
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)

            Returns:

             str: schema definition file locator

        """
        schemaLocator = None
        try:
            locPath = self.__cfgOb.get("SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName)
            fn = "schema_def-%s-%s.json" % (databaseName, dataTyping.upper())
            schemaLocator = os.path.join(locPath, fn)
        except Exception as e:
            logger.exception("Retreiving schema definition path %s for %s failing with %s", databaseName, dataTyping, str(e))
        return schemaLocator

    def __getJsonSchemaLocator(self, databaseName, collectionName, encodingType="BSON", level="full"):
        """Internal method returning JSON schema path for the input collection data type convention and level.
           Defines the JSON/BSON schema naming convention -

           Args:
            databaseName (str): database name in the document store
            collectionName (str): collection name in document store
            encodingType (str, optional): data type convention (BSON|JSON)
            level (str, optional): Completeness of the schema (e.g. min or full)

            Returns:

            str: schema file locator

        """
        schemaLocator = None
        try:
            sdType = None
            sLevel = None
            schemaLocator = None
            if encodingType.upper() in ["JSON", "BSON"]:
                sdType = encodingType.lower()
            if level.lower() in ["min", "minimun"]:
                sLevel = "min"
            elif level.lower() in ["full"]:
                sLevel = level.lower()
            #
            if sdType and sLevel:
                locPath = self.__cfgOb.get("JSON_SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName)
                fn = "%s-%s-db-%s-col-%s.json" % (sdType, sLevel, databaseName, collectionName)
                schemaLocator = os.path.join(locPath, fn)
            else:
                logger.error("Unsupported schema options:  %s level %r type %r", collectionName, level, encodingType)
                schemaLocator = None
        except Exception as e:
            logger.debug("Retreiving JSON schema definition for %s type %s failing with %s", collectionName, encodingType, str(e))
        #
        return schemaLocator

    def __reload(self, locator, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(locator)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Target cache filePath %s", filePath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.info("Fetch data from source %s to %s", locator, filePath)
            ok = self.__fileU.get(locator, filePath)

        return filePath if ok else None

    def getJsonSchema(self, databaseName, collectionName, encodingType="BSON", level="full", extraOpts=None):
        """Return JSON schema (w/ BSON types) object for the input collection and level.and

        Args:
            databaseName (str): database name
            collectionName (str): collection name in document store
            encodingType (str, optional): data type convention (BSON|JSON)
            level (str, optional): Completeness of the schema (e.g. min or full)

        Returns:
            dict: Schema object

        """
        sObj = None
        schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level)
        #
        if self.__rebuildFlag:
            filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
            self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, extraOpts=extraOpts)
        else:
            filePath = self.__reload(schemaLocator, self.__jsonSchemaCachePath, useCache=self.__useCache)
        mU = MarshalUtil(workPath=self.__workPath)
        if filePath and mU.exists(filePath):
            mU = MarshalUtil(workPath=self.__workPath)
            sObj = mU.doImport(filePath, fmt="json")
        else:
            logger.debug("Failed to read schema for %s %r", collectionName, level)
        return sObj

    def makeSchema(self, databaseName, collectionName, encodingType="BSON", level="full", saveSchema=False, extraOpts=None):
        try:
            smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath)
            #
            cD = None
            stU = encodingType.upper()
            cD = smb.build(collectionName, dataTyping=stU, encodingType=stU, enforceOpts=self.getSchemaOptions(level, extraOpts=extraOpts))
            if cD and saveSchema:
                schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level)
                localPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaLocator))
                mU = MarshalUtil(workPath=self.__workPath)
                mU.doExport(localPath, cD, fmt="json", indent=3, enforceAscii=False)
        except Exception as e:
            logger.exception("Building schema %s collection %s failing with %s", databaseName, collectionName, str(e))
        return cD

    def makeSchemaDef(self, databaseName, dataTyping="ANY", saveSchema=False):
        schemaDef = None
        try:
            smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath)
            schemaDef = smb.build(dataTyping=dataTyping, encodingType="rcsb")
            if schemaDef and saveSchema:
                schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
                localPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
                mU = MarshalUtil(workPath=self.__workPath)
                mU.doExport(localPath, schemaDef, fmt="json", indent=3, enforceAscii=False)
        except Exception as e:
            logger.exception("Building schema %s failing with %s", databaseName, str(e))
        return schemaDef

    def schemaCompare(self, orgD, newD):
        """ Compute the difference of nested dictionaries.

        """
        fOrgD = self.__flatten(orgD)
        fNewD = self.__flatten(newD)
        if len(fOrgD) != len(fNewD):
            logger.debug("Schema lengths differ: org %d new %d", len(fOrgD), len(fNewD))
        #
        addedD = {k: fNewD[k] for k in set(fNewD) - set(fOrgD)}
        removedD = {k: fOrgD[k] for k in set(fOrgD) - set(fNewD)}
        changedOrgD = {k: fOrgD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]}
        changedNewD = {k: fNewD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]}
        chD = {}
        for ky in changedOrgD:
            kyS = ".".join(ky)
            vOrg = changedOrgD[ky]
            vNew = changedNewD[ky]
            if isinstance(vOrg, (list, tuple)) and isinstance(vNew, (list, tuple)):
                # logger.info(" >> %r vOrg %r vNew %r", ky, vOrg, vNew)
                dV = list(set(vNew) - set(vOrg))
                if dV:
                    chD[kyS] = {"diff": dV}
            else:
                chD[kyS] = {"from": vOrg, "to": vNew}
        #
        nT = len(addedD) + len(removedD) + len(chD)
        diffD = {"added": [".".join(kk) for kk in addedD.keys()], "removed": [".".join(kk) for kk in removedD.keys()], "changed": chD}
        return nT, diffD

    def __flatten(self, inpDict, prefix=None):
        prefix = prefix[:] if prefix else []
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flatten(value, prefix + [key])
                outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
            elif isinstance(value, (list, tuple)) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flatten(sublist, prefix + [key] + [str(index)])
                        outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[tuple(prefix + [key] + [str(index)])] = value
            else:
                outDict[tuple(prefix + [key])] = value
        return outDict

    def __flattenX(self, inpDict, prefix=None):
        prefix = prefix[:] if prefix else []
        # separator = "."
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flatten(value, prefix + [key])
                outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
            elif isinstance(value, list) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flatten(sublist, prefix + [key] + [str(index)])
                        outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[tuple(prefix + [key] + [str(index)])] = value
            else:
                outDict[tuple(prefix + [key])] = value
        return outDict

    def __flattenOrg(self, inpDict, separator=".", prefix=""):
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flattenOrg(value, separator, prefix + key + separator)
                outDict.update({key2: val2 for key2, val2 in deeper.items()})
            elif isinstance(value, list) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flattenOrg(sublist, separator, prefix + key + separator + str(index) + separator)
                        outDict.update({key2: val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[prefix + key + separator + str(index)] = value
            else:
                outDict[prefix + key] = value
        return outDict

    def __dictGen(self, indict, pre=None):
        pre = pre[:] if pre else []
        if isinstance(indict, dict):
            for key, value in indict.items():
                if isinstance(value, dict):
                    for dD in self.__dictGen(value, pre + [key]):
                        yield dD
                elif isinstance(value, list) or isinstance(value, tuple):
                    for v in value:
                        for dD in self.__dictGen(v, pre + [key]):
                            yield dD
                else:
                    yield pre + [key, value]
        else:
            yield indict
    def __reload(self, dirPath, reloadDb=False, fromDb=False, useCache=False, pharosDumpUrl=None, mysqlUser=None, mysqlPassword=None):
        startTime = time.time()
        pharosSelectedTables = ["drug_activity", "cmpd_activity", "target", "protein", "t2tc"]
        pharosDumpUrl = pharosDumpUrl if pharosDumpUrl else "http://juniper.health.unm.edu/tcrd/download/latest.sql.gz"
        pharosReadmeUrl = "http://juniper.health.unm.edu/tcrd/download/latest.README"
        ok = False
        fU = FileUtil()
        pharosDumpFileName = fU.getFileName(pharosDumpUrl)
        pharosDumpPath = os.path.join(dirPath, pharosDumpFileName)
        pharosUpdatePath = os.path.join(dirPath, "pharos-update.sql")
        pharosReadmePath = os.path.join(dirPath, "pharos-readme.txt")
        logPath = os.path.join(dirPath, "pharosLoad.log")
        #
        fU.mkdir(dirPath)
        #

        exU = ExecUtils()
        #
        if reloadDb:
            logger.info("useCache %r pharosDumpPath %r", useCache, pharosDumpPath)
            if useCache and self.__mU.exists(pharosDumpPath):
                ok = True
            else:
                logger.info("Fetching url %s path %s", pharosDumpUrl, pharosDumpPath)
                ok1 = fU.get(pharosDumpUrl, pharosDumpPath)
                ok2 = fU.get(pharosReadmeUrl, pharosReadmePath)
                logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1 and ok2, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
            # ---
            readmeLines = self.__mU.doImport(pharosReadmePath, fmt="list")
            self.__version = readmeLines[0].split(" ")[1][1:] if readmeLines else "6"
            # ---
            logger.info("Filtering SQL dump %r for selected tables %r", pharosDumpFileName, pharosSelectedTables)
            doWrite = True
            # Note: the pharos dump file latest.sql.gz is not gzipped
            with open(pharosDumpPath, "r", encoding="utf-8") as ifh, open(pharosUpdatePath, "w", encoding="utf-8") as ofh:
                for line in ifh:
                    if line.startswith("-- Table structure for table"):
                        tN = line.split(" ")[-1][1:-2]
                        doWrite = True if tN in pharosSelectedTables else False
                    if doWrite:
                        ofh.write(line)
            # ---
            ok = exU.run(
                "mysql",
                execArgList=["-v", "-u", mysqlUser, "--password=%s" % mysqlPassword, "-e", "create database if not exists tcrd6;"],
                outPath=logPath,
                outAppend=False,
                timeOut=None,
            )
            # ok = exU.run(
            #     "mysql",
            #     execArgList=["-u", mysqlUser, "--password=%s" % mysqlPassword, "tcrd6"],
            #     outPath=logPath,
            #     inpPath=pharosDumpPath,
            #     outAppend=True,
            #     timeOut=None,
            # )
            shellCmd = 'trap "" SIGHUP SIGINT SIGTERM; nohup mysql -u %s --password=%s tcrd6 < %s >& %s' % (mysqlUser, mysqlPassword, pharosUpdatePath, logPath)
            ok = exU.runShell(
                shellCmd,
                outPath=None,
                inpPath=None,
                outAppend=True,
                timeOut=None,
            )
            logger.info("SQL dump restore status %r", ok)
        # --
        if fromDb:
            for tbl in pharosSelectedTables:
                outPath = os.path.join(dirPath, "%s.tdd" % tbl)
                # if useCache and self.__mU.exists(outPath):
                #   continue
                ok = exU.run(
                    "mysql",
                    execArgList=["-u", mysqlUser, "--password=%s" % mysqlPassword, "-e", "use tcrd6; select * from %s;" % tbl],
                    outPath=outPath,
                    outAppend=False,
                    timeOut=None,
                    suppressStderr=True,
                )
                logger.info("SQL table %s export status %r", tbl, ok)
        return ok