Python FileUtil.exists примеры использования

Язык программирования: Python

Пространство имен/Пакет: rcsb.utils.io.FileUtil

Класс/Тип: FileUtil

Метод/Функция: exists

Примеров на hotexamples.com: 21

Python FileUtil.exists - 21 примеров найдено. Это лучшие примеры Python кода для rcsb.utils.io.FileUtil.FileUtil.exists, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

FileUtil(30)

get(30)

mkdir(24)

getFileName(22)

exists(21)

remove(11)

put(7)

unbundleTarfile(6)

mkdirForFile(4)

uncompress(4)

getFilePath(3)

hash(3)

isLocal(3)

bundleTarfile(3)

size(2)

compress(1)

replace(1)

unbundleZipfile(1)

Пример #1

Показать файл

 def __reload(self, dirPath, baseVersion, useCache, **kwargs):
     startTime = time.time()
     mU = MarshalUtil(workPath=dirPath)
     chemblDbUrl = kwargs.get(
         "ChEMBLDbUrl",
         "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/")
     ok = False
     fU = FileUtil()
     fU.mkdir(dirPath)
     #
     # ChEMBL current version <baseVersion>,...
     # template:  chembl_<baseVersion>.fa.gz
     #
     targetFileName = "chembl_" + str(baseVersion) + ".fa.gz"
     mappingFileName = "chembl_uniprot_mapping.txt"
     #
     chemblTargetPath = os.path.join(dirPath, targetFileName)
     chemblMappingPath = os.path.join(dirPath, mappingFileName)
     mappingFilePath = os.path.join(dirPath, "chembl_uniprot_mapping.json")
     #
     mapD = {}
     if useCache and fU.exists(mappingFilePath):
         logger.info("useCache %r using %r and %r and %r", useCache,
                     chemblTargetPath, chemblMappingPath, mappingFilePath)
         mapD = mU.doImport(mappingFilePath, fmt="json")
     else:
         # Get the ChEMBL UniProt mapping file
         url = os.path.join(chemblDbUrl, mappingFileName)
         ok = fU.get(url, chemblMappingPath)
         logger.info("Fetched %r url %s path %s", ok, url,
                     chemblMappingPath)
         logger.info("Reading ChEMBL mapping file path %s", mappingFilePath)
         rowL = mU.doImport(chemblMappingPath, fmt="tdd", rowFormat="list")
         for row in rowL:
             mapD[row[0]] = (row[1], row[2], row[3])
         ok = mU.doExport(mappingFilePath, mapD, fmt="json")
         logger.info("Processed mapping path %s (%d) %r", mappingFilePath,
                     len(mapD), ok)
         #
         # Get the target FASTA files --
         for vers in range(baseVersion, baseVersion + 10):
             logger.info("Now fetching version %r", vers)
             self.__version = vers
             targetFileName = "chembl_" + str(vers) + ".fa.gz"
             chemblTargetPath = os.path.join(dirPath,
                                             "chembl_targets_raw.fa.gz")
             url = os.path.join(chemblDbUrl, targetFileName)
             ok = fU.get(url, chemblTargetPath)
             logger.info("Fetched %r url %s path %s", ok, url,
                         chemblTargetPath)
             if ok:
                 break
     #
     logger.info("Completed reload at %s (%.4f seconds)",
                 time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                 time.time() - startTime)
     #
     return mapD

Пример #2

Показать файл

    def pushBundle(self, gitRepositoryPath, accessToken, gitHost="github.com", gitBranch="master", remoteStashPrefix="A", maxSizeMB=95):
        """Push bundle to remote stash git repository.

        Args:
            gitRepositoryPath (str): git repository path (e.g., rcsb/py-rcsb_exdb_assets_stash)
            accessToken (str): git repository access token
            gitHost (str, optional): git repository host name. Defaults to github.com.
            gitBranch (str, optional): git branch name. Defaults to master.
            remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A')
            maxSizeMB (int, optional): maximum stash bundle file size that will be committed. Defaults to 95MB.

        Returns:
          bool:  True for success or False otherwise

        """
        try:
            ok = False
            gU = GitUtil(token=accessToken, repositoryHost=gitHost)
            fU = FileUtil()
            localRepositoryPath = os.path.join(self.__localBundlePath, "stash_repository")
            fn = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix)
            #
            # Update existing local repository, otherwise clone a new copy
            if fU.exists(localRepositoryPath):
                ok = gU.pull(localRepositoryPath, branch=gitBranch)
                logger.debug("After pull status %r", gU.status(localRepositoryPath))
            else:
                ok = gU.clone(gitRepositoryPath, localRepositoryPath, branch=gitBranch)
            #
            # Split all bundles
            mbSize = float(fU.size(self.__localStashTarFilePath)) / 1000000.0
            logger.info("Splitting bundle %r (%.3f MB/Max %d MB)", fn, mbSize, maxSizeMB)
            sj = SplitJoin()
            splitDirPath = os.path.join(localRepositoryPath, "stash", fn[:-7])
            sj.split(self.__localStashTarFilePath, splitDirPath, maxSizeMB=maxSizeMB)
            fU.remove(self.__localStashTarFilePath)
            # else:
            # fU.put(self.__localStashTarFilePath, os.path.join(localRepositoryPath, "stash", fn))

            ok = gU.addAll(localRepositoryPath, branch=gitBranch)
            ok = gU.commit(localRepositoryPath, branch=gitBranch)
            logger.debug("After commit status %r", gU.status(localRepositoryPath))
            #
            if accessToken:
                ok = gU.push(localRepositoryPath, branch=gitBranch)
                logger.info("After push status %r", gU.status(localRepositoryPath))
            #
            return ok
        except Exception as e:
            logger.exception("For %r %r failing with %s", gitHost, gitRepositoryPath, str(e))
        return False

Пример #3

Показать файл

Файл: ChemCompMoleculeProvider.py Проект: rcsb/py-rcsb_utils_chem

 def __fetchUrl(self, urlTarget, dirPath, useCache=False):
     fU = FileUtil()
     fn = fU.getFileName(urlTarget)
     filePath = os.path.join(dirPath, fn)
     if not (useCache and fU.exists(filePath)):
         startTime = time.time()
         ok2 = fU.get(urlTarget, filePath)
         endTime = time.time()
         if ok2:
             logger.info("Fetched %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime)
         else:
             logger.error("Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime)
     #
     return filePath

Пример #4

Показать файл

 def __reload(self, dirPath, useCache):
     startTime = time.time()
     aD = {}
     fU = FileUtil()
     fU.mkdir(dirPath)
     targetMechanismFilePath = self.getTargetMechanismDataPath()
     #
     if useCache and fU.exists(targetMechanismFilePath):
         logger.info("useCache %r using %r", useCache,
                     targetMechanismFilePath)
         qD = self.__mU.doImport(targetMechanismFilePath, fmt="json")
         aD = qD["mechanism"] if "mechanism" in qD else {}
     #
     logger.info("Completed reload of (%d) at %s (%.4f seconds)", len(aD),
                 time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                 time.time() - startTime)
     #
     return aD

Пример #5

Показать файл

    def fetchBundle(self, localRestoreDirPath, url, remoteDirPath, remoteStashPrefix="A", userName=None, password=None):
        """Restore bundled dependencies from remote storage and unbundle these in the
           current local cache directory.

        Args:
            localRestoreDirPath (str): local restore path
            url (str): remote URL
            remoteDirPath (str): remote directory path on the remote resource
            remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A')
            userName (str, optional): optional access information. Defaults to None.
            password (str, optional): optional access information. Defaults to None.
        """
        try:
            ok = False
            fileU = FileUtil()
            fn = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix)
            if not url:
                remotePath = os.path.join(remoteDirPath, fn)
                if fileU.exists(remotePath):
                    ok = fileU.get(remotePath, self.__localStashTarFilePath)
                else:
                    ok = False
                    logger.warning("Missing bundle file %r", remotePath)

            elif url and (url.startswith("http://") or url.startswith("https://")):
                remotePath = url + os.path.join("/", remoteDirPath, fn)
                ok = fileU.get(remotePath, self.__localStashTarFilePath)

            elif url and url.startswith("sftp://"):
                sftpU = SftpUtil()
                ok = sftpU.connect(url[7:], userName, pw=password, port=22)
                if ok:
                    remotePath = os.path.join(remoteDirPath, fn)
                    ok = sftpU.get(remotePath, self.__localStashTarFilePath)
            else:
                logger.error("Unsupported protocol %r", url)
            if ok:
                ok = fileU.unbundleTarfile(self.__localStashTarFilePath, dirPath=localRestoreDirPath)
            return ok
        except Exception as e:
            logger.exception("For %r %r Failing with %s", url, remoteDirPath, str(e))
            ok = False
        return ok

Пример #6

Показать файл

Файл: EcodClassificationProvider.py Проект: rcsb/py-rcsb_utils_struct

 def __fetchFromSource(self, urlTarget):
     """Fetch the classification names and domain assignments from the ECOD repo."""
     fU = FileUtil()
     fn = fU.getFileName(urlTarget)
     fp = os.path.join(self.__dirPath, fn)
     if not fU.exists(fp):
         fU.get(urlTarget, fp)
     #
     with open(fp, "r", encoding="utf-8") as ifh:
         line = ifh.readline()
         line = ifh.readline()
         line = ifh.readline()
         ff = line[:-1].split()
         self.__version = ff[-1]
     #
     nmL = self.__mU.doImport(fp, fmt="list", uncomment=True)
     fU.remove(fp)
     #
     return nmL

Пример #7

Показать файл

Файл: ConfigUtil.py Проект: rcsb/py-rcsb_utils_config

    def __processAppendedSections(self,
                                  appendConfigOption,
                                  cachePath,
                                  useCache=True):
        """Fetch and append configuration assets assigned to input configuration option.

        Args:
            appendConfigOption (str): reserved configuration option to hold a list of configuration asset locators
            cachePath (str): path to store cached copies configuration assets
            useCache (bool, optional): use existing cached configuration assets. Defaults to True.

        Returns:
            bool: True for success of False otherwise
        """
        try:
            ret = True
            appendLocL = self.getList(appendConfigOption,
                                      sectionName=self.__defaultSectionName)
            logger.debug("appendLocL is %r", appendLocL)
            if appendLocL:
                cP = os.path.join(cachePath, "config")
                fU = FileUtil(workPath=cP)
                logger.debug("Fetching append sections from %r", appendLocL)
                for appendLoc in appendLocL:
                    fn = fU.getFileName(appendLoc)
                    fp = os.path.join(cP, fn)
                    okF = True
                    if not (useCache and fU.exists(fp)):
                        # get a fresh copy from source
                        okF = fU.get(appendLoc, fp)
                        logger.debug("Fetched %r to %r", appendLoc, fp)
                    ok = self.appendConfig(fp)
                    ret = ret and ok and okF
        except Exception as e:
            logger.exception("Failing for option %r cachePath %r with %s",
                             appendConfigOption, cachePath, str(e))
            ret = False
        #
        if not ret:
            logger.error("Fetching appended sections failing %r", appendLocL)

        return ret

Пример #8

Показать файл

    def __reload(self, urlTarget, dirPath, useCache=True):
        """Reload local cache of mapping resources to support validation report reader and translator.

        Args:
            urlTarget (list, str): URL for schema mapping file
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        Returns:
            (object): instance of ValidationReportReader()
        """
        mapD = {}
        #
        mU = MarshalUtil()
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        mappingFilePath = os.path.join(dirPath, fn)
        mU.mkdir(dirPath)
        #
        # if not useCache:
        #     for fp in [mappingFilePath]:
        #         try:
        #             os.remove(fp)
        #         except Exception:
        #             pass
        # #
        logger.debug("Loading validation mapping data in %s (useCache %r)", fn,
                     useCache)
        if useCache and fU.exists(mappingFilePath):
            mapD = mU.doImport(mappingFilePath, fmt="json")
        else:
            logger.info("Fetching url %s to resource file %s", urlTarget,
                        mappingFilePath)
            tS = uuid.uuid4().hex
            tP = os.path.join(dirPath, "._" + tS)
            ok = fU.get(urlTarget, tP)
            if ok:
                mapD = mU.doImport(tP, fmt="json")
                os.replace(tP, mappingFilePath)
        return mapD

Пример #9

Показать файл

    def get(self, remotePath, localPath):
        """Get a file from a remote FTP server.

        Arguments:
            remotePath (str): remote file path
            localPath (str): local file path

        Returns:
            bool: True for success or false otherwise
        """
        try:
            fileU = FileUtil()
            fileU.mkdirForFile(localPath)
            # If provided localPath already exists and is a directory, retrieve the file using the name on the remote server
            # to avoid unintentionally overwriting an entire local directory with a single retrieved file
            if (os.path.exists(localPath) and os.path.isdir(localPath)):
                remoteFileName = FileUtil().getFileName(remotePath)
                localFilePath = os.path.join(localPath, remoteFileName)
            else:
                localFilePath = localPath
            with open(localFilePath, 'wb') as lFP:
                self.__ftpClient.retrbinary('RETR %s' % remotePath, lFP.write)
            ok = fileU.exists(localFilePath)
            if ok:
                return True
            else:
                logger.error("get failing for remotePath %s localFilePath %s",
                             remotePath, localFilePath)
                return False
        except Exception as e:
            if self.__raiseExceptions:
                raise e
            else:
                logger.error(
                    "get failing for remotePath %s localPath %s with %s",
                    remotePath, localPath, str(e))
                return False

Пример #10

Показать файл

 def __reload(self, dirPath, useCache):
     startTime = time.time()
     aD = {}
     allIdD = {}
     fU = FileUtil()
     fU.mkdir(dirPath)
     targetActivityFilePath = self.getTargetActivityDataPath()
     #
     if useCache and fU.exists(targetActivityFilePath):
         logger.info("useCache %r using %r", useCache, targetActivityFilePath)
         qD = self.__mU.doImport(targetActivityFilePath, fmt="json")
         aD = qD["activity"] if "activity" in qD else {}
         idL = qD["all_ids"] if "all_ids" in qD else []
         allIdD = {k: k in aD for k in idL}
     #
     logger.info(
         "Completed reload (%d activities) (%d tried identifiers) at %s (%.4f seconds)",
         len(aD),
         len(allIdD),
         time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
         time.time() - startTime,
     )
     #
     return aD, allIdD

Пример #11

Показать файл

    def __reload(self, urlTarget, dirPath, useCache=True):
        """ Reload input GO OBO ontology file and return a nx graph object.
'
        Returns:
            dictionary[goId] = {'name_list': ... , 'id_list': ... 'depth_list': ... }
        """
        goGraph = None
        #
        # mU = MarshalUtil()
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        oboFilePath = os.path.join(dirPath, fn)
        fU.mkdir(dirPath)
        #
        if not useCache:
            for fp in [oboFilePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and fU.exists(oboFilePath):
            goGraph = obonet.read_obo(oboFilePath)
        else:
            logger.info("Fetching url %s to resource file %s", urlTarget,
                        oboFilePath)
            ok = fU.get(urlTarget, oboFilePath)
            if ok:
                goGraph = obonet.read_obo(oboFilePath)
        if goGraph:
            logger.info("Reading %d nodes and %d edges", len(goGraph),
                        goGraph.number_of_edges())
        else:
            logger.info("Go graph construction failing")
        #
        return goGraph

Пример #12

Показать файл

Файл: IoUtil.py Проект: rcsb/py-rcsb_utils_io

class IoUtil(object):
    def __init__(self, **kwargs):
        self.__fileU = FileUtil(**kwargs)

    def serialize(self, filePath, myObj, fmt="pickle", **kwargs):
        """Public method to serialize format appropriate objects

        Args:
            filePath (str): local file path'
            myObj (object): format appropriate object to be serialized
            format (str, optional): one of ['mmcif', mmcif-dict', json', 'list', 'text-dump', pickle' (default)]
            **kwargs: additional keyword arguments passed to worker methods -

        Returns:
            bool: status of serialization operation; true for success or false otherwise

        """
        ret = False
        fmt = str(fmt).lower()
        ret = self.__fileU.mkdirForFile(filePath)
        if not ret:
            return ret
        if fmt in ["mmcif"]:
            ret = self.__serializeMmCif(filePath, myObj, **kwargs)
        elif fmt in ["json"]:
            ret = self.__serializeJson(filePath, myObj, **kwargs)
        elif fmt in ["pickle"]:
            ret = self.__serializePickle(filePath, myObj, **kwargs)
        elif fmt in ["list"]:
            ret = self.__serializeList(filePath,
                                       myObj,
                                       enforceAscii=True,
                                       **kwargs)
        elif fmt in ["mmcif-dict"]:
            ret = self.__serializeMmCifDict(filePath, myObj, **kwargs)
        elif fmt in ["text-dump"]:
            ret = self.__textDump(filePath, myObj, **kwargs)
        elif fmt in ["fasta"]:
            ret = self.__serializeFasta(filePath, myObj, **kwargs)
        elif fmt in ["csv"]:
            ret = self.__serializeCsv(filePath, myObj, **kwargs)
        else:
            pass

        return ret

    def deserialize(self, filePath, fmt="pickle", **kwargs):
        """Public method to deserialize objects in supported formats.

        Args:
            filePath (str): local file path
            format (str, optional): one of ['mmcif', 'json', 'list', ..., 'pickle' (default)]
            **kwargs:  additional keyword arguments passed to worker methods -

        Returns:
            object: deserialized object data

        """
        fmt = str(fmt).lower()
        if fmt in ["mmcif"]:
            ret = self.__deserializeMmCif(filePath, **kwargs)  # type: ignore
        elif fmt in ["json"]:
            ret = self.__deserializeJson(filePath, **kwargs)  # type: ignore
        elif fmt in ["pickle"]:
            ret = self.__deserializePickle(filePath, **kwargs)  # type: ignore
        elif fmt in ["list"]:
            ret = self.__deserializeList(filePath, enforceAscii=True,
                                         **kwargs)  # type: ignore
        elif fmt in ["mmcif-dict"]:
            ret = self.__deserializeMmCifDict(filePath,
                                              **kwargs)  # type: ignore
        elif fmt in ["fasta"]:
            ret = self.__deserializeFasta(filePath, **kwargs)  # type: ignore
        # elif fmt in ["vrpt-xml-to-cif"]:
        #    ret = self.__deserializeVrptToCif(filePath, **kwargs)  # type: ignore
        elif fmt in ["csv", "tdd"]:
            delimiter = kwargs.get("csvDelimiter",
                                   "," if fmt == "csv" else "\t")
            ret = self.__deserializeCsv(filePath,
                                        delimiter=delimiter,
                                        **kwargs)  # type: ignore
        elif fmt in ["xml"]:
            ret = self.__deserializeXml(filePath, **kwargs)  # type: ignore
        else:
            ret = None  # type: ignore

        return ret

    def __sliceInChunks(self, myList, numChunks):
        mc = min(len(myList), numChunks)
        chunkSize = int(len(myList) / mc)
        if len(myList) % mc:
            chunkSize += 1
        for i in range(0, len(myList), chunkSize):
            yield myList[i:i + chunkSize]

    def serializeInParts(self,
                         filePath,
                         myObj,
                         numParts,
                         fmt="json",
                         **kwargs):
        """Public method to serialize format appropriate (json, pickle) objects in multiple parts

        Args:
            filePath (str): local file path
            myObj (object): format appropriate object to be serialized
            numParts (int): divide the data into numParts segments
            format (str, optional): one of ['json' or 'pickle']. Defaults to json
            **kwargs: additional keyword arguments passed to worker methods -

        Returns:
            bool: True for success or False otherwise
        """
        if fmt not in ["json", "pickle"]:
            logger.error("Unsupported format for %s", fmt)
            return False
        pth, fn = os.path.split(filePath)
        self.__fileU.mkdirForFile(pth)
        bn, ext = os.path.splitext(fn)
        ret = True
        if isinstance(myObj, list):
            for ii, subList in enumerate(self.__sliceInChunks(myObj,
                                                              numParts)):
                fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext)
                ok = self.serialize(fp, subList, fmt=fmt, **kwargs)
                ret = ret and ok
        elif isinstance(myObj, dict):
            for ii, keyList in enumerate(
                    self.__sliceInChunks(list(myObj.keys()), numParts)):
                fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext)
                ok = self.serialize(fp,
                                    OrderedDict([(k, myObj[k])
                                                 for k in keyList]),
                                    fmt=fmt,
                                    **kwargs)
                ret = ret and ok
        else:
            logger.error("Unsupported data type for serialization in parts")
            ret = False
        #
        return ret

    def deserializeInParts(self, filePath, numParts, fmt="json", **kwargs):
        """Public method to deserialize objects in supported formats from multiple parts

        Args:
            filePath (str): local file path
            numParts (int): reconstruct the data object from numParts segments
            format (str, optional): one of ['json' or 'pickle']. Defaults to json
            **kwargs: additional keyword arguments passed to worker methods -

        Returns:
            object: deserialized object data
        """
        rObj = None
        if fmt not in ["json", "pickle"]:
            logger.error("Unsupported format for %s", fmt)
            return rObj
        #
        pth, fn = os.path.split(filePath)
        bn, ext = os.path.splitext(fn)
        if not numParts:
            fp = os.path.join(pth, bn + "_part_*" + ext)
            numParts = len(glob.glob(fp))
        #
        for ii in range(numParts):
            fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext)
            tObj = self.deserialize(fp, fmt=fmt, **kwargs)
            if isinstance(tObj, list):
                if not rObj:
                    rObj = []
                rObj.extend(tObj)
            elif isinstance(tObj, dict):
                if not rObj:
                    rObj = OrderedDict()
                rObj.update(tObj)
            else:
                logger.error(
                    "Unsupported data type for deserialization in parts")
        return rObj

    def exists(self, filePath, mode=os.R_OK):
        return self.__fileU.exists(filePath, mode=mode)

    def mkdir(self, dirPath, mode=0o755):
        return self.__fileU.mkdir(dirPath, mode=mode)

    def remove(self, pth):
        return self.__fileU.remove(pth)

    def __deserializeFasta(self, filePath, **kwargs):
        try:
            commentStyle = kwargs.get("commentStyle", "uniprot")
            fau = FastaUtil()
            return fau.readFasta(filePath, commentStyle=commentStyle)
        except Exception as e:
            logger.error("Unable to deserialize %r %r ", filePath, str(e))
        return {}

    def __serializeFasta(self, filePath, myObj, **kwargs):
        try:
            maxLineLength = int(kwargs.get("maxLineLength", 70))
            makeComment = kwargs.get("makeComment", False)
            fau = FastaUtil()
            ok = fau.writeFasta(filePath,
                                myObj,
                                maxLineLength=maxLineLength,
                                makeComment=makeComment)
            return ok
        except Exception as e:
            logger.error("Unable to serialize FASTA file %r  %r", filePath,
                         str(e))
        return False

    def __textDump(self, filePath, myObj, **kwargs):
        try:
            indent = kwargs.get("indent", 1)
            width = kwargs.get("width", 120)
            sOut = pprint.pformat(myObj, indent=indent, width=width)
            with open(filePath, "w") as ofh:
                ofh.write("\n%s\n" % sOut)
            return True
        except Exception as e:
            logger.error("Unable to dump to %r  %r", filePath, str(e))
        return False

    def __serializePickle(self, filePath, myObj, **kwargs):
        try:
            pickleProtocol = kwargs.get("pickleProtocol",
                                        pickle.DEFAULT_PROTOCOL)

            with open(filePath, "wb") as outfile:
                pickle.dump(myObj, outfile, pickleProtocol)
            return True
        except Exception as e:
            logger.error("Unable to serialize %r  %r", filePath, str(e))
        return False

    def __deserializePickle(self, filePath, **kwargs):
        myDefault = kwargs.get("default", {})
        try:
            if sys.version_info[0] > 2:
                encoding = kwargs.get("encoding", "ASCII")
                errors = kwargs.get("errors", "strict")
                with open(filePath, "rb") as outfile:
                    return pickle.load(outfile,
                                       encoding=encoding,
                                       errors=errors)
            else:
                with open(filePath, "rb") as outfile:
                    return pickle.load(outfile)
        except Exception as e:
            logger.warning("Unable to deserialize %r %r", filePath, str(e))
        return myDefault

    def __serializeJson(self, filePath, myObj, **kwargs):
        """Internal method to serialize the input object as JSON.  An encoding
        helper class is included to handle selected python data types (e.g., datetime)
        """
        indent = kwargs.get("indent", 0)
        enforceAscii = kwargs.get("enforceAscii", True)
        try:
            if enforceAscii:
                with open(filePath, "w") as outfile:
                    json.dump(myObj,
                              outfile,
                              indent=indent,
                              cls=JsonTypeEncoder,
                              ensure_ascii=enforceAscii)
            else:
                with io.open(filePath, "w", encoding="utf-8") as outfile:
                    json.dump(myObj,
                              outfile,
                              indent=indent,
                              cls=JsonTypeEncoder,
                              ensure_ascii=enforceAscii)
            return True
        except Exception as e:
            logger.error("Unable to serialize %r  %r", filePath, str(e))
        return False

    def __deserializeJson(self, filePath, **kwargs):
        myDefault = kwargs.get("default", {})
        encoding = kwargs.get("encoding", "utf-8-sig")
        encodingErrors = kwargs.get("encodingErrors", "ignore")
        try:
            if filePath[-3:] == ".gz":
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding=encoding,
                                   errors=encodingErrors) as inpFile:
                        return json.load(inpFile,
                                         object_pairs_hook=OrderedDict)
                else:
                    # Py2 situation non-ascii encodings is problematic
                    # with gzip.open(filePath, "rb") as csvFile:
                    #    oL = self.__csvReader(csvFile, rowFormat, delimiter)
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    with io.open(tPath,
                                 newline="",
                                 encoding=encoding,
                                 errors="ignore") as inpFile:
                        return json.load(inpFile,
                                         object_pairs_hook=OrderedDict)
            else:
                with open(filePath, "r") as inpFile:
                    return json.load(inpFile, object_pairs_hook=OrderedDict)
        except Exception as e:
            logger.warning("Unable to deserialize %r %r", filePath, str(e))
        return myDefault

    def __hasMinSize(self, pth, minSize):
        try:
            return os.path.getsize(pth) >= minSize
        except Exception:
            return False

    def __deserializeMmCif(self, locator, **kwargs):
        """ """
        try:
            containerList = []
            workPath = kwargs.get("workPath", None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            minSize = kwargs.get("minSize", 5)
            #
            if self.__fileU.isLocal(locator):
                if minSize >= 0 and not self.__hasMinSize(locator, minSize):
                    logger.warning("Minimum file size not satisfied for: %r",
                                   locator)
                myIo = IoAdapter(raiseExceptions=raiseExceptions,
                                 useCharRefs=useCharRefs)
                containerList = myIo.readFile(
                    locator, enforceAscii=enforceAscii,
                    outDirPath=workPath)  # type: ignore
            else:
                # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs)
                # containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath)
                containerList = self.__deserializeMmCifRemote(
                    locator, useCharRefs, enforceAscii, workPath)

        except Exception as e:
            logger.error("Failing for %s with %s", locator, str(e))
        return containerList

    @retry((requests.exceptions.RequestException),
           maxAttempts=3,
           delaySeconds=1,
           multiplier=2,
           defaultValue=[],
           logger=logger)
    def __deserializeMmCifRemote(self, locator, useCharRefs, enforceAscii,
                                 workPath):
        containerList = []
        try:
            myIo = IoAdapterPy(raiseExceptions=True, useCharRefs=useCharRefs)
            containerList = myIo.readFile(locator,
                                          enforceAscii=enforceAscii,
                                          outDirPath=workPath)
        except Exception as e:
            raise e
        return containerList

    def __serializeMmCif(self, filePath, containerList, **kwargs):
        """ """
        try:
            ret = False
            workPath = kwargs.get("workPath", None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            #
            myIo = IoAdapter(raiseExceptions=raiseExceptions,
                             useCharRefs=useCharRefs)
            if filePath.endswith(".gz") and workPath:
                rfn = "".join(
                    random.choice(string.ascii_uppercase + string.digits)
                    for _ in range(10))
                tPath = os.path.join(workPath, rfn)
                ret = myIo.writeFile(tPath,
                                     containerList=containerList,
                                     enforceAscii=enforceAscii)
                ret = self.__fileU.compress(tPath,
                                            filePath,
                                            compressType="gzip")
            else:
                ret = myIo.writeFile(filePath,
                                     containerList=containerList,
                                     enforceAscii=enforceAscii)
        except Exception as e:
            logger.error("Failing for %s with %s", filePath, str(e))
        return ret

    def __deserializeMmCifDict(self, filePath, **kwargs):
        """ """
        try:
            containerList = []
            workPath = kwargs.get("workPath", None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            #
            myIo = IoAdapterPy(raiseExceptions=raiseExceptions,
                               useCharRefs=useCharRefs)
            containerList = myIo.readFile(filePath,
                                          enforceAscii=enforceAscii,
                                          outDirPath=workPath)
        except Exception as e:
            logger.error("Failing for %s with %s", filePath, str(e))
        return containerList

    def __serializeMmCifDict(self, filePath, containerList, **kwargs):
        """ """
        try:
            ret = False
            # workPath = kwargs.get('workPath', None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            #
            myIo = IoAdapterPy(raiseExceptions=raiseExceptions,
                               useCharRefs=useCharRefs)
            ret = myIo.writeFile(filePath,
                                 containerList=containerList,
                                 enforceAscii=enforceAscii)
        except Exception as e:
            logger.error("Failing for %s with %s", filePath, str(e))
        return ret

    def __serializeList(self, filePath, aList, enforceAscii=True, **kwargs):
        """ """

        try:
            _ = kwargs
            if enforceAscii:
                encoding = "ascii"
            else:
                encoding = "utf-8"
            #
            if sys.version_info[0] > 2:
                with open(filePath, "w") as ofh:
                    if enforceAscii:
                        for st in aList:
                            ofh.write("%s\n" % st.encode(
                                "ascii", "xmlcharrefreplace").decode("ascii"))
                    else:
                        for st in aList:
                            ofh.write("%s\n" % st)
            else:
                if enforceAscii:
                    with io.open(filePath, "w", encoding=encoding) as ofh:
                        for st in aList:
                            ofh.write("%s\n" % st.encode(
                                "ascii", "xmlcharrefreplace").decode("ascii"))
                else:
                    with open(filePath, "wb") as ofh:
                        for st in aList:
                            ofh.write("%s\n" % st)
            return True
        except Exception as e:
            logger.error("Unable to serialize %r %r", filePath, str(e))
        return False

    def __processList(self, ifh, enforceAscii=True, **kwargs):
        uncomment = kwargs.get("uncomment", True)
        aList = []
        for line in ifh:
            if enforceAscii:
                pth = line[:-1].encode("ascii",
                                       "xmlcharrefreplace").decode("ascii")
            else:
                pth = line[:-1]
            if not pth or (uncomment and pth.startswith("#")):
                continue
            aList.append(pth)
        return aList

    def __deserializeList(self,
                          filePath,
                          enforceAscii=True,
                          encodingErrors="ignore",
                          **kwargs):
        aList = []
        _ = kwargs
        try:
            if filePath[-3:] == ".gz":
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding="utf-8-sig",
                                   errors=encodingErrors) as ifh:
                        aList = self.__processList(ifh,
                                                   enforceAscii=enforceAscii,
                                                   **kwargs)
                else:
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    # for py2 this commented code is problematic for non-ascii data
                    # with gzip.open(filePath, "rb") as ifh:
                    #    aList = self.__processList(ifh, enforceAscii=enforceAscii)
                    with io.open(tPath, encoding="utf-8-sig",
                                 errors="ignore") as ifh:
                        aList = self.__processList(ifh,
                                                   enforceAscii=enforceAscii)
            else:
                with io.open(filePath, encoding="utf-8-sig",
                             errors="ignore") as ifh:
                    aList = self.__processList(ifh,
                                               enforceAscii=enforceAscii,
                                               **kwargs)
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        logger.debug("Reading list length %d", len(aList))
        return aList

    def __csvReader(self, csvFile, rowFormat, delimiter, uncomment=True):
        oL = []

        maxInt = sys.maxsize
        csv.field_size_limit(maxInt)
        if rowFormat == "dict":
            if uncomment:
                reader = csv.DictReader(uncommentFilter(csvFile),
                                        delimiter=delimiter)
            else:
                reader = csv.DictReader(csvFile, delimiter=delimiter)
            for rowD in reader:
                oL.append(rowD)
        elif rowFormat == "list":
            if uncomment:
                reader = csv.reader(uncommentFilter(csvFile),
                                    delimiter=delimiter)
            else:
                reader = csv.reader(csvFile, delimiter=delimiter)
            for rowL in reader:
                oL.append(rowL)
        return oL

    def deserializeCsvIter(self,
                           filePath,
                           delimiter=",",
                           rowFormat="dict",
                           encodingErrors="ignore",
                           uncomment=True,
                           **kwargs):
        """Return an iterator to input CSV format file.

        Args:
            filePath (str): input file path
            delimiter (str, optional): CSV delimiter. Defaults to ",".
            rowFormat (str, optional): format for each process row (list or dict). Defaults to "dict".
            encodingErrors (str, optional): treatment of encoding errors. Defaults to "ignore".
            uncomment (bool, optional): flag to ignore leading comments. Defaults to True.

        Returns:
            (iterator): iterator for rowwise access to processed CSV data
        """
        encoding = kwargs.get("encoding", "utf-8-sig")
        maxInt = sys.maxsize
        csv.field_size_limit(maxInt)
        try:
            if filePath[-3:] == ".gz":
                with gzip.open(filePath,
                               "rt",
                               encoding=encoding,
                               errors=encodingErrors) as csvFile:
                    startIt = itertools.dropwhile(
                        lambda x: x.startswith("#"),
                        csvFile) if uncomment else csvFile
                    if rowFormat == "dict":
                        reader = csv.DictReader(startIt, delimiter=delimiter)
                    elif rowFormat == "list":
                        reader = csv.reader(startIt, delimiter=delimiter)
                    for row in reader:
                        yield row
            else:
                with io.open(filePath,
                             newline="",
                             encoding=encoding,
                             errors="ignore") as csvFile:
                    startIt = itertools.dropwhile(
                        lambda x: x.startswith("#"),
                        csvFile) if uncomment else csvFile
                    if rowFormat == "dict":
                        reader = csv.DictReader(startIt, delimiter=delimiter)
                    elif rowFormat == "list":
                        reader = csv.reader(startIt, delimiter=delimiter)
                    for row in reader:
                        # if uncomment and row.startswith("#"):
                        #    continue
                        yield row
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))

    def __deserializeCsv(self,
                         filePath,
                         delimiter=",",
                         rowFormat="dict",
                         encodingErrors="ignore",
                         uncomment=True,
                         **kwargs):
        oL = []
        encoding = kwargs.get("encoding", "utf-8-sig")
        try:
            if filePath[-3:] == ".gz":
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding=encoding,
                                   errors=encodingErrors) as csvFile:
                        oL = self.__csvReader(csvFile,
                                              rowFormat,
                                              delimiter,
                                              uncomment=uncomment)
                else:
                    # Py2 situation non-ascii encodings is problematic
                    # with gzip.open(filePath, "rb") as csvFile:
                    #    oL = self.__csvReader(csvFile, rowFormat, delimiter)
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    with io.open(tPath,
                                 newline="",
                                 encoding=encoding,
                                 errors="ignore") as csvFile:
                        oL = self.__csvReader(csvFile,
                                              rowFormat,
                                              delimiter,
                                              uncomment=uncomment)
            else:
                with io.open(filePath,
                             newline="",
                             encoding=encoding,
                             errors="ignore") as csvFile:
                    oL = self.__csvReader(csvFile,
                                          rowFormat,
                                          delimiter,
                                          uncomment=uncomment)

            return oL
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        logger.debug("Reading list length %d", len(oL))
        return oL

    def __serializeCsv(self, filePath, rowDictList, fieldNames=None, **kwargs):
        """ """
        _ = kwargs
        try:
            wD = {}
            ret = False
            fNames = fieldNames if fieldNames else list(rowDictList[0].keys())
            # with io.open(filePath, 'w', newline='') as csvFile:
            with open(filePath, "w") as csvFile:
                writer = csv.DictWriter(csvFile, fieldnames=fNames)
                writer.writeheader()
                for ii, rowDict in enumerate(rowDictList):
                    try:
                        wD = {k: v for k, v in rowDict.items() if k in fNames}
                        writer.writerow(wD)
                    except Exception as e:
                        logger.error(
                            "Skipping bad CSV record %d wD %r rowDict %r with %s",
                            ii + 1, wD, rowDict, str(e))
                        continue

            ret = True
        except Exception as e:
            logger.error("Failing for %s : %r with %s", filePath, wD, str(e))
        return ret

    def __csvEncoder(self,
                     csvData,
                     encoding="utf-8-sig",
                     encodingErrors="ignore"):
        """Handle encoding issues for gzipped data in Py2. (beware of the BOM chars)

        Args:
            csvData (text lines): uncompressed data from gzip open
            encoding (str, optional): character encoding. Defaults to "utf-8-sig".
            encodingErrors (str, optional): error treatment. Defaults to "ignore".
        """
        for line in csvData:
            yield line.decode("utf-8-sig", errors=encodingErrors).encode(
                encoding, errors=encodingErrors)

    def __deserializeXmlPrev(self, filePath, **kwargs):
        """Read the input XML file path and return an ElementTree data object instance.

        Args:
            filePath (sting): input XML file path

        Returns:
            object: instance of an ElementTree tree object
        """
        _ = kwargs
        tree = None
        try:
            logger.debug("Parsing XML path %s", filePath)
            if filePath[-3:] == ".gz":
                with gzip.open(filePath, mode="rb") as ifh:
                    tV = time.time()
                    tree = ET.parse(ifh)
            else:
                with open(filePath, mode="rb") as ifh:
                    tV = time.time()
                    tree = ET.parse(ifh)
            logger.debug("Parsed %s in %.2f seconds", filePath,
                         time.time() - tV)
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        return tree

    def __testGzip(self, filePath):
        ok = True
        with gzip.open(filePath, "r") as fh:
            try:
                fh.read(1)
            except gzip.BadGzipFile:
                ok = False
            except Exception:
                ok = False
        logger.debug("Gzip file check %r", ok)
        return ok

    def __deserializeXml(self, filePath, **kwargs):
        """Read the input XML file path and return an ElementTree data object instance.

        Args:
            filePath (sting): input XML file path

        Returns:
            object: instance of an ElementTree tree object
        """
        _ = kwargs
        tree = None
        encoding = kwargs.get("encoding", "utf-8-sig")
        encodingErrors = kwargs.get("encodingErrors", "ignore")
        #
        try:
            logger.debug("Parsing XML path %s", filePath)
            if filePath[-3:] == ".gz" and self.__testGzip(filePath):
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding=encoding,
                                   errors=encodingErrors) as ifh:
                        tV = time.time()
                        tree = ET.parse(ifh)
                else:
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    with io.open(tPath,
                                 encoding=encoding,
                                 errors=encodingErrors) as ifh:
                        tV = time.time()
                        tree = ET.parse(ifh)
            else:
                with io.open(filePath,
                             encoding=encoding,
                             errors=encodingErrors) as ifh:
                    tV = time.time()
                    tree = ET.parse(ifh)
            logger.debug("Parsed %s in %.2f seconds", filePath,
                         time.time() - tV)
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        return tree

Пример #13

Показать файл

class DataTypeApiProvider(SingletonClass):
    """ Data type application and instance information provider.
    """
    def __init__(self, cfgOb, cachePath, useCache=True, **kwargs):
        """Data type application and instance information provider.

        Args:
            cfgOb (object):  ConfigInfo() object instance
            cachePath (str): path to hold the cache directory
            useCache (bool, optional): flag to use cached files. Defaults to True.

        """
        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__useCache = useCache
        self.__cachePath = cachePath
        # self.__contentInfoConfigName = "content_info_helper_configuration"
        self.__fileU = FileUtil()
        self.__contentDefHelper = self.__cfgOb.getHelper(
            "CONTENT_DEF_HELPER_MODULE",
            sectionName=self.__configName,
            cfgOb=self.__cfgOb)
        self.__dirPath = os.path.join(
            cachePath,
            self.__cfgOb.get("DATA_TYPE_INFO_CACHE_DIR",
                             sectionName=self.__configName))
        self.__kwargs = kwargs
        #
        logger.debug("Leaving constructor")

    def getDataTypeInstanceApi(self, databaseName, **kwargs):
        """Return instance of DataTypeInstanceInfo().

        Args:
            databaseName (str): database name

        Returns:
            (object): Instance of DataTypeInstanceInfo()
        """
        _ = kwargs
        dataTypeInstanceLocatorPath = self.__cfgOb.getPath(
            "INSTANCE_DATA_TYPE_INFO_LOCATOR_PATH",
            sectionName=self.__configName)
        dataTypeInstanceFile = self.__contentDefHelper.getDataTypeInstanceFile(
            databaseName) if self.__contentDefHelper else None
        if dataTypeInstanceLocatorPath and dataTypeInstanceFile:
            loc = os.path.join(dataTypeInstanceLocatorPath,
                               dataTypeInstanceFile)
            filePath = self.__reload(loc,
                                     self.__dirPath,
                                     useCache=self.__useCache)
            dtApi = DataTypeInstanceInfo(filePath)
        else:
            # DataTypeInstanceInfo() provides an internal by-pass mode where no coverage data is available.
            dtApi = DataTypeInstanceInfo(None)
            logger.debug("No data coverage available for database %s",
                         databaseName)
        return dtApi

    def getDataTypeApplicationApi(self, appName, **kwargs):
        """Return instance of DataTypeApplicationInfo.

        Args:
            appName (str): application name (e.g., SQL, ANY)

        Returns:
            (object): Instance of DataTypeApplicationInfo()
        """
        _ = kwargs
        dataTypeApplicationLocator = self.__cfgOb.getPath(
            "APP_DATA_TYPE_INFO_LOCATOR", sectionName=self.__configName)
        filePath = self.__reload(dataTypeApplicationLocator,
                                 self.__dirPath,
                                 useCache=self.__useCache)
        dtApi = DataTypeApplicationInfo(
            filePath, dataTyping=appName,
            workPath=self.__dirPath) if filePath else None
        return dtApi

    def __reload(self, urlTarget, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(urlTarget)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Using cache path %s", dirPath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.debug("Fetch data from source %s", urlTarget)
            ok = self.__fileU.get(urlTarget, os.path.join(dirPath, fn))

        return filePath if ok else None

Пример #14

Показать файл

Файл: DictionaryApiProvider.py Проект: MShaffar19/py-rcsb_db

class DictionaryApiProvider(SingletonClass):
    """ Resource provider for dictionary APIs.
    """
    def __init__(self, dirPath, useCache=True):
        """Resource provider for dictionary APIs.

        Args:
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        """
        self.__apiMap = {}
        self.__dirPath = dirPath
        self.__useCache = useCache
        #
        self.__fileU = FileUtil(workPath=self.__dirPath)
        logger.debug("Leaving constructor")

    def __reload(self, dictLocators, dirPath, useCache=True):
        """Reload local cache of dictionary resources and return a dictionary API instance.

        Args:
            dictLocators (list, str): list of locators for dictionary resource files
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        Returns:
            (object): instance of dictionary API
        """
        #
        # verify the exitence of the cache directory ...
        self.__fileU.mkdir(dirPath)
        if not useCache:
            for dictLocator in dictLocators:
                try:
                    fn = self.__fileU.getFileName(dictLocator)
                    os.remove(os.path.join(dirPath, fn))
                except Exception:
                    pass
        #
        ret = True
        for dictLocator in dictLocators:
            cacheFilePath = os.path.join(dirPath,
                                         self.__fileU.getFileName(dictLocator))
            if useCache and self.__fileU.exists(cacheFilePath):
                # nothing to do
                continue
            logger.debug("Fetching url %s caching in %s", dictLocator,
                         cacheFilePath)
            ok = self.__fileU.get(dictLocator, cacheFilePath)
            ret = ret and ok
        return ret

    def getApi(self, dictLocators, **kwargs):
        """Return a dictionary API object of the input dictioaries.

        Arguments:
            dictLocators {list str} -- list of dictionary locator paths

        Returns:
            [object] -- returns DictionaryApi() object for input dictionaries
        """
        dictFileNames = [
            self.__fileU.getFileName(dictLocator)
            for dictLocator in dictLocators
        ]
        dictTup = tuple(dictFileNames)
        dApi = self.__apiMap[
            dictTup] if dictTup in self.__apiMap else self.__getApi(
                dictLocators, **kwargs)
        self.__apiMap[dictTup] = dApi
        return dApi

    def __getApi(self, dictLocators, **kwargs):
        """ Return an instance of a dictionary API instance for the input dictionary locator list.
        """
        consolidate = kwargs.get("consolidate", True)
        replaceDefinition = kwargs.get("replaceDefinitions", True)
        verbose = kwargs.get("verbose", True)
        #
        ok = self.__reload(dictLocators,
                           self.__dirPath,
                           useCache=self.__useCache)
        #
        dApi = None
        if ok:
            mU = MarshalUtil()
            containerList = []
            for dictLocator in dictLocators:
                cacheFilePath = os.path.join(
                    self.__dirPath, self.__fileU.getFileName(dictLocator))
                containerList.extend(
                    mU.doImport(cacheFilePath, fmt="mmcif-dict"))
            #
            dApi = DictionaryApi(containerList=containerList,
                                 consolidate=consolidate,
                                 replaceDefinition=replaceDefinition,
                                 verbose=verbose)
        return dApi

Пример #15

Показать файл

class FileUtilTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb",
                                                     "mock-data",
                                                     "dictionaries",
                                                     "mmcif_pdbx_v5_next.dic")

        self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                               "NCBI", "names.dmp.gz")
        self.__zipFileUrl = "https://inventory.data.gov/dataset/794cd3d7-4d28-4408-8f7d-84b820dbf7f2/resource/6b78ec0c-4980-4ad8-9cbd-2d6eb9eda8e7/download/myfoodapediadata.zip"
        self.__xzFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                     "MOCK_MODBASE_MODELS",
                                     "NP_001030614.1_1.pdb.xz")
        #
        self.__ftpFileUrl = "ftp://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz"
        self.__httpsFileUrl = "https://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz"
        #
        self.__workPath = os.path.join(HERE, "test-output")
        self.__inpDirPath = os.path.join(HERE, "test-data")
        self.__fileU = FileUtil()
        self.__startTime = time.time()
        logger.debug("Running tests on version %s", __version__)
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testTarBundling(self):
        """Test case for tarfile bundling and unbundling"""
        try:
            tP = os.path.join(self.__workPath, "t0.tar.gz")
            dirPath = os.path.join(self.__inpDirPath, "topdir")

            ok = self.__fileU.bundleTarfile(tP, [dirPath],
                                            mode="w:gz",
                                            recursive=True)
            self.assertTrue(ok)

            numBytes = self.__fileU.size(tP)
            self.assertGreaterEqual(numBytes, 250)
            #
            md5 = self.__fileU.hash(tP, hashType="md5")
            self.assertTrue(md5 is not None)
            #
            ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath)
            self.assertTrue(ok)
            #
            tP = os.path.join(self.__workPath, "t1.tar.gz")
            dirPathList = [
                os.path.join(self.__inpDirPath, "topdir", "subdirA"),
                os.path.join(self.__inpDirPath, "topdir", "subdirB")
            ]

            ok = self.__fileU.bundleTarfile(tP,
                                            dirPathList,
                                            mode="w:gz",
                                            recursive=True)
            self.assertTrue(ok)
            #
            ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath)
            self.assertTrue(ok)

            tP = os.path.join(self.__workPath, "t2.tar")
            dirPathList = [
                os.path.join(self.__inpDirPath, "topdir", "subdirA"),
                os.path.join(self.__inpDirPath, "topdir", "subdirB")
            ]

            ok = self.__fileU.bundleTarfile(tP,
                                            dirPathList,
                                            mode="w",
                                            recursive=True)
            self.assertTrue(ok)
            #
            ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath)
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testGetFile(self):
        """Test case for a local files and directories"""
        try:
            remoteLocator = self.__pathPdbxDictionaryFile
            fn = self.__fileU.getFileName(remoteLocator)
            # _, fn = os.path.split(remoteLocator)
            lPath = os.path.join(self.__workPath, fn)
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            ok = self.__fileU.remove(lPath)
            self.assertTrue(ok)
            dPath = os.path.join(self.__workPath, "tdir")
            ok = self.__fileU.mkdir(dPath)
            self.assertTrue(ok)
            ok = self.__fileU.remove(dPath)
            self.assertTrue(ok)
            ok = self.__fileU.remove(";lakdjf")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testMoveAndCopyFile(self):
        """Test case for copying ("put") and moving ("replace") local files"""
        try:
            remoteLocator = self.__pathPdbxDictionaryFile
            fn = self.__fileU.getFileName(remoteLocator)
            # _, fn = os.path.split(remoteLocator)
            lPath = os.path.join(self.__workPath, fn)
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            # Test copy file
            dPath2 = os.path.join(self.__workPath, "tdir")
            ok = self.__fileU.mkdir(dPath2)
            self.assertTrue(ok)
            lPath2 = os.path.join(dPath2, fn)
            ok = self.__fileU.put(lPath, lPath2)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath2)
            self.assertTrue(ok)
            # Remove copied file (to test moving file next)
            ok = self.__fileU.remove(lPath2)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath2)
            self.assertFalse(ok)
            # Test move file
            ok = self.__fileU.replace(lPath, lPath2)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertFalse(ok)
            ok = self.__fileU.exists(lPath2)
            self.assertTrue(ok)
            # Now clean up files and dirs
            ok = self.__fileU.remove(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.remove(dPath2)
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testZipUrl(self):
        """Test case for downloading remote zip file and extracting contents."""
        try:
            remoteLocator = self.__zipFileUrl
            # fn = self.__fileU.getFileName(remoteLocator)
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            lPath = os.path.join(self.__workPath,
                                 self.__fileU.getFileName(self.__zipFileUrl))
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath)
            ok = fp.endswith("Food_Display_Table.xlsx")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testFtpUrl(self):
        """Test case for downloading remote file ftp protocol and extracting contents."""
        try:
            remoteLocator = self.__ftpFileUrl
            # fn = self.__fileU.getFileName(remoteLocator)
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            dirPath = os.path.join(self.__workPath, "chem_comp_models")
            lPath = os.path.join(dirPath,
                                 self.__fileU.getFileName(self.__ftpFileUrl))
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            fp = self.__fileU.uncompress(lPath, outputDir=dirPath)
            ok = fp.endswith("chem_comp_model.cif")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testRemote(self):
        """Test case remote status"""
        try:
            remoteLocator = self.__httpsFileUrl
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            ok = self.__fileU.exists(remoteLocator)
            self.assertTrue(ok)
            size = self.__fileU.size(remoteLocator)
            self.assertGreaterEqual(size, 1000)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skip("DrugBank example -- skipping")
    def testGetDrugBankUrl(self):
        """Test case for downloading drugbank master xml file"""
        try:
            remoteLocator = "https://www.drugbank.ca/releases/latest/downloads/all-full-database"
            un = "username"
            pw = "password"
            # fn = self.__fileU.getFileName(remoteLocator)
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            lPath = os.path.join(self.__workPath, "db-download.zip")
            ok = self.__fileU.get(remoteLocator,
                                  lPath,
                                  username=un,
                                  password=pw)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            self.__fileU.uncompress(lPath, outputDir=self.__workPath)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testXzFile(self):
        """Test case for extracting contents from xz file"""
        try:
            remoteLocator = self.__xzFile
            fn = self.__fileU.getFileName(remoteLocator)
            lPath = os.path.join(self.__workPath, fn)
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath)
            ok = fp.endswith(".pdb")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

Пример #16

Показать файл

Файл: CODModelSearch.py Проект: rcsb/py-rcsb_ccmodels

class CODModelSearch(object):
    def __init__(self, cachePath, **kwargs):
        self.__cachePath = cachePath
        #
        self.__useCache = kwargs.get("useCache", True)
        self.__ccUrlTarget = kwargs.get("ccUrlTarget", None)
        self.__birdUrlTarget = kwargs.get("birdUrlTarget", None)
        self.__descriptorUrlTarget = kwargs.get(
            "descriptorUrlTarget",
            "http://www.crystallography.net/cod/smi/allcod.smi")
        self.__prefix = kwargs.get("prefix", None)
        self.__numProc = kwargs.get("numProc", 4)
        self.__chunkSize = kwargs.get("chunkSize", 50)
        self.__ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc-full"
        self.__fU = FileUtil()
        # self.__ccmG = ChemCompModelGen(self.__cachePath, self.__prefix)

    def getResultIndex(self):
        mU = MarshalUtil(workPath=self.__cachePath)
        cD = mU.doImport(self.getResultFilePath(), fmt="json")
        return cD

    def getResultDetails(self, codId):
        mU = MarshalUtil(workPath=self.__cachePath)
        dD = mU.doImport(self.__getCodDetailsFilePath(codId), fmt="json")
        return dD

    def storeResultIndex(self, cD):
        mU = MarshalUtil(workPath=self.__cachePath)
        ok = mU.doExport(self.getResultFilePath(), cD, fmt="json", indent=3)
        return ok

    def getResultDirFilePath(self):
        dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-result-files"
        return os.path.join(self.__cachePath, dN)

    def getRawResultFilePath(self):
        dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-search-files"
        return os.path.join(self.__cachePath, dN,
                            "cod-raw-result-file-index.json")

    def getResultFilePath(self):
        dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-search-files"
        return os.path.join(self.__cachePath, dN, "cod-result-file-index.json")

    def getDescriptorPath(self):
        fn = self.__fU.getFileName(self.__descriptorUrlTarget)
        dirPath = self.getResultDirFilePath()
        filePath = os.path.join(dirPath, fn)
        return filePath

    def updateDescriptors(self):
        self.__fetchUrl(self.__descriptorUrlTarget,
                        filePath=self.getDescriptorPath(),
                        useCache=False)

    def __fetchUrl(self, urlTarget, filePath, useCache=False, noRetry=False):
        ok = False
        try:
            if not (useCache and self.__fU.exists(filePath)):
                startTime = time.time()
                ok = self.__fU.get(urlTarget, filePath, noRetry=noRetry)
                endTime = time.time()
                if ok:
                    logger.debug(
                        "Fetched %s for resource file %s (status = %r) (%.4f seconds)",
                        urlTarget, filePath, ok, endTime - startTime)
                else:
                    logger.error(
                        "Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)",
                        urlTarget, filePath, ok, endTime - startTime)
            else:
                ok = True
                logger.debug("Using cached data for %s", urlTarget)
            #
        except Exception as e:
            logger.exception("Failing for %r with %s", urlTarget, str(e))
        return ok

    def search(self, molLimit=None):
        try:
            bsw = BatchChemSearch(
                useCache=self.__useCache,
                ccUrlTarget=self.__ccUrlTarget,
                birdUrlTarget=self.__birdUrlTarget,
                ccFileNamePrefix=self.__ccFileNamePrefix,
                cachePath=self.__cachePath,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
            )
            smiPath = self.getDescriptorPath()
            smiL = bsw.fetchDescriptorList(smiPath, swap=True)
            logger.info("Query length (%d)", len(smiL))
            #
            smiL = bsw.splitSmiles(smiL)
            retL = bsw.doQuery(smiL[:molLimit],
                               "SMILES",
                               matchOpts="graph-exact")
            logger.info("Result length (%d)", len(retL))
            #
            for ii, ret in enumerate(retL, 1):
                logger.debug("%5d %8s %4s (%.3f) %s: %s", ii, ret.queryId,
                             ret.ccId, ret.fpScore, ret.queryType, ret.query)
            #
            fp = self.getRawResultFilePath()
            ok = bsw.storeMatchList(fp, retL)
            return len(retL) if ok else 0
        except Exception as e:
            logger.exception("Failing with %s", str(e))

    def __getSearchResults(self):
        """Read search results and convert to a chemical component dictionary."""
        fp = self.getRawResultFilePath()
        mU = MarshalUtil(workPath=self.__cachePath)
        rawL = mU.doImport(fp, fmt="json")
        rD = {}
        for cD in rawL:
            rD.setdefault(cD["ccId"], []).append(cD)
        return rD

    def __getCodEntryUrl(self, codId):
        # Template Examples:
        # https://molecules.crystallography.net/cod/sdf/1/00/00/1000098.sdf
        # https://molecules.crystallography.net/cod/sdf/6/00/05/6000557.sdf
        #
        baseUrl = "https://molecules.crystallography.net/cod/sdf"
        url = os.path.join(baseUrl, codId[0:1], codId[1:3], codId[3:5],
                           codId + ".sdf")
        return url

    def __getCodDetailsUrl(self, codId):
        baseUrl = "http://www.crystallography.net/cod/optimade/structures"
        url = os.path.join(baseUrl, codId)
        return url

    def __getCodDetailsFilePath(self, codId):
        dirPath = self.getResultDirFilePath()
        fp = os.path.join(dirPath, "cod-data", codId[0:1], codId[1:3],
                          codId[3:5], codId + ".json")
        return fp

    def __getCodEntryFilePath(self, codId):
        dirPath = self.getResultDirFilePath()
        fp = os.path.join(dirPath, "cod-data", codId[0:1], codId[1:3],
                          codId[3:5], codId + ".sdf")
        return fp

    def fetchMatchedData(self, useCache=True):
        """Fetch COD matched entries and metadata and update the raw search index with essential COD data attrbutes.

        Args:
            useCache (bool, optional): use any cached COD data. Defaults to True.

        Returns:
            int: search result count

        """
        eCount = 0
        eSkip = 0
        rcD = {}
        cD = self.__getSearchResults()
        #
        for ccId, qDL in cD.items():
            # cifPath = self.__ccmG.getChemCompPath(ccId)
            # if not cifPath:
            #    logger.info("No CIF for %s skipping", ccId)
            #    continue
            parentId = ccId.split("|")[0]
            rqDL = []
            for qD in qDL:
                codId = qD["queryId"]
                codEntryFilePath = self.__getCodEntryFilePath(codId)
                codDetailsFilePath = self.__getCodDetailsFilePath(codId)
                ok1 = self.__fetchUrl(self.__getCodEntryUrl(codId),
                                      self.__getCodEntryFilePath(codId),
                                      useCache=useCache,
                                      noRetry=True)
                ok2 = self.__fetchUrl(self.__getCodDetailsUrl(codId),
                                      self.__getCodDetailsFilePath(codId),
                                      useCache=useCache,
                                      noRetry=True)
                tD = self.getResultDetails(codId)
                dD = tD["data"][
                    "attributes"] if "data" in tD and "attributes" in tD[
                        "data"] else {}
                mD = tD["meta"][
                    "implementation"] if "meta" in tD and "implementation" in tD[
                        "meta"] else {}
                if ok1 & ok2:
                    logger.info("Fetched COD entry and details for %s (%r)",
                                codId, ok1 & ok2)
                    eCount += 1
                    qD["codEntryFilePath"] = codEntryFilePath
                    qD["codDetailsFilePath"] = codDetailsFilePath
                    # qD["cifPath"] = cifPath
                    qD["parentId"] = parentId
                    qD["chemicalName"] = dD[
                        "_cod_commonname"] if "_cod_commonname" in dD else None
                    qD["chemicalName"] = dD[
                        "_cod_chemname"] if "_cod_chemname" in dD else qD[
                            "chemicalName"]
                    qD["rValue"] = dD[
                        "_cod_Robs"] if "_cod_Robs" in dD else None
                    qD["diffrnTemp"] = dD[
                        "_cod_diffrtemp"] if "_cod_diffrtemp" in dD else None
                    qD["radiationSource"] = dD[
                        "_cod_radType"] if "_cod_radType" in dD else None
                    qD["publicationDOI"] = dD[
                        "_cod_doi"] if "_cod_doi" in dD else None
                    qD["version"] = mD["version"] if "version" in mD else None
                    qD["hasDisorder"] = "N"
                    rqDL.append(qD)
                else:
                    logger.info("Skipping entry missing data for %r at %r",
                                codId, self.__getCodEntryUrl(codId))
                    eSkip += 1
            if rqDL:
                rcD[ccId] = rqDL
        #
        ok = self.storeResultIndex(rcD)
        logger.info(
            "Final match result (w/sdf and metadata) (%d/%d) cod hits (%d) skipped (%d)",
            len(rcD), len(cD), eCount, eSkip)
        return eCount if ok else 0

    def fetchMatchedDataMp(self, numProc=6, chunkSize=5, useCache=True):
        rcD = {}
        cD = self.__getSearchResults()
        idList = list(cD.keys())
        # ---
        mpu = MultiProcUtil(verbose=True)
        mpu.setWorkingDir(self.__cachePath)
        mpu.setOptions(optionsD={
            "resultPath": self.__cachePath,
            "cD": cD,
            "useCache": useCache
        })
        mpu.set(workerObj=self, workerMethod="fetchDataWorker")

        ok, failList, resultList, _ = mpu.runMulti(dataList=idList,
                                                   numProc=numProc,
                                                   numResults=1,
                                                   chunkSize=chunkSize)
        logger.info("Run ended with status %r success count %d failures %r",
                    ok, len(resultList[0]), len(failList))
        for rTup in resultList[0]:
            rcD[rTup[0]] = rTup[1]
        # ---
        ok = self.storeResultIndex(rcD)
        logger.info("Final match result (w/sdf and metadata) (%d/%d)",
                    len(rcD), len(cD))
        return True

    def fetchDataWorker(self, dataList, procName, optionsD, workingDir):
        """Worker method to fetch COD data for matched entries

        Args:
            dataList (list): list of mol2 file paths to be searched
            procName (str): processName
            optionsD (dict): dictionary of options
            workingDir (str): path to working directory (not used)

        Returns:
            (successList, resultList, []): success and result lists of mol2 paths with CCDC matches
        """
        resultPath = optionsD["resultPath"]
        cD = optionsD["cD"]
        useCache = optionsD["useCache"]
        _ = workingDir
        resultList = []
        successList = []
        startTime = time.time()
        logger.info("starting %s at %s", procName,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
        #
        eCount = 0
        eSkip = 0
        try:
            stopPath = os.path.join(resultPath, "STOP")
            logger.info("%s starting search data length %d", procName,
                        len(dataList))
            if self.__checkStop(stopPath):
                logger.info("%s stopping", procName)
                return resultList, resultList, []
            #
            # for ccId, qDL in cD.items():
            for ccId in dataList:
                if ccId in cD:
                    qDL = cD[ccId]
                #
                parentId = ccId.split("|")[0]
                rqDL = []
                for qD in qDL:
                    codId = qD["queryId"]
                    codEntryFilePath = self.__getCodEntryFilePath(codId)
                    codDetailsFilePath = self.__getCodDetailsFilePath(codId)
                    ok1 = self.__fetchUrl(self.__getCodEntryUrl(codId),
                                          self.__getCodEntryFilePath(codId),
                                          useCache=useCache,
                                          noRetry=True)
                    ok2 = self.__fetchUrl(self.__getCodDetailsUrl(codId),
                                          self.__getCodDetailsFilePath(codId),
                                          useCache=useCache,
                                          noRetry=True)
                    tD = self.getResultDetails(codId)
                    dD = tD["data"][
                        "attributes"] if "data" in tD and "attributes" in tD[
                            "data"] else {}
                    mD = tD["meta"][
                        "implementation"] if "meta" in tD and "implementation" in tD[
                            "meta"] else {}
                    if ok1 & ok2:
                        logger.info(
                            "Fetched COD entry and details for %s (%r)", codId,
                            ok1 & ok2)
                        eCount += 1
                        qD["codEntryFilePath"] = codEntryFilePath
                        qD["codDetailsFilePath"] = codDetailsFilePath
                        # qD["cifPath"] = cifPath
                        qD["parentId"] = parentId
                        qD["chemicalName"] = dD[
                            "_cod_commonname"] if "_cod_commonname" in dD else None
                        qD["chemicalName"] = dD[
                            "_cod_chemname"] if "_cod_chemname" in dD else qD[
                                "chemicalName"]
                        qD["rValue"] = dD[
                            "_cod_Robs"] if "_cod_Robs" in dD else None
                        qD["diffrnTemp"] = dD[
                            "_cod_diffrtemp"] if "_cod_diffrtemp" in dD else None
                        qD["radiationSource"] = dD[
                            "_cod_radType"] if "_cod_radType" in dD else None
                        qD["publicationDOI"] = dD[
                            "_cod_doi"] if "_cod_doi" in dD else None
                        qD["version"] = mD[
                            "version"] if "version" in mD else None
                        qD["hasDisorder"] = "N"
                        rqDL.append(qD)
                    else:
                        logger.info("Skipping entry missing data for %r at %r",
                                    codId, self.__getCodEntryUrl(codId))
                        eSkip += 1
                if rqDL:
                    resultList.append((ccId, rqDL))
                    successList.append(ccId)
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        endTime = time.time()
        logger.info(
            "%s (entries %d skipped %d) (ccId result length %d) completed at %s (%.2f seconds)",
            procName,
            eCount,
            eSkip,
            len(successList),
            time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
            endTime - startTime,
        )
        return successList, resultList, []

    def __checkStop(self, path):
        try:
            if os.access(path, os.F_OK):
                return True
        except Exception:
            pass
        return False

Пример #17

Показать файл

    def __reloadFasta(self, dirPath, **kwargs):
        """Reload DrugBank target FASTA data files.

        Args:
            dirPath (str, optional): path to DrugBank cache directory
            useCache (bool, optional): flag to use cached files. Defaults to True.

        Returns:

        """
        startTime = time.time()
        logger.info("Starting db reload at %s",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
        retFilePathList = []
        urlTargetL = [
            "https://go.drugbank.com/releases/latest/downloads/target-all-polypeptide-sequences",
            "https://go.drugbank.com/releases/latest/downloads/enzyme-all-polypeptide-sequences",
            "https://go.drugbank.com/releases/latest/downloads/carrier-all-polypeptide-sequences",
            "https://go.drugbank.com/releases/latest/downloads/transporter-all-polypeptide-sequences",
        ]

        useCache = kwargs.get("useCache", True)
        username = kwargs.get("username", None)
        password = kwargs.get("password", None)
        #
        if not username or not password:
            return retFilePathList
        #
        fU = FileUtil()
        fU.mkdir(dirPath)
        #
        if not useCache:
            #  Clear any cached files
            for urlTarget in urlTargetL:
                baseFileName = fU.getFileName(urlTarget)
                zipFileName = baseFileName + ".fasta.zip"
                retFileName = baseFileName + ".fa"
                for fn in [baseFileName, zipFileName, retFileName]:
                    try:
                        fp = os.path.join(dirPath, fn)
                        os.remove(fp)
                    except Exception:
                        pass
        #
        ok = False
        if useCache:
            ok = True
            for urlTarget in urlTargetL:
                baseFileName = fU.getFileName(urlTarget)
                retFileName = baseFileName + ".fa"
                retFilePath = os.path.join(dirPath, retFileName)
                ok = fU.exists(retFilePath)
                if not ok:
                    break
                retFilePathList.append(retFilePath)
        #
        logger.info("Using cached files %r", ok)
        if not useCache or not ok:
            if not username or not password:
                logger.warning(
                    "Missing credentials for DrugBank file download...")

            for urlTarget in urlTargetL:
                baseFileName = fU.getFileName(urlTarget)
                zipFileName = baseFileName + ".fasta.zip"
                retFileName = baseFileName + ".fa"
                zipFilePath = os.path.join(dirPath, zipFileName)
                retFilePath = os.path.join(dirPath, retFileName)
                basePath = os.path.join(dirPath, baseFileName)
                logger.info("Fetching url %s for FASTA target file %s",
                            urlTarget, baseFileName)
                ok = fU.get(urlTarget,
                            zipFilePath,
                            username=username,
                            password=password)
                endTime = time.time()
                logger.info(
                    "Completed db fetch at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
                #
                ok = fU.unbundleZipfile(zipFilePath, dirPath=basePath)
                fU.put(os.path.join(basePath, "protein.fasta"), retFilePath)
                endTime = time.time()
                logger.info(
                    "Completed unzip at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
                retFilePathList.append(retFilePath)
        return retFilePathList

Пример #18

Показать файл

class MarshalUtil(object):
    """Wrapper for serialization and deserialization methods."""
    def __init__(self, **kwargs):
        self.__workPath = kwargs.get("workPath", ".")
        self.__workDirSuffix = kwargs.get("workDirSuffix", "marshall_")
        self.__workDirPrefix = kwargs.get("workDirSuffix", "_tempdir")
        #
        self.__fileU = FileUtil(workPath=self.__workPath)
        self.__ioU = IoUtil()

    def doExport(self,
                 locator,
                 obj,
                 fmt="list",
                 marshalHelper=None,
                 numParts=None,
                 **kwargs):
        """Serialize the input object at locator path in specified format.  The
        input object is optionally preprocessed by the helper method.

        Args:
            locator (str): target path or URI
            obj (object): data to be serialized
            fmt (str, optional): format for serialization (mmcif, tdd, csv, list). Defaults to "list".
            marshalHelper (method, optional): pre-processor method applied to input data object. Defaults to None.
            numParts (int, optional): serialize the data in parts. Defaults to None. (json and pickle formats)
        Returns:
            bool: True for sucess or False otherwise
        """
        try:
            ret = False
            localFlag = self.__fileU.isLocal(locator)
            if marshalHelper:
                myObj = marshalHelper(obj, **kwargs)
            else:
                myObj = obj
            #
            if localFlag and numParts and fmt in ["json", "pickle"]:
                localFilePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.serializeInParts(localFilePath,
                                                  myObj,
                                                  numParts,
                                                  fmt=fmt,
                                                  **kwargs)
            elif localFlag:
                localFilePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.serialize(localFilePath,
                                           myObj,
                                           fmt=fmt,
                                           workPath=self.__workPath,
                                           **kwargs)
            else:
                with tempfile.TemporaryDirectory(
                        suffix=self.__workDirSuffix,
                        prefix=self.__workDirPrefix,
                        dir=self.__workPath) as tmpDirName:
                    # write a local copy then copy to destination -
                    #
                    localFilePath = os.path.join(
                        self.__workPath, tmpDirName,
                        self.__fileU.getFileName(locator))
                    ok1 = self.__ioU.serialize(localFilePath,
                                               myObj,
                                               fmt=fmt,
                                               workPath=self.__workPath,
                                               **kwargs)
                    ok2 = True
                    if ok1:
                        ok2 = self.__fileU.put(localFilePath, locator,
                                               **kwargs)
                ret = ok1 and ok2
        except Exception as e:
            logger.exception("Exporting locator %r failing with %s", locator,
                             str(e))

        return ret

    def doImport(self,
                 locator,
                 fmt="list",
                 marshalHelper=None,
                 numParts=None,
                 **kwargs):
        """Deserialize data at the target locator in specified format. The deserialized
        data is optionally post-processed by the input helper method.

        Args:
            locator (str): path or URI to input data
            fmt (str, optional): format for deserialization (mmcif, tdd, csv, list). Defaults to "list".
            marshalHelper (method, optional): post-processor method applied to deserialized data object. Defaults to None.
            numParts (int, optional): deserialize the data in parts. Defaults to None. (json and pickle formats)
            tarMember (str, optional): name of a member of tar file bundle. Defaults to None. (tar file format)

        Returns:
            Any: format specific return type
        """
        try:
            tarMember = kwargs.get("tarMember", None)
            localFlag = self.__fileU.isLocal(locator) and not tarMember
            #
            if localFlag and numParts and fmt in ["json", "pickle"]:
                filePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.deserializeInParts(filePath,
                                                    numParts,
                                                    fmt=fmt,
                                                    **kwargs)
            elif localFlag:
                filePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.deserialize(filePath,
                                             fmt=fmt,
                                             workPath=self.__workPath,
                                             **kwargs)
            else:
                #
                if fmt == "mmcif":
                    ret = self.__ioU.deserialize(locator,
                                                 fmt=fmt,
                                                 workPath=self.__workPath,
                                                 **kwargs)
                else:
                    with tempfile.TemporaryDirectory(
                            suffix=self.__workDirSuffix,
                            prefix=self.__workDirPrefix,
                            dir=self.__workPath) as tmpDirName:
                        #
                        # Fetch first then read a local copy -
                        #
                        if tarMember:
                            localFilePath = os.path.join(
                                self.__workPath, tmpDirName, tarMember)
                        else:
                            localFilePath = os.path.join(
                                self.__workPath, tmpDirName,
                                self.__fileU.getFileName(locator))

                        # ---  Local copy approach ---
                        self.__fileU.get(locator, localFilePath, **kwargs)
                        ret = self.__ioU.deserialize(localFilePath,
                                                     fmt=fmt,
                                                     workPath=self.__workPath,
                                                     **kwargs)

            if marshalHelper:
                ret = marshalHelper(ret, **kwargs)
        except Exception as e:
            logger.exception("Importing locator %r failing with %s", locator,
                             str(e))
            ret = None
        return ret

    def exists(self, filePath, mode=os.R_OK):
        return self.__fileU.exists(filePath, mode=mode)

    def mkdir(self, dirPath, mode=0o755):
        return self.__fileU.mkdir(dirPath, mode=mode)

    def remove(self, pth):
        return self.__fileU.remove(pth)

Пример #19

Показать файл

Файл: UniProtIdMappingProvider.py Проект: rcsb/py-rcsb_utils_seq

    def __rebuildCache(self,
                       targetUrl,
                       mapNameL,
                       outDirPath,
                       rawDirPath,
                       fmt="pickle",
                       useCache=True):
        """Fetch the UniProt selected id mapping resource file and extract
        UniProt Acc to  'mapIndex' mapping. Serialize the mapping as required.

        Args:
            targetUrl (str): source URL of the remote index file
            mapNameL (list): list of key mapping names to extract from the index
            outDirPath (str): directory path for raw and processed mapping files
            fmt (str, optional): output format (pickle|json) . Defaults to "pickle".
            useCache (bool, optional): use cached files. Defaults to True.

        Returns:
            dict: od[uniprotId] = mapped value

                idmapping_selected.tab

                1. UniProtKB-AC
                2. UniProtKB-ID
                3. GeneID (EntrezGene)
                4. RefSeq
                5. GI
                6. PDB
                7. GO
                8. UniRef100
                9. UniRef90
                10. UniRef50
                11. UniParc
                12. PIR
                13. NCBI-taxon
                14. MIM
                15. UniGene
                16. PubMed
                17. EMBL
                18. EMBL-CDS
                19. Ensembl
                20. Ensembl_TRS
                21. Ensembl_PRO
                22. Additional PubMed

        """
        startTime = time.time()
        nL = mapNameL
        oD = {}
        try:
            fileU = FileUtil()
            fExt = "pic" if fmt == "pickle" else "json"
            fExt = "tdd" if fmt == "tdd" else fExt
            fN, _ = os.path.splitext(fileU.getFileName(targetUrl))
            mapFileName = fN + "-map." + fExt
            idMapPath = os.path.join(outDirPath, mapFileName)
            mU = MarshalUtil()
            if useCache and mU.exists(idMapPath):
                logger.info("Reading cached serialized file %r", idMapPath)
                if fmt in ["pickle", "json"]:
                    tD = mU.doImport(idMapPath, fmt=fmt)
                    nL = list(set(tD["idNameList"]))
                    oD = tD["uniprotMapD"]
                    logger.info("keys %r", list(oD.keys())[:10])
                    logger.info("nL %r", nL)
                    ok = True
                elif fmt == "tdd":
                    ioU = IoUtil()
                    it = ioU.deserializeCsvIter(idMapPath,
                                                delimiter="\t",
                                                rowFormat="list",
                                                encodingErrors="ignore")
                    tL = next(it, [])
                    nL = tL[1:]
                    if len(nL) == 1:
                        for row in it:
                            oD[row[0]] = row[1]
                    else:
                        for row in it:
                            oD[row[0]] = row[1:]
                    ok = True
            else:
                idPath = os.path.join(rawDirPath, fileU.getFileName(targetUrl))
                if not fileU.exists(idPath):
                    logger.info(
                        "Fetching selected UniProt idmapping data from %r in %r",
                        targetUrl, outDirPath)
                    ok = fileU.get(targetUrl, idPath)
                    if not ok:
                        logger.error("Failed to downlowd %r", targetUrl)
                        return oD
                else:
                    logger.info("Using cached mapping file %r", idPath)
                # ---
                ioU = IoUtil()
                if fmt in ["pickle", "json"]:
                    if len(mapNameL) == 1:
                        for row in ioU.deserializeCsvIter(
                                idPath,
                                delimiter="\t",
                                rowFormat="list",
                                encodingErrors="ignore"):
                            oD[row[0]] = str(
                                row[self.__mapRecordD[mapNameL[0]] - 1])
                    else:
                        for row in ioU.deserializeCsvIter(
                                idPath,
                                delimiter="\t",
                                rowFormat="list",
                                encodingErrors="ignore"):
                            for mapName in mapNameL:
                                oD.setdefault(row[0], []).append(
                                    str(row[self.__mapRecordD[mapName] - 1]))
                    logger.info("Writing serialized mapping file %r",
                                idMapPath)
                    ok = mU.doExport(idMapPath, {
                        "idNameList": mapNameL,
                        "uniprotMapD": oD
                    },
                                     fmt=fmt)
                elif fmt == "tdd":
                    #
                    logger.info("Writing serialized mapping file %r",
                                idMapPath)
                    fU = FileUtil()
                    fU.mkdirForFile(idMapPath)
                    colNameL = []
                    colNameL.append("UniProtId")
                    colNameL.extend(mapNameL)
                    with open(idMapPath, "w", encoding="utf-8") as ofh:
                        ofh.write("%s\n" % "\t".join(colNameL))
                        if len(mapNameL) == 1:
                            idx = self.__mapRecordD[mapNameL[0]] - 1
                            for row in ioU.deserializeCsvIter(
                                    idPath,
                                    delimiter="\t",
                                    rowFormat="list",
                                    encodingErrors="ignore"):
                                ofh.write("%s\t%s\n" % (row[0], row[idx]))
                        else:
                            idxL = [0]
                            idxL.extend([
                                self.__mapRecordD[mapName] - 1
                                for mapName in mapNameL
                            ])
                            for row in ioU.deserializeCsvIter(
                                    idPath,
                                    delimiter="\t",
                                    rowFormat="list",
                                    encodingErrors="ignore"):
                                ofh.write(
                                    "%s\n" %
                                    "\t".join([str(row[idx]) for idx in idxL]))
                            #
                    nL, oD = self.__rebuildCache(targetUrl,
                                                 mapNameL,
                                                 outDirPath,
                                                 rawDirPath,
                                                 fmt=fmt,
                                                 useCache=True)
                    ok = True if nL and oD else False
            logger.info("Completed reload (%r) at %s (%.4f seconds)", ok,
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        time.time() - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return nL, oD

Пример #20

Показать файл

Файл: SchemaProvider.py Проект: MShaffar19/py-rcsb_db

class SchemaProvider(SingletonClass):
    """ A collection of schema build and caching methods.

        Static cache worflow:

            <authorative source>  <--   <cache dir>  <-  client API

        Compute workflow:

        <dependent resource files, config file, dictionaries> -> [schema builder] --> <schema def> --> <Json schema>

    """

    def __init__(self, cfgOb, cachePath, useCache=True, rebuildFlag=False, **kwargs):
        """A collection of schema build and caching methods.

        Args:
            cfgOb (object): ConfigInfo() instance
            cachePath (str): path to directory containing schema
            useCache (bool, optional): use cached schema. Defaults to True.
            rebuildFlag (bool, optional): on-the-fly rebuild and cache schema
        """

        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__cachePath = os.path.abspath(cachePath)
        self.__useCache = useCache
        self.__rebuildFlag = rebuildFlag
        self.__useCache = rebuildFlag if rebuildFlag else useCache
        #
        self.__workPath = os.path.join(self.__cachePath, "work")

        self.__fileU = FileUtil(workPath=os.path.join(self.__cachePath, "work"))
        self.__schemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName))
        self.__jsonSchemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("JSON_SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName))
        self.__fileU.mkdir(self.__schemaCachePath)
        self.__fileU.mkdir(self.__jsonSchemaCachePath)
        self.__kwargs = kwargs

    def getSchemaOptions(self, schemaLevel, extraOpts=None):
        opts = extraOpts + "|" if extraOpts else ""
        if schemaLevel == "full":
            return opts + "mandatoryKeys|mandatoryAttributes|bounds|enums|rcsb"
        elif schemaLevel in ["min", "minimum"]:
            return opts + "mandatoryKeys|enums|rcsb"
        else:
            return opts

    def getSchemaInfo(self, databaseName, dataTyping="ANY"):
        """Convenience method to return essential schema details for the input repository content type.

        Args:
            databaseName (str): schema name  (e.g. pdbx, bird, chem_comp, ...)
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)

        Returns:
            tuple: SchemaDefAccess(object), target database name, target collection name list, primary index attribute list


        """
        sd = None
        dbName = None
        collectionNameList = []
        docIndexD = {}
        try:
            mU = MarshalUtil(workPath=self.__workPath)
            schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
            if self.__rebuildFlag:
                filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
                self.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True)
            else:
                filePath = self.__reload(schemaLocator, self.__schemaCachePath, useCache=self.__useCache)

            if not filePath:
                logger.error("Unable to recover schema %s (%s)", databaseName, dataTyping)
            logger.debug("ContentType %r dataTyping %r schemaLocator %r", databaseName, dataTyping, schemaLocator)
            schemaDef = mU.doImport(filePath, fmt="json")
            if schemaDef:
                logger.debug("Using cached schema definition for %s application %s", databaseName, dataTyping)
                sd = SchemaDefAccess(schemaDef)
                if sd:
                    dbName = sd.getDatabaseName()
                    collectionInfoList = sd.getCollectionInfo()
                    logger.debug("Schema %s database name %s collections %r", databaseName, dbName, collectionInfoList)
                    for cd in collectionInfoList:
                        collectionName = cd["NAME"]
                        collectionNameList.append(collectionName)
                        docIndexD[collectionName] = sd.getDocumentIndices(collectionName)

        except Exception as e:
            logger.exception("Retreiving schema %s for %s failing with %s", databaseName, dataTyping, str(e))

        return sd, dbName, collectionNameList, docIndexD

    def schemaDefCompare(self, databaseName, dataTyping="ANY"):
        """Compare computed schema defintion with current source/cached version.

        Args:
            databaseName (str): schema definition name for comparison
            dataTyping (str, optional): data type conventions for the schema comparison. Defaults to "ANY".

        Returns:
            (str): file path for schema difference or None
        """
        mU = MarshalUtil(workPath=self.__workPath)
        schemaDiffPath = os.path.join(self.__cachePath, "schema_diff")
        mU.mkdir(schemaDiffPath)
        schemaPath = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
        fn = self.__fileU.getFileName(schemaPath)
        sD = self.makeSchemaDef(databaseName, dataTyping=dataTyping)
        v2 = sD["DATABASE_VERSION"]
        # ----
        # tPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath) + "-test")
        # logger.info("Exporting schema def to %s", tPath)
        # mU.doExport(tPath, sD, fmt="json", indent=3)
        # sD = mU.doImport(tPath, fmt="json")
        # ----
        cPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath))
        sDCache = mU.doImport(cPath, fmt="json")
        v1 = sDCache["DATABASE_VERSION"]
        #
        numDiff, difD = self.schemaCompare(sDCache, sD)
        #
        # jD = diff(sDCache, sD, syntax="explicit", marshal=True)
        diffPath = None
        if numDiff:
            bn, _ = os.path.splitext(fn)
            diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json")
            # logger.info("diff for %s %s = \n%s", databaseName, dataTyping, pprint.pformat(difD, indent=3, width=100))
            mU.doExport(diffPath, difD, fmt="json", indent=3)
        #
        return diffPath

    def jsonSchemaCompare(self, databaseName, collectionName, encodingType, level, extraOpts=None):
        """Compare computed JSON schema defintion with current source/cached version.

        Args:
            databaseName (str): schema name
            collectionName (str): collection name
            encodingType (str): schema data type conventions (JSON|BSON)
            level (str): metadata level (min|full)
            extraOpts (str): extra schema construction options

        Returns:
            (str): path to the difference file or None
        """
        mU = MarshalUtil(workPath=self.__workPath)
        schemaDiffPath = os.path.join(self.__cachePath, "schema_diff")
        mU.mkdir(schemaDiffPath)
        schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType, level)
        fn = self.__fileU.getFileName(schemaLocator)
        schemaPath = os.path.join(self.__jsonSchemaCachePath, fn)
        #
        sD = self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, saveSchema=False, extraOpts=extraOpts)
        v2 = self.__getSchemaVersion(sD)
        # ----
        # tPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaPath) + "-test")
        # logger.info("Exporting json schema to %s", tPath)
        # mU.doExport(tPath, sD, fmt="json", indent=3)
        # ----
        #
        sDCache = mU.doImport(schemaPath, fmt="json")
        v1 = self.__getSchemaVersion(sDCache)
        if not v1:
            logger.error("no version for %s - %s %s", schemaLocator, databaseName, collectionName)
        #
        numDiff, difD = self.schemaCompare(sDCache, sD)
        # jD = diff(sDCache, sD, marshal=True, syntax="explicit")
        diffPath = None
        if numDiff:
            logger.debug("diff for %s %s %s %s = \n%s", databaseName, collectionName, encodingType, level, pprint.pformat(difD, indent=3, width=100))
            bn, _ = os.path.splitext(fn)
            diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json")
            mU.doExport(diffPath, difD, fmt="json", indent=3)

        return diffPath

    def __getSchemaVersion(self, jsonSchema):
        try:
            comment = jsonSchema["$comment"] if "$comment" in jsonSchema else ""
            ff = comment.split(":")
            version = ff[1].strip()
            return version
        except Exception as e:
            logger.exception("Failing for with %s", str(e))
        return ""

    def __getSchemaDefLocator(self, databaseName, dataTyping="ANY"):
        """Internal method returning schema definition path for the input content type and application.
           Defines schema definition naming convention -

           Args:
            databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...)
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)

            Returns:

             str: schema definition file locator

        """
        schemaLocator = None
        try:
            locPath = self.__cfgOb.get("SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName)
            fn = "schema_def-%s-%s.json" % (databaseName, dataTyping.upper())
            schemaLocator = os.path.join(locPath, fn)
        except Exception as e:
            logger.exception("Retreiving schema definition path %s for %s failing with %s", databaseName, dataTyping, str(e))
        return schemaLocator

    def __getJsonSchemaLocator(self, databaseName, collectionName, encodingType="BSON", level="full"):
        """Internal method returning JSON schema path for the input collection data type convention and level.
           Defines the JSON/BSON schema naming convention -

           Args:
            databaseName (str): database name in the document store
            collectionName (str): collection name in document store
            encodingType (str, optional): data type convention (BSON|JSON)
            level (str, optional): Completeness of the schema (e.g. min or full)

            Returns:

            str: schema file locator

        """
        schemaLocator = None
        try:
            sdType = None
            sLevel = None
            schemaLocator = None
            if encodingType.upper() in ["JSON", "BSON"]:
                sdType = encodingType.lower()
            if level.lower() in ["min", "minimun"]:
                sLevel = "min"
            elif level.lower() in ["full"]:
                sLevel = level.lower()
            #
            if sdType and sLevel:
                locPath = self.__cfgOb.get("JSON_SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName)
                fn = "%s-%s-db-%s-col-%s.json" % (sdType, sLevel, databaseName, collectionName)
                schemaLocator = os.path.join(locPath, fn)
            else:
                logger.error("Unsupported schema options:  %s level %r type %r", collectionName, level, encodingType)
                schemaLocator = None
        except Exception as e:
            logger.debug("Retreiving JSON schema definition for %s type %s failing with %s", collectionName, encodingType, str(e))
        #
        return schemaLocator

    def __reload(self, locator, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(locator)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Target cache filePath %s", filePath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.info("Fetch data from source %s to %s", locator, filePath)
            ok = self.__fileU.get(locator, filePath)

        return filePath if ok else None

    def getJsonSchema(self, databaseName, collectionName, encodingType="BSON", level="full", extraOpts=None):
        """Return JSON schema (w/ BSON types) object for the input collection and level.and

        Args:
            databaseName (str): database name
            collectionName (str): collection name in document store
            encodingType (str, optional): data type convention (BSON|JSON)
            level (str, optional): Completeness of the schema (e.g. min or full)

        Returns:
            dict: Schema object

        """
        sObj = None
        schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level)
        #
        if self.__rebuildFlag:
            filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
            self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, extraOpts=extraOpts)
        else:
            filePath = self.__reload(schemaLocator, self.__jsonSchemaCachePath, useCache=self.__useCache)
        mU = MarshalUtil(workPath=self.__workPath)
        if filePath and mU.exists(filePath):
            mU = MarshalUtil(workPath=self.__workPath)
            sObj = mU.doImport(filePath, fmt="json")
        else:
            logger.debug("Failed to read schema for %s %r", collectionName, level)
        return sObj

    def makeSchema(self, databaseName, collectionName, encodingType="BSON", level="full", saveSchema=False, extraOpts=None):
        try:
            smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath)
            #
            cD = None
            stU = encodingType.upper()
            cD = smb.build(collectionName, dataTyping=stU, encodingType=stU, enforceOpts=self.getSchemaOptions(level, extraOpts=extraOpts))
            if cD and saveSchema:
                schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level)
                localPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaLocator))
                mU = MarshalUtil(workPath=self.__workPath)
                mU.doExport(localPath, cD, fmt="json", indent=3, enforceAscii=False)
        except Exception as e:
            logger.exception("Building schema %s collection %s failing with %s", databaseName, collectionName, str(e))
        return cD

    def makeSchemaDef(self, databaseName, dataTyping="ANY", saveSchema=False):
        schemaDef = None
        try:
            smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath)
            schemaDef = smb.build(dataTyping=dataTyping, encodingType="rcsb")
            if schemaDef and saveSchema:
                schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
                localPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
                mU = MarshalUtil(workPath=self.__workPath)
                mU.doExport(localPath, schemaDef, fmt="json", indent=3, enforceAscii=False)
        except Exception as e:
            logger.exception("Building schema %s failing with %s", databaseName, str(e))
        return schemaDef

    def schemaCompare(self, orgD, newD):
        """ Compute the difference of nested dictionaries.

        """
        fOrgD = self.__flatten(orgD)
        fNewD = self.__flatten(newD)
        if len(fOrgD) != len(fNewD):
            logger.debug("Schema lengths differ: org %d new %d", len(fOrgD), len(fNewD))
        #
        addedD = {k: fNewD[k] for k in set(fNewD) - set(fOrgD)}
        removedD = {k: fOrgD[k] for k in set(fOrgD) - set(fNewD)}
        changedOrgD = {k: fOrgD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]}
        changedNewD = {k: fNewD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]}
        chD = {}
        for ky in changedOrgD:
            kyS = ".".join(ky)
            vOrg = changedOrgD[ky]
            vNew = changedNewD[ky]
            if isinstance(vOrg, (list, tuple)) and isinstance(vNew, (list, tuple)):
                # logger.info(" >> %r vOrg %r vNew %r", ky, vOrg, vNew)
                dV = list(set(vNew) - set(vOrg))
                if dV:
                    chD[kyS] = {"diff": dV}
            else:
                chD[kyS] = {"from": vOrg, "to": vNew}
        #
        nT = len(addedD) + len(removedD) + len(chD)
        diffD = {"added": [".".join(kk) for kk in addedD.keys()], "removed": [".".join(kk) for kk in removedD.keys()], "changed": chD}
        return nT, diffD

    def __flatten(self, inpDict, prefix=None):
        prefix = prefix[:] if prefix else []
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flatten(value, prefix + [key])
                outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
            elif isinstance(value, (list, tuple)) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flatten(sublist, prefix + [key] + [str(index)])
                        outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[tuple(prefix + [key] + [str(index)])] = value
            else:
                outDict[tuple(prefix + [key])] = value
        return outDict

    def __flattenX(self, inpDict, prefix=None):
        prefix = prefix[:] if prefix else []
        # separator = "."
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flatten(value, prefix + [key])
                outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
            elif isinstance(value, list) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flatten(sublist, prefix + [key] + [str(index)])
                        outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[tuple(prefix + [key] + [str(index)])] = value
            else:
                outDict[tuple(prefix + [key])] = value
        return outDict

    def __flattenOrg(self, inpDict, separator=".", prefix=""):
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flattenOrg(value, separator, prefix + key + separator)
                outDict.update({key2: val2 for key2, val2 in deeper.items()})
            elif isinstance(value, list) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flattenOrg(sublist, separator, prefix + key + separator + str(index) + separator)
                        outDict.update({key2: val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[prefix + key + separator + str(index)] = value
            else:
                outDict[prefix + key] = value
        return outDict

    def __dictGen(self, indict, pre=None):
        pre = pre[:] if pre else []
        if isinstance(indict, dict):
            for key, value in indict.items():
                if isinstance(value, dict):
                    for dD in self.__dictGen(value, pre + [key]):
                        yield dD
                elif isinstance(value, list) or isinstance(value, tuple):
                    for v in value:
                        for dD in self.__dictGen(v, pre + [key]):
                            yield dD
                else:
                    yield pre + [key, value]
        else:
            yield indict

Пример #21

Показать файл

class ProvenanceProvider(SingletonClass):
    """Utilities to access and update provenance details."""
    def __init__(self, cfgOb, cachePath, useCache=True, **kwargs):
        """Utilities to access and update provenance details.

        Args:
            cfgOb ([type]): ConfigInfo() instance
            cachePath ([type]): path to directory containing schema
            useCache (bool, optional): use cached schema. Defaults to True.
        """

        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__cachePath = cachePath
        self.__useCache = useCache
        #
        self.__workPath = os.path.join(self.__cachePath, "work")
        self.__provenanceCachePath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("PROVENANCE_INFO_CACHE_DIR",
                             sectionName=self.__configName))
        self.__provenanceLocator = self.__cfgOb.getPath(
            "PROVENANCE_INFO_LOCATOR", sectionName=self.__configName)
        #
        self.__fileU = FileUtil(workPath=self.__workPath)
        self.__fileU.mkdir(self.__provenanceCachePath)
        self.__kwargs = kwargs
        #

    def __reload(self, locator, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(locator)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Using cache path %s", dirPath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.debug("Fetch data from source %s", locator)
            ok = self.__fileU.get(locator, filePath)

        return filePath if ok else None

    def fetch(self):
        try:
            provenanceFileCachePath = self.__reload(self.__provenanceLocator,
                                                    self.__provenanceCachePath,
                                                    useCache=self.__useCache)
            mU = MarshalUtil(workPath=self.__workPath)
            return mU.doImport(provenanceFileCachePath, fmt="json")
        except Exception as e:
            logger.exception("Failed retreiving provenance with %s", str(e))
        return {}

    def update(self, provD):
        ok = False
        try:
            provenanceFileCachePath = self.__reload(self.__provenanceLocator,
                                                    self.__provenanceCachePath,
                                                    useCache=self.__useCache)
            mU = MarshalUtil(workPath=self.__workPath)
            tD = mU.doImport(provenanceFileCachePath, fmt="json")
            tD.update(provD)
            ok = mU.doExport(provenanceFileCachePath, tD, fmt="json")
        except Exception as e:
            logger.exception("Failed updating provenance with %s", str(e))
        return ok

    def store(self, provD):
        ok = False
        try:
            provenanceFileCachePath = self.__reload(self.__provenanceLocator,
                                                    self.__provenanceCachePath,
                                                    useCache=self.__useCache)
            mU = MarshalUtil(workPath=self.__workPath)
            ok = mU.doExport(provenanceFileCachePath, provD, fmt="json")
        except Exception as e:
            logger.exception("Failed storing provenance with %s", str(e))
        return ok