Пример #1
0
 def __reload(self, dirPath, baseVersion, useCache, **kwargs):
     startTime = time.time()
     mU = MarshalUtil(workPath=dirPath)
     chemblDbUrl = kwargs.get(
         "ChEMBLDbUrl",
         "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/")
     ok = False
     fU = FileUtil()
     fU.mkdir(dirPath)
     #
     # ChEMBL current version <baseVersion>,...
     # template:  chembl_<baseVersion>.fa.gz
     #
     targetFileName = "chembl_" + str(baseVersion) + ".fa.gz"
     mappingFileName = "chembl_uniprot_mapping.txt"
     #
     chemblTargetPath = os.path.join(dirPath, targetFileName)
     chemblMappingPath = os.path.join(dirPath, mappingFileName)
     mappingFilePath = os.path.join(dirPath, "chembl_uniprot_mapping.json")
     #
     mapD = {}
     if useCache and fU.exists(mappingFilePath):
         logger.info("useCache %r using %r and %r and %r", useCache,
                     chemblTargetPath, chemblMappingPath, mappingFilePath)
         mapD = mU.doImport(mappingFilePath, fmt="json")
     else:
         # Get the ChEMBL UniProt mapping file
         url = os.path.join(chemblDbUrl, mappingFileName)
         ok = fU.get(url, chemblMappingPath)
         logger.info("Fetched %r url %s path %s", ok, url,
                     chemblMappingPath)
         logger.info("Reading ChEMBL mapping file path %s", mappingFilePath)
         rowL = mU.doImport(chemblMappingPath, fmt="tdd", rowFormat="list")
         for row in rowL:
             mapD[row[0]] = (row[1], row[2], row[3])
         ok = mU.doExport(mappingFilePath, mapD, fmt="json")
         logger.info("Processed mapping path %s (%d) %r", mappingFilePath,
                     len(mapD), ok)
         #
         # Get the target FASTA files --
         for vers in range(baseVersion, baseVersion + 10):
             logger.info("Now fetching version %r", vers)
             self.__version = vers
             targetFileName = "chembl_" + str(vers) + ".fa.gz"
             chemblTargetPath = os.path.join(dirPath,
                                             "chembl_targets_raw.fa.gz")
             url = os.path.join(chemblDbUrl, targetFileName)
             ok = fU.get(url, chemblTargetPath)
             logger.info("Fetched %r url %s path %s", ok, url,
                         chemblTargetPath)
             if ok:
                 break
     #
     logger.info("Completed reload at %s (%.4f seconds)",
                 time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                 time.time() - startTime)
     #
     return mapD
Пример #2
0
    def pushBundle(self, gitRepositoryPath, accessToken, gitHost="github.com", gitBranch="master", remoteStashPrefix="A", maxSizeMB=95):
        """Push bundle to remote stash git repository.

        Args:
            gitRepositoryPath (str): git repository path (e.g., rcsb/py-rcsb_exdb_assets_stash)
            accessToken (str): git repository access token
            gitHost (str, optional): git repository host name. Defaults to github.com.
            gitBranch (str, optional): git branch name. Defaults to master.
            remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A')
            maxSizeMB (int, optional): maximum stash bundle file size that will be committed. Defaults to 95MB.

        Returns:
          bool:  True for success or False otherwise

        """
        try:
            ok = False
            gU = GitUtil(token=accessToken, repositoryHost=gitHost)
            fU = FileUtil()
            localRepositoryPath = os.path.join(self.__localBundlePath, "stash_repository")
            fn = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix)
            #
            # Update existing local repository, otherwise clone a new copy
            if fU.exists(localRepositoryPath):
                ok = gU.pull(localRepositoryPath, branch=gitBranch)
                logger.debug("After pull status %r", gU.status(localRepositoryPath))
            else:
                ok = gU.clone(gitRepositoryPath, localRepositoryPath, branch=gitBranch)
            #
            # Split all bundles
            mbSize = float(fU.size(self.__localStashTarFilePath)) / 1000000.0
            logger.info("Splitting bundle %r (%.3f MB/Max %d MB)", fn, mbSize, maxSizeMB)
            sj = SplitJoin()
            splitDirPath = os.path.join(localRepositoryPath, "stash", fn[:-7])
            sj.split(self.__localStashTarFilePath, splitDirPath, maxSizeMB=maxSizeMB)
            fU.remove(self.__localStashTarFilePath)
            # else:
            # fU.put(self.__localStashTarFilePath, os.path.join(localRepositoryPath, "stash", fn))

            ok = gU.addAll(localRepositoryPath, branch=gitBranch)
            ok = gU.commit(localRepositoryPath, branch=gitBranch)
            logger.debug("After commit status %r", gU.status(localRepositoryPath))
            #
            if accessToken:
                ok = gU.push(localRepositoryPath, branch=gitBranch)
                logger.info("After push status %r", gU.status(localRepositoryPath))
            #
            return ok
        except Exception as e:
            logger.exception("For %r %r failing with %s", gitHost, gitRepositoryPath, str(e))
        return False
 def __fetchUrl(self, urlTarget, dirPath, useCache=False):
     fU = FileUtil()
     fn = fU.getFileName(urlTarget)
     filePath = os.path.join(dirPath, fn)
     if not (useCache and fU.exists(filePath)):
         startTime = time.time()
         ok2 = fU.get(urlTarget, filePath)
         endTime = time.time()
         if ok2:
             logger.info("Fetched %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime)
         else:
             logger.error("Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime)
     #
     return filePath
Пример #4
0
 def __reload(self, dirPath, useCache):
     startTime = time.time()
     aD = {}
     fU = FileUtil()
     fU.mkdir(dirPath)
     targetMechanismFilePath = self.getTargetMechanismDataPath()
     #
     if useCache and fU.exists(targetMechanismFilePath):
         logger.info("useCache %r using %r", useCache,
                     targetMechanismFilePath)
         qD = self.__mU.doImport(targetMechanismFilePath, fmt="json")
         aD = qD["mechanism"] if "mechanism" in qD else {}
     #
     logger.info("Completed reload of (%d) at %s (%.4f seconds)", len(aD),
                 time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                 time.time() - startTime)
     #
     return aD
Пример #5
0
    def fetchBundle(self, localRestoreDirPath, url, remoteDirPath, remoteStashPrefix="A", userName=None, password=None):
        """Restore bundled dependencies from remote storage and unbundle these in the
           current local cache directory.

        Args:
            localRestoreDirPath (str): local restore path
            url (str): remote URL
            remoteDirPath (str): remote directory path on the remote resource
            remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A')
            userName (str, optional): optional access information. Defaults to None.
            password (str, optional): optional access information. Defaults to None.
        """
        try:
            ok = False
            fileU = FileUtil()
            fn = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix)
            if not url:
                remotePath = os.path.join(remoteDirPath, fn)
                if fileU.exists(remotePath):
                    ok = fileU.get(remotePath, self.__localStashTarFilePath)
                else:
                    ok = False
                    logger.warning("Missing bundle file %r", remotePath)

            elif url and (url.startswith("http://") or url.startswith("https://")):
                remotePath = url + os.path.join("/", remoteDirPath, fn)
                ok = fileU.get(remotePath, self.__localStashTarFilePath)

            elif url and url.startswith("sftp://"):
                sftpU = SftpUtil()
                ok = sftpU.connect(url[7:], userName, pw=password, port=22)
                if ok:
                    remotePath = os.path.join(remoteDirPath, fn)
                    ok = sftpU.get(remotePath, self.__localStashTarFilePath)
            else:
                logger.error("Unsupported protocol %r", url)
            if ok:
                ok = fileU.unbundleTarfile(self.__localStashTarFilePath, dirPath=localRestoreDirPath)
            return ok
        except Exception as e:
            logger.exception("For %r %r Failing with %s", url, remoteDirPath, str(e))
            ok = False
        return ok
 def __fetchFromSource(self, urlTarget):
     """Fetch the classification names and domain assignments from the ECOD repo."""
     fU = FileUtil()
     fn = fU.getFileName(urlTarget)
     fp = os.path.join(self.__dirPath, fn)
     if not fU.exists(fp):
         fU.get(urlTarget, fp)
     #
     with open(fp, "r", encoding="utf-8") as ifh:
         line = ifh.readline()
         line = ifh.readline()
         line = ifh.readline()
         ff = line[:-1].split()
         self.__version = ff[-1]
     #
     nmL = self.__mU.doImport(fp, fmt="list", uncomment=True)
     fU.remove(fp)
     #
     return nmL
Пример #7
0
    def __processAppendedSections(self,
                                  appendConfigOption,
                                  cachePath,
                                  useCache=True):
        """Fetch and append configuration assets assigned to input configuration option.

        Args:
            appendConfigOption (str): reserved configuration option to hold a list of configuration asset locators
            cachePath (str): path to store cached copies configuration assets
            useCache (bool, optional): use existing cached configuration assets. Defaults to True.

        Returns:
            bool: True for success of False otherwise
        """
        try:
            ret = True
            appendLocL = self.getList(appendConfigOption,
                                      sectionName=self.__defaultSectionName)
            logger.debug("appendLocL is %r", appendLocL)
            if appendLocL:
                cP = os.path.join(cachePath, "config")
                fU = FileUtil(workPath=cP)
                logger.debug("Fetching append sections from %r", appendLocL)
                for appendLoc in appendLocL:
                    fn = fU.getFileName(appendLoc)
                    fp = os.path.join(cP, fn)
                    okF = True
                    if not (useCache and fU.exists(fp)):
                        # get a fresh copy from source
                        okF = fU.get(appendLoc, fp)
                        logger.debug("Fetched %r to %r", appendLoc, fp)
                    ok = self.appendConfig(fp)
                    ret = ret and ok and okF
        except Exception as e:
            logger.exception("Failing for option %r cachePath %r with %s",
                             appendConfigOption, cachePath, str(e))
            ret = False
        #
        if not ret:
            logger.error("Fetching appended sections failing %r", appendLocL)

        return ret
Пример #8
0
    def __reload(self, urlTarget, dirPath, useCache=True):
        """Reload local cache of mapping resources to support validation report reader and translator.

        Args:
            urlTarget (list, str): URL for schema mapping file
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        Returns:
            (object): instance of ValidationReportReader()
        """
        mapD = {}
        #
        mU = MarshalUtil()
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        mappingFilePath = os.path.join(dirPath, fn)
        mU.mkdir(dirPath)
        #
        # if not useCache:
        #     for fp in [mappingFilePath]:
        #         try:
        #             os.remove(fp)
        #         except Exception:
        #             pass
        # #
        logger.debug("Loading validation mapping data in %s (useCache %r)", fn,
                     useCache)
        if useCache and fU.exists(mappingFilePath):
            mapD = mU.doImport(mappingFilePath, fmt="json")
        else:
            logger.info("Fetching url %s to resource file %s", urlTarget,
                        mappingFilePath)
            tS = uuid.uuid4().hex
            tP = os.path.join(dirPath, "._" + tS)
            ok = fU.get(urlTarget, tP)
            if ok:
                mapD = mU.doImport(tP, fmt="json")
                os.replace(tP, mappingFilePath)
        return mapD
Пример #9
0
    def get(self, remotePath, localPath):
        """Get a file from a remote FTP server.

        Arguments:
            remotePath (str): remote file path
            localPath (str): local file path

        Returns:
            bool: True for success or false otherwise
        """
        try:
            fileU = FileUtil()
            fileU.mkdirForFile(localPath)
            # If provided localPath already exists and is a directory, retrieve the file using the name on the remote server
            # to avoid unintentionally overwriting an entire local directory with a single retrieved file
            if (os.path.exists(localPath) and os.path.isdir(localPath)):
                remoteFileName = FileUtil().getFileName(remotePath)
                localFilePath = os.path.join(localPath, remoteFileName)
            else:
                localFilePath = localPath
            with open(localFilePath, 'wb') as lFP:
                self.__ftpClient.retrbinary('RETR %s' % remotePath, lFP.write)
            ok = fileU.exists(localFilePath)
            if ok:
                return True
            else:
                logger.error("get failing for remotePath %s localFilePath %s",
                             remotePath, localFilePath)
                return False
        except Exception as e:
            if self.__raiseExceptions:
                raise e
            else:
                logger.error(
                    "get failing for remotePath %s localPath %s with %s",
                    remotePath, localPath, str(e))
                return False
Пример #10
0
 def __reload(self, dirPath, useCache):
     startTime = time.time()
     aD = {}
     allIdD = {}
     fU = FileUtil()
     fU.mkdir(dirPath)
     targetActivityFilePath = self.getTargetActivityDataPath()
     #
     if useCache and fU.exists(targetActivityFilePath):
         logger.info("useCache %r using %r", useCache, targetActivityFilePath)
         qD = self.__mU.doImport(targetActivityFilePath, fmt="json")
         aD = qD["activity"] if "activity" in qD else {}
         idL = qD["all_ids"] if "all_ids" in qD else []
         allIdD = {k: k in aD for k in idL}
     #
     logger.info(
         "Completed reload (%d activities) (%d tried identifiers) at %s (%.4f seconds)",
         len(aD),
         len(allIdD),
         time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
         time.time() - startTime,
     )
     #
     return aD, allIdD
Пример #11
0
    def __reload(self, urlTarget, dirPath, useCache=True):
        """ Reload input GO OBO ontology file and return a nx graph object.
'
        Returns:
            dictionary[goId] = {'name_list': ... , 'id_list': ... 'depth_list': ... }
        """
        goGraph = None
        #
        # mU = MarshalUtil()
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        oboFilePath = os.path.join(dirPath, fn)
        fU.mkdir(dirPath)
        #
        if not useCache:
            for fp in [oboFilePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and fU.exists(oboFilePath):
            goGraph = obonet.read_obo(oboFilePath)
        else:
            logger.info("Fetching url %s to resource file %s", urlTarget,
                        oboFilePath)
            ok = fU.get(urlTarget, oboFilePath)
            if ok:
                goGraph = obonet.read_obo(oboFilePath)
        if goGraph:
            logger.info("Reading %d nodes and %d edges", len(goGraph),
                        goGraph.number_of_edges())
        else:
            logger.info("Go graph construction failing")
        #
        return goGraph
Пример #12
0
class IoUtil(object):
    def __init__(self, **kwargs):
        self.__fileU = FileUtil(**kwargs)

    def serialize(self, filePath, myObj, fmt="pickle", **kwargs):
        """Public method to serialize format appropriate objects

        Args:
            filePath (str): local file path'
            myObj (object): format appropriate object to be serialized
            format (str, optional): one of ['mmcif', mmcif-dict', json', 'list', 'text-dump', pickle' (default)]
            **kwargs: additional keyword arguments passed to worker methods -

        Returns:
            bool: status of serialization operation; true for success or false otherwise

        """
        ret = False
        fmt = str(fmt).lower()
        ret = self.__fileU.mkdirForFile(filePath)
        if not ret:
            return ret
        if fmt in ["mmcif"]:
            ret = self.__serializeMmCif(filePath, myObj, **kwargs)
        elif fmt in ["json"]:
            ret = self.__serializeJson(filePath, myObj, **kwargs)
        elif fmt in ["pickle"]:
            ret = self.__serializePickle(filePath, myObj, **kwargs)
        elif fmt in ["list"]:
            ret = self.__serializeList(filePath,
                                       myObj,
                                       enforceAscii=True,
                                       **kwargs)
        elif fmt in ["mmcif-dict"]:
            ret = self.__serializeMmCifDict(filePath, myObj, **kwargs)
        elif fmt in ["text-dump"]:
            ret = self.__textDump(filePath, myObj, **kwargs)
        elif fmt in ["fasta"]:
            ret = self.__serializeFasta(filePath, myObj, **kwargs)
        elif fmt in ["csv"]:
            ret = self.__serializeCsv(filePath, myObj, **kwargs)
        else:
            pass

        return ret

    def deserialize(self, filePath, fmt="pickle", **kwargs):
        """Public method to deserialize objects in supported formats.

        Args:
            filePath (str): local file path
            format (str, optional): one of ['mmcif', 'json', 'list', ..., 'pickle' (default)]
            **kwargs:  additional keyword arguments passed to worker methods -

        Returns:
            object: deserialized object data

        """
        fmt = str(fmt).lower()
        if fmt in ["mmcif"]:
            ret = self.__deserializeMmCif(filePath, **kwargs)  # type: ignore
        elif fmt in ["json"]:
            ret = self.__deserializeJson(filePath, **kwargs)  # type: ignore
        elif fmt in ["pickle"]:
            ret = self.__deserializePickle(filePath, **kwargs)  # type: ignore
        elif fmt in ["list"]:
            ret = self.__deserializeList(filePath, enforceAscii=True,
                                         **kwargs)  # type: ignore
        elif fmt in ["mmcif-dict"]:
            ret = self.__deserializeMmCifDict(filePath,
                                              **kwargs)  # type: ignore
        elif fmt in ["fasta"]:
            ret = self.__deserializeFasta(filePath, **kwargs)  # type: ignore
        # elif fmt in ["vrpt-xml-to-cif"]:
        #    ret = self.__deserializeVrptToCif(filePath, **kwargs)  # type: ignore
        elif fmt in ["csv", "tdd"]:
            delimiter = kwargs.get("csvDelimiter",
                                   "," if fmt == "csv" else "\t")
            ret = self.__deserializeCsv(filePath,
                                        delimiter=delimiter,
                                        **kwargs)  # type: ignore
        elif fmt in ["xml"]:
            ret = self.__deserializeXml(filePath, **kwargs)  # type: ignore
        else:
            ret = None  # type: ignore

        return ret

    def __sliceInChunks(self, myList, numChunks):
        mc = min(len(myList), numChunks)
        chunkSize = int(len(myList) / mc)
        if len(myList) % mc:
            chunkSize += 1
        for i in range(0, len(myList), chunkSize):
            yield myList[i:i + chunkSize]

    def serializeInParts(self,
                         filePath,
                         myObj,
                         numParts,
                         fmt="json",
                         **kwargs):
        """Public method to serialize format appropriate (json, pickle) objects in multiple parts

        Args:
            filePath (str): local file path
            myObj (object): format appropriate object to be serialized
            numParts (int): divide the data into numParts segments
            format (str, optional): one of ['json' or 'pickle']. Defaults to json
            **kwargs: additional keyword arguments passed to worker methods -

        Returns:
            bool: True for success or False otherwise
        """
        if fmt not in ["json", "pickle"]:
            logger.error("Unsupported format for %s", fmt)
            return False
        pth, fn = os.path.split(filePath)
        self.__fileU.mkdirForFile(pth)
        bn, ext = os.path.splitext(fn)
        ret = True
        if isinstance(myObj, list):
            for ii, subList in enumerate(self.__sliceInChunks(myObj,
                                                              numParts)):
                fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext)
                ok = self.serialize(fp, subList, fmt=fmt, **kwargs)
                ret = ret and ok
        elif isinstance(myObj, dict):
            for ii, keyList in enumerate(
                    self.__sliceInChunks(list(myObj.keys()), numParts)):
                fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext)
                ok = self.serialize(fp,
                                    OrderedDict([(k, myObj[k])
                                                 for k in keyList]),
                                    fmt=fmt,
                                    **kwargs)
                ret = ret and ok
        else:
            logger.error("Unsupported data type for serialization in parts")
            ret = False
        #
        return ret

    def deserializeInParts(self, filePath, numParts, fmt="json", **kwargs):
        """Public method to deserialize objects in supported formats from multiple parts

        Args:
            filePath (str): local file path
            numParts (int): reconstruct the data object from numParts segments
            format (str, optional): one of ['json' or 'pickle']. Defaults to json
            **kwargs: additional keyword arguments passed to worker methods -

        Returns:
            object: deserialized object data
        """
        rObj = None
        if fmt not in ["json", "pickle"]:
            logger.error("Unsupported format for %s", fmt)
            return rObj
        #
        pth, fn = os.path.split(filePath)
        bn, ext = os.path.splitext(fn)
        if not numParts:
            fp = os.path.join(pth, bn + "_part_*" + ext)
            numParts = len(glob.glob(fp))
        #
        for ii in range(numParts):
            fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext)
            tObj = self.deserialize(fp, fmt=fmt, **kwargs)
            if isinstance(tObj, list):
                if not rObj:
                    rObj = []
                rObj.extend(tObj)
            elif isinstance(tObj, dict):
                if not rObj:
                    rObj = OrderedDict()
                rObj.update(tObj)
            else:
                logger.error(
                    "Unsupported data type for deserialization in parts")
        return rObj

    def exists(self, filePath, mode=os.R_OK):
        return self.__fileU.exists(filePath, mode=mode)

    def mkdir(self, dirPath, mode=0o755):
        return self.__fileU.mkdir(dirPath, mode=mode)

    def remove(self, pth):
        return self.__fileU.remove(pth)

    def __deserializeFasta(self, filePath, **kwargs):
        try:
            commentStyle = kwargs.get("commentStyle", "uniprot")
            fau = FastaUtil()
            return fau.readFasta(filePath, commentStyle=commentStyle)
        except Exception as e:
            logger.error("Unable to deserialize %r %r ", filePath, str(e))
        return {}

    def __serializeFasta(self, filePath, myObj, **kwargs):
        try:
            maxLineLength = int(kwargs.get("maxLineLength", 70))
            makeComment = kwargs.get("makeComment", False)
            fau = FastaUtil()
            ok = fau.writeFasta(filePath,
                                myObj,
                                maxLineLength=maxLineLength,
                                makeComment=makeComment)
            return ok
        except Exception as e:
            logger.error("Unable to serialize FASTA file %r  %r", filePath,
                         str(e))
        return False

    def __textDump(self, filePath, myObj, **kwargs):
        try:
            indent = kwargs.get("indent", 1)
            width = kwargs.get("width", 120)
            sOut = pprint.pformat(myObj, indent=indent, width=width)
            with open(filePath, "w") as ofh:
                ofh.write("\n%s\n" % sOut)
            return True
        except Exception as e:
            logger.error("Unable to dump to %r  %r", filePath, str(e))
        return False

    def __serializePickle(self, filePath, myObj, **kwargs):
        try:
            pickleProtocol = kwargs.get("pickleProtocol",
                                        pickle.DEFAULT_PROTOCOL)

            with open(filePath, "wb") as outfile:
                pickle.dump(myObj, outfile, pickleProtocol)
            return True
        except Exception as e:
            logger.error("Unable to serialize %r  %r", filePath, str(e))
        return False

    def __deserializePickle(self, filePath, **kwargs):
        myDefault = kwargs.get("default", {})
        try:
            if sys.version_info[0] > 2:
                encoding = kwargs.get("encoding", "ASCII")
                errors = kwargs.get("errors", "strict")
                with open(filePath, "rb") as outfile:
                    return pickle.load(outfile,
                                       encoding=encoding,
                                       errors=errors)
            else:
                with open(filePath, "rb") as outfile:
                    return pickle.load(outfile)
        except Exception as e:
            logger.warning("Unable to deserialize %r %r", filePath, str(e))
        return myDefault

    def __serializeJson(self, filePath, myObj, **kwargs):
        """Internal method to serialize the input object as JSON.  An encoding
        helper class is included to handle selected python data types (e.g., datetime)
        """
        indent = kwargs.get("indent", 0)
        enforceAscii = kwargs.get("enforceAscii", True)
        try:
            if enforceAscii:
                with open(filePath, "w") as outfile:
                    json.dump(myObj,
                              outfile,
                              indent=indent,
                              cls=JsonTypeEncoder,
                              ensure_ascii=enforceAscii)
            else:
                with io.open(filePath, "w", encoding="utf-8") as outfile:
                    json.dump(myObj,
                              outfile,
                              indent=indent,
                              cls=JsonTypeEncoder,
                              ensure_ascii=enforceAscii)
            return True
        except Exception as e:
            logger.error("Unable to serialize %r  %r", filePath, str(e))
        return False

    def __deserializeJson(self, filePath, **kwargs):
        myDefault = kwargs.get("default", {})
        encoding = kwargs.get("encoding", "utf-8-sig")
        encodingErrors = kwargs.get("encodingErrors", "ignore")
        try:
            if filePath[-3:] == ".gz":
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding=encoding,
                                   errors=encodingErrors) as inpFile:
                        return json.load(inpFile,
                                         object_pairs_hook=OrderedDict)
                else:
                    # Py2 situation non-ascii encodings is problematic
                    # with gzip.open(filePath, "rb") as csvFile:
                    #    oL = self.__csvReader(csvFile, rowFormat, delimiter)
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    with io.open(tPath,
                                 newline="",
                                 encoding=encoding,
                                 errors="ignore") as inpFile:
                        return json.load(inpFile,
                                         object_pairs_hook=OrderedDict)
            else:
                with open(filePath, "r") as inpFile:
                    return json.load(inpFile, object_pairs_hook=OrderedDict)
        except Exception as e:
            logger.warning("Unable to deserialize %r %r", filePath, str(e))
        return myDefault

    def __hasMinSize(self, pth, minSize):
        try:
            return os.path.getsize(pth) >= minSize
        except Exception:
            return False

    def __deserializeMmCif(self, locator, **kwargs):
        """ """
        try:
            containerList = []
            workPath = kwargs.get("workPath", None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            minSize = kwargs.get("minSize", 5)
            #
            if self.__fileU.isLocal(locator):
                if minSize >= 0 and not self.__hasMinSize(locator, minSize):
                    logger.warning("Minimum file size not satisfied for: %r",
                                   locator)
                myIo = IoAdapter(raiseExceptions=raiseExceptions,
                                 useCharRefs=useCharRefs)
                containerList = myIo.readFile(
                    locator, enforceAscii=enforceAscii,
                    outDirPath=workPath)  # type: ignore
            else:
                # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs)
                # containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath)
                containerList = self.__deserializeMmCifRemote(
                    locator, useCharRefs, enforceAscii, workPath)

        except Exception as e:
            logger.error("Failing for %s with %s", locator, str(e))
        return containerList

    @retry((requests.exceptions.RequestException),
           maxAttempts=3,
           delaySeconds=1,
           multiplier=2,
           defaultValue=[],
           logger=logger)
    def __deserializeMmCifRemote(self, locator, useCharRefs, enforceAscii,
                                 workPath):
        containerList = []
        try:
            myIo = IoAdapterPy(raiseExceptions=True, useCharRefs=useCharRefs)
            containerList = myIo.readFile(locator,
                                          enforceAscii=enforceAscii,
                                          outDirPath=workPath)
        except Exception as e:
            raise e
        return containerList

    def __serializeMmCif(self, filePath, containerList, **kwargs):
        """ """
        try:
            ret = False
            workPath = kwargs.get("workPath", None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            #
            myIo = IoAdapter(raiseExceptions=raiseExceptions,
                             useCharRefs=useCharRefs)
            if filePath.endswith(".gz") and workPath:
                rfn = "".join(
                    random.choice(string.ascii_uppercase + string.digits)
                    for _ in range(10))
                tPath = os.path.join(workPath, rfn)
                ret = myIo.writeFile(tPath,
                                     containerList=containerList,
                                     enforceAscii=enforceAscii)
                ret = self.__fileU.compress(tPath,
                                            filePath,
                                            compressType="gzip")
            else:
                ret = myIo.writeFile(filePath,
                                     containerList=containerList,
                                     enforceAscii=enforceAscii)
        except Exception as e:
            logger.error("Failing for %s with %s", filePath, str(e))
        return ret

    def __deserializeMmCifDict(self, filePath, **kwargs):
        """ """
        try:
            containerList = []
            workPath = kwargs.get("workPath", None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            #
            myIo = IoAdapterPy(raiseExceptions=raiseExceptions,
                               useCharRefs=useCharRefs)
            containerList = myIo.readFile(filePath,
                                          enforceAscii=enforceAscii,
                                          outDirPath=workPath)
        except Exception as e:
            logger.error("Failing for %s with %s", filePath, str(e))
        return containerList

    def __serializeMmCifDict(self, filePath, containerList, **kwargs):
        """ """
        try:
            ret = False
            # workPath = kwargs.get('workPath', None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            #
            myIo = IoAdapterPy(raiseExceptions=raiseExceptions,
                               useCharRefs=useCharRefs)
            ret = myIo.writeFile(filePath,
                                 containerList=containerList,
                                 enforceAscii=enforceAscii)
        except Exception as e:
            logger.error("Failing for %s with %s", filePath, str(e))
        return ret

    def __serializeList(self, filePath, aList, enforceAscii=True, **kwargs):
        """ """

        try:
            _ = kwargs
            if enforceAscii:
                encoding = "ascii"
            else:
                encoding = "utf-8"
            #
            if sys.version_info[0] > 2:
                with open(filePath, "w") as ofh:
                    if enforceAscii:
                        for st in aList:
                            ofh.write("%s\n" % st.encode(
                                "ascii", "xmlcharrefreplace").decode("ascii"))
                    else:
                        for st in aList:
                            ofh.write("%s\n" % st)
            else:
                if enforceAscii:
                    with io.open(filePath, "w", encoding=encoding) as ofh:
                        for st in aList:
                            ofh.write("%s\n" % st.encode(
                                "ascii", "xmlcharrefreplace").decode("ascii"))
                else:
                    with open(filePath, "wb") as ofh:
                        for st in aList:
                            ofh.write("%s\n" % st)
            return True
        except Exception as e:
            logger.error("Unable to serialize %r %r", filePath, str(e))
        return False

    def __processList(self, ifh, enforceAscii=True, **kwargs):
        uncomment = kwargs.get("uncomment", True)
        aList = []
        for line in ifh:
            if enforceAscii:
                pth = line[:-1].encode("ascii",
                                       "xmlcharrefreplace").decode("ascii")
            else:
                pth = line[:-1]
            if not pth or (uncomment and pth.startswith("#")):
                continue
            aList.append(pth)
        return aList

    def __deserializeList(self,
                          filePath,
                          enforceAscii=True,
                          encodingErrors="ignore",
                          **kwargs):
        aList = []
        _ = kwargs
        try:
            if filePath[-3:] == ".gz":
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding="utf-8-sig",
                                   errors=encodingErrors) as ifh:
                        aList = self.__processList(ifh,
                                                   enforceAscii=enforceAscii,
                                                   **kwargs)
                else:
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    # for py2 this commented code is problematic for non-ascii data
                    # with gzip.open(filePath, "rb") as ifh:
                    #    aList = self.__processList(ifh, enforceAscii=enforceAscii)
                    with io.open(tPath, encoding="utf-8-sig",
                                 errors="ignore") as ifh:
                        aList = self.__processList(ifh,
                                                   enforceAscii=enforceAscii)
            else:
                with io.open(filePath, encoding="utf-8-sig",
                             errors="ignore") as ifh:
                    aList = self.__processList(ifh,
                                               enforceAscii=enforceAscii,
                                               **kwargs)
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        logger.debug("Reading list length %d", len(aList))
        return aList

    def __csvReader(self, csvFile, rowFormat, delimiter, uncomment=True):
        oL = []

        maxInt = sys.maxsize
        csv.field_size_limit(maxInt)
        if rowFormat == "dict":
            if uncomment:
                reader = csv.DictReader(uncommentFilter(csvFile),
                                        delimiter=delimiter)
            else:
                reader = csv.DictReader(csvFile, delimiter=delimiter)
            for rowD in reader:
                oL.append(rowD)
        elif rowFormat == "list":
            if uncomment:
                reader = csv.reader(uncommentFilter(csvFile),
                                    delimiter=delimiter)
            else:
                reader = csv.reader(csvFile, delimiter=delimiter)
            for rowL in reader:
                oL.append(rowL)
        return oL

    def deserializeCsvIter(self,
                           filePath,
                           delimiter=",",
                           rowFormat="dict",
                           encodingErrors="ignore",
                           uncomment=True,
                           **kwargs):
        """Return an iterator to input CSV format file.

        Args:
            filePath (str): input file path
            delimiter (str, optional): CSV delimiter. Defaults to ",".
            rowFormat (str, optional): format for each process row (list or dict). Defaults to "dict".
            encodingErrors (str, optional): treatment of encoding errors. Defaults to "ignore".
            uncomment (bool, optional): flag to ignore leading comments. Defaults to True.

        Returns:
            (iterator): iterator for rowwise access to processed CSV data
        """
        encoding = kwargs.get("encoding", "utf-8-sig")
        maxInt = sys.maxsize
        csv.field_size_limit(maxInt)
        try:
            if filePath[-3:] == ".gz":
                with gzip.open(filePath,
                               "rt",
                               encoding=encoding,
                               errors=encodingErrors) as csvFile:
                    startIt = itertools.dropwhile(
                        lambda x: x.startswith("#"),
                        csvFile) if uncomment else csvFile
                    if rowFormat == "dict":
                        reader = csv.DictReader(startIt, delimiter=delimiter)
                    elif rowFormat == "list":
                        reader = csv.reader(startIt, delimiter=delimiter)
                    for row in reader:
                        yield row
            else:
                with io.open(filePath,
                             newline="",
                             encoding=encoding,
                             errors="ignore") as csvFile:
                    startIt = itertools.dropwhile(
                        lambda x: x.startswith("#"),
                        csvFile) if uncomment else csvFile
                    if rowFormat == "dict":
                        reader = csv.DictReader(startIt, delimiter=delimiter)
                    elif rowFormat == "list":
                        reader = csv.reader(startIt, delimiter=delimiter)
                    for row in reader:
                        # if uncomment and row.startswith("#"):
                        #    continue
                        yield row
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))

    def __deserializeCsv(self,
                         filePath,
                         delimiter=",",
                         rowFormat="dict",
                         encodingErrors="ignore",
                         uncomment=True,
                         **kwargs):
        oL = []
        encoding = kwargs.get("encoding", "utf-8-sig")
        try:
            if filePath[-3:] == ".gz":
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding=encoding,
                                   errors=encodingErrors) as csvFile:
                        oL = self.__csvReader(csvFile,
                                              rowFormat,
                                              delimiter,
                                              uncomment=uncomment)
                else:
                    # Py2 situation non-ascii encodings is problematic
                    # with gzip.open(filePath, "rb") as csvFile:
                    #    oL = self.__csvReader(csvFile, rowFormat, delimiter)
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    with io.open(tPath,
                                 newline="",
                                 encoding=encoding,
                                 errors="ignore") as csvFile:
                        oL = self.__csvReader(csvFile,
                                              rowFormat,
                                              delimiter,
                                              uncomment=uncomment)
            else:
                with io.open(filePath,
                             newline="",
                             encoding=encoding,
                             errors="ignore") as csvFile:
                    oL = self.__csvReader(csvFile,
                                          rowFormat,
                                          delimiter,
                                          uncomment=uncomment)

            return oL
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        logger.debug("Reading list length %d", len(oL))
        return oL

    def __serializeCsv(self, filePath, rowDictList, fieldNames=None, **kwargs):
        """ """
        _ = kwargs
        try:
            wD = {}
            ret = False
            fNames = fieldNames if fieldNames else list(rowDictList[0].keys())
            # with io.open(filePath, 'w', newline='') as csvFile:
            with open(filePath, "w") as csvFile:
                writer = csv.DictWriter(csvFile, fieldnames=fNames)
                writer.writeheader()
                for ii, rowDict in enumerate(rowDictList):
                    try:
                        wD = {k: v for k, v in rowDict.items() if k in fNames}
                        writer.writerow(wD)
                    except Exception as e:
                        logger.error(
                            "Skipping bad CSV record %d wD %r rowDict %r with %s",
                            ii + 1, wD, rowDict, str(e))
                        continue

            ret = True
        except Exception as e:
            logger.error("Failing for %s : %r with %s", filePath, wD, str(e))
        return ret

    def __csvEncoder(self,
                     csvData,
                     encoding="utf-8-sig",
                     encodingErrors="ignore"):
        """Handle encoding issues for gzipped data in Py2. (beware of the BOM chars)

        Args:
            csvData (text lines): uncompressed data from gzip open
            encoding (str, optional): character encoding. Defaults to "utf-8-sig".
            encodingErrors (str, optional): error treatment. Defaults to "ignore".
        """
        for line in csvData:
            yield line.decode("utf-8-sig", errors=encodingErrors).encode(
                encoding, errors=encodingErrors)

    def __deserializeXmlPrev(self, filePath, **kwargs):
        """Read the input XML file path and return an ElementTree data object instance.

        Args:
            filePath (sting): input XML file path

        Returns:
            object: instance of an ElementTree tree object
        """
        _ = kwargs
        tree = None
        try:
            logger.debug("Parsing XML path %s", filePath)
            if filePath[-3:] == ".gz":
                with gzip.open(filePath, mode="rb") as ifh:
                    tV = time.time()
                    tree = ET.parse(ifh)
            else:
                with open(filePath, mode="rb") as ifh:
                    tV = time.time()
                    tree = ET.parse(ifh)
            logger.debug("Parsed %s in %.2f seconds", filePath,
                         time.time() - tV)
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        return tree

    def __testGzip(self, filePath):
        ok = True
        with gzip.open(filePath, "r") as fh:
            try:
                fh.read(1)
            except gzip.BadGzipFile:
                ok = False
            except Exception:
                ok = False
        logger.debug("Gzip file check %r", ok)
        return ok

    def __deserializeXml(self, filePath, **kwargs):
        """Read the input XML file path and return an ElementTree data object instance.

        Args:
            filePath (sting): input XML file path

        Returns:
            object: instance of an ElementTree tree object
        """
        _ = kwargs
        tree = None
        encoding = kwargs.get("encoding", "utf-8-sig")
        encodingErrors = kwargs.get("encodingErrors", "ignore")
        #
        try:
            logger.debug("Parsing XML path %s", filePath)
            if filePath[-3:] == ".gz" and self.__testGzip(filePath):
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding=encoding,
                                   errors=encodingErrors) as ifh:
                        tV = time.time()
                        tree = ET.parse(ifh)
                else:
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    with io.open(tPath,
                                 encoding=encoding,
                                 errors=encodingErrors) as ifh:
                        tV = time.time()
                        tree = ET.parse(ifh)
            else:
                with io.open(filePath,
                             encoding=encoding,
                             errors=encodingErrors) as ifh:
                    tV = time.time()
                    tree = ET.parse(ifh)
            logger.debug("Parsed %s in %.2f seconds", filePath,
                         time.time() - tV)
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        return tree
Пример #13
0
class DataTypeApiProvider(SingletonClass):
    """ Data type application and instance information provider.
    """
    def __init__(self, cfgOb, cachePath, useCache=True, **kwargs):
        """Data type application and instance information provider.

        Args:
            cfgOb (object):  ConfigInfo() object instance
            cachePath (str): path to hold the cache directory
            useCache (bool, optional): flag to use cached files. Defaults to True.

        """
        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__useCache = useCache
        self.__cachePath = cachePath
        # self.__contentInfoConfigName = "content_info_helper_configuration"
        self.__fileU = FileUtil()
        self.__contentDefHelper = self.__cfgOb.getHelper(
            "CONTENT_DEF_HELPER_MODULE",
            sectionName=self.__configName,
            cfgOb=self.__cfgOb)
        self.__dirPath = os.path.join(
            cachePath,
            self.__cfgOb.get("DATA_TYPE_INFO_CACHE_DIR",
                             sectionName=self.__configName))
        self.__kwargs = kwargs
        #
        logger.debug("Leaving constructor")

    def getDataTypeInstanceApi(self, databaseName, **kwargs):
        """Return instance of DataTypeInstanceInfo().

        Args:
            databaseName (str): database name

        Returns:
            (object): Instance of DataTypeInstanceInfo()
        """
        _ = kwargs
        dataTypeInstanceLocatorPath = self.__cfgOb.getPath(
            "INSTANCE_DATA_TYPE_INFO_LOCATOR_PATH",
            sectionName=self.__configName)
        dataTypeInstanceFile = self.__contentDefHelper.getDataTypeInstanceFile(
            databaseName) if self.__contentDefHelper else None
        if dataTypeInstanceLocatorPath and dataTypeInstanceFile:
            loc = os.path.join(dataTypeInstanceLocatorPath,
                               dataTypeInstanceFile)
            filePath = self.__reload(loc,
                                     self.__dirPath,
                                     useCache=self.__useCache)
            dtApi = DataTypeInstanceInfo(filePath)
        else:
            # DataTypeInstanceInfo() provides an internal by-pass mode where no coverage data is available.
            dtApi = DataTypeInstanceInfo(None)
            logger.debug("No data coverage available for database %s",
                         databaseName)
        return dtApi

    def getDataTypeApplicationApi(self, appName, **kwargs):
        """Return instance of DataTypeApplicationInfo.

        Args:
            appName (str): application name (e.g., SQL, ANY)

        Returns:
            (object): Instance of DataTypeApplicationInfo()
        """
        _ = kwargs
        dataTypeApplicationLocator = self.__cfgOb.getPath(
            "APP_DATA_TYPE_INFO_LOCATOR", sectionName=self.__configName)
        filePath = self.__reload(dataTypeApplicationLocator,
                                 self.__dirPath,
                                 useCache=self.__useCache)
        dtApi = DataTypeApplicationInfo(
            filePath, dataTyping=appName,
            workPath=self.__dirPath) if filePath else None
        return dtApi

    def __reload(self, urlTarget, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(urlTarget)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Using cache path %s", dirPath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.debug("Fetch data from source %s", urlTarget)
            ok = self.__fileU.get(urlTarget, os.path.join(dirPath, fn))

        return filePath if ok else None
Пример #14
0
class DictionaryApiProvider(SingletonClass):
    """ Resource provider for dictionary APIs.
    """
    def __init__(self, dirPath, useCache=True):
        """Resource provider for dictionary APIs.

        Args:
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        """
        self.__apiMap = {}
        self.__dirPath = dirPath
        self.__useCache = useCache
        #
        self.__fileU = FileUtil(workPath=self.__dirPath)
        logger.debug("Leaving constructor")

    def __reload(self, dictLocators, dirPath, useCache=True):
        """Reload local cache of dictionary resources and return a dictionary API instance.

        Args:
            dictLocators (list, str): list of locators for dictionary resource files
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        Returns:
            (object): instance of dictionary API
        """
        #
        # verify the exitence of the cache directory ...
        self.__fileU.mkdir(dirPath)
        if not useCache:
            for dictLocator in dictLocators:
                try:
                    fn = self.__fileU.getFileName(dictLocator)
                    os.remove(os.path.join(dirPath, fn))
                except Exception:
                    pass
        #
        ret = True
        for dictLocator in dictLocators:
            cacheFilePath = os.path.join(dirPath,
                                         self.__fileU.getFileName(dictLocator))
            if useCache and self.__fileU.exists(cacheFilePath):
                # nothing to do
                continue
            logger.debug("Fetching url %s caching in %s", dictLocator,
                         cacheFilePath)
            ok = self.__fileU.get(dictLocator, cacheFilePath)
            ret = ret and ok
        return ret

    def getApi(self, dictLocators, **kwargs):
        """Return a dictionary API object of the input dictioaries.

        Arguments:
            dictLocators {list str} -- list of dictionary locator paths

        Returns:
            [object] -- returns DictionaryApi() object for input dictionaries
        """
        dictFileNames = [
            self.__fileU.getFileName(dictLocator)
            for dictLocator in dictLocators
        ]
        dictTup = tuple(dictFileNames)
        dApi = self.__apiMap[
            dictTup] if dictTup in self.__apiMap else self.__getApi(
                dictLocators, **kwargs)
        self.__apiMap[dictTup] = dApi
        return dApi

    def __getApi(self, dictLocators, **kwargs):
        """ Return an instance of a dictionary API instance for the input dictionary locator list.
        """
        consolidate = kwargs.get("consolidate", True)
        replaceDefinition = kwargs.get("replaceDefinitions", True)
        verbose = kwargs.get("verbose", True)
        #
        ok = self.__reload(dictLocators,
                           self.__dirPath,
                           useCache=self.__useCache)
        #
        dApi = None
        if ok:
            mU = MarshalUtil()
            containerList = []
            for dictLocator in dictLocators:
                cacheFilePath = os.path.join(
                    self.__dirPath, self.__fileU.getFileName(dictLocator))
                containerList.extend(
                    mU.doImport(cacheFilePath, fmt="mmcif-dict"))
            #
            dApi = DictionaryApi(containerList=containerList,
                                 consolidate=consolidate,
                                 replaceDefinition=replaceDefinition,
                                 verbose=verbose)
        return dApi
Пример #15
0
class FileUtilTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb",
                                                     "mock-data",
                                                     "dictionaries",
                                                     "mmcif_pdbx_v5_next.dic")

        self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                               "NCBI", "names.dmp.gz")
        self.__zipFileUrl = "https://inventory.data.gov/dataset/794cd3d7-4d28-4408-8f7d-84b820dbf7f2/resource/6b78ec0c-4980-4ad8-9cbd-2d6eb9eda8e7/download/myfoodapediadata.zip"
        self.__xzFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                     "MOCK_MODBASE_MODELS",
                                     "NP_001030614.1_1.pdb.xz")
        #
        self.__ftpFileUrl = "ftp://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz"
        self.__httpsFileUrl = "https://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz"
        #
        self.__workPath = os.path.join(HERE, "test-output")
        self.__inpDirPath = os.path.join(HERE, "test-data")
        self.__fileU = FileUtil()
        self.__startTime = time.time()
        logger.debug("Running tests on version %s", __version__)
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testTarBundling(self):
        """Test case for tarfile bundling and unbundling"""
        try:
            tP = os.path.join(self.__workPath, "t0.tar.gz")
            dirPath = os.path.join(self.__inpDirPath, "topdir")

            ok = self.__fileU.bundleTarfile(tP, [dirPath],
                                            mode="w:gz",
                                            recursive=True)
            self.assertTrue(ok)

            numBytes = self.__fileU.size(tP)
            self.assertGreaterEqual(numBytes, 250)
            #
            md5 = self.__fileU.hash(tP, hashType="md5")
            self.assertTrue(md5 is not None)
            #
            ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath)
            self.assertTrue(ok)
            #
            tP = os.path.join(self.__workPath, "t1.tar.gz")
            dirPathList = [
                os.path.join(self.__inpDirPath, "topdir", "subdirA"),
                os.path.join(self.__inpDirPath, "topdir", "subdirB")
            ]

            ok = self.__fileU.bundleTarfile(tP,
                                            dirPathList,
                                            mode="w:gz",
                                            recursive=True)
            self.assertTrue(ok)
            #
            ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath)
            self.assertTrue(ok)

            tP = os.path.join(self.__workPath, "t2.tar")
            dirPathList = [
                os.path.join(self.__inpDirPath, "topdir", "subdirA"),
                os.path.join(self.__inpDirPath, "topdir", "subdirB")
            ]

            ok = self.__fileU.bundleTarfile(tP,
                                            dirPathList,
                                            mode="w",
                                            recursive=True)
            self.assertTrue(ok)
            #
            ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath)
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testGetFile(self):
        """Test case for a local files and directories"""
        try:
            remoteLocator = self.__pathPdbxDictionaryFile
            fn = self.__fileU.getFileName(remoteLocator)
            # _, fn = os.path.split(remoteLocator)
            lPath = os.path.join(self.__workPath, fn)
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            ok = self.__fileU.remove(lPath)
            self.assertTrue(ok)
            dPath = os.path.join(self.__workPath, "tdir")
            ok = self.__fileU.mkdir(dPath)
            self.assertTrue(ok)
            ok = self.__fileU.remove(dPath)
            self.assertTrue(ok)
            ok = self.__fileU.remove(";lakdjf")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testMoveAndCopyFile(self):
        """Test case for copying ("put") and moving ("replace") local files"""
        try:
            remoteLocator = self.__pathPdbxDictionaryFile
            fn = self.__fileU.getFileName(remoteLocator)
            # _, fn = os.path.split(remoteLocator)
            lPath = os.path.join(self.__workPath, fn)
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            # Test copy file
            dPath2 = os.path.join(self.__workPath, "tdir")
            ok = self.__fileU.mkdir(dPath2)
            self.assertTrue(ok)
            lPath2 = os.path.join(dPath2, fn)
            ok = self.__fileU.put(lPath, lPath2)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath2)
            self.assertTrue(ok)
            # Remove copied file (to test moving file next)
            ok = self.__fileU.remove(lPath2)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath2)
            self.assertFalse(ok)
            # Test move file
            ok = self.__fileU.replace(lPath, lPath2)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertFalse(ok)
            ok = self.__fileU.exists(lPath2)
            self.assertTrue(ok)
            # Now clean up files and dirs
            ok = self.__fileU.remove(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.remove(dPath2)
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testZipUrl(self):
        """Test case for downloading remote zip file and extracting contents."""
        try:
            remoteLocator = self.__zipFileUrl
            # fn = self.__fileU.getFileName(remoteLocator)
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            lPath = os.path.join(self.__workPath,
                                 self.__fileU.getFileName(self.__zipFileUrl))
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath)
            ok = fp.endswith("Food_Display_Table.xlsx")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testFtpUrl(self):
        """Test case for downloading remote file ftp protocol and extracting contents."""
        try:
            remoteLocator = self.__ftpFileUrl
            # fn = self.__fileU.getFileName(remoteLocator)
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            dirPath = os.path.join(self.__workPath, "chem_comp_models")
            lPath = os.path.join(dirPath,
                                 self.__fileU.getFileName(self.__ftpFileUrl))
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            fp = self.__fileU.uncompress(lPath, outputDir=dirPath)
            ok = fp.endswith("chem_comp_model.cif")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testRemote(self):
        """Test case remote status"""
        try:
            remoteLocator = self.__httpsFileUrl
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            ok = self.__fileU.exists(remoteLocator)
            self.assertTrue(ok)
            size = self.__fileU.size(remoteLocator)
            self.assertGreaterEqual(size, 1000)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skip("DrugBank example -- skipping")
    def testGetDrugBankUrl(self):
        """Test case for downloading drugbank master xml file"""
        try:
            remoteLocator = "https://www.drugbank.ca/releases/latest/downloads/all-full-database"
            un = "username"
            pw = "password"
            # fn = self.__fileU.getFileName(remoteLocator)
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            lPath = os.path.join(self.__workPath, "db-download.zip")
            ok = self.__fileU.get(remoteLocator,
                                  lPath,
                                  username=un,
                                  password=pw)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            self.__fileU.uncompress(lPath, outputDir=self.__workPath)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testXzFile(self):
        """Test case for extracting contents from xz file"""
        try:
            remoteLocator = self.__xzFile
            fn = self.__fileU.getFileName(remoteLocator)
            lPath = os.path.join(self.__workPath, fn)
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath)
            ok = fp.endswith(".pdb")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Пример #16
0
class CODModelSearch(object):
    def __init__(self, cachePath, **kwargs):
        self.__cachePath = cachePath
        #
        self.__useCache = kwargs.get("useCache", True)
        self.__ccUrlTarget = kwargs.get("ccUrlTarget", None)
        self.__birdUrlTarget = kwargs.get("birdUrlTarget", None)
        self.__descriptorUrlTarget = kwargs.get(
            "descriptorUrlTarget",
            "http://www.crystallography.net/cod/smi/allcod.smi")
        self.__prefix = kwargs.get("prefix", None)
        self.__numProc = kwargs.get("numProc", 4)
        self.__chunkSize = kwargs.get("chunkSize", 50)
        self.__ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc-full"
        self.__fU = FileUtil()
        # self.__ccmG = ChemCompModelGen(self.__cachePath, self.__prefix)

    def getResultIndex(self):
        mU = MarshalUtil(workPath=self.__cachePath)
        cD = mU.doImport(self.getResultFilePath(), fmt="json")
        return cD

    def getResultDetails(self, codId):
        mU = MarshalUtil(workPath=self.__cachePath)
        dD = mU.doImport(self.__getCodDetailsFilePath(codId), fmt="json")
        return dD

    def storeResultIndex(self, cD):
        mU = MarshalUtil(workPath=self.__cachePath)
        ok = mU.doExport(self.getResultFilePath(), cD, fmt="json", indent=3)
        return ok

    def getResultDirFilePath(self):
        dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-result-files"
        return os.path.join(self.__cachePath, dN)

    def getRawResultFilePath(self):
        dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-search-files"
        return os.path.join(self.__cachePath, dN,
                            "cod-raw-result-file-index.json")

    def getResultFilePath(self):
        dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-search-files"
        return os.path.join(self.__cachePath, dN, "cod-result-file-index.json")

    def getDescriptorPath(self):
        fn = self.__fU.getFileName(self.__descriptorUrlTarget)
        dirPath = self.getResultDirFilePath()
        filePath = os.path.join(dirPath, fn)
        return filePath

    def updateDescriptors(self):
        self.__fetchUrl(self.__descriptorUrlTarget,
                        filePath=self.getDescriptorPath(),
                        useCache=False)

    def __fetchUrl(self, urlTarget, filePath, useCache=False, noRetry=False):
        ok = False
        try:
            if not (useCache and self.__fU.exists(filePath)):
                startTime = time.time()
                ok = self.__fU.get(urlTarget, filePath, noRetry=noRetry)
                endTime = time.time()
                if ok:
                    logger.debug(
                        "Fetched %s for resource file %s (status = %r) (%.4f seconds)",
                        urlTarget, filePath, ok, endTime - startTime)
                else:
                    logger.error(
                        "Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)",
                        urlTarget, filePath, ok, endTime - startTime)
            else:
                ok = True
                logger.debug("Using cached data for %s", urlTarget)
            #
        except Exception as e:
            logger.exception("Failing for %r with %s", urlTarget, str(e))
        return ok

    def search(self, molLimit=None):
        try:
            bsw = BatchChemSearch(
                useCache=self.__useCache,
                ccUrlTarget=self.__ccUrlTarget,
                birdUrlTarget=self.__birdUrlTarget,
                ccFileNamePrefix=self.__ccFileNamePrefix,
                cachePath=self.__cachePath,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
            )
            smiPath = self.getDescriptorPath()
            smiL = bsw.fetchDescriptorList(smiPath, swap=True)
            logger.info("Query length (%d)", len(smiL))
            #
            smiL = bsw.splitSmiles(smiL)
            retL = bsw.doQuery(smiL[:molLimit],
                               "SMILES",
                               matchOpts="graph-exact")
            logger.info("Result length (%d)", len(retL))
            #
            for ii, ret in enumerate(retL, 1):
                logger.debug("%5d %8s %4s (%.3f) %s: %s", ii, ret.queryId,
                             ret.ccId, ret.fpScore, ret.queryType, ret.query)
            #
            fp = self.getRawResultFilePath()
            ok = bsw.storeMatchList(fp, retL)
            return len(retL) if ok else 0
        except Exception as e:
            logger.exception("Failing with %s", str(e))

    def __getSearchResults(self):
        """Read search results and convert to a chemical component dictionary."""
        fp = self.getRawResultFilePath()
        mU = MarshalUtil(workPath=self.__cachePath)
        rawL = mU.doImport(fp, fmt="json")
        rD = {}
        for cD in rawL:
            rD.setdefault(cD["ccId"], []).append(cD)
        return rD

    def __getCodEntryUrl(self, codId):
        # Template Examples:
        # https://molecules.crystallography.net/cod/sdf/1/00/00/1000098.sdf
        # https://molecules.crystallography.net/cod/sdf/6/00/05/6000557.sdf
        #
        baseUrl = "https://molecules.crystallography.net/cod/sdf"
        url = os.path.join(baseUrl, codId[0:1], codId[1:3], codId[3:5],
                           codId + ".sdf")
        return url

    def __getCodDetailsUrl(self, codId):
        baseUrl = "http://www.crystallography.net/cod/optimade/structures"
        url = os.path.join(baseUrl, codId)
        return url

    def __getCodDetailsFilePath(self, codId):
        dirPath = self.getResultDirFilePath()
        fp = os.path.join(dirPath, "cod-data", codId[0:1], codId[1:3],
                          codId[3:5], codId + ".json")
        return fp

    def __getCodEntryFilePath(self, codId):
        dirPath = self.getResultDirFilePath()
        fp = os.path.join(dirPath, "cod-data", codId[0:1], codId[1:3],
                          codId[3:5], codId + ".sdf")
        return fp

    def fetchMatchedData(self, useCache=True):
        """Fetch COD matched entries and metadata and update the raw search index with essential COD data attrbutes.

        Args:
            useCache (bool, optional): use any cached COD data. Defaults to True.

        Returns:
            int: search result count

        """
        eCount = 0
        eSkip = 0
        rcD = {}
        cD = self.__getSearchResults()
        #
        for ccId, qDL in cD.items():
            # cifPath = self.__ccmG.getChemCompPath(ccId)
            # if not cifPath:
            #    logger.info("No CIF for %s skipping", ccId)
            #    continue
            parentId = ccId.split("|")[0]
            rqDL = []
            for qD in qDL:
                codId = qD["queryId"]
                codEntryFilePath = self.__getCodEntryFilePath(codId)
                codDetailsFilePath = self.__getCodDetailsFilePath(codId)
                ok1 = self.__fetchUrl(self.__getCodEntryUrl(codId),
                                      self.__getCodEntryFilePath(codId),
                                      useCache=useCache,
                                      noRetry=True)
                ok2 = self.__fetchUrl(self.__getCodDetailsUrl(codId),
                                      self.__getCodDetailsFilePath(codId),
                                      useCache=useCache,
                                      noRetry=True)
                tD = self.getResultDetails(codId)
                dD = tD["data"][
                    "attributes"] if "data" in tD and "attributes" in tD[
                        "data"] else {}
                mD = tD["meta"][
                    "implementation"] if "meta" in tD and "implementation" in tD[
                        "meta"] else {}
                if ok1 & ok2:
                    logger.info("Fetched COD entry and details for %s (%r)",
                                codId, ok1 & ok2)
                    eCount += 1
                    qD["codEntryFilePath"] = codEntryFilePath
                    qD["codDetailsFilePath"] = codDetailsFilePath
                    # qD["cifPath"] = cifPath
                    qD["parentId"] = parentId
                    qD["chemicalName"] = dD[
                        "_cod_commonname"] if "_cod_commonname" in dD else None
                    qD["chemicalName"] = dD[
                        "_cod_chemname"] if "_cod_chemname" in dD else qD[
                            "chemicalName"]
                    qD["rValue"] = dD[
                        "_cod_Robs"] if "_cod_Robs" in dD else None
                    qD["diffrnTemp"] = dD[
                        "_cod_diffrtemp"] if "_cod_diffrtemp" in dD else None
                    qD["radiationSource"] = dD[
                        "_cod_radType"] if "_cod_radType" in dD else None
                    qD["publicationDOI"] = dD[
                        "_cod_doi"] if "_cod_doi" in dD else None
                    qD["version"] = mD["version"] if "version" in mD else None
                    qD["hasDisorder"] = "N"
                    rqDL.append(qD)
                else:
                    logger.info("Skipping entry missing data for %r at %r",
                                codId, self.__getCodEntryUrl(codId))
                    eSkip += 1
            if rqDL:
                rcD[ccId] = rqDL
        #
        ok = self.storeResultIndex(rcD)
        logger.info(
            "Final match result (w/sdf and metadata) (%d/%d) cod hits (%d) skipped (%d)",
            len(rcD), len(cD), eCount, eSkip)
        return eCount if ok else 0

    def fetchMatchedDataMp(self, numProc=6, chunkSize=5, useCache=True):
        rcD = {}
        cD = self.__getSearchResults()
        idList = list(cD.keys())
        # ---
        mpu = MultiProcUtil(verbose=True)
        mpu.setWorkingDir(self.__cachePath)
        mpu.setOptions(optionsD={
            "resultPath": self.__cachePath,
            "cD": cD,
            "useCache": useCache
        })
        mpu.set(workerObj=self, workerMethod="fetchDataWorker")

        ok, failList, resultList, _ = mpu.runMulti(dataList=idList,
                                                   numProc=numProc,
                                                   numResults=1,
                                                   chunkSize=chunkSize)
        logger.info("Run ended with status %r success count %d failures %r",
                    ok, len(resultList[0]), len(failList))
        for rTup in resultList[0]:
            rcD[rTup[0]] = rTup[1]
        # ---
        ok = self.storeResultIndex(rcD)
        logger.info("Final match result (w/sdf and metadata) (%d/%d)",
                    len(rcD), len(cD))
        return True

    def fetchDataWorker(self, dataList, procName, optionsD, workingDir):
        """Worker method to fetch COD data for matched entries

        Args:
            dataList (list): list of mol2 file paths to be searched
            procName (str): processName
            optionsD (dict): dictionary of options
            workingDir (str): path to working directory (not used)

        Returns:
            (successList, resultList, []): success and result lists of mol2 paths with CCDC matches
        """
        resultPath = optionsD["resultPath"]
        cD = optionsD["cD"]
        useCache = optionsD["useCache"]
        _ = workingDir
        resultList = []
        successList = []
        startTime = time.time()
        logger.info("starting %s at %s", procName,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
        #
        eCount = 0
        eSkip = 0
        try:
            stopPath = os.path.join(resultPath, "STOP")
            logger.info("%s starting search data length %d", procName,
                        len(dataList))
            if self.__checkStop(stopPath):
                logger.info("%s stopping", procName)
                return resultList, resultList, []
            #
            # for ccId, qDL in cD.items():
            for ccId in dataList:
                if ccId in cD:
                    qDL = cD[ccId]
                #
                parentId = ccId.split("|")[0]
                rqDL = []
                for qD in qDL:
                    codId = qD["queryId"]
                    codEntryFilePath = self.__getCodEntryFilePath(codId)
                    codDetailsFilePath = self.__getCodDetailsFilePath(codId)
                    ok1 = self.__fetchUrl(self.__getCodEntryUrl(codId),
                                          self.__getCodEntryFilePath(codId),
                                          useCache=useCache,
                                          noRetry=True)
                    ok2 = self.__fetchUrl(self.__getCodDetailsUrl(codId),
                                          self.__getCodDetailsFilePath(codId),
                                          useCache=useCache,
                                          noRetry=True)
                    tD = self.getResultDetails(codId)
                    dD = tD["data"][
                        "attributes"] if "data" in tD and "attributes" in tD[
                            "data"] else {}
                    mD = tD["meta"][
                        "implementation"] if "meta" in tD and "implementation" in tD[
                            "meta"] else {}
                    if ok1 & ok2:
                        logger.info(
                            "Fetched COD entry and details for %s (%r)", codId,
                            ok1 & ok2)
                        eCount += 1
                        qD["codEntryFilePath"] = codEntryFilePath
                        qD["codDetailsFilePath"] = codDetailsFilePath
                        # qD["cifPath"] = cifPath
                        qD["parentId"] = parentId
                        qD["chemicalName"] = dD[
                            "_cod_commonname"] if "_cod_commonname" in dD else None
                        qD["chemicalName"] = dD[
                            "_cod_chemname"] if "_cod_chemname" in dD else qD[
                                "chemicalName"]
                        qD["rValue"] = dD[
                            "_cod_Robs"] if "_cod_Robs" in dD else None
                        qD["diffrnTemp"] = dD[
                            "_cod_diffrtemp"] if "_cod_diffrtemp" in dD else None
                        qD["radiationSource"] = dD[
                            "_cod_radType"] if "_cod_radType" in dD else None
                        qD["publicationDOI"] = dD[
                            "_cod_doi"] if "_cod_doi" in dD else None
                        qD["version"] = mD[
                            "version"] if "version" in mD else None
                        qD["hasDisorder"] = "N"
                        rqDL.append(qD)
                    else:
                        logger.info("Skipping entry missing data for %r at %r",
                                    codId, self.__getCodEntryUrl(codId))
                        eSkip += 1
                if rqDL:
                    resultList.append((ccId, rqDL))
                    successList.append(ccId)
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        endTime = time.time()
        logger.info(
            "%s (entries %d skipped %d) (ccId result length %d) completed at %s (%.2f seconds)",
            procName,
            eCount,
            eSkip,
            len(successList),
            time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
            endTime - startTime,
        )
        return successList, resultList, []

    def __checkStop(self, path):
        try:
            if os.access(path, os.F_OK):
                return True
        except Exception:
            pass
        return False
Пример #17
0
    def __reloadFasta(self, dirPath, **kwargs):
        """Reload DrugBank target FASTA data files.

        Args:
            dirPath (str, optional): path to DrugBank cache directory
            useCache (bool, optional): flag to use cached files. Defaults to True.

        Returns:

        """
        startTime = time.time()
        logger.info("Starting db reload at %s",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
        retFilePathList = []
        urlTargetL = [
            "https://go.drugbank.com/releases/latest/downloads/target-all-polypeptide-sequences",
            "https://go.drugbank.com/releases/latest/downloads/enzyme-all-polypeptide-sequences",
            "https://go.drugbank.com/releases/latest/downloads/carrier-all-polypeptide-sequences",
            "https://go.drugbank.com/releases/latest/downloads/transporter-all-polypeptide-sequences",
        ]

        useCache = kwargs.get("useCache", True)
        username = kwargs.get("username", None)
        password = kwargs.get("password", None)
        #
        if not username or not password:
            return retFilePathList
        #
        fU = FileUtil()
        fU.mkdir(dirPath)
        #
        if not useCache:
            #  Clear any cached files
            for urlTarget in urlTargetL:
                baseFileName = fU.getFileName(urlTarget)
                zipFileName = baseFileName + ".fasta.zip"
                retFileName = baseFileName + ".fa"
                for fn in [baseFileName, zipFileName, retFileName]:
                    try:
                        fp = os.path.join(dirPath, fn)
                        os.remove(fp)
                    except Exception:
                        pass
        #
        ok = False
        if useCache:
            ok = True
            for urlTarget in urlTargetL:
                baseFileName = fU.getFileName(urlTarget)
                retFileName = baseFileName + ".fa"
                retFilePath = os.path.join(dirPath, retFileName)
                ok = fU.exists(retFilePath)
                if not ok:
                    break
                retFilePathList.append(retFilePath)
        #
        logger.info("Using cached files %r", ok)
        if not useCache or not ok:
            if not username or not password:
                logger.warning(
                    "Missing credentials for DrugBank file download...")

            for urlTarget in urlTargetL:
                baseFileName = fU.getFileName(urlTarget)
                zipFileName = baseFileName + ".fasta.zip"
                retFileName = baseFileName + ".fa"
                zipFilePath = os.path.join(dirPath, zipFileName)
                retFilePath = os.path.join(dirPath, retFileName)
                basePath = os.path.join(dirPath, baseFileName)
                logger.info("Fetching url %s for FASTA target file %s",
                            urlTarget, baseFileName)
                ok = fU.get(urlTarget,
                            zipFilePath,
                            username=username,
                            password=password)
                endTime = time.time()
                logger.info(
                    "Completed db fetch at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
                #
                ok = fU.unbundleZipfile(zipFilePath, dirPath=basePath)
                fU.put(os.path.join(basePath, "protein.fasta"), retFilePath)
                endTime = time.time()
                logger.info(
                    "Completed unzip at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
                retFilePathList.append(retFilePath)
        return retFilePathList
Пример #18
0
class MarshalUtil(object):
    """Wrapper for serialization and deserialization methods."""
    def __init__(self, **kwargs):
        self.__workPath = kwargs.get("workPath", ".")
        self.__workDirSuffix = kwargs.get("workDirSuffix", "marshall_")
        self.__workDirPrefix = kwargs.get("workDirSuffix", "_tempdir")
        #
        self.__fileU = FileUtil(workPath=self.__workPath)
        self.__ioU = IoUtil()

    def doExport(self,
                 locator,
                 obj,
                 fmt="list",
                 marshalHelper=None,
                 numParts=None,
                 **kwargs):
        """Serialize the input object at locator path in specified format.  The
        input object is optionally preprocessed by the helper method.

        Args:
            locator (str): target path or URI
            obj (object): data to be serialized
            fmt (str, optional): format for serialization (mmcif, tdd, csv, list). Defaults to "list".
            marshalHelper (method, optional): pre-processor method applied to input data object. Defaults to None.
            numParts (int, optional): serialize the data in parts. Defaults to None. (json and pickle formats)
        Returns:
            bool: True for sucess or False otherwise
        """
        try:
            ret = False
            localFlag = self.__fileU.isLocal(locator)
            if marshalHelper:
                myObj = marshalHelper(obj, **kwargs)
            else:
                myObj = obj
            #
            if localFlag and numParts and fmt in ["json", "pickle"]:
                localFilePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.serializeInParts(localFilePath,
                                                  myObj,
                                                  numParts,
                                                  fmt=fmt,
                                                  **kwargs)
            elif localFlag:
                localFilePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.serialize(localFilePath,
                                           myObj,
                                           fmt=fmt,
                                           workPath=self.__workPath,
                                           **kwargs)
            else:
                with tempfile.TemporaryDirectory(
                        suffix=self.__workDirSuffix,
                        prefix=self.__workDirPrefix,
                        dir=self.__workPath) as tmpDirName:
                    # write a local copy then copy to destination -
                    #
                    localFilePath = os.path.join(
                        self.__workPath, tmpDirName,
                        self.__fileU.getFileName(locator))
                    ok1 = self.__ioU.serialize(localFilePath,
                                               myObj,
                                               fmt=fmt,
                                               workPath=self.__workPath,
                                               **kwargs)
                    ok2 = True
                    if ok1:
                        ok2 = self.__fileU.put(localFilePath, locator,
                                               **kwargs)
                ret = ok1 and ok2
        except Exception as e:
            logger.exception("Exporting locator %r failing with %s", locator,
                             str(e))

        return ret

    def doImport(self,
                 locator,
                 fmt="list",
                 marshalHelper=None,
                 numParts=None,
                 **kwargs):
        """Deserialize data at the target locator in specified format. The deserialized
        data is optionally post-processed by the input helper method.

        Args:
            locator (str): path or URI to input data
            fmt (str, optional): format for deserialization (mmcif, tdd, csv, list). Defaults to "list".
            marshalHelper (method, optional): post-processor method applied to deserialized data object. Defaults to None.
            numParts (int, optional): deserialize the data in parts. Defaults to None. (json and pickle formats)
            tarMember (str, optional): name of a member of tar file bundle. Defaults to None. (tar file format)

        Returns:
            Any: format specific return type
        """
        try:
            tarMember = kwargs.get("tarMember", None)
            localFlag = self.__fileU.isLocal(locator) and not tarMember
            #
            if localFlag and numParts and fmt in ["json", "pickle"]:
                filePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.deserializeInParts(filePath,
                                                    numParts,
                                                    fmt=fmt,
                                                    **kwargs)
            elif localFlag:
                filePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.deserialize(filePath,
                                             fmt=fmt,
                                             workPath=self.__workPath,
                                             **kwargs)
            else:
                #
                if fmt == "mmcif":
                    ret = self.__ioU.deserialize(locator,
                                                 fmt=fmt,
                                                 workPath=self.__workPath,
                                                 **kwargs)
                else:
                    with tempfile.TemporaryDirectory(
                            suffix=self.__workDirSuffix,
                            prefix=self.__workDirPrefix,
                            dir=self.__workPath) as tmpDirName:
                        #
                        # Fetch first then read a local copy -
                        #
                        if tarMember:
                            localFilePath = os.path.join(
                                self.__workPath, tmpDirName, tarMember)
                        else:
                            localFilePath = os.path.join(
                                self.__workPath, tmpDirName,
                                self.__fileU.getFileName(locator))

                        # ---  Local copy approach ---
                        self.__fileU.get(locator, localFilePath, **kwargs)
                        ret = self.__ioU.deserialize(localFilePath,
                                                     fmt=fmt,
                                                     workPath=self.__workPath,
                                                     **kwargs)

            if marshalHelper:
                ret = marshalHelper(ret, **kwargs)
        except Exception as e:
            logger.exception("Importing locator %r failing with %s", locator,
                             str(e))
            ret = None
        return ret

    def exists(self, filePath, mode=os.R_OK):
        return self.__fileU.exists(filePath, mode=mode)

    def mkdir(self, dirPath, mode=0o755):
        return self.__fileU.mkdir(dirPath, mode=mode)

    def remove(self, pth):
        return self.__fileU.remove(pth)
    def __rebuildCache(self,
                       targetUrl,
                       mapNameL,
                       outDirPath,
                       rawDirPath,
                       fmt="pickle",
                       useCache=True):
        """Fetch the UniProt selected id mapping resource file and extract
        UniProt Acc to  'mapIndex' mapping. Serialize the mapping as required.

        Args:
            targetUrl (str): source URL of the remote index file
            mapNameL (list): list of key mapping names to extract from the index
            outDirPath (str): directory path for raw and processed mapping files
            fmt (str, optional): output format (pickle|json) . Defaults to "pickle".
            useCache (bool, optional): use cached files. Defaults to True.

        Returns:
            dict: od[uniprotId] = mapped value

                idmapping_selected.tab

                1. UniProtKB-AC
                2. UniProtKB-ID
                3. GeneID (EntrezGene)
                4. RefSeq
                5. GI
                6. PDB
                7. GO
                8. UniRef100
                9. UniRef90
                10. UniRef50
                11. UniParc
                12. PIR
                13. NCBI-taxon
                14. MIM
                15. UniGene
                16. PubMed
                17. EMBL
                18. EMBL-CDS
                19. Ensembl
                20. Ensembl_TRS
                21. Ensembl_PRO
                22. Additional PubMed

        """
        startTime = time.time()
        nL = mapNameL
        oD = {}
        try:
            fileU = FileUtil()
            fExt = "pic" if fmt == "pickle" else "json"
            fExt = "tdd" if fmt == "tdd" else fExt
            fN, _ = os.path.splitext(fileU.getFileName(targetUrl))
            mapFileName = fN + "-map." + fExt
            idMapPath = os.path.join(outDirPath, mapFileName)
            mU = MarshalUtil()
            if useCache and mU.exists(idMapPath):
                logger.info("Reading cached serialized file %r", idMapPath)
                if fmt in ["pickle", "json"]:
                    tD = mU.doImport(idMapPath, fmt=fmt)
                    nL = list(set(tD["idNameList"]))
                    oD = tD["uniprotMapD"]
                    logger.info("keys %r", list(oD.keys())[:10])
                    logger.info("nL %r", nL)
                    ok = True
                elif fmt == "tdd":
                    ioU = IoUtil()
                    it = ioU.deserializeCsvIter(idMapPath,
                                                delimiter="\t",
                                                rowFormat="list",
                                                encodingErrors="ignore")
                    tL = next(it, [])
                    nL = tL[1:]
                    if len(nL) == 1:
                        for row in it:
                            oD[row[0]] = row[1]
                    else:
                        for row in it:
                            oD[row[0]] = row[1:]
                    ok = True
            else:
                idPath = os.path.join(rawDirPath, fileU.getFileName(targetUrl))
                if not fileU.exists(idPath):
                    logger.info(
                        "Fetching selected UniProt idmapping data from %r in %r",
                        targetUrl, outDirPath)
                    ok = fileU.get(targetUrl, idPath)
                    if not ok:
                        logger.error("Failed to downlowd %r", targetUrl)
                        return oD
                else:
                    logger.info("Using cached mapping file %r", idPath)
                # ---
                ioU = IoUtil()
                if fmt in ["pickle", "json"]:
                    if len(mapNameL) == 1:
                        for row in ioU.deserializeCsvIter(
                                idPath,
                                delimiter="\t",
                                rowFormat="list",
                                encodingErrors="ignore"):
                            oD[row[0]] = str(
                                row[self.__mapRecordD[mapNameL[0]] - 1])
                    else:
                        for row in ioU.deserializeCsvIter(
                                idPath,
                                delimiter="\t",
                                rowFormat="list",
                                encodingErrors="ignore"):
                            for mapName in mapNameL:
                                oD.setdefault(row[0], []).append(
                                    str(row[self.__mapRecordD[mapName] - 1]))
                    logger.info("Writing serialized mapping file %r",
                                idMapPath)
                    ok = mU.doExport(idMapPath, {
                        "idNameList": mapNameL,
                        "uniprotMapD": oD
                    },
                                     fmt=fmt)
                elif fmt == "tdd":
                    #
                    logger.info("Writing serialized mapping file %r",
                                idMapPath)
                    fU = FileUtil()
                    fU.mkdirForFile(idMapPath)
                    colNameL = []
                    colNameL.append("UniProtId")
                    colNameL.extend(mapNameL)
                    with open(idMapPath, "w", encoding="utf-8") as ofh:
                        ofh.write("%s\n" % "\t".join(colNameL))
                        if len(mapNameL) == 1:
                            idx = self.__mapRecordD[mapNameL[0]] - 1
                            for row in ioU.deserializeCsvIter(
                                    idPath,
                                    delimiter="\t",
                                    rowFormat="list",
                                    encodingErrors="ignore"):
                                ofh.write("%s\t%s\n" % (row[0], row[idx]))
                        else:
                            idxL = [0]
                            idxL.extend([
                                self.__mapRecordD[mapName] - 1
                                for mapName in mapNameL
                            ])
                            for row in ioU.deserializeCsvIter(
                                    idPath,
                                    delimiter="\t",
                                    rowFormat="list",
                                    encodingErrors="ignore"):
                                ofh.write(
                                    "%s\n" %
                                    "\t".join([str(row[idx]) for idx in idxL]))
                            #
                    nL, oD = self.__rebuildCache(targetUrl,
                                                 mapNameL,
                                                 outDirPath,
                                                 rawDirPath,
                                                 fmt=fmt,
                                                 useCache=True)
                    ok = True if nL and oD else False
            logger.info("Completed reload (%r) at %s (%.4f seconds)", ok,
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        time.time() - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return nL, oD
Пример #20
0
class SchemaProvider(SingletonClass):
    """ A collection of schema build and caching methods.

        Static cache worflow:

            <authorative source>  <--   <cache dir>  <-  client API

        Compute workflow:

        <dependent resource files, config file, dictionaries> -> [schema builder] --> <schema def> --> <Json schema>

    """

    def __init__(self, cfgOb, cachePath, useCache=True, rebuildFlag=False, **kwargs):
        """A collection of schema build and caching methods.

        Args:
            cfgOb (object): ConfigInfo() instance
            cachePath (str): path to directory containing schema
            useCache (bool, optional): use cached schema. Defaults to True.
            rebuildFlag (bool, optional): on-the-fly rebuild and cache schema
        """

        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__cachePath = os.path.abspath(cachePath)
        self.__useCache = useCache
        self.__rebuildFlag = rebuildFlag
        self.__useCache = rebuildFlag if rebuildFlag else useCache
        #
        self.__workPath = os.path.join(self.__cachePath, "work")

        self.__fileU = FileUtil(workPath=os.path.join(self.__cachePath, "work"))
        self.__schemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName))
        self.__jsonSchemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("JSON_SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName))
        self.__fileU.mkdir(self.__schemaCachePath)
        self.__fileU.mkdir(self.__jsonSchemaCachePath)
        self.__kwargs = kwargs

    def getSchemaOptions(self, schemaLevel, extraOpts=None):
        opts = extraOpts + "|" if extraOpts else ""
        if schemaLevel == "full":
            return opts + "mandatoryKeys|mandatoryAttributes|bounds|enums|rcsb"
        elif schemaLevel in ["min", "minimum"]:
            return opts + "mandatoryKeys|enums|rcsb"
        else:
            return opts

    def getSchemaInfo(self, databaseName, dataTyping="ANY"):
        """Convenience method to return essential schema details for the input repository content type.

        Args:
            databaseName (str): schema name  (e.g. pdbx, bird, chem_comp, ...)
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)

        Returns:
            tuple: SchemaDefAccess(object), target database name, target collection name list, primary index attribute list


        """
        sd = None
        dbName = None
        collectionNameList = []
        docIndexD = {}
        try:
            mU = MarshalUtil(workPath=self.__workPath)
            schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
            if self.__rebuildFlag:
                filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
                self.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True)
            else:
                filePath = self.__reload(schemaLocator, self.__schemaCachePath, useCache=self.__useCache)

            if not filePath:
                logger.error("Unable to recover schema %s (%s)", databaseName, dataTyping)
            logger.debug("ContentType %r dataTyping %r schemaLocator %r", databaseName, dataTyping, schemaLocator)
            schemaDef = mU.doImport(filePath, fmt="json")
            if schemaDef:
                logger.debug("Using cached schema definition for %s application %s", databaseName, dataTyping)
                sd = SchemaDefAccess(schemaDef)
                if sd:
                    dbName = sd.getDatabaseName()
                    collectionInfoList = sd.getCollectionInfo()
                    logger.debug("Schema %s database name %s collections %r", databaseName, dbName, collectionInfoList)
                    for cd in collectionInfoList:
                        collectionName = cd["NAME"]
                        collectionNameList.append(collectionName)
                        docIndexD[collectionName] = sd.getDocumentIndices(collectionName)

        except Exception as e:
            logger.exception("Retreiving schema %s for %s failing with %s", databaseName, dataTyping, str(e))

        return sd, dbName, collectionNameList, docIndexD

    def schemaDefCompare(self, databaseName, dataTyping="ANY"):
        """Compare computed schema defintion with current source/cached version.

        Args:
            databaseName (str): schema definition name for comparison
            dataTyping (str, optional): data type conventions for the schema comparison. Defaults to "ANY".

        Returns:
            (str): file path for schema difference or None
        """
        mU = MarshalUtil(workPath=self.__workPath)
        schemaDiffPath = os.path.join(self.__cachePath, "schema_diff")
        mU.mkdir(schemaDiffPath)
        schemaPath = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
        fn = self.__fileU.getFileName(schemaPath)
        sD = self.makeSchemaDef(databaseName, dataTyping=dataTyping)
        v2 = sD["DATABASE_VERSION"]
        # ----
        # tPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath) + "-test")
        # logger.info("Exporting schema def to %s", tPath)
        # mU.doExport(tPath, sD, fmt="json", indent=3)
        # sD = mU.doImport(tPath, fmt="json")
        # ----
        cPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath))
        sDCache = mU.doImport(cPath, fmt="json")
        v1 = sDCache["DATABASE_VERSION"]
        #
        numDiff, difD = self.schemaCompare(sDCache, sD)
        #
        # jD = diff(sDCache, sD, syntax="explicit", marshal=True)
        diffPath = None
        if numDiff:
            bn, _ = os.path.splitext(fn)
            diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json")
            # logger.info("diff for %s %s = \n%s", databaseName, dataTyping, pprint.pformat(difD, indent=3, width=100))
            mU.doExport(diffPath, difD, fmt="json", indent=3)
        #
        return diffPath

    def jsonSchemaCompare(self, databaseName, collectionName, encodingType, level, extraOpts=None):
        """Compare computed JSON schema defintion with current source/cached version.

        Args:
            databaseName (str): schema name
            collectionName (str): collection name
            encodingType (str): schema data type conventions (JSON|BSON)
            level (str): metadata level (min|full)
            extraOpts (str): extra schema construction options

        Returns:
            (str): path to the difference file or None
        """
        mU = MarshalUtil(workPath=self.__workPath)
        schemaDiffPath = os.path.join(self.__cachePath, "schema_diff")
        mU.mkdir(schemaDiffPath)
        schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType, level)
        fn = self.__fileU.getFileName(schemaLocator)
        schemaPath = os.path.join(self.__jsonSchemaCachePath, fn)
        #
        sD = self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, saveSchema=False, extraOpts=extraOpts)
        v2 = self.__getSchemaVersion(sD)
        # ----
        # tPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaPath) + "-test")
        # logger.info("Exporting json schema to %s", tPath)
        # mU.doExport(tPath, sD, fmt="json", indent=3)
        # ----
        #
        sDCache = mU.doImport(schemaPath, fmt="json")
        v1 = self.__getSchemaVersion(sDCache)
        if not v1:
            logger.error("no version for %s - %s %s", schemaLocator, databaseName, collectionName)
        #
        numDiff, difD = self.schemaCompare(sDCache, sD)
        # jD = diff(sDCache, sD, marshal=True, syntax="explicit")
        diffPath = None
        if numDiff:
            logger.debug("diff for %s %s %s %s = \n%s", databaseName, collectionName, encodingType, level, pprint.pformat(difD, indent=3, width=100))
            bn, _ = os.path.splitext(fn)
            diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json")
            mU.doExport(diffPath, difD, fmt="json", indent=3)

        return diffPath

    def __getSchemaVersion(self, jsonSchema):
        try:
            comment = jsonSchema["$comment"] if "$comment" in jsonSchema else ""
            ff = comment.split(":")
            version = ff[1].strip()
            return version
        except Exception as e:
            logger.exception("Failing for with %s", str(e))
        return ""

    def __getSchemaDefLocator(self, databaseName, dataTyping="ANY"):
        """Internal method returning schema definition path for the input content type and application.
           Defines schema definition naming convention -

           Args:
            databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...)
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)

            Returns:

             str: schema definition file locator

        """
        schemaLocator = None
        try:
            locPath = self.__cfgOb.get("SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName)
            fn = "schema_def-%s-%s.json" % (databaseName, dataTyping.upper())
            schemaLocator = os.path.join(locPath, fn)
        except Exception as e:
            logger.exception("Retreiving schema definition path %s for %s failing with %s", databaseName, dataTyping, str(e))
        return schemaLocator

    def __getJsonSchemaLocator(self, databaseName, collectionName, encodingType="BSON", level="full"):
        """Internal method returning JSON schema path for the input collection data type convention and level.
           Defines the JSON/BSON schema naming convention -

           Args:
            databaseName (str): database name in the document store
            collectionName (str): collection name in document store
            encodingType (str, optional): data type convention (BSON|JSON)
            level (str, optional): Completeness of the schema (e.g. min or full)

            Returns:

            str: schema file locator

        """
        schemaLocator = None
        try:
            sdType = None
            sLevel = None
            schemaLocator = None
            if encodingType.upper() in ["JSON", "BSON"]:
                sdType = encodingType.lower()
            if level.lower() in ["min", "minimun"]:
                sLevel = "min"
            elif level.lower() in ["full"]:
                sLevel = level.lower()
            #
            if sdType and sLevel:
                locPath = self.__cfgOb.get("JSON_SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName)
                fn = "%s-%s-db-%s-col-%s.json" % (sdType, sLevel, databaseName, collectionName)
                schemaLocator = os.path.join(locPath, fn)
            else:
                logger.error("Unsupported schema options:  %s level %r type %r", collectionName, level, encodingType)
                schemaLocator = None
        except Exception as e:
            logger.debug("Retreiving JSON schema definition for %s type %s failing with %s", collectionName, encodingType, str(e))
        #
        return schemaLocator

    def __reload(self, locator, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(locator)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Target cache filePath %s", filePath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.info("Fetch data from source %s to %s", locator, filePath)
            ok = self.__fileU.get(locator, filePath)

        return filePath if ok else None

    def getJsonSchema(self, databaseName, collectionName, encodingType="BSON", level="full", extraOpts=None):
        """Return JSON schema (w/ BSON types) object for the input collection and level.and

        Args:
            databaseName (str): database name
            collectionName (str): collection name in document store
            encodingType (str, optional): data type convention (BSON|JSON)
            level (str, optional): Completeness of the schema (e.g. min or full)

        Returns:
            dict: Schema object

        """
        sObj = None
        schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level)
        #
        if self.__rebuildFlag:
            filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
            self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, extraOpts=extraOpts)
        else:
            filePath = self.__reload(schemaLocator, self.__jsonSchemaCachePath, useCache=self.__useCache)
        mU = MarshalUtil(workPath=self.__workPath)
        if filePath and mU.exists(filePath):
            mU = MarshalUtil(workPath=self.__workPath)
            sObj = mU.doImport(filePath, fmt="json")
        else:
            logger.debug("Failed to read schema for %s %r", collectionName, level)
        return sObj

    def makeSchema(self, databaseName, collectionName, encodingType="BSON", level="full", saveSchema=False, extraOpts=None):
        try:
            smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath)
            #
            cD = None
            stU = encodingType.upper()
            cD = smb.build(collectionName, dataTyping=stU, encodingType=stU, enforceOpts=self.getSchemaOptions(level, extraOpts=extraOpts))
            if cD and saveSchema:
                schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level)
                localPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaLocator))
                mU = MarshalUtil(workPath=self.__workPath)
                mU.doExport(localPath, cD, fmt="json", indent=3, enforceAscii=False)
        except Exception as e:
            logger.exception("Building schema %s collection %s failing with %s", databaseName, collectionName, str(e))
        return cD

    def makeSchemaDef(self, databaseName, dataTyping="ANY", saveSchema=False):
        schemaDef = None
        try:
            smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath)
            schemaDef = smb.build(dataTyping=dataTyping, encodingType="rcsb")
            if schemaDef and saveSchema:
                schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
                localPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
                mU = MarshalUtil(workPath=self.__workPath)
                mU.doExport(localPath, schemaDef, fmt="json", indent=3, enforceAscii=False)
        except Exception as e:
            logger.exception("Building schema %s failing with %s", databaseName, str(e))
        return schemaDef

    def schemaCompare(self, orgD, newD):
        """ Compute the difference of nested dictionaries.

        """
        fOrgD = self.__flatten(orgD)
        fNewD = self.__flatten(newD)
        if len(fOrgD) != len(fNewD):
            logger.debug("Schema lengths differ: org %d new %d", len(fOrgD), len(fNewD))
        #
        addedD = {k: fNewD[k] for k in set(fNewD) - set(fOrgD)}
        removedD = {k: fOrgD[k] for k in set(fOrgD) - set(fNewD)}
        changedOrgD = {k: fOrgD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]}
        changedNewD = {k: fNewD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]}
        chD = {}
        for ky in changedOrgD:
            kyS = ".".join(ky)
            vOrg = changedOrgD[ky]
            vNew = changedNewD[ky]
            if isinstance(vOrg, (list, tuple)) and isinstance(vNew, (list, tuple)):
                # logger.info(" >> %r vOrg %r vNew %r", ky, vOrg, vNew)
                dV = list(set(vNew) - set(vOrg))
                if dV:
                    chD[kyS] = {"diff": dV}
            else:
                chD[kyS] = {"from": vOrg, "to": vNew}
        #
        nT = len(addedD) + len(removedD) + len(chD)
        diffD = {"added": [".".join(kk) for kk in addedD.keys()], "removed": [".".join(kk) for kk in removedD.keys()], "changed": chD}
        return nT, diffD

    def __flatten(self, inpDict, prefix=None):
        prefix = prefix[:] if prefix else []
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flatten(value, prefix + [key])
                outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
            elif isinstance(value, (list, tuple)) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flatten(sublist, prefix + [key] + [str(index)])
                        outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[tuple(prefix + [key] + [str(index)])] = value
            else:
                outDict[tuple(prefix + [key])] = value
        return outDict

    def __flattenX(self, inpDict, prefix=None):
        prefix = prefix[:] if prefix else []
        # separator = "."
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flatten(value, prefix + [key])
                outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
            elif isinstance(value, list) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flatten(sublist, prefix + [key] + [str(index)])
                        outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[tuple(prefix + [key] + [str(index)])] = value
            else:
                outDict[tuple(prefix + [key])] = value
        return outDict

    def __flattenOrg(self, inpDict, separator=".", prefix=""):
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flattenOrg(value, separator, prefix + key + separator)
                outDict.update({key2: val2 for key2, val2 in deeper.items()})
            elif isinstance(value, list) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flattenOrg(sublist, separator, prefix + key + separator + str(index) + separator)
                        outDict.update({key2: val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[prefix + key + separator + str(index)] = value
            else:
                outDict[prefix + key] = value
        return outDict

    def __dictGen(self, indict, pre=None):
        pre = pre[:] if pre else []
        if isinstance(indict, dict):
            for key, value in indict.items():
                if isinstance(value, dict):
                    for dD in self.__dictGen(value, pre + [key]):
                        yield dD
                elif isinstance(value, list) or isinstance(value, tuple):
                    for v in value:
                        for dD in self.__dictGen(v, pre + [key]):
                            yield dD
                else:
                    yield pre + [key, value]
        else:
            yield indict
Пример #21
0
class ProvenanceProvider(SingletonClass):
    """Utilities to access and update provenance details."""
    def __init__(self, cfgOb, cachePath, useCache=True, **kwargs):
        """Utilities to access and update provenance details.

        Args:
            cfgOb ([type]): ConfigInfo() instance
            cachePath ([type]): path to directory containing schema
            useCache (bool, optional): use cached schema. Defaults to True.
        """

        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__cachePath = cachePath
        self.__useCache = useCache
        #
        self.__workPath = os.path.join(self.__cachePath, "work")
        self.__provenanceCachePath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("PROVENANCE_INFO_CACHE_DIR",
                             sectionName=self.__configName))
        self.__provenanceLocator = self.__cfgOb.getPath(
            "PROVENANCE_INFO_LOCATOR", sectionName=self.__configName)
        #
        self.__fileU = FileUtil(workPath=self.__workPath)
        self.__fileU.mkdir(self.__provenanceCachePath)
        self.__kwargs = kwargs
        #

    def __reload(self, locator, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(locator)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Using cache path %s", dirPath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.debug("Fetch data from source %s", locator)
            ok = self.__fileU.get(locator, filePath)

        return filePath if ok else None

    def fetch(self):
        try:
            provenanceFileCachePath = self.__reload(self.__provenanceLocator,
                                                    self.__provenanceCachePath,
                                                    useCache=self.__useCache)
            mU = MarshalUtil(workPath=self.__workPath)
            return mU.doImport(provenanceFileCachePath, fmt="json")
        except Exception as e:
            logger.exception("Failed retreiving provenance with %s", str(e))
        return {}

    def update(self, provD):
        ok = False
        try:
            provenanceFileCachePath = self.__reload(self.__provenanceLocator,
                                                    self.__provenanceCachePath,
                                                    useCache=self.__useCache)
            mU = MarshalUtil(workPath=self.__workPath)
            tD = mU.doImport(provenanceFileCachePath, fmt="json")
            tD.update(provD)
            ok = mU.doExport(provenanceFileCachePath, tD, fmt="json")
        except Exception as e:
            logger.exception("Failed updating provenance with %s", str(e))
        return ok

    def store(self, provD):
        ok = False
        try:
            provenanceFileCachePath = self.__reload(self.__provenanceLocator,
                                                    self.__provenanceCachePath,
                                                    useCache=self.__useCache)
            mU = MarshalUtil(workPath=self.__workPath)
            ok = mU.doExport(provenanceFileCachePath, provD, fmt="json")
        except Exception as e:
            logger.exception("Failed storing provenance with %s", str(e))
        return ok