def __reload(self, dirPath, useCache, fmt): startTime = time.time() fD = {} ok = False cofactorPath = self.__getCofactorDataPath(fmt=fmt) # logger.info("useCache %r featurePath %r", useCache, cofactorPath) if useCache and self.__mU.exists(cofactorPath): fD = self.__mU.doImport(cofactorPath, fmt=fmt) else: fU = FileUtil() fU.mkdir(dirPath) # --- logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return fD
def __fetchUrl(self, urlTarget, dirPath, useCache=False): fU = FileUtil() fn = fU.getFileName(urlTarget) filePath = os.path.join(dirPath, fn) if not (useCache and fU.exists(filePath)): startTime = time.time() ok2 = fU.get(urlTarget, filePath) endTime = time.time() if ok2: logger.info( "Fetched %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime) else: logger.error( "Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime) # return filePath
def __reload(self, dirPath, useCache): startTime = time.time() aD = {} fU = FileUtil() fU.mkdir(dirPath) targetMechanismFilePath = self.getTargetMechanismDataPath() # if useCache and fU.exists(targetMechanismFilePath): logger.info("useCache %r using %r", useCache, targetMechanismFilePath) qD = self.__mU.doImport(targetMechanismFilePath, fmt="json") aD = qD["mechanism"] if "mechanism" in qD else {} # logger.info("Completed reload of (%d) at %s (%.4f seconds)", len(aD), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # return aD
def __reload(self, dirPath, useCache): startTime = time.time() fD = {} ok = False mappingPath = self.__getMappingDataPath() # logger.info("useCache %r mappingPath %r", useCache, mappingPath) if useCache and self.__mU.exists(mappingPath): fD = self.__mU.doImport(mappingPath, fmt="json") ok = True else: fU = FileUtil() fU.mkdir(dirPath) # --- logger.info("Completed reload with status (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return fD
def put(self, localPath, remotePath): """Put a local file on a remote FTP server. Arguments: localPath (str): local file path remotePath (str): remote file path Returns: bool: True for success or false otherwise """ try: # First, make sure the provided localPath represents a file, not a directory if not os.path.isfile(localPath): logger.error( "put failing for localPath %s - path must be to a specific file, not a directory.", localPath) return False fileU = FileUtil() remotePathDir = fileU.getFilePath(remotePath) self.mkdir(remotePathDir) # If provided remotePath already exists and is a directory, put the file on the remote server using the local filename # to avoid unintentionally overwriting an entire remote directory with a single file if (os.path.exists(remotePath) and os.path.isdir(remotePath)): localFileName = FileUtil().getFileName(localPath) remoteFilePath = os.path.join(remotePath, localFileName) else: remoteFilePath = remotePath with open(localPath, 'rb') as lFP: self.__ftpClient.storbinary('STOR %s' % remoteFilePath, lFP) if remoteFilePath in self.listdir(remotePathDir): return True else: logger.error("put failing for localPath %s remoteFilePath %s", localPath, remoteFilePath) return False except Exception as e: if self.__raiseExceptions: raise e else: logger.error( "put failing for localPath %s remotePath %s with %s", localPath, remotePath, str(e)) return False
def __pharosFixture(self): try: ok = False fU = FileUtil() srcPath = os.path.join(self.__dataPath, "Pharos") dstPath = os.path.join(self.__cachePath, "Pharos-targets") for fn in ["drug_activity", "cmpd_activity", "protein"]: inpPath = os.path.join(srcPath, fn + ".tdd.gz") outPath = os.path.join(dstPath, fn + ".tdd.gz") fU.get(inpPath, outPath) fU.uncompress(outPath, outputDir=dstPath) fU.remove(outPath) fU.put(os.path.join(srcPath, "pharos-readme.txt"), os.path.join(dstPath, "pharos-readme.txt")) ok = True except Exception as e: logger.exception("Failing with %s", str(e)) ok = False return ok
def __reload(self, dirPath, baseVersion, useCache, **kwargs): startTime = time.time() mU = MarshalUtil(workPath=dirPath) chemblDbUrl = kwargs.get( "ChEMBLDbUrl", "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/") ok = False fU = FileUtil() fU.mkdir(dirPath) # # ChEMBL current version <baseVersion>,... # template: chembl_<baseVersion>.fa.gz # targetFileName = "chembl_" + str(baseVersion) + ".fa.gz" mappingFileName = "chembl_uniprot_mapping.txt" # chemblTargetPath = os.path.join(dirPath, targetFileName) chemblMappingPath = os.path.join(dirPath, mappingFileName) mappingFilePath = os.path.join(dirPath, "chembl_uniprot_mapping.json") # mapD = {} if useCache and fU.exists(mappingFilePath): logger.info("useCache %r using %r and %r and %r", useCache, chemblTargetPath, chemblMappingPath, mappingFilePath) mapD = mU.doImport(mappingFilePath, fmt="json") else: # Get the ChEMBL UniProt mapping file url = os.path.join(chemblDbUrl, mappingFileName) ok = fU.get(url, chemblMappingPath) logger.info("Fetched %r url %s path %s", ok, url, chemblMappingPath) logger.info("Reading ChEMBL mapping file path %s", mappingFilePath) rowL = mU.doImport(chemblMappingPath, fmt="tdd", rowFormat="list") for row in rowL: mapD[row[0]] = (row[1], row[2], row[3]) ok = mU.doExport(mappingFilePath, mapD, fmt="json") logger.info("Processed mapping path %s (%d) %r", mappingFilePath, len(mapD), ok) # # Get the target FASTA files -- for vers in range(baseVersion, baseVersion + 10): logger.info("Now fetching version %r", vers) self.__version = vers targetFileName = "chembl_" + str(vers) + ".fa.gz" chemblTargetPath = os.path.join(dirPath, "chembl_targets_raw.fa.gz") url = os.path.join(chemblDbUrl, targetFileName) ok = fU.get(url, chemblTargetPath) logger.info("Fetched %r url %s path %s", ok, url, chemblTargetPath) if ok: break # logger.info("Completed reload at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # return mapD
def __processAppendedSections(self, appendConfigOption, cachePath, useCache=True): """Fetch and append configuration assets assigned to input configuration option. Args: appendConfigOption (str): reserved configuration option to hold a list of configuration asset locators cachePath (str): path to store cached copies configuration assets useCache (bool, optional): use existing cached configuration assets. Defaults to True. Returns: bool: True for success of False otherwise """ try: ret = True appendLocL = self.getList(appendConfigOption, sectionName=self.__defaultSectionName) logger.debug("appendLocL is %r", appendLocL) if appendLocL: cP = os.path.join(cachePath, "config") fU = FileUtil(workPath=cP) logger.debug("Fetching append sections from %r", appendLocL) for appendLoc in appendLocL: fn = fU.getFileName(appendLoc) fp = os.path.join(cP, fn) okF = True if not (useCache and fU.exists(fp)): # get a fresh copy from source okF = fU.get(appendLoc, fp) logger.debug("Fetched %r to %r", appendLoc, fp) ok = self.appendConfig(fp) ret = ret and ok and okF except Exception as e: logger.exception("Failing for option %r cachePath %r with %s", appendConfigOption, cachePath, str(e)) ret = False # if not ret: logger.error("Fetching appended sections failing %r", appendLocL) return ret
def __reload(self, urlTarget, urlFallbackTarget, dirPath, useCache=True): invD = {} fU = FileUtil() fn = fU.getFileName(urlTarget) fp = os.path.join(dirPath, fn) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(fp): invD = self.__mU.doImport(fp, fmt="json") logger.debug("Reading cached inventory (%d)", len(invD)) else: logger.info("Fetch inventory from %s", urlTarget) ok = fU.get(urlTarget, fp) if not ok: ok = fU.get(urlFallbackTarget, fp) # if ok: invD = self.__mU.doImport(fp, fmt="json") # return invD
def setUp(self): self.__verbose = True self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "mmcif_pdbx_v5_next.dic") self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data", "NCBI", "names.dmp.gz") self.__zipFileUrl = "https://inventory.data.gov/dataset/794cd3d7-4d28-4408-8f7d-84b820dbf7f2/resource/6b78ec0c-4980-4ad8-9cbd-2d6eb9eda8e7/download/myfoodapediadata.zip" # self.__ftpFileUrl = "ftp://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz" # self.__workPath = os.path.join(HERE, "test-output") self.__inpDirPath = os.path.join(HERE, "test-data") self.__fileU = FileUtil() self.__startTime = time.time() logger.debug("Running tests on version %s", __version__) logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
def __init__(self, cfgOb, cachePath, useCache=True, **kwargs): """Utilities to access and update provenance details. Args: cfgOb ([type]): ConfigInfo() instance cachePath ([type]): path to directory containing schema useCache (bool, optional): use cached schema. Defaults to True. """ self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__cachePath = cachePath self.__useCache = useCache # self.__workPath = os.path.join(self.__cachePath, "work") self.__provenanceCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("PROVENANCE_INFO_CACHE_DIR", sectionName=self.__configName)) self.__provenanceLocator = self.__cfgOb.getPath("PROVENANCE_INFO_LOCATOR", sectionName=self.__configName) # self.__fileU = FileUtil(workPath=self.__workPath) self.__fileU.mkdir(self.__provenanceCachePath) self.__kwargs = kwargs
def stashDependencies(self, url, dirPath, bundleLabel="A", userName=None, pw=None): """Store a copy of the bundled search dependencies remotely - Args: url (str): URL string for the destination host (e.g. sftp://myserver.net or None for a local file) dirPath (str): directory path on the remote resource bundleLabel (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A') userName (str, optional): optional access information. Defaults to None. password (str, optional): optional access information. Defaults to None. Returns: bool: True for success or False otherwise """ try: ok = False fn = self.__makeBundleFileName(self.__dependFileName, bundleLabel=bundleLabel) if url and url.startswith("sftp://"): sftpU = SftpUtil() hostName = url[7:] ok = sftpU.connect(hostName, userName, pw=pw, port=22) if ok: remotePath = os.path.join("/", dirPath, fn) ok = sftpU.put(self.__dependTarFilePath, remotePath) elif not url: fileU = FileUtil() remotePath = os.path.join(dirPath, fn) ok = fileU.put(self.__dependTarFilePath, remotePath) else: logger.error("Unsupported stash protocol %r", url) return ok except Exception as e: logger.exception("For %r %r failing with %s", url, dirPath, str(e)) return False
def __reload(self, urlTarget, dirPath, useCache=True): """Reload local cache of mapping resources to support validation report reader and translator. Args: urlTarget (list, str): URL for schema mapping file dirPath (str): path to the directory containing cache files useCache (bool, optional): flag to use cached files. Defaults to True. Returns: (object): instance of ValidationReportReader() """ mapD = {} # mU = MarshalUtil() fU = FileUtil() fn = fU.getFileName(urlTarget) mappingFilePath = os.path.join(dirPath, fn) mU.mkdir(dirPath) # # if not useCache: # for fp in [mappingFilePath]: # try: # os.remove(fp) # except Exception: # pass # # logger.debug("Loading validation mapping data in %s (useCache %r)", fn, useCache) if useCache and fU.exists(mappingFilePath): mapD = mU.doImport(mappingFilePath, fmt="json") else: logger.info("Fetching url %s to resource file %s", urlTarget, mappingFilePath) tS = uuid.uuid4().hex tP = os.path.join(dirPath, "._" + tS) ok = fU.get(urlTarget, tP) if ok: mapD = mU.doImport(tP, fmt="json") os.replace(tP, mappingFilePath) return mapD
def __rebuildCache(self, urlTargetPfam, urlTargetPfamFB, dirPath, useCache): pfamD = {} fmt = "json" ext = fmt if fmt == "json" else "pic" pfamDataPath = os.path.join(dirPath, "pfam-data.%s" % ext) # logger.debug("Using cache data path %s", dirPath) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(pfamDataPath): pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt) logger.debug("Pfam data length %d", len(pfamD)) elif not useCache: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetPfam, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam)) ok = fU.get(urlTargetPfam, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB)) ok = fU.get(urlTargetPfamFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) pfamD = self.__getPfamIndex(fp) ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt) logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath, ok) # ------ # return pfamD
def __reload(self, dirPath, **kwargs): startTime = time.time() fD = {} useCache = kwargs.get("useCache", True) ok = False cofactorPath = self.__getCofactorDataPath() # logger.info("useCache %r cofactorPath %r", useCache, cofactorPath) if useCache and self.__mU.exists(cofactorPath): fD = self.__mU.doImport(cofactorPath, fmt="json") ok = True else: fU = FileUtil() fU.mkdir(dirPath) # --- numCofactors = len(fD["cofactors"]) if fD and "cofactors" in fD else 0 logger.info( "Completed reload of (%d) cofactors with status (%r) at %s (%.4f seconds)", numCofactors, ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return fD
def __rebuildCache(self, urlTargetInterPro, urlTargetInterProFB, urlTargetInterProParent, urlTargetInterProParentFB, dirPath, useCache): fmt = "json" ext = fmt if fmt == "json" else "pic" interProDataPath = os.path.join(dirPath, "interPro-data.%s" % ext) # logger.debug("Using cache data path %s", dirPath) self.__mU.mkdir(dirPath) # if useCache and self.__mU.exists(interProDataPath): rD = self.__mU.doImport(interProDataPath, fmt=fmt) interProD = rD["index"] interProParentD = rD["parents"] logger.debug("InterPro index length %d parent length %d", len(interProD), len(interProParentD)) else: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetInterPro, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetInterPro)) ok = fU.get(urlTargetInterPro, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProFB)) ok = fU.get(urlTargetInterProFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) interProD = self.__getInterProIndex(fp) logger.info("Caching %d in %s status %r", len(interProD), interProDataPath, ok) # ------ logger.info("Fetch data from source %s in %s", urlTargetInterProParent, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParent)) ok = fU.get(urlTargetInterProParent, fp) if not ok: fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParentFB)) ok = fU.get(urlTargetInterProParentFB, fp) logger.info("Fetch data fallback fetch status is %r", ok) interProParentD = self.__getInterProParents(fp) # ok = self.__mU.doExport(interProDataPath, {"index": interProD, "parents": interProParentD}, fmt=fmt) # return interProD, interProParentD
def fetchBundle(self, localRestoreDirPath, url, remoteDirPath, remoteStashPrefix="A", userName=None, password=None): """Restore bundled dependencies from remote storage and unbundle these in the current local cache directory. Args: localRestoreDirPath (str): local restore path url (str): remote URL remoteDirPath (str): remote directory path on the remote resource remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A') userName (str, optional): optional access information. Defaults to None. password (str, optional): optional access information. Defaults to None. """ try: ok = False fileU = FileUtil() fn = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix) if not url: remotePath = os.path.join(remoteDirPath, fn) ok = fileU.get(remotePath, self.__localStashTarFilePath) elif url and (url.startswith("http://") or url.startswith("https://")): remotePath = url + os.path.join("/", remoteDirPath, fn) ok = fileU.get(remotePath, self.__localStashTarFilePath) elif url and url.startswith("sftp://"): sftpU = SftpUtil() ok = sftpU.connect(url[7:], userName, pw=password, port=22) if ok: remotePath = os.path.join(remoteDirPath, fn) ok = sftpU.get(remotePath, self.__localStashTarFilePath) else: logger.error("Unsupported protocol %r", url) if ok: ok = fileU.unbundleTarfile(self.__localStashTarFilePath, dirPath=localRestoreDirPath) return ok except Exception as e: logger.exception("For %r %r Failing with %s", url, remoteDirPath, str(e)) ok = False return ok
def __init__(self, cfgOb, cachePath, useCache=True, rebuildFlag=False, **kwargs): """A collection of schema build and caching methods. Args: cfgOb (object): ConfigInfo() instance cachePath (str): path to directory containing schema useCache (bool, optional): use cached schema. Defaults to True. rebuildFlag (bool, optional): on-the-fly rebuild and cache schema """ self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__cachePath = os.path.abspath(cachePath) self.__useCache = useCache self.__rebuildFlag = rebuildFlag self.__useCache = rebuildFlag if rebuildFlag else useCache # self.__workPath = os.path.join(self.__cachePath, "work") self.__fileU = FileUtil( workPath=os.path.join(self.__cachePath, "work")) self.__schemaCachePath = os.path.join( self.__cachePath, self.__cfgOb.get("SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName)) self.__jsonSchemaCachePath = os.path.join( self.__cachePath, self.__cfgOb.get("JSON_SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName)) self.__fileU.mkdir(self.__schemaCachePath) self.__fileU.mkdir(self.__jsonSchemaCachePath) self.__kwargs = kwargs
def __reload(self, dirPath, useCache): startTime = time.time() aD = {} allIdD = {} fU = FileUtil() fU.mkdir(dirPath) targetActivityFilePath = self.getTargetActivityDataPath() # if useCache and fU.exists(targetActivityFilePath): logger.info("useCache %r using %r", useCache, targetActivityFilePath) qD = self.__mU.doImport(targetActivityFilePath, fmt="json") aD = qD["activity"] if "activity" in qD else {} idL = qD["all_ids"] if "all_ids" in qD else [] allIdD = {k: k in aD for k in idL} # logger.info( "Completed reload (%d activities) (%d tried identifiers) at %s (%.4f seconds)", len(aD), len(allIdD), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime, ) # return aD, allIdD
def buildDependenices(self, ccUrlTarget, birdUrlTarget, **kwargs): """Convenience method to build configuration and static dependencies for the chemical search services. Args: ccUrlTarget (str): path to source concatenated chemical component definition file birdUrlTarget (str): path to the source concatenated BIRD definition file Other options are propagated to configurations of the wrapped classes in __bootstrapConfig() """ try: okT = False ok1 = self.setConfig(ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget, **kwargs) useCache = kwargs.get("useCache", False) ok2 = self.updateChemCompIndex(useCache=useCache) ok3 = self.updateSearchIndex(useCache=useCache) ok4 = self.updateSearchMoleculeProvider(useCache=useCache) okBuild = ok1 and ok2 and ok3 and ok4 if okBuild: fileU = FileUtil() dirPathList = [ os.path.join(self.__cachePath, subDir) for subDir in ["chem_comp", "oe_mol", "config"] ] okT = fileU.bundleTarfile(self.__dependTarFilePath, dirPathList, mode="w:gz", recursive=True) # return okT and okBuild except Exception as e: logger.exception("Failing build with %r and %r with %s", ccUrlTarget, birdUrlTarget, str(e)) return False
def __fetchFromSource(self, urlTarget): """Fetch the classification names and domain assignments from the ECOD repo.""" fU = FileUtil() fn = fU.getFileName(urlTarget) fp = os.path.join(self.__dirPath, fn) if not fU.exists(fp): fU.get(urlTarget, fp) # with open(fp, "r", encoding="utf-8") as ifh: line = ifh.readline() line = ifh.readline() line = ifh.readline() ff = line[:-1].split() self.__version = ff[-1] # nmL = self.__mU.doImport(fp, fmt="list", uncomment=True) fU.remove(fp) # return nmL
def pushBundle(self, gitRepositoryPath, accessToken, gitHost="github.com", gitBranch="master", remoteStashPrefix="A", maxSizeMB=95): """Push bundle to remote stash git repository. Args: gitRepositoryPath (str): git repository path (e.g., rcsb/py-rcsb_exdb_assets_stash) accessToken (str): git repository access token gitHost (str, optional): git repository host name. Defaults to github.com. gitBranch (str, optional): git branch name. Defaults to master. remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A') maxSizeMB (int, optional): maximum stash bundle file size that will be committed. Defaults to 95MB. Returns: bool: True for success or False otherwise """ try: ok = False gU = GitUtil(token=accessToken, repositoryHost=gitHost) fU = FileUtil() localRepositoryPath = os.path.join(self.__localBundlePath, "stash_repository") fn = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix) # # Update existing local repository, otherwise clone a new copy if fU.exists(localRepositoryPath): ok = gU.pull(localRepositoryPath, branch=gitBranch) logger.debug("After pull status %r", gU.status(localRepositoryPath)) else: ok = gU.clone(gitRepositoryPath, localRepositoryPath, branch=gitBranch) # # Split all bundles mbSize = float(fU.size(self.__localStashTarFilePath)) / 1000000.0 logger.info("Splitting bundle %r (%.3f MB/Max %d MB)", fn, mbSize, maxSizeMB) sj = SplitJoin() splitDirPath = os.path.join(localRepositoryPath, "stash", fn[:-7]) sj.split(self.__localStashTarFilePath, splitDirPath, maxSizeMB=maxSizeMB) fU.remove(self.__localStashTarFilePath) # else: # fU.put(self.__localStashTarFilePath, os.path.join(localRepositoryPath, "stash", fn)) ok = gU.addAll(localRepositoryPath, branch=gitBranch) ok = gU.commit(localRepositoryPath, branch=gitBranch) logger.debug("After commit status %r", gU.status(localRepositoryPath)) # if accessToken: ok = gU.push(localRepositoryPath, branch=gitBranch) logger.info("After push status %r", gU.status(localRepositoryPath)) # return ok except Exception as e: logger.exception("For %r %r failing with %s", gitHost, gitRepositoryPath, str(e)) return False
def setUp(self): self.__workPath = os.path.join(HERE, "test-output") # self.__testLogFileMin = os.path.join(self.__workPath, "logfile-min.json") self.__testLogFileDetailed = os.path.join(self.__workPath, "logfile-detailed.json") fU = FileUtil() fU.remove(self.__testLogFileMin) fU.remove(self.__testLogFileDetailed) self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
def testAABuildDependenciesAndStash(self): """Test case - build, stash and restore dependencies -""" try: ccsw = ChemCompSearchWrapper() ccUrlTarget = os.path.join( self.__dataPath, "components-abbrev.cif") if not self.__testFlagFull else None birdUrlTarget = os.path.join( self.__dataPath, "prdcc-abbrev.cif") if not self.__testFlagFull else None ok = ccsw.buildDependenices(ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget) self.assertTrue(ok) # if self.__testStash: url = "sftp://bl-east.rcsb.org" userName = "" pw = "" dirPath = "4-coastal" ok = ccsw.stashDependencies(url, dirPath, userName=userName, pw=pw) self.assertTrue(ok) # fileU = FileUtil() fileU.remove(self.__cachePath) # url = "http://bl-east.rcsb.org" ok = ccsw.restoreDependencies(url, dirPath) # fileU.remove(self.__cachePath) # url = "sftp://bl-east.rcsb.org" ok = ccsw.restoreDependencies(url, dirPath, userName=userName, pw=pw) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def __reload(self, urlTarget, dirPath, useCache=True): """ Reload input GO OBO ontology file and return a nx graph object. ' Returns: dictionary[goId] = {'name_list': ... , 'id_list': ... 'depth_list': ... } """ goGraph = None # # mU = MarshalUtil() fU = FileUtil() fn = fU.getFileName(urlTarget) oboFilePath = os.path.join(dirPath, fn) fU.mkdir(dirPath) # if not useCache: for fp in [oboFilePath]: try: os.remove(fp) except Exception: pass # if useCache and fU.exists(oboFilePath): goGraph = obonet.read_obo(oboFilePath) else: logger.info("Fetching url %s to resource file %s", urlTarget, oboFilePath) ok = fU.get(urlTarget, oboFilePath) if ok: goGraph = obonet.read_obo(oboFilePath) if goGraph: logger.info("Reading %d nodes and %d edges", len(goGraph), goGraph.number_of_edges()) else: logger.info("Go graph construction failing") # return goGraph
def get(self, remotePath, localPath): """Get a file from a remote FTP server. Arguments: remotePath (str): remote file path localPath (str): local file path Returns: bool: True for success or false otherwise """ try: fileU = FileUtil() fileU.mkdirForFile(localPath) # If provided localPath already exists and is a directory, retrieve the file using the name on the remote server # to avoid unintentionally overwriting an entire local directory with a single retrieved file if (os.path.exists(localPath) and os.path.isdir(localPath)): remoteFileName = FileUtil().getFileName(remotePath) localFilePath = os.path.join(localPath, remoteFileName) else: localFilePath = localPath with open(localFilePath, 'wb') as lFP: self.__ftpClient.retrbinary('RETR %s' % remotePath, lFP.write) ok = fileU.exists(localFilePath) if ok: return True else: logger.error("get failing for remotePath %s localFilePath %s", remotePath, localFilePath) return False except Exception as e: if self.__raiseExceptions: raise e else: logger.error( "get failing for remotePath %s localPath %s with %s", remotePath, localPath, str(e)) return False
class SchemaProvider(SingletonClass): """ A collection of schema build and caching methods. Static cache worflow: <authorative source> <-- <cache dir> <- client API Compute workflow: <dependent resource files, config file, dictionaries> -> [schema builder] --> <schema def> --> <Json schema> """ def __init__(self, cfgOb, cachePath, useCache=True, rebuildFlag=False, **kwargs): """A collection of schema build and caching methods. Args: cfgOb (object): ConfigInfo() instance cachePath (str): path to directory containing schema useCache (bool, optional): use cached schema. Defaults to True. rebuildFlag (bool, optional): on-the-fly rebuild and cache schema """ self.__cfgOb = cfgOb self.__configName = self.__cfgOb.getDefaultSectionName() self.__cachePath = os.path.abspath(cachePath) self.__useCache = useCache self.__rebuildFlag = rebuildFlag self.__useCache = rebuildFlag if rebuildFlag else useCache # self.__workPath = os.path.join(self.__cachePath, "work") self.__fileU = FileUtil(workPath=os.path.join(self.__cachePath, "work")) self.__schemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName)) self.__jsonSchemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("JSON_SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName)) self.__fileU.mkdir(self.__schemaCachePath) self.__fileU.mkdir(self.__jsonSchemaCachePath) self.__kwargs = kwargs def getSchemaOptions(self, schemaLevel, extraOpts=None): opts = extraOpts + "|" if extraOpts else "" if schemaLevel == "full": return opts + "mandatoryKeys|mandatoryAttributes|bounds|enums|rcsb" elif schemaLevel in ["min", "minimum"]: return opts + "mandatoryKeys|enums|rcsb" else: return opts def getSchemaInfo(self, databaseName, dataTyping="ANY"): """Convenience method to return essential schema details for the input repository content type. Args: databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...) dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...) Returns: tuple: SchemaDefAccess(object), target database name, target collection name list, primary index attribute list """ sd = None dbName = None collectionNameList = [] docIndexD = {} try: mU = MarshalUtil(workPath=self.__workPath) schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) if self.__rebuildFlag: filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) self.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True) else: filePath = self.__reload(schemaLocator, self.__schemaCachePath, useCache=self.__useCache) if not filePath: logger.error("Unable to recover schema %s (%s)", databaseName, dataTyping) logger.debug("ContentType %r dataTyping %r schemaLocator %r", databaseName, dataTyping, schemaLocator) schemaDef = mU.doImport(filePath, fmt="json") if schemaDef: logger.debug("Using cached schema definition for %s application %s", databaseName, dataTyping) sd = SchemaDefAccess(schemaDef) if sd: dbName = sd.getDatabaseName() collectionInfoList = sd.getCollectionInfo() logger.debug("Schema %s database name %s collections %r", databaseName, dbName, collectionInfoList) for cd in collectionInfoList: collectionName = cd["NAME"] collectionNameList.append(collectionName) docIndexD[collectionName] = sd.getDocumentIndices(collectionName) except Exception as e: logger.exception("Retreiving schema %s for %s failing with %s", databaseName, dataTyping, str(e)) return sd, dbName, collectionNameList, docIndexD def schemaDefCompare(self, databaseName, dataTyping="ANY"): """Compare computed schema defintion with current source/cached version. Args: databaseName (str): schema definition name for comparison dataTyping (str, optional): data type conventions for the schema comparison. Defaults to "ANY". Returns: (str): file path for schema difference or None """ mU = MarshalUtil(workPath=self.__workPath) schemaDiffPath = os.path.join(self.__cachePath, "schema_diff") mU.mkdir(schemaDiffPath) schemaPath = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) fn = self.__fileU.getFileName(schemaPath) sD = self.makeSchemaDef(databaseName, dataTyping=dataTyping) v2 = sD["DATABASE_VERSION"] # ---- # tPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath) + "-test") # logger.info("Exporting schema def to %s", tPath) # mU.doExport(tPath, sD, fmt="json", indent=3) # sD = mU.doImport(tPath, fmt="json") # ---- cPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath)) sDCache = mU.doImport(cPath, fmt="json") v1 = sDCache["DATABASE_VERSION"] # numDiff, difD = self.schemaCompare(sDCache, sD) # # jD = diff(sDCache, sD, syntax="explicit", marshal=True) diffPath = None if numDiff: bn, _ = os.path.splitext(fn) diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json") # logger.info("diff for %s %s = \n%s", databaseName, dataTyping, pprint.pformat(difD, indent=3, width=100)) mU.doExport(diffPath, difD, fmt="json", indent=3) # return diffPath def jsonSchemaCompare(self, databaseName, collectionName, encodingType, level, extraOpts=None): """Compare computed JSON schema defintion with current source/cached version. Args: databaseName (str): schema name collectionName (str): collection name encodingType (str): schema data type conventions (JSON|BSON) level (str): metadata level (min|full) extraOpts (str): extra schema construction options Returns: (str): path to the difference file or None """ mU = MarshalUtil(workPath=self.__workPath) schemaDiffPath = os.path.join(self.__cachePath, "schema_diff") mU.mkdir(schemaDiffPath) schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType, level) fn = self.__fileU.getFileName(schemaLocator) schemaPath = os.path.join(self.__jsonSchemaCachePath, fn) # sD = self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, saveSchema=False, extraOpts=extraOpts) v2 = self.__getSchemaVersion(sD) # ---- # tPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaPath) + "-test") # logger.info("Exporting json schema to %s", tPath) # mU.doExport(tPath, sD, fmt="json", indent=3) # ---- # sDCache = mU.doImport(schemaPath, fmt="json") v1 = self.__getSchemaVersion(sDCache) if not v1: logger.error("no version for %s - %s %s", schemaLocator, databaseName, collectionName) # numDiff, difD = self.schemaCompare(sDCache, sD) # jD = diff(sDCache, sD, marshal=True, syntax="explicit") diffPath = None if numDiff: logger.debug("diff for %s %s %s %s = \n%s", databaseName, collectionName, encodingType, level, pprint.pformat(difD, indent=3, width=100)) bn, _ = os.path.splitext(fn) diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json") mU.doExport(diffPath, difD, fmt="json", indent=3) return diffPath def __getSchemaVersion(self, jsonSchema): try: comment = jsonSchema["$comment"] if "$comment" in jsonSchema else "" ff = comment.split(":") version = ff[1].strip() return version except Exception as e: logger.exception("Failing for with %s", str(e)) return "" def __getSchemaDefLocator(self, databaseName, dataTyping="ANY"): """Internal method returning schema definition path for the input content type and application. Defines schema definition naming convention - Args: databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...) dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...) Returns: str: schema definition file locator """ schemaLocator = None try: locPath = self.__cfgOb.get("SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName) fn = "schema_def-%s-%s.json" % (databaseName, dataTyping.upper()) schemaLocator = os.path.join(locPath, fn) except Exception as e: logger.exception("Retreiving schema definition path %s for %s failing with %s", databaseName, dataTyping, str(e)) return schemaLocator def __getJsonSchemaLocator(self, databaseName, collectionName, encodingType="BSON", level="full"): """Internal method returning JSON schema path for the input collection data type convention and level. Defines the JSON/BSON schema naming convention - Args: databaseName (str): database name in the document store collectionName (str): collection name in document store encodingType (str, optional): data type convention (BSON|JSON) level (str, optional): Completeness of the schema (e.g. min or full) Returns: str: schema file locator """ schemaLocator = None try: sdType = None sLevel = None schemaLocator = None if encodingType.upper() in ["JSON", "BSON"]: sdType = encodingType.lower() if level.lower() in ["min", "minimun"]: sLevel = "min" elif level.lower() in ["full"]: sLevel = level.lower() # if sdType and sLevel: locPath = self.__cfgOb.get("JSON_SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName) fn = "%s-%s-db-%s-col-%s.json" % (sdType, sLevel, databaseName, collectionName) schemaLocator = os.path.join(locPath, fn) else: logger.error("Unsupported schema options: %s level %r type %r", collectionName, level, encodingType) schemaLocator = None except Exception as e: logger.debug("Retreiving JSON schema definition for %s type %s failing with %s", collectionName, encodingType, str(e)) # return schemaLocator def __reload(self, locator, dirPath, useCache=True): # fn = self.__fileU.getFileName(locator) filePath = os.path.join(dirPath, fn) logger.debug("Target cache filePath %s", filePath) self.__fileU.mkdir(dirPath) if not useCache: try: os.remove(filePath) except Exception: pass # if useCache and self.__fileU.exists(filePath): ok = True else: logger.info("Fetch data from source %s to %s", locator, filePath) ok = self.__fileU.get(locator, filePath) return filePath if ok else None def getJsonSchema(self, databaseName, collectionName, encodingType="BSON", level="full", extraOpts=None): """Return JSON schema (w/ BSON types) object for the input collection and level.and Args: databaseName (str): database name collectionName (str): collection name in document store encodingType (str, optional): data type convention (BSON|JSON) level (str, optional): Completeness of the schema (e.g. min or full) Returns: dict: Schema object """ sObj = None schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level) # if self.__rebuildFlag: filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, extraOpts=extraOpts) else: filePath = self.__reload(schemaLocator, self.__jsonSchemaCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) if filePath and mU.exists(filePath): mU = MarshalUtil(workPath=self.__workPath) sObj = mU.doImport(filePath, fmt="json") else: logger.debug("Failed to read schema for %s %r", collectionName, level) return sObj def makeSchema(self, databaseName, collectionName, encodingType="BSON", level="full", saveSchema=False, extraOpts=None): try: smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath) # cD = None stU = encodingType.upper() cD = smb.build(collectionName, dataTyping=stU, encodingType=stU, enforceOpts=self.getSchemaOptions(level, extraOpts=extraOpts)) if cD and saveSchema: schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level) localPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaLocator)) mU = MarshalUtil(workPath=self.__workPath) mU.doExport(localPath, cD, fmt="json", indent=3, enforceAscii=False) except Exception as e: logger.exception("Building schema %s collection %s failing with %s", databaseName, collectionName, str(e)) return cD def makeSchemaDef(self, databaseName, dataTyping="ANY", saveSchema=False): schemaDef = None try: smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath) schemaDef = smb.build(dataTyping=dataTyping, encodingType="rcsb") if schemaDef and saveSchema: schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) localPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) mU = MarshalUtil(workPath=self.__workPath) mU.doExport(localPath, schemaDef, fmt="json", indent=3, enforceAscii=False) except Exception as e: logger.exception("Building schema %s failing with %s", databaseName, str(e)) return schemaDef def schemaCompare(self, orgD, newD): """ Compute the difference of nested dictionaries. """ fOrgD = self.__flatten(orgD) fNewD = self.__flatten(newD) if len(fOrgD) != len(fNewD): logger.debug("Schema lengths differ: org %d new %d", len(fOrgD), len(fNewD)) # addedD = {k: fNewD[k] for k in set(fNewD) - set(fOrgD)} removedD = {k: fOrgD[k] for k in set(fOrgD) - set(fNewD)} changedOrgD = {k: fOrgD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]} changedNewD = {k: fNewD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]} chD = {} for ky in changedOrgD: kyS = ".".join(ky) vOrg = changedOrgD[ky] vNew = changedNewD[ky] if isinstance(vOrg, (list, tuple)) and isinstance(vNew, (list, tuple)): # logger.info(" >> %r vOrg %r vNew %r", ky, vOrg, vNew) dV = list(set(vNew) - set(vOrg)) if dV: chD[kyS] = {"diff": dV} else: chD[kyS] = {"from": vOrg, "to": vNew} # nT = len(addedD) + len(removedD) + len(chD) diffD = {"added": [".".join(kk) for kk in addedD.keys()], "removed": [".".join(kk) for kk in removedD.keys()], "changed": chD} return nT, diffD def __flatten(self, inpDict, prefix=None): prefix = prefix[:] if prefix else [] outDict = {} for key, value in inpDict.items(): if isinstance(value, dict) and value: deeper = self.__flatten(value, prefix + [key]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) elif isinstance(value, (list, tuple)) and value: for index, sublist in enumerate(value, start=1): if isinstance(sublist, dict) and sublist: deeper = self.__flatten(sublist, prefix + [key] + [str(index)]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) else: outDict[tuple(prefix + [key] + [str(index)])] = value else: outDict[tuple(prefix + [key])] = value return outDict def __flattenX(self, inpDict, prefix=None): prefix = prefix[:] if prefix else [] # separator = "." outDict = {} for key, value in inpDict.items(): if isinstance(value, dict) and value: deeper = self.__flatten(value, prefix + [key]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) elif isinstance(value, list) and value: for index, sublist in enumerate(value, start=1): if isinstance(sublist, dict) and sublist: deeper = self.__flatten(sublist, prefix + [key] + [str(index)]) outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()}) else: outDict[tuple(prefix + [key] + [str(index)])] = value else: outDict[tuple(prefix + [key])] = value return outDict def __flattenOrg(self, inpDict, separator=".", prefix=""): outDict = {} for key, value in inpDict.items(): if isinstance(value, dict) and value: deeper = self.__flattenOrg(value, separator, prefix + key + separator) outDict.update({key2: val2 for key2, val2 in deeper.items()}) elif isinstance(value, list) and value: for index, sublist in enumerate(value, start=1): if isinstance(sublist, dict) and sublist: deeper = self.__flattenOrg(sublist, separator, prefix + key + separator + str(index) + separator) outDict.update({key2: val2 for key2, val2 in deeper.items()}) else: outDict[prefix + key + separator + str(index)] = value else: outDict[prefix + key] = value return outDict def __dictGen(self, indict, pre=None): pre = pre[:] if pre else [] if isinstance(indict, dict): for key, value in indict.items(): if isinstance(value, dict): for dD in self.__dictGen(value, pre + [key]): yield dD elif isinstance(value, list) or isinstance(value, tuple): for v in value: for dD in self.__dictGen(v, pre + [key]): yield dD else: yield pre + [key, value] else: yield indict
def __init__(self, **kwargs): self.__fileU = FileUtil(**kwargs)
class IoUtil(object): def __init__(self, **kwargs): self.__fileU = FileUtil(**kwargs) def serialize(self, filePath, myObj, fmt="pickle", **kwargs): """Public method to serialize format appropriate objects Args: filePath (str): local file path' myObj (object): format appropriate object to be serialized format (str, optional): one of ['mmcif', mmcif-dict', json', 'list', 'text-dump', pickle' (default)] **kwargs: additional keyword arguments passed to worker methods - Returns: bool: status of serialization operation; true for success or false otherwise """ ret = False fmt = str(fmt).lower() ret = self.__fileU.mkdirForFile(filePath) if not ret: return ret if fmt in ["mmcif"]: ret = self.__serializeMmCif(filePath, myObj, **kwargs) elif fmt in ["json"]: ret = self.__serializeJson(filePath, myObj, **kwargs) elif fmt in ["pickle"]: ret = self.__serializePickle(filePath, myObj, **kwargs) elif fmt in ["list"]: ret = self.__serializeList(filePath, myObj, enforceAscii=True, **kwargs) elif fmt in ["mmcif-dict"]: ret = self.__serializeMmCifDict(filePath, myObj, **kwargs) elif fmt in ["text-dump"]: ret = self.__textDump(filePath, myObj, **kwargs) elif fmt in ["fasta"]: ret = self.__serializeFasta(filePath, myObj, **kwargs) elif fmt in ["csv"]: ret = self.__serializeCsv(filePath, myObj, **kwargs) else: pass return ret def deserialize(self, filePath, fmt="pickle", **kwargs): """Public method to deserialize objects in supported formats. Args: filePath (str): local file path format (str, optional): one of ['mmcif', 'json', 'list', ..., 'pickle' (default)] **kwargs: additional keyword arguments passed to worker methods - Returns: object: deserialized object data """ fmt = str(fmt).lower() if fmt in ["mmcif"]: ret = self.__deserializeMmCif(filePath, **kwargs) # type: ignore elif fmt in ["json"]: ret = self.__deserializeJson(filePath, **kwargs) # type: ignore elif fmt in ["pickle"]: ret = self.__deserializePickle(filePath, **kwargs) # type: ignore elif fmt in ["list"]: ret = self.__deserializeList(filePath, enforceAscii=True, **kwargs) # type: ignore elif fmt in ["mmcif-dict"]: ret = self.__deserializeMmCifDict(filePath, **kwargs) # type: ignore elif fmt in ["fasta"]: ret = self.__deserializeFasta(filePath, **kwargs) # type: ignore # elif fmt in ["vrpt-xml-to-cif"]: # ret = self.__deserializeVrptToCif(filePath, **kwargs) # type: ignore elif fmt in ["csv", "tdd"]: delimiter = kwargs.get("csvDelimiter", "," if fmt == "csv" else "\t") ret = self.__deserializeCsv(filePath, delimiter=delimiter, **kwargs) # type: ignore elif fmt in ["xml"]: ret = self.__deserializeXml(filePath, **kwargs) # type: ignore else: ret = None # type: ignore return ret def __sliceInChunks(self, myList, numChunks): mc = min(len(myList), numChunks) chunkSize = int(len(myList) / mc) if len(myList) % mc: chunkSize += 1 for i in range(0, len(myList), chunkSize): yield myList[i:i + chunkSize] def serializeInParts(self, filePath, myObj, numParts, fmt="json", **kwargs): """Public method to serialize format appropriate (json, pickle) objects in multiple parts Args: filePath (str): local file path myObj (object): format appropriate object to be serialized numParts (int): divide the data into numParts segments format (str, optional): one of ['json' or 'pickle']. Defaults to json **kwargs: additional keyword arguments passed to worker methods - Returns: bool: True for success or False otherwise """ if fmt not in ["json", "pickle"]: logger.error("Unsupported format for %s", fmt) return False pth, fn = os.path.split(filePath) self.__fileU.mkdirForFile(pth) bn, ext = os.path.splitext(fn) ret = True if isinstance(myObj, list): for ii, subList in enumerate(self.__sliceInChunks(myObj, numParts)): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) ok = self.serialize(fp, subList, fmt=fmt, **kwargs) ret = ret and ok elif isinstance(myObj, dict): for ii, keyList in enumerate( self.__sliceInChunks(list(myObj.keys()), numParts)): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) ok = self.serialize(fp, OrderedDict([(k, myObj[k]) for k in keyList]), fmt=fmt, **kwargs) ret = ret and ok else: logger.error("Unsupported data type for serialization in parts") ret = False # return ret def deserializeInParts(self, filePath, numParts, fmt="json", **kwargs): """Public method to deserialize objects in supported formats from multiple parts Args: filePath (str): local file path numParts (int): reconstruct the data object from numParts segments format (str, optional): one of ['json' or 'pickle']. Defaults to json **kwargs: additional keyword arguments passed to worker methods - Returns: object: deserialized object data """ rObj = None if fmt not in ["json", "pickle"]: logger.error("Unsupported format for %s", fmt) return rObj # pth, fn = os.path.split(filePath) bn, ext = os.path.splitext(fn) if not numParts: fp = os.path.join(pth, bn + "_part_*" + ext) numParts = len(glob.glob(fp)) # for ii in range(numParts): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) tObj = self.deserialize(fp, fmt=fmt, **kwargs) if isinstance(tObj, list): if not rObj: rObj = [] rObj.extend(tObj) elif isinstance(tObj, dict): if not rObj: rObj = OrderedDict() rObj.update(tObj) else: logger.error( "Unsupported data type for deserialization in parts") return rObj def exists(self, filePath, mode=os.R_OK): return self.__fileU.exists(filePath, mode=mode) def mkdir(self, dirPath, mode=0o755): return self.__fileU.mkdir(dirPath, mode=mode) def remove(self, pth): return self.__fileU.remove(pth) def __deserializeFasta(self, filePath, **kwargs): try: commentStyle = kwargs.get("commentStyle", "uniprot") fau = FastaUtil() return fau.readFasta(filePath, commentStyle=commentStyle) except Exception as e: logger.error("Unable to deserialize %r %r ", filePath, str(e)) return {} def __serializeFasta(self, filePath, myObj, **kwargs): try: maxLineLength = int(kwargs.get("maxLineLength", 70)) makeComment = kwargs.get("makeComment", False) fau = FastaUtil() ok = fau.writeFasta(filePath, myObj, maxLineLength=maxLineLength, makeComment=makeComment) return ok except Exception as e: logger.error("Unable to serialize FASTA file %r %r", filePath, str(e)) return False def __textDump(self, filePath, myObj, **kwargs): try: indent = kwargs.get("indent", 1) width = kwargs.get("width", 120) sOut = pprint.pformat(myObj, indent=indent, width=width) with open(filePath, "w") as ofh: ofh.write("\n%s\n" % sOut) return True except Exception as e: logger.error("Unable to dump to %r %r", filePath, str(e)) return False def __serializePickle(self, filePath, myObj, **kwargs): try: pickleProtocol = kwargs.get("pickleProtocol", pickle.DEFAULT_PROTOCOL) with open(filePath, "wb") as outfile: pickle.dump(myObj, outfile, pickleProtocol) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __deserializePickle(self, filePath, **kwargs): myDefault = kwargs.get("default", {}) try: if sys.version_info[0] > 2: encoding = kwargs.get("encoding", "ASCII") errors = kwargs.get("errors", "strict") with open(filePath, "rb") as outfile: return pickle.load(outfile, encoding=encoding, errors=errors) else: with open(filePath, "rb") as outfile: return pickle.load(outfile) except Exception as e: logger.warning("Unable to deserialize %r %r", filePath, str(e)) return myDefault def __serializeJson(self, filePath, myObj, **kwargs): """Internal method to serialize the input object as JSON. An encoding helper class is included to handle selected python data types (e.g., datetime) """ indent = kwargs.get("indent", 0) enforceAscii = kwargs.get("enforceAscii", True) try: if enforceAscii: with open(filePath, "w") as outfile: json.dump(myObj, outfile, indent=indent, cls=JsonTypeEncoder, ensure_ascii=enforceAscii) else: with io.open(filePath, "w", encoding="utf-8") as outfile: json.dump(myObj, outfile, indent=indent, cls=JsonTypeEncoder, ensure_ascii=enforceAscii) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __deserializeJson(self, filePath, **kwargs): myDefault = kwargs.get("default", {}) encoding = kwargs.get("encoding", "utf-8-sig") encodingErrors = kwargs.get("encodingErrors", "ignore") try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) else: # Py2 situation non-ascii encodings is problematic # with gzip.open(filePath, "rb") as csvFile: # oL = self.__csvReader(csvFile, rowFormat, delimiter) tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, newline="", encoding=encoding, errors="ignore") as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) else: with open(filePath, "r") as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) except Exception as e: logger.warning("Unable to deserialize %r %r", filePath, str(e)) return myDefault def __hasMinSize(self, pth, minSize): try: return os.path.getsize(pth) >= minSize except Exception: return False def __deserializeMmCif(self, locator, **kwargs): """ """ try: containerList = [] workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) minSize = kwargs.get("minSize", 5) # if self.__fileU.isLocal(locator): if minSize >= 0 and not self.__hasMinSize(locator, minSize): logger.warning("Minimum file size not satisfied for: %r", locator) myIo = IoAdapter(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) containerList = myIo.readFile( locator, enforceAscii=enforceAscii, outDirPath=workPath) # type: ignore else: # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) # containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath) containerList = self.__deserializeMmCifRemote( locator, useCharRefs, enforceAscii, workPath) except Exception as e: logger.error("Failing for %s with %s", locator, str(e)) return containerList @retry((requests.exceptions.RequestException), maxAttempts=3, delaySeconds=1, multiplier=2, defaultValue=[], logger=logger) def __deserializeMmCifRemote(self, locator, useCharRefs, enforceAscii, workPath): containerList = [] try: myIo = IoAdapterPy(raiseExceptions=True, useCharRefs=useCharRefs) containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath) except Exception as e: raise e return containerList def __serializeMmCif(self, filePath, containerList, **kwargs): """ """ try: ret = False workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapter(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) if filePath.endswith(".gz") and workPath: rfn = "".join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) tPath = os.path.join(workPath, rfn) ret = myIo.writeFile(tPath, containerList=containerList, enforceAscii=enforceAscii) ret = self.__fileU.compress(tPath, filePath, compressType="gzip") else: ret = myIo.writeFile(filePath, containerList=containerList, enforceAscii=enforceAscii) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return ret def __deserializeMmCifDict(self, filePath, **kwargs): """ """ try: containerList = [] workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) containerList = myIo.readFile(filePath, enforceAscii=enforceAscii, outDirPath=workPath) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return containerList def __serializeMmCifDict(self, filePath, containerList, **kwargs): """ """ try: ret = False # workPath = kwargs.get('workPath', None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) ret = myIo.writeFile(filePath, containerList=containerList, enforceAscii=enforceAscii) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return ret def __serializeList(self, filePath, aList, enforceAscii=True, **kwargs): """ """ try: _ = kwargs if enforceAscii: encoding = "ascii" else: encoding = "utf-8" # if sys.version_info[0] > 2: with open(filePath, "w") as ofh: if enforceAscii: for st in aList: ofh.write("%s\n" % st.encode( "ascii", "xmlcharrefreplace").decode("ascii")) else: for st in aList: ofh.write("%s\n" % st) else: if enforceAscii: with io.open(filePath, "w", encoding=encoding) as ofh: for st in aList: ofh.write("%s\n" % st.encode( "ascii", "xmlcharrefreplace").decode("ascii")) else: with open(filePath, "wb") as ofh: for st in aList: ofh.write("%s\n" % st) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __processList(self, ifh, enforceAscii=True, **kwargs): uncomment = kwargs.get("uncomment", True) aList = [] for line in ifh: if enforceAscii: pth = line[:-1].encode("ascii", "xmlcharrefreplace").decode("ascii") else: pth = line[:-1] if not pth or (uncomment and pth.startswith("#")): continue aList.append(pth) return aList def __deserializeList(self, filePath, enforceAscii=True, encodingErrors="ignore", **kwargs): aList = [] _ = kwargs try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding="utf-8-sig", errors=encodingErrors) as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii, **kwargs) else: tPath = self.__fileU.uncompress(filePath, outputDir=None) # for py2 this commented code is problematic for non-ascii data # with gzip.open(filePath, "rb") as ifh: # aList = self.__processList(ifh, enforceAscii=enforceAscii) with io.open(tPath, encoding="utf-8-sig", errors="ignore") as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii) else: with io.open(filePath, encoding="utf-8-sig", errors="ignore") as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii, **kwargs) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # logger.debug("Reading list length %d", len(aList)) return aList def __csvReader(self, csvFile, rowFormat, delimiter, uncomment=True): oL = [] maxInt = sys.maxsize csv.field_size_limit(maxInt) if rowFormat == "dict": if uncomment: reader = csv.DictReader(uncommentFilter(csvFile), delimiter=delimiter) else: reader = csv.DictReader(csvFile, delimiter=delimiter) for rowD in reader: oL.append(rowD) elif rowFormat == "list": if uncomment: reader = csv.reader(uncommentFilter(csvFile), delimiter=delimiter) else: reader = csv.reader(csvFile, delimiter=delimiter) for rowL in reader: oL.append(rowL) return oL def deserializeCsvIter(self, filePath, delimiter=",", rowFormat="dict", encodingErrors="ignore", uncomment=True, **kwargs): """Return an iterator to input CSV format file. Args: filePath (str): input file path delimiter (str, optional): CSV delimiter. Defaults to ",". rowFormat (str, optional): format for each process row (list or dict). Defaults to "dict". encodingErrors (str, optional): treatment of encoding errors. Defaults to "ignore". uncomment (bool, optional): flag to ignore leading comments. Defaults to True. Returns: (iterator): iterator for rowwise access to processed CSV data """ encoding = kwargs.get("encoding", "utf-8-sig") maxInt = sys.maxsize csv.field_size_limit(maxInt) try: if filePath[-3:] == ".gz": with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as csvFile: startIt = itertools.dropwhile( lambda x: x.startswith("#"), csvFile) if uncomment else csvFile if rowFormat == "dict": reader = csv.DictReader(startIt, delimiter=delimiter) elif rowFormat == "list": reader = csv.reader(startIt, delimiter=delimiter) for row in reader: yield row else: with io.open(filePath, newline="", encoding=encoding, errors="ignore") as csvFile: startIt = itertools.dropwhile( lambda x: x.startswith("#"), csvFile) if uncomment else csvFile if rowFormat == "dict": reader = csv.DictReader(startIt, delimiter=delimiter) elif rowFormat == "list": reader = csv.reader(startIt, delimiter=delimiter) for row in reader: # if uncomment and row.startswith("#"): # continue yield row except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) def __deserializeCsv(self, filePath, delimiter=",", rowFormat="dict", encodingErrors="ignore", uncomment=True, **kwargs): oL = [] encoding = kwargs.get("encoding", "utf-8-sig") try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) else: # Py2 situation non-ascii encodings is problematic # with gzip.open(filePath, "rb") as csvFile: # oL = self.__csvReader(csvFile, rowFormat, delimiter) tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, newline="", encoding=encoding, errors="ignore") as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) else: with io.open(filePath, newline="", encoding=encoding, errors="ignore") as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) return oL except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # logger.debug("Reading list length %d", len(oL)) return oL def __serializeCsv(self, filePath, rowDictList, fieldNames=None, **kwargs): """ """ _ = kwargs try: wD = {} ret = False fNames = fieldNames if fieldNames else list(rowDictList[0].keys()) # with io.open(filePath, 'w', newline='') as csvFile: with open(filePath, "w") as csvFile: writer = csv.DictWriter(csvFile, fieldnames=fNames) writer.writeheader() for ii, rowDict in enumerate(rowDictList): try: wD = {k: v for k, v in rowDict.items() if k in fNames} writer.writerow(wD) except Exception as e: logger.error( "Skipping bad CSV record %d wD %r rowDict %r with %s", ii + 1, wD, rowDict, str(e)) continue ret = True except Exception as e: logger.error("Failing for %s : %r with %s", filePath, wD, str(e)) return ret def __csvEncoder(self, csvData, encoding="utf-8-sig", encodingErrors="ignore"): """Handle encoding issues for gzipped data in Py2. (beware of the BOM chars) Args: csvData (text lines): uncompressed data from gzip open encoding (str, optional): character encoding. Defaults to "utf-8-sig". encodingErrors (str, optional): error treatment. Defaults to "ignore". """ for line in csvData: yield line.decode("utf-8-sig", errors=encodingErrors).encode( encoding, errors=encodingErrors) def __deserializeXmlPrev(self, filePath, **kwargs): """Read the input XML file path and return an ElementTree data object instance. Args: filePath (sting): input XML file path Returns: object: instance of an ElementTree tree object """ _ = kwargs tree = None try: logger.debug("Parsing XML path %s", filePath) if filePath[-3:] == ".gz": with gzip.open(filePath, mode="rb") as ifh: tV = time.time() tree = ET.parse(ifh) else: with open(filePath, mode="rb") as ifh: tV = time.time() tree = ET.parse(ifh) logger.debug("Parsed %s in %.2f seconds", filePath, time.time() - tV) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # return tree def __testGzip(self, filePath): ok = True with gzip.open(filePath, "r") as fh: try: fh.read(1) except gzip.BadGzipFile: ok = False except Exception: ok = False logger.debug("Gzip file check %r", ok) return ok def __deserializeXml(self, filePath, **kwargs): """Read the input XML file path and return an ElementTree data object instance. Args: filePath (sting): input XML file path Returns: object: instance of an ElementTree tree object """ _ = kwargs tree = None encoding = kwargs.get("encoding", "utf-8-sig") encodingErrors = kwargs.get("encodingErrors", "ignore") # try: logger.debug("Parsing XML path %s", filePath) if filePath[-3:] == ".gz" and self.__testGzip(filePath): if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) else: tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) else: with io.open(filePath, encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) logger.debug("Parsed %s in %.2f seconds", filePath, time.time() - tV) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # return tree
def __reload(self, dirPath, useCache=False, imgtDumpUrl=None, testList=None, maxCount=None): imgtD = {} startTime = time.time() fU = FileUtil() fU.mkdir(dirPath) # imgtDataPath = os.path.join(self.__dirPath, "imgt-data.json") # logger.info("useCache %r imgtFeaturePath %r", useCache, imgtDataPath) if useCache and self.__mU.exists(imgtDataPath): imgtD = self.__mU.doImport(imgtDataPath, fmt="json") self.__version = imgtD["version"] else: imgtDumpUrl = imgtDumpUrl if imgtDumpUrl else "http://www.imgt.org/download/3Dstructure-DB/IMGT3DFlatFiles.tgz" imgtReadmeUrl = "http://www.imgt.org/download/3Dstructure-DB/RELEASE" imgtDumpFileName = fU.getFileName(imgtDumpUrl) imgtDumpPath = os.path.join(dirPath, imgtDumpFileName) imgtReleasePath = os.path.join(dirPath, "IMGT-release.txt") _, fn = os.path.split(imgtDumpUrl) imgtFlatFilePath = os.path.join(self.__dirPath, fn[:-4]) # logger.info("Fetching url %s path %s", imgtDumpUrl, imgtDumpPath) ok1 = fU.get(imgtDumpUrl, imgtDumpPath) ok2 = fU.get(imgtReadmeUrl, imgtReleasePath) fU.unbundleTarfile(imgtDumpPath, dirPath=dirPath) logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1 and ok2, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # --- readmeLines = self.__mU.doImport(imgtReleasePath, fmt="list") self.__version = readmeLines[0].strip() if readmeLines else None logger.info("IMGT version %r", self.__version) # --- chainD, rawD = self.__imgtFlatFileProcessor(imgtFlatFilePath, maxCount=maxCount, testList=testList) # --- tS = datetime.datetime.now().isoformat() # vS = datetime.datetime.now().strftime("%Y-%m-%d") if testList: imgtD = { "version": self.__version, "date": tS, "chains": chainD, "raw": rawD } else: imgtD = { "version": self.__version, "date": tS, "chains": chainD } ok = self.__mU.doExport(imgtDataPath, imgtD, fmt="json", indent=3) logger.info("Completed flatfile prep (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) return imgtD