def get(self, remotePath, localPath): try: fileU = FileUtil() fileU.mkdirForFile(localPath) self.__sftpClient.get(remotePath, localPath) return True except Exception as e: if self.__raiseExceptions: raise e else: logger.error("get failing for remotePath %s localPath %s with %s", remotePath, localPath, str(e)) return False
def get(self, remotePath, localPath): """Get a file from a remote FTP server. Arguments: remotePath (str): remote file path localPath (str): local file path Returns: bool: True for success or false otherwise """ try: fileU = FileUtil() fileU.mkdirForFile(localPath) # If provided localPath already exists and is a directory, retrieve the file using the name on the remote server # to avoid unintentionally overwriting an entire local directory with a single retrieved file if (os.path.exists(localPath) and os.path.isdir(localPath)): remoteFileName = FileUtil().getFileName(remotePath) localFilePath = os.path.join(localPath, remoteFileName) else: localFilePath = localPath with open(localFilePath, 'wb') as lFP: self.__ftpClient.retrbinary('RETR %s' % remotePath, lFP.write) ok = fileU.exists(localFilePath) if ok: return True else: logger.error("get failing for remotePath %s localFilePath %s", remotePath, localFilePath) return False except Exception as e: if self.__raiseExceptions: raise e else: logger.error( "get failing for remotePath %s localPath %s with %s", remotePath, localPath, str(e)) return False
class IoUtil(object): def __init__(self, **kwargs): self.__fileU = FileUtil(**kwargs) def serialize(self, filePath, myObj, fmt="pickle", **kwargs): """Public method to serialize format appropriate objects Args: filePath (str): local file path' myObj (object): format appropriate object to be serialized format (str, optional): one of ['mmcif', mmcif-dict', json', 'list', 'text-dump', pickle' (default)] **kwargs: additional keyword arguments passed to worker methods - Returns: bool: status of serialization operation; true for success or false otherwise """ ret = False fmt = str(fmt).lower() ret = self.__fileU.mkdirForFile(filePath) if not ret: return ret if fmt in ["mmcif"]: ret = self.__serializeMmCif(filePath, myObj, **kwargs) elif fmt in ["json"]: ret = self.__serializeJson(filePath, myObj, **kwargs) elif fmt in ["pickle"]: ret = self.__serializePickle(filePath, myObj, **kwargs) elif fmt in ["list"]: ret = self.__serializeList(filePath, myObj, enforceAscii=True, **kwargs) elif fmt in ["mmcif-dict"]: ret = self.__serializeMmCifDict(filePath, myObj, **kwargs) elif fmt in ["text-dump"]: ret = self.__textDump(filePath, myObj, **kwargs) elif fmt in ["fasta"]: ret = self.__serializeFasta(filePath, myObj, **kwargs) elif fmt in ["csv"]: ret = self.__serializeCsv(filePath, myObj, **kwargs) else: pass return ret def deserialize(self, filePath, fmt="pickle", **kwargs): """Public method to deserialize objects in supported formats. Args: filePath (str): local file path format (str, optional): one of ['mmcif', 'json', 'list', ..., 'pickle' (default)] **kwargs: additional keyword arguments passed to worker methods - Returns: object: deserialized object data """ fmt = str(fmt).lower() if fmt in ["mmcif"]: ret = self.__deserializeMmCif(filePath, **kwargs) # type: ignore elif fmt in ["json"]: ret = self.__deserializeJson(filePath, **kwargs) # type: ignore elif fmt in ["pickle"]: ret = self.__deserializePickle(filePath, **kwargs) # type: ignore elif fmt in ["list"]: ret = self.__deserializeList(filePath, enforceAscii=True, **kwargs) # type: ignore elif fmt in ["mmcif-dict"]: ret = self.__deserializeMmCifDict(filePath, **kwargs) # type: ignore elif fmt in ["fasta"]: ret = self.__deserializeFasta(filePath, **kwargs) # type: ignore # elif fmt in ["vrpt-xml-to-cif"]: # ret = self.__deserializeVrptToCif(filePath, **kwargs) # type: ignore elif fmt in ["csv", "tdd"]: delimiter = kwargs.get("csvDelimiter", "," if fmt == "csv" else "\t") ret = self.__deserializeCsv(filePath, delimiter=delimiter, **kwargs) # type: ignore elif fmt in ["xml"]: ret = self.__deserializeXml(filePath, **kwargs) # type: ignore else: ret = None # type: ignore return ret def __sliceInChunks(self, myList, numChunks): mc = min(len(myList), numChunks) chunkSize = int(len(myList) / mc) if len(myList) % mc: chunkSize += 1 for i in range(0, len(myList), chunkSize): yield myList[i:i + chunkSize] def serializeInParts(self, filePath, myObj, numParts, fmt="json", **kwargs): """Public method to serialize format appropriate (json, pickle) objects in multiple parts Args: filePath (str): local file path myObj (object): format appropriate object to be serialized numParts (int): divide the data into numParts segments format (str, optional): one of ['json' or 'pickle']. Defaults to json **kwargs: additional keyword arguments passed to worker methods - Returns: bool: True for success or False otherwise """ if fmt not in ["json", "pickle"]: logger.error("Unsupported format for %s", fmt) return False pth, fn = os.path.split(filePath) self.__fileU.mkdirForFile(pth) bn, ext = os.path.splitext(fn) ret = True if isinstance(myObj, list): for ii, subList in enumerate(self.__sliceInChunks(myObj, numParts)): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) ok = self.serialize(fp, subList, fmt=fmt, **kwargs) ret = ret and ok elif isinstance(myObj, dict): for ii, keyList in enumerate( self.__sliceInChunks(list(myObj.keys()), numParts)): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) ok = self.serialize(fp, OrderedDict([(k, myObj[k]) for k in keyList]), fmt=fmt, **kwargs) ret = ret and ok else: logger.error("Unsupported data type for serialization in parts") ret = False # return ret def deserializeInParts(self, filePath, numParts, fmt="json", **kwargs): """Public method to deserialize objects in supported formats from multiple parts Args: filePath (str): local file path numParts (int): reconstruct the data object from numParts segments format (str, optional): one of ['json' or 'pickle']. Defaults to json **kwargs: additional keyword arguments passed to worker methods - Returns: object: deserialized object data """ rObj = None if fmt not in ["json", "pickle"]: logger.error("Unsupported format for %s", fmt) return rObj # pth, fn = os.path.split(filePath) bn, ext = os.path.splitext(fn) if not numParts: fp = os.path.join(pth, bn + "_part_*" + ext) numParts = len(glob.glob(fp)) # for ii in range(numParts): fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext) tObj = self.deserialize(fp, fmt=fmt, **kwargs) if isinstance(tObj, list): if not rObj: rObj = [] rObj.extend(tObj) elif isinstance(tObj, dict): if not rObj: rObj = OrderedDict() rObj.update(tObj) else: logger.error( "Unsupported data type for deserialization in parts") return rObj def exists(self, filePath, mode=os.R_OK): return self.__fileU.exists(filePath, mode=mode) def mkdir(self, dirPath, mode=0o755): return self.__fileU.mkdir(dirPath, mode=mode) def remove(self, pth): return self.__fileU.remove(pth) def __deserializeFasta(self, filePath, **kwargs): try: commentStyle = kwargs.get("commentStyle", "uniprot") fau = FastaUtil() return fau.readFasta(filePath, commentStyle=commentStyle) except Exception as e: logger.error("Unable to deserialize %r %r ", filePath, str(e)) return {} def __serializeFasta(self, filePath, myObj, **kwargs): try: maxLineLength = int(kwargs.get("maxLineLength", 70)) makeComment = kwargs.get("makeComment", False) fau = FastaUtil() ok = fau.writeFasta(filePath, myObj, maxLineLength=maxLineLength, makeComment=makeComment) return ok except Exception as e: logger.error("Unable to serialize FASTA file %r %r", filePath, str(e)) return False def __textDump(self, filePath, myObj, **kwargs): try: indent = kwargs.get("indent", 1) width = kwargs.get("width", 120) sOut = pprint.pformat(myObj, indent=indent, width=width) with open(filePath, "w") as ofh: ofh.write("\n%s\n" % sOut) return True except Exception as e: logger.error("Unable to dump to %r %r", filePath, str(e)) return False def __serializePickle(self, filePath, myObj, **kwargs): try: pickleProtocol = kwargs.get("pickleProtocol", pickle.DEFAULT_PROTOCOL) with open(filePath, "wb") as outfile: pickle.dump(myObj, outfile, pickleProtocol) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __deserializePickle(self, filePath, **kwargs): myDefault = kwargs.get("default", {}) try: if sys.version_info[0] > 2: encoding = kwargs.get("encoding", "ASCII") errors = kwargs.get("errors", "strict") with open(filePath, "rb") as outfile: return pickle.load(outfile, encoding=encoding, errors=errors) else: with open(filePath, "rb") as outfile: return pickle.load(outfile) except Exception as e: logger.warning("Unable to deserialize %r %r", filePath, str(e)) return myDefault def __serializeJson(self, filePath, myObj, **kwargs): """Internal method to serialize the input object as JSON. An encoding helper class is included to handle selected python data types (e.g., datetime) """ indent = kwargs.get("indent", 0) enforceAscii = kwargs.get("enforceAscii", True) try: if enforceAscii: with open(filePath, "w") as outfile: json.dump(myObj, outfile, indent=indent, cls=JsonTypeEncoder, ensure_ascii=enforceAscii) else: with io.open(filePath, "w", encoding="utf-8") as outfile: json.dump(myObj, outfile, indent=indent, cls=JsonTypeEncoder, ensure_ascii=enforceAscii) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __deserializeJson(self, filePath, **kwargs): myDefault = kwargs.get("default", {}) encoding = kwargs.get("encoding", "utf-8-sig") encodingErrors = kwargs.get("encodingErrors", "ignore") try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) else: # Py2 situation non-ascii encodings is problematic # with gzip.open(filePath, "rb") as csvFile: # oL = self.__csvReader(csvFile, rowFormat, delimiter) tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, newline="", encoding=encoding, errors="ignore") as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) else: with open(filePath, "r") as inpFile: return json.load(inpFile, object_pairs_hook=OrderedDict) except Exception as e: logger.warning("Unable to deserialize %r %r", filePath, str(e)) return myDefault def __hasMinSize(self, pth, minSize): try: return os.path.getsize(pth) >= minSize except Exception: return False def __deserializeMmCif(self, locator, **kwargs): """ """ try: containerList = [] workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) minSize = kwargs.get("minSize", 5) # if self.__fileU.isLocal(locator): if minSize >= 0 and not self.__hasMinSize(locator, minSize): logger.warning("Minimum file size not satisfied for: %r", locator) myIo = IoAdapter(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) containerList = myIo.readFile( locator, enforceAscii=enforceAscii, outDirPath=workPath) # type: ignore else: # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) # containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath) containerList = self.__deserializeMmCifRemote( locator, useCharRefs, enforceAscii, workPath) except Exception as e: logger.error("Failing for %s with %s", locator, str(e)) return containerList @retry((requests.exceptions.RequestException), maxAttempts=3, delaySeconds=1, multiplier=2, defaultValue=[], logger=logger) def __deserializeMmCifRemote(self, locator, useCharRefs, enforceAscii, workPath): containerList = [] try: myIo = IoAdapterPy(raiseExceptions=True, useCharRefs=useCharRefs) containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath) except Exception as e: raise e return containerList def __serializeMmCif(self, filePath, containerList, **kwargs): """ """ try: ret = False workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapter(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) if filePath.endswith(".gz") and workPath: rfn = "".join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) tPath = os.path.join(workPath, rfn) ret = myIo.writeFile(tPath, containerList=containerList, enforceAscii=enforceAscii) ret = self.__fileU.compress(tPath, filePath, compressType="gzip") else: ret = myIo.writeFile(filePath, containerList=containerList, enforceAscii=enforceAscii) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return ret def __deserializeMmCifDict(self, filePath, **kwargs): """ """ try: containerList = [] workPath = kwargs.get("workPath", None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) containerList = myIo.readFile(filePath, enforceAscii=enforceAscii, outDirPath=workPath) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return containerList def __serializeMmCifDict(self, filePath, containerList, **kwargs): """ """ try: ret = False # workPath = kwargs.get('workPath', None) enforceAscii = kwargs.get("enforceAscii", True) raiseExceptions = kwargs.get("raiseExceptions", True) useCharRefs = kwargs.get("useCharRefs", True) # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs) ret = myIo.writeFile(filePath, containerList=containerList, enforceAscii=enforceAscii) except Exception as e: logger.error("Failing for %s with %s", filePath, str(e)) return ret def __serializeList(self, filePath, aList, enforceAscii=True, **kwargs): """ """ try: _ = kwargs if enforceAscii: encoding = "ascii" else: encoding = "utf-8" # if sys.version_info[0] > 2: with open(filePath, "w") as ofh: if enforceAscii: for st in aList: ofh.write("%s\n" % st.encode( "ascii", "xmlcharrefreplace").decode("ascii")) else: for st in aList: ofh.write("%s\n" % st) else: if enforceAscii: with io.open(filePath, "w", encoding=encoding) as ofh: for st in aList: ofh.write("%s\n" % st.encode( "ascii", "xmlcharrefreplace").decode("ascii")) else: with open(filePath, "wb") as ofh: for st in aList: ofh.write("%s\n" % st) return True except Exception as e: logger.error("Unable to serialize %r %r", filePath, str(e)) return False def __processList(self, ifh, enforceAscii=True, **kwargs): uncomment = kwargs.get("uncomment", True) aList = [] for line in ifh: if enforceAscii: pth = line[:-1].encode("ascii", "xmlcharrefreplace").decode("ascii") else: pth = line[:-1] if not pth or (uncomment and pth.startswith("#")): continue aList.append(pth) return aList def __deserializeList(self, filePath, enforceAscii=True, encodingErrors="ignore", **kwargs): aList = [] _ = kwargs try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding="utf-8-sig", errors=encodingErrors) as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii, **kwargs) else: tPath = self.__fileU.uncompress(filePath, outputDir=None) # for py2 this commented code is problematic for non-ascii data # with gzip.open(filePath, "rb") as ifh: # aList = self.__processList(ifh, enforceAscii=enforceAscii) with io.open(tPath, encoding="utf-8-sig", errors="ignore") as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii) else: with io.open(filePath, encoding="utf-8-sig", errors="ignore") as ifh: aList = self.__processList(ifh, enforceAscii=enforceAscii, **kwargs) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # logger.debug("Reading list length %d", len(aList)) return aList def __csvReader(self, csvFile, rowFormat, delimiter, uncomment=True): oL = [] maxInt = sys.maxsize csv.field_size_limit(maxInt) if rowFormat == "dict": if uncomment: reader = csv.DictReader(uncommentFilter(csvFile), delimiter=delimiter) else: reader = csv.DictReader(csvFile, delimiter=delimiter) for rowD in reader: oL.append(rowD) elif rowFormat == "list": if uncomment: reader = csv.reader(uncommentFilter(csvFile), delimiter=delimiter) else: reader = csv.reader(csvFile, delimiter=delimiter) for rowL in reader: oL.append(rowL) return oL def deserializeCsvIter(self, filePath, delimiter=",", rowFormat="dict", encodingErrors="ignore", uncomment=True, **kwargs): """Return an iterator to input CSV format file. Args: filePath (str): input file path delimiter (str, optional): CSV delimiter. Defaults to ",". rowFormat (str, optional): format for each process row (list or dict). Defaults to "dict". encodingErrors (str, optional): treatment of encoding errors. Defaults to "ignore". uncomment (bool, optional): flag to ignore leading comments. Defaults to True. Returns: (iterator): iterator for rowwise access to processed CSV data """ encoding = kwargs.get("encoding", "utf-8-sig") maxInt = sys.maxsize csv.field_size_limit(maxInt) try: if filePath[-3:] == ".gz": with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as csvFile: startIt = itertools.dropwhile( lambda x: x.startswith("#"), csvFile) if uncomment else csvFile if rowFormat == "dict": reader = csv.DictReader(startIt, delimiter=delimiter) elif rowFormat == "list": reader = csv.reader(startIt, delimiter=delimiter) for row in reader: yield row else: with io.open(filePath, newline="", encoding=encoding, errors="ignore") as csvFile: startIt = itertools.dropwhile( lambda x: x.startswith("#"), csvFile) if uncomment else csvFile if rowFormat == "dict": reader = csv.DictReader(startIt, delimiter=delimiter) elif rowFormat == "list": reader = csv.reader(startIt, delimiter=delimiter) for row in reader: # if uncomment and row.startswith("#"): # continue yield row except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) def __deserializeCsv(self, filePath, delimiter=",", rowFormat="dict", encodingErrors="ignore", uncomment=True, **kwargs): oL = [] encoding = kwargs.get("encoding", "utf-8-sig") try: if filePath[-3:] == ".gz": if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) else: # Py2 situation non-ascii encodings is problematic # with gzip.open(filePath, "rb") as csvFile: # oL = self.__csvReader(csvFile, rowFormat, delimiter) tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, newline="", encoding=encoding, errors="ignore") as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) else: with io.open(filePath, newline="", encoding=encoding, errors="ignore") as csvFile: oL = self.__csvReader(csvFile, rowFormat, delimiter, uncomment=uncomment) return oL except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # logger.debug("Reading list length %d", len(oL)) return oL def __serializeCsv(self, filePath, rowDictList, fieldNames=None, **kwargs): """ """ _ = kwargs try: wD = {} ret = False fNames = fieldNames if fieldNames else list(rowDictList[0].keys()) # with io.open(filePath, 'w', newline='') as csvFile: with open(filePath, "w") as csvFile: writer = csv.DictWriter(csvFile, fieldnames=fNames) writer.writeheader() for ii, rowDict in enumerate(rowDictList): try: wD = {k: v for k, v in rowDict.items() if k in fNames} writer.writerow(wD) except Exception as e: logger.error( "Skipping bad CSV record %d wD %r rowDict %r with %s", ii + 1, wD, rowDict, str(e)) continue ret = True except Exception as e: logger.error("Failing for %s : %r with %s", filePath, wD, str(e)) return ret def __csvEncoder(self, csvData, encoding="utf-8-sig", encodingErrors="ignore"): """Handle encoding issues for gzipped data in Py2. (beware of the BOM chars) Args: csvData (text lines): uncompressed data from gzip open encoding (str, optional): character encoding. Defaults to "utf-8-sig". encodingErrors (str, optional): error treatment. Defaults to "ignore". """ for line in csvData: yield line.decode("utf-8-sig", errors=encodingErrors).encode( encoding, errors=encodingErrors) def __deserializeXmlPrev(self, filePath, **kwargs): """Read the input XML file path and return an ElementTree data object instance. Args: filePath (sting): input XML file path Returns: object: instance of an ElementTree tree object """ _ = kwargs tree = None try: logger.debug("Parsing XML path %s", filePath) if filePath[-3:] == ".gz": with gzip.open(filePath, mode="rb") as ifh: tV = time.time() tree = ET.parse(ifh) else: with open(filePath, mode="rb") as ifh: tV = time.time() tree = ET.parse(ifh) logger.debug("Parsed %s in %.2f seconds", filePath, time.time() - tV) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # return tree def __testGzip(self, filePath): ok = True with gzip.open(filePath, "r") as fh: try: fh.read(1) except gzip.BadGzipFile: ok = False except Exception: ok = False logger.debug("Gzip file check %r", ok) return ok def __deserializeXml(self, filePath, **kwargs): """Read the input XML file path and return an ElementTree data object instance. Args: filePath (sting): input XML file path Returns: object: instance of an ElementTree tree object """ _ = kwargs tree = None encoding = kwargs.get("encoding", "utf-8-sig") encodingErrors = kwargs.get("encodingErrors", "ignore") # try: logger.debug("Parsing XML path %s", filePath) if filePath[-3:] == ".gz" and self.__testGzip(filePath): if sys.version_info[0] > 2: with gzip.open(filePath, "rt", encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) else: tPath = self.__fileU.uncompress(filePath, outputDir=None) with io.open(tPath, encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) else: with io.open(filePath, encoding=encoding, errors=encodingErrors) as ifh: tV = time.time() tree = ET.parse(ifh) logger.debug("Parsed %s in %.2f seconds", filePath, time.time() - tV) except Exception as e: logger.error("Unable to deserialize %r %s", filePath, str(e)) # return tree
def __rebuildCache(self, targetUrl, mapNameL, outDirPath, rawDirPath, fmt="pickle", useCache=True): """Fetch the UniProt selected id mapping resource file and extract UniProt Acc to 'mapIndex' mapping. Serialize the mapping as required. Args: targetUrl (str): source URL of the remote index file mapNameL (list): list of key mapping names to extract from the index outDirPath (str): directory path for raw and processed mapping files fmt (str, optional): output format (pickle|json) . Defaults to "pickle". useCache (bool, optional): use cached files. Defaults to True. Returns: dict: od[uniprotId] = mapped value idmapping_selected.tab 1. UniProtKB-AC 2. UniProtKB-ID 3. GeneID (EntrezGene) 4. RefSeq 5. GI 6. PDB 7. GO 8. UniRef100 9. UniRef90 10. UniRef50 11. UniParc 12. PIR 13. NCBI-taxon 14. MIM 15. UniGene 16. PubMed 17. EMBL 18. EMBL-CDS 19. Ensembl 20. Ensembl_TRS 21. Ensembl_PRO 22. Additional PubMed """ startTime = time.time() nL = mapNameL oD = {} try: fileU = FileUtil() fExt = "pic" if fmt == "pickle" else "json" fExt = "tdd" if fmt == "tdd" else fExt fN, _ = os.path.splitext(fileU.getFileName(targetUrl)) mapFileName = fN + "-map." + fExt idMapPath = os.path.join(outDirPath, mapFileName) mU = MarshalUtil() if useCache and mU.exists(idMapPath): logger.info("Reading cached serialized file %r", idMapPath) if fmt in ["pickle", "json"]: tD = mU.doImport(idMapPath, fmt=fmt) nL = list(set(tD["idNameList"])) oD = tD["uniprotMapD"] logger.info("keys %r", list(oD.keys())[:10]) logger.info("nL %r", nL) ok = True elif fmt == "tdd": ioU = IoUtil() it = ioU.deserializeCsvIter(idMapPath, delimiter="\t", rowFormat="list", encodingErrors="ignore") tL = next(it, []) nL = tL[1:] if len(nL) == 1: for row in it: oD[row[0]] = row[1] else: for row in it: oD[row[0]] = row[1:] ok = True else: idPath = os.path.join(rawDirPath, fileU.getFileName(targetUrl)) if not fileU.exists(idPath): logger.info( "Fetching selected UniProt idmapping data from %r in %r", targetUrl, outDirPath) ok = fileU.get(targetUrl, idPath) if not ok: logger.error("Failed to downlowd %r", targetUrl) return oD else: logger.info("Using cached mapping file %r", idPath) # --- ioU = IoUtil() if fmt in ["pickle", "json"]: if len(mapNameL) == 1: for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): oD[row[0]] = str( row[self.__mapRecordD[mapNameL[0]] - 1]) else: for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): for mapName in mapNameL: oD.setdefault(row[0], []).append( str(row[self.__mapRecordD[mapName] - 1])) logger.info("Writing serialized mapping file %r", idMapPath) ok = mU.doExport(idMapPath, { "idNameList": mapNameL, "uniprotMapD": oD }, fmt=fmt) elif fmt == "tdd": # logger.info("Writing serialized mapping file %r", idMapPath) fU = FileUtil() fU.mkdirForFile(idMapPath) colNameL = [] colNameL.append("UniProtId") colNameL.extend(mapNameL) with open(idMapPath, "w", encoding="utf-8") as ofh: ofh.write("%s\n" % "\t".join(colNameL)) if len(mapNameL) == 1: idx = self.__mapRecordD[mapNameL[0]] - 1 for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): ofh.write("%s\t%s\n" % (row[0], row[idx])) else: idxL = [0] idxL.extend([ self.__mapRecordD[mapName] - 1 for mapName in mapNameL ]) for row in ioU.deserializeCsvIter( idPath, delimiter="\t", rowFormat="list", encodingErrors="ignore"): ofh.write( "%s\n" % "\t".join([str(row[idx]) for idx in idxL])) # nL, oD = self.__rebuildCache(targetUrl, mapNameL, outDirPath, rawDirPath, fmt=fmt, useCache=True) ok = True if nL and oD else False logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) # return nL, oD