def toAscii(fileData, mimeType=None, \ maxBinFileSize=pubConf.maxBinFileSize, maxTxtFileSize=pubConf.maxTxtFileSize, \ minTxtFileSize=pubConf.minTxtFileSize): """ pick out the content from the fileData dictionary, write it to a local file in tempDir and convert it to ASCII format. Put output back into the content field. mimeType will be used if specified, otherwise try to guess converter based on url file extension returns fileData if successful, otherwise None returns only unicode strings (despite the name) """ converters = pubConf.getConverters() tempDir = pubConf.getTempDir() fileContent = fileData["content"] fileSize = len(fileContent) if "locFname" in fileData: locFname=fileData["locFname"] fileDebugDesc = fileData["externalId"]+":"+locFname else: locFname = None fileDebugDesc = ",".join([fileData["url"],fileData["desc"], fileData["fileId"],fileData["articleId"]]) if fileSize > maxBinFileSize: logging.warn("binary file size before conversion %d > %d, skipping file %s" % \ (len(fileContent), maxBinFileSize, fileDebugDesc)) return None fileExt = getFileExt(fileData, locFname, mimeType) if fileExt not in converters: logging.debug("Could not convert file %s, no converter for extension %s" % \ (fileDebugDesc, fileExt)) return None cmdLine = converters[fileExt] if cmdLine=="COPY": # fileData["content"] already contains ASCII text pass elif cmdLine=="XMLTEXT" or cmdLine=="NXMLTEXT": logging.debug("stripping XML tags") if cmdLine=="NXMLTEXT": asciiData = pubXml.stripXmlTags(fileContent, isNxmlFormat=True) else: asciiData = pubXml.stripXmlTags(fileContent) if asciiData==None: logging.debug("Could not convert xml to ascii, file %s" % fileData["url"]) return None fileData["content"]=asciiData else: asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir) # try to detect corrupted pdf2text output and run second converter if fileExt=="pdf" and \ ((asciiData==None or len(asciiData)<minTxtFileSize) or countBadChars(asciiData)>=10): logging.debug("No data or too many non printable characters in PDF, trying alternative program") cmdLine = converters["pdf2"] asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir) if asciiData==None: logging.info("conversion failed for %s" % fileDebugDesc) return None else: fileData["content"]=removeBadChars(asciiData) fileData = dictToUnicode(fileData) if len(fileData["content"]) > maxTxtFileSize: logging.info("ascii file size after conversion too big, ignoring file %s" % fileDebugDesc) return None if len(fileData["content"]) < minTxtFileSize: logging.debug("ascii file size only %d bytes < %d, ignoring %s" % \ (len(fileData["content"]), minTxtFileSize, fileDebugDesc)) return None #charSet = set(fileData["content"]) #if len(charSet) < 10: #logging.warn("too few characters in ASCII output: %s" % charSet) #return None return fileData
def toAscii( fileData, mimeType=None, maxBinFileSize=pubConf.maxBinFileSize, maxTxtFileSize=pubConf.maxTxtFileSize, minTxtFileSize=pubConf.minTxtFileSize, ): """ pick out the content from the fileData dictionary, write it to a local file in tempDir and convert it to ASCII format. Put output back into the content field hint specifies where the files come from. can be elsevier or pmc. mimeType will be used if specified, otherwise try to guess converter based on url file extension returns fileData if successful, otherwise None returns only unicode strings (despite the name) """ converters = pubConf.getConverters() tempDir = pubConf.getTempDir() fileContent = fileData["content"] if len(fileContent) > maxBinFileSize: logging.warn( "binary file size before conversion %d > %d, skipping file %s" % ( len(fileContent), maxBinFileSize, fileData["url"] + fileData["desc"] + fileData["fileId"] + fileData["articleId"], ) ) return None url = fileData["url"] fileExt = None if mimeType == None and "mimeType" in fileData and fileData["mimeType"] != None: mimeType = fileData["mimeType"] if mimeType: fileExt = pubConf.MIMEMAP.get(mimeType, None) logging.debug("File extension determined as %s" % fileExt) if fileExt == None: fileExt = os.path.splitext(url)[1].lower().strip(".") if fileExt not in converters: logging.debug("Could not convert file %s, no converter for extension %s" % (url, fileExt)) return None cmdLine = converters[fileExt] if cmdLine == "COPY": pass elif cmdLine == "XMLTEXT" or cmdLine == "NXMLTEXT": logging.debug("stripping XML tags") if cmdLine == "NXMLTEXT": asciiData = pubXml.stripXmlTags(fileContent, isNxmlFormat=True) else: asciiData = pubXml.stripXmlTags(fileContent) if asciiData == None: logging.warn("Could not convert xml to ascii") return None fileData["content"] = asciiData else: # logging.verbose("data before conversion is %s" % fileContent) asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir) # logging.verbose("Ascii data after conversion is %s" % asciiData) if fileExt == "pdf" and (asciiData == None or countBadChars(asciiData) >= 10): logging.debug("No data or too many non printable characters in PDF, trying alternative program") cmdLine = converters["pdf2"] asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir) if asciiData == None: return None else: fileData["content"] = removeBadChars(asciiData) fileData = dictToUnicode(fileData) if len(fileData["content"]) > maxTxtFileSize: logging.warn("ascii file size after conversion too big, ignoring file") return None if len(fileData["content"]) < minTxtFileSize: logging.warn("ascii file size after conversion too small, ignoring file") return None # charSet = set(fileData["content"]) # if len(charSet) < 10: # logging.warn("too few characters in ASCII output: %s" % charSet) # return None return fileData
def toAscii(fileData, mimeType=None, \ maxBinFileSize=pubConf.maxBinFileSize, maxTxtFileSize=pubConf.maxTxtFileSize, \ minTxtFileSize=pubConf.minTxtFileSize): """ pick out the content from the fileData dictionary, write it to a local file in tempDir and convert it to ASCII format. Put output back into the content field. mimeType will be used if specified, otherwise try to guess converter based on url file extension returns fileData if successful, otherwise None returns only unicode strings (despite the name) """ converters = pubConf.getConverters() tempDir = pubConf.getTempDir() fileContent = fileData["content"] fileSize = len(fileContent) if "locFname" in fileData: locFname = fileData["locFname"] fileDebugDesc = fileData["externalId"] + ":" + locFname else: locFname = None fileDebugDesc = ",".join([ fileData["url"], fileData["desc"], fileData["fileId"], fileData["articleId"] ]) if fileSize > maxBinFileSize: logging.warn("binary file size before conversion %d > %d, skipping file %s" % \ (len(fileContent), maxBinFileSize, fileDebugDesc)) return None fileExt = getFileExt(fileData, locFname, mimeType) if fileExt not in converters: logging.debug("Could not convert file %s, no converter for extension %s" % \ (fileDebugDesc, fileExt)) return None cmdLine = converters[fileExt] if cmdLine == "COPY": # fileData["content"] already contains ASCII text pass elif cmdLine == "XMLTEXT" or cmdLine == "NXMLTEXT": logging.debug("stripping XML tags") if cmdLine == "NXMLTEXT": asciiData = pubXml.stripXmlTags(fileContent, isNxmlFormat=True) else: asciiData = pubXml.stripXmlTags(fileContent) if asciiData == None: logging.debug("Could not convert xml to ascii, file %s" % fileData["url"]) return None fileData["content"] = asciiData else: asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir) # try to detect corrupted pdf2text output and run second converter if fileExt=="pdf" and \ ((asciiData==None or len(asciiData)<minTxtFileSize) or countBadChars(asciiData)>=10): logging.debug( "No data or too many non printable characters in PDF, trying alternative program" ) cmdLine = converters["pdf2"] asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir) if asciiData == None: logging.info("conversion failed for %s" % fileDebugDesc) return None else: fileData["content"] = removeBadChars(asciiData) fileData = dictToUnicode(fileData) if len(fileData["content"]) > maxTxtFileSize: logging.info( "ascii file size after conversion too big, ignoring file %s" % fileDebugDesc) return None if len(fileData["content"]) < minTxtFileSize: logging.debug("ascii file size only %d bytes < %d, ignoring %s" % \ (len(fileData["content"]), minTxtFileSize, fileDebugDesc)) return None #charSet = set(fileData["content"]) #if len(charSet) < 10: #logging.warn("too few characters in ASCII output: %s" % charSet) #return None return fileData