Exemplo n.º 1
0
def toAscii(fileData, mimeType=None, \
        maxBinFileSize=pubConf.maxBinFileSize, maxTxtFileSize=pubConf.maxTxtFileSize, \
        minTxtFileSize=pubConf.minTxtFileSize):
    """ pick out the content from the fileData dictionary, 
    write it to a local file in tempDir and convert it to 
    ASCII format. Put output back into the content field.

    mimeType will be used if specified, otherwise try to guess
    converter based on url file extension

    returns fileData if successful, otherwise None
    returns only unicode strings (despite the name)
    """
    converters = pubConf.getConverters()
    tempDir = pubConf.getTempDir()

    fileContent = fileData["content"]
    fileSize = len(fileContent)

    if "locFname" in fileData:
        locFname=fileData["locFname"]
        fileDebugDesc = fileData["externalId"]+":"+locFname
    else:
        locFname = None
        fileDebugDesc = ",".join([fileData["url"],fileData["desc"],
            fileData["fileId"],fileData["articleId"]])

    if fileSize > maxBinFileSize:
        logging.warn("binary file size before conversion %d > %d, skipping file %s" % \
            (len(fileContent), maxBinFileSize, fileDebugDesc))
        return None

    fileExt = getFileExt(fileData, locFname, mimeType)

    if fileExt not in converters:
        logging.debug("Could not convert file %s, no converter for extension %s" % \
            (fileDebugDesc, fileExt))
        return None
    cmdLine = converters[fileExt]

    if cmdLine=="COPY":
        # fileData["content"] already contains ASCII text
        pass
        
    elif cmdLine=="XMLTEXT" or cmdLine=="NXMLTEXT":
        logging.debug("stripping XML tags")
        if cmdLine=="NXMLTEXT":
            asciiData = pubXml.stripXmlTags(fileContent, isNxmlFormat=True)
        else:
            asciiData = pubXml.stripXmlTags(fileContent)

        if asciiData==None:
            logging.debug("Could not convert xml to ascii, file %s" % fileData["url"])
            return None
        fileData["content"]=asciiData

    else:
        asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir)

        # try to detect corrupted pdf2text output and run second converter
        if fileExt=="pdf" and \
            ((asciiData==None or len(asciiData)<minTxtFileSize) or countBadChars(asciiData)>=10):
            logging.debug("No data or too many non printable characters in PDF, trying alternative program")
            cmdLine = converters["pdf2"]
            asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir)

        if asciiData==None:
            logging.info("conversion failed for %s" % fileDebugDesc)
            return None
        else:
            fileData["content"]=removeBadChars(asciiData)

    fileData = dictToUnicode(fileData)

    if len(fileData["content"]) > maxTxtFileSize:
        logging.info("ascii file size after conversion too big, ignoring file %s" % fileDebugDesc)
        return None

    if len(fileData["content"]) < minTxtFileSize:
        logging.debug("ascii file size only %d bytes < %d, ignoring %s" % \
            (len(fileData["content"]), minTxtFileSize, fileDebugDesc))
        return None

    #charSet = set(fileData["content"])
    #if len(charSet) < 10:
        #logging.warn("too few characters in ASCII output: %s" % charSet)
        #return None

    return fileData
Exemplo n.º 2
0
def toAscii(
    fileData,
    mimeType=None,
    maxBinFileSize=pubConf.maxBinFileSize,
    maxTxtFileSize=pubConf.maxTxtFileSize,
    minTxtFileSize=pubConf.minTxtFileSize,
):
    """ pick out the content from the fileData dictionary, 
    write it to a local file in tempDir and convert it to 
    ASCII format. Put output back into the content field 

    hint specifies where the files come from. can be elsevier or pmc.
    mimeType will be used if specified, otherwise try to guess
    converter based on url file extension

    returns fileData if successful, otherwise None
    returns only unicode strings (despite the name)
    """
    converters = pubConf.getConverters()
    tempDir = pubConf.getTempDir()

    fileContent = fileData["content"]
    if len(fileContent) > maxBinFileSize:
        logging.warn(
            "binary file size before conversion %d > %d, skipping file %s"
            % (
                len(fileContent),
                maxBinFileSize,
                fileData["url"] + fileData["desc"] + fileData["fileId"] + fileData["articleId"],
            )
        )
        return None

    url = fileData["url"]

    fileExt = None
    if mimeType == None and "mimeType" in fileData and fileData["mimeType"] != None:
        mimeType = fileData["mimeType"]

    if mimeType:
        fileExt = pubConf.MIMEMAP.get(mimeType, None)
        logging.debug("File extension determined as %s" % fileExt)
    if fileExt == None:
        fileExt = os.path.splitext(url)[1].lower().strip(".")

    if fileExt not in converters:
        logging.debug("Could not convert file %s, no converter for extension %s" % (url, fileExt))
        return None

    cmdLine = converters[fileExt]

    if cmdLine == "COPY":
        pass

    elif cmdLine == "XMLTEXT" or cmdLine == "NXMLTEXT":
        logging.debug("stripping XML tags")
        if cmdLine == "NXMLTEXT":
            asciiData = pubXml.stripXmlTags(fileContent, isNxmlFormat=True)
        else:
            asciiData = pubXml.stripXmlTags(fileContent)

        if asciiData == None:
            logging.warn("Could not convert xml to ascii")
            return None
        fileData["content"] = asciiData
    else:
        # logging.verbose("data before conversion is %s" % fileContent)
        asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir)
        # logging.verbose("Ascii data after conversion is %s" % asciiData)
        if fileExt == "pdf" and (asciiData == None or countBadChars(asciiData) >= 10):
            logging.debug("No data or too many non printable characters in PDF, trying alternative program")
            cmdLine = converters["pdf2"]
            asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir)

        if asciiData == None:
            return None
        else:
            fileData["content"] = removeBadChars(asciiData)

    fileData = dictToUnicode(fileData)

    if len(fileData["content"]) > maxTxtFileSize:
        logging.warn("ascii file size after conversion too big, ignoring file")
        return None

    if len(fileData["content"]) < minTxtFileSize:
        logging.warn("ascii file size after conversion too small, ignoring file")
        return None

    # charSet = set(fileData["content"])
    # if len(charSet) < 10:
    # logging.warn("too few characters in ASCII output: %s" % charSet)
    # return None

    return fileData
Exemplo n.º 3
0
def toAscii(fileData, mimeType=None, \
        maxBinFileSize=pubConf.maxBinFileSize, maxTxtFileSize=pubConf.maxTxtFileSize, \
        minTxtFileSize=pubConf.minTxtFileSize):
    """ pick out the content from the fileData dictionary,
    write it to a local file in tempDir and convert it to
    ASCII format. Put output back into the content field.

    mimeType will be used if specified, otherwise try to guess
    converter based on url file extension

    returns fileData if successful, otherwise None
    returns only unicode strings (despite the name)
    """
    converters = pubConf.getConverters()
    tempDir = pubConf.getTempDir()

    fileContent = fileData["content"]
    fileSize = len(fileContent)

    if "locFname" in fileData:
        locFname = fileData["locFname"]
        fileDebugDesc = fileData["externalId"] + ":" + locFname
    else:
        locFname = None
        fileDebugDesc = ",".join([
            fileData["url"], fileData["desc"], fileData["fileId"],
            fileData["articleId"]
        ])

    if fileSize > maxBinFileSize:
        logging.warn("binary file size before conversion %d > %d, skipping file %s" % \
            (len(fileContent), maxBinFileSize, fileDebugDesc))
        return None

    fileExt = getFileExt(fileData, locFname, mimeType)

    if fileExt not in converters:
        logging.debug("Could not convert file %s, no converter for extension %s" % \
            (fileDebugDesc, fileExt))
        return None
    cmdLine = converters[fileExt]

    if cmdLine == "COPY":
        # fileData["content"] already contains ASCII text
        pass

    elif cmdLine == "XMLTEXT" or cmdLine == "NXMLTEXT":
        logging.debug("stripping XML tags")
        if cmdLine == "NXMLTEXT":
            asciiData = pubXml.stripXmlTags(fileContent, isNxmlFormat=True)
        else:
            asciiData = pubXml.stripXmlTags(fileContent)

        if asciiData == None:
            logging.debug("Could not convert xml to ascii, file %s" %
                          fileData["url"])
            return None
        fileData["content"] = asciiData

    else:
        asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir)

        # try to detect corrupted pdf2text output and run second converter
        if fileExt=="pdf" and \
            ((asciiData==None or len(asciiData)<minTxtFileSize) or countBadChars(asciiData)>=10):
            logging.debug(
                "No data or too many non printable characters in PDF, trying alternative program"
            )
            cmdLine = converters["pdf2"]
            asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir)

        if asciiData == None:
            logging.info("conversion failed for %s" % fileDebugDesc)
            return None
        else:
            fileData["content"] = removeBadChars(asciiData)

    fileData = dictToUnicode(fileData)

    if len(fileData["content"]) > maxTxtFileSize:
        logging.info(
            "ascii file size after conversion too big, ignoring file %s" %
            fileDebugDesc)
        return None

    if len(fileData["content"]) < minTxtFileSize:
        logging.debug("ascii file size only %d bytes < %d, ignoring %s" % \
            (len(fileData["content"]), minTxtFileSize, fileDebugDesc))
        return None

    #charSet = set(fileData["content"])
    #if len(charSet) < 10:
    #logging.warn("too few characters in ASCII output: %s" % charSet)
    #return None

    return fileData