Пример #1
0
def treeToAscii_Elsevier(tree):
    """ try to convert an elsevier XML file to normal ascii text """
    logging.debug("Converting elsevier tree to ascii text")
    asciiText = ""
    dp = tree.find("document-properties")
    if dp!=None:
        rawTextEl = dp.find("raw-text")
        if rawTextEl!=None:
            rawText = rawTextEl.text
            if rawText!=None:
                try:
                    asciiText = rawText.encode('latin1').decode('utf8')
                except UnicodeEncodeError:
                    asciiText = pubGeneric.forceToUnicode(rawText)
                except UnicodeDecodeError:
                    asciiText = pubGeneric.forceToUnicode(rawText)
                #logging.debug("ascii is %s" % repr(rawText))
                return asciiText, "text/plain"

    articleEl, articleType = findMainArticleTag(tree)
    if articleEl is None:
        return None, None

    asciiText = pubXml.treeToAsciiText(articleEl, addNewlineTags=elsNewlineTags)
    return asciiText, "text/xml"
Пример #2
0
def treeToAscii_Elsevier(tree):
    """ try to convert an elsevier XML file to normal ascii text """
    logging.debug("Converting elsevier tree to ascii text")
    asciiText = ""
    dp = tree.find("document-properties")
    if dp != None:
        rawTextEl = dp.find("raw-text")
        if rawTextEl != None:
            rawText = rawTextEl.text
            if rawText != None:
                try:
                    asciiText = rawText.encode('latin1').decode('utf8')
                except UnicodeEncodeError:
                    asciiText = pubGeneric.forceToUnicode(rawText)
                except UnicodeDecodeError:
                    asciiText = pubGeneric.forceToUnicode(rawText)
                #logging.debug("ascii is %s" % repr(rawText))
                return asciiText, "text/plain"

    articleEl, articleType = findMainArticleTag(tree)
    if articleEl is None:
        return None, None

    asciiText = pubXml.treeToAsciiText(articleEl,
                                       addNewlineTags=elsNewlineTags)
    return asciiText, "text/xml"
Пример #3
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error

    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""

    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Пример #4
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries 
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error
    
    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""
        
    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True 
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Пример #5
0
    def _readFilesForArticle(self, articleId, fileDataList):
        " reads files until the articleId changes, adds them to fileDataList "

        for fileData in self.fileRows:
           logging.log(5, "Read file data %s for article %s" % \
               (str(fileData.fileId), fileData.articleId))
           text = pubGeneric.forceToUnicode(fileData.content)
           fileData = fileData._replace(content=text)
           if articleId==fileData.articleId:
               logging.log(5, "adding file data")
               fileDataList.append(fileData)
           else:
               fileIds = list(set([str(x.fileId)[:pubConf.ARTICLEDIGITS] for x in fileDataList]))
               logging.log(5, "article change. yielding: articleId %s, %d files with ids %s" % \
                   (articleId, len(fileDataList), fileIds))
               assert(len(fileIds)==1)
               assert(fileIds[0]==str(articleId))
               return fileDataList, fileData
        return fileDataList, None