示例#1
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries 
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error
    
    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""
        
    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True 
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
示例#2
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error

    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""

    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
示例#3
0
def createFileData(articleData, mimeType, asciiString):
    fileData = pubStore.createEmptyFileDict()
    fileData["desc"] = ""
    fileData["url"] = articleData["fulltextUrl"]
    fileData["content"] = asciiString
    fileData["mimeType"] = mimeType
    fileData["fileType"] = "main"
    return fileData
示例#4
0
def createFileData(articleData, mimeType, asciiString):
    fileData = pubStore.createEmptyFileDict()
    fileData["desc"] = ""
    fileData["url"] = articleData["fulltextUrl"]
    fileData["content"] = asciiString
    fileData["mimeType"] = mimeType
    fileData["fileType"] = "main"
    return fileData
示例#5
0
def createFileData(articleData, mimeType, asciiString):
    fileData = pubStore.createEmptyFileDict()
    fileData["desc"] = ""
    fileData["url"] = articleData["fulltextUrl"].replace("/article/", "/content/pdf/") + ".pdf"
    fileData["content"] = asciiString
    fileData["mimeType"] = mimeType
    fileData["fileType"] = "main"
    return fileData
示例#6
0
def createFileData(articleData, mimeType, asciiString):
    fileData = pubStore.createEmptyFileDict()
    fileData["desc"] = ""
    fileData["url"] = articleData["fulltextUrl"].replace("/article/", "/content/pdf/")+".pdf"
    fileData["content"] = asciiString
    fileData["mimeType"] = mimeType
    fileData["fileType"] = "main"
    return fileData
示例#7
0
def minimalHtmlToDicts(url, content):
    " a minimalistic article dict filler, does not try to parse the html "
    logging.debug("Falling back to minimal html to text")
    fileDict = pubStore.createEmptyFileDict(url=url, content=content, mimeType="text/html")
    fileDict = pubGeneric.toAsciiEscape(fileDict, mimeType="text/html")
    if fileDict==None or not "content" in fileDict:
        return None, None
    text = fileDict["content"]
    title = unidecode.unidecode(content[:100])
    abstract = unidecode.unidecode(content[100:1000])
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url, \
        title=title, abstract=abstract, externalId=url)
    #if fileDict==None: #continue
    return artDict, fileDict
示例#8
0
def minimalHtmlToDicts(url, content):
    " a minimalistic article dict filler, does not try to parse the html "
    logging.debug("Falling back to minimal html to text")
    fileDict = pubStore.createEmptyFileDict(url=url, content=content, mimeType="text/html")
    fileDict = pubGeneric.toAsciiEscape(fileDict, mimeType="text/html")
    if fileDict==None or not "content" in fileDict:
        return None, None
    text = fileDict["content"]
    title = unidecode.unidecode(content[:100])
    abstract = unidecode.unidecode(content[100:1000])
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url, \
        title=title, abstract=abstract, externalId=url)
    #if fileDict==None: #continue
    return artDict, fileDict