def convertHtmlToDicts(url, content): """ given a url and content, create file and article dictionaries content has to include normal newlines, no \a or #N# replacers returns None, None on error """ # lxml does not like unicode if the document has an explicit encoding if " encoding=" not in content: content = pubGeneric.forceToUnicode(content) logging.debug("Converting to text: %s " % (repr(url))) artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url) if not "<html" in content: return None, None try: logging.debug("Parsing html with lxml, html size %d" % len(content)) tree = lxml.html.document_fromstring(content) logging.debug("end parse html") except lxml.etree.XMLSyntaxError: return None, None titleEl = tree.find("head/title") if titleEl!=None: title = titleEl.text else: logging.debug("No title found?") title = "" metaTags = tree.findall("head/meta") artDict = parseMetaData(metaTags, artDict) logging.debug("Cleaning html tree") cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.meta = True cleaner.embedded = True cleaner.page_structure=True #cleaner.remove_tags = ["a", "li", "td"] cleanTree = cleaner.clean_html(tree) logging.debug("Cleaning done, now converting to ASCII") #text = cleanTree.text_content() newlineTags = ["p", "br"] asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags) logging.debug("ASCII conversion done") logging.debug("title: %s" % title) if "title" not in artDict or artDict["title"]=="": artDict["title"] = title if artDict["abstract"]=="": abstract = unidecode.unidecode(asciiText[0:1500]).strip() artDict["abstract"] = abstract logging.debug("abstract: %s" % artDict["abstract"]) fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html") logging.debug("meta data extract success: %s" % artDict) return artDict, fileDict
def createFileData(articleData, mimeType, asciiString): fileData = pubStore.createEmptyFileDict() fileData["desc"] = "" fileData["url"] = articleData["fulltextUrl"] fileData["content"] = asciiString fileData["mimeType"] = mimeType fileData["fileType"] = "main" return fileData
def createFileData(articleData, mimeType, asciiString): fileData = pubStore.createEmptyFileDict() fileData["desc"] = "" fileData["url"] = articleData["fulltextUrl"].replace("/article/", "/content/pdf/") + ".pdf" fileData["content"] = asciiString fileData["mimeType"] = mimeType fileData["fileType"] = "main" return fileData
def createFileData(articleData, mimeType, asciiString): fileData = pubStore.createEmptyFileDict() fileData["desc"] = "" fileData["url"] = articleData["fulltextUrl"].replace("/article/", "/content/pdf/")+".pdf" fileData["content"] = asciiString fileData["mimeType"] = mimeType fileData["fileType"] = "main" return fileData
def minimalHtmlToDicts(url, content): " a minimalistic article dict filler, does not try to parse the html " logging.debug("Falling back to minimal html to text") fileDict = pubStore.createEmptyFileDict(url=url, content=content, mimeType="text/html") fileDict = pubGeneric.toAsciiEscape(fileDict, mimeType="text/html") if fileDict==None or not "content" in fileDict: return None, None text = fileDict["content"] title = unidecode.unidecode(content[:100]) abstract = unidecode.unidecode(content[100:1000]) artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url, \ title=title, abstract=abstract, externalId=url) #if fileDict==None: #continue return artDict, fileDict