def treeToAscii_Elsevier(tree): """ try to convert an elsevier XML file to normal ascii text """ logging.debug("Converting elsevier tree to ascii text") asciiText = "" dp = tree.find("document-properties") if dp!=None: rawTextEl = dp.find("raw-text") if rawTextEl!=None: rawText = rawTextEl.text if rawText!=None: try: asciiText = rawText.encode('latin1').decode('utf8') except UnicodeEncodeError: asciiText = pubGeneric.forceToUnicode(rawText) except UnicodeDecodeError: asciiText = pubGeneric.forceToUnicode(rawText) #logging.debug("ascii is %s" % repr(rawText)) return asciiText, "text/plain" articleEl, articleType = findMainArticleTag(tree) if articleEl is None: return None, None asciiText = pubXml.treeToAsciiText(articleEl, addNewlineTags=elsNewlineTags) return asciiText, "text/xml"
def treeToAscii_Elsevier(tree): """ try to convert an elsevier XML file to normal ascii text """ logging.debug("Converting elsevier tree to ascii text") asciiText = "" dp = tree.find("document-properties") if dp != None: rawTextEl = dp.find("raw-text") if rawTextEl != None: rawText = rawTextEl.text if rawText != None: try: asciiText = rawText.encode('latin1').decode('utf8') except UnicodeEncodeError: asciiText = pubGeneric.forceToUnicode(rawText) except UnicodeDecodeError: asciiText = pubGeneric.forceToUnicode(rawText) #logging.debug("ascii is %s" % repr(rawText)) return asciiText, "text/plain" articleEl, articleType = findMainArticleTag(tree) if articleEl is None: return None, None asciiText = pubXml.treeToAsciiText(articleEl, addNewlineTags=elsNewlineTags) return asciiText, "text/xml"
def convertHtmlToDicts(url, content): """ given a url and content, create file and article dictionaries content has to include normal newlines, no \a or #N# replacers returns None, None on error """ # lxml does not like unicode if the document has an explicit encoding if " encoding=" not in content: content = pubGeneric.forceToUnicode(content) logging.debug("Converting to text: %s " % (repr(url))) artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url) if not "<html" in content: return None, None try: logging.debug("Parsing html with lxml, html size %d" % len(content)) tree = lxml.html.document_fromstring(content) logging.debug("end parse html") except lxml.etree.XMLSyntaxError: return None, None titleEl = tree.find("head/title") if titleEl!=None: title = titleEl.text else: logging.debug("No title found?") title = "" metaTags = tree.findall("head/meta") artDict = parseMetaData(metaTags, artDict) logging.debug("Cleaning html tree") cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.meta = True cleaner.embedded = True cleaner.page_structure=True #cleaner.remove_tags = ["a", "li", "td"] cleanTree = cleaner.clean_html(tree) logging.debug("Cleaning done, now converting to ASCII") #text = cleanTree.text_content() newlineTags = ["p", "br"] asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags) logging.debug("ASCII conversion done") logging.debug("title: %s" % title) if "title" not in artDict or artDict["title"]=="": artDict["title"] = title if artDict["abstract"]=="": abstract = unidecode.unidecode(asciiText[0:1500]).strip() artDict["abstract"] = abstract logging.debug("abstract: %s" % artDict["abstract"]) fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html") logging.debug("meta data extract success: %s" % artDict) return artDict, fileDict
def _readFilesForArticle(self, articleId, fileDataList): " reads files until the articleId changes, adds them to fileDataList " for fileData in self.fileRows: logging.log(5, "Read file data %s for article %s" % \ (str(fileData.fileId), fileData.articleId)) text = pubGeneric.forceToUnicode(fileData.content) fileData = fileData._replace(content=text) if articleId==fileData.articleId: logging.log(5, "adding file data") fileDataList.append(fileData) else: fileIds = list(set([str(x.fileId)[:pubConf.ARTICLEDIGITS] for x in fileDataList])) logging.log(5, "article change. yielding: articleId %s, %d files with ids %s" % \ (articleId, len(fileDataList), fileIds)) assert(len(fileIds)==1) assert(fileIds[0]==str(articleId)) return fileDataList, fileData return fileDataList, None