def strip_wrapping(html): """ Removes the wrapping that might have resulted when using get_html_tree(). """ if html.startswith('<div>') and html.endswith('</div>'): html = html[5:-6] return html.strip()
def block_html(self, html): if self.texoid and html.startswith('<latex'): attr = html[6:html.index('>')] latex = html[html.index('>') + 1:html.rindex('<')] latex = self.parser.unescape(latex) result = self.texoid.get_result(latex) if not result: return '<pre>%s</pre>' % mistune.escape(latex, smart_amp=False) elif 'error' not in result: img = ('''<img src="%(svg)s" onerror="this.src='%(png)s';this.onerror=null"''' 'width="%(width)s" height="%(height)s"%(tail)s>') % { 'svg': result['svg'], 'png': result['png'], 'width': result['meta']['width'], 'height': result['meta']['height'], 'tail': ' /' if self.options.get('use_xhtml') else '' } style = ['max-width: 100%', 'height: %s' % result['meta']['height'], 'max-height: %s' % result['meta']['height'], 'width: %s' % result['meta']['height']] if 'inline' in attr: tag = 'span' else: tag = 'div' style += ['text-align: center'] return '<%s style="%s">%s</%s>' % (tag, ';'.join(style), img, tag) else: return '<pre>%s</pre>' % mistune.escape(result['error'], smart_amp=False) return super(AwesomeRenderer, self).block_html(html)
def from_string(cls, html): html = cls.get_unicode_html(html) # Enclosed in a `try` to prevent bringing the entire library # down due to one article (out of potentially many in a `Source`) # noinspection PyBroadException,PyUnusedLocal # lxml does not play well with <? ?> encoding tags if html.startswith('<?'): html = re.sub(r'^<\?.*?\?>', '', html, flags=re.DOTALL) cls.doc = lxml.html.fromstring(html) return cls.doc
def test_home_page_returns_correct_html(self): # we need Karyn in the DB in order to log her in. # load_model_objects returns a `dot_notation` dict which we can # use all of the model objects from, seen in the print stmnt below. self.client.login(username='******', password='******') response = self.client.get('/') html = response.content.decode('utf8').rstrip() self.assertTrue(html.startswith('<!DOCTYPE html>')) self.assertIn('<title>factotum</title>', html) self.assertTrue(html.endswith('</html>'))
def fromstring(cls, html): html = cls.get_unicode_html(html) # Enclosed in a `try` to prevent bringing the entire library # down due to one article (out of potentially many in a `Source`) try: # lxml does not play well with <? ?> encoding tags if html.startswith('<?'): html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL) cls.doc = lxml.html.fromstring(html) return cls.doc except Exception as e: print(e)
def fromstring(cls, html): html = cls.get_unicode_html(html) # Enclosed in a `try` to prevent bringing the entire library # down due to one article (out of potentially many in a `Source`) try: # lxml does not play well with <? ?> encoding tags if html.startswith('<?'): html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL) cls.doc = lxml.html.fromstring(html) return cls.doc except Exception: traceback.print_exc() return
def sanitize_fragment(html): ''' #html5lib reorders arguments, so not usable import html5lib return html5lib.parseFragment(html).toxml().decode('utf-8') ''' if not html: return u'' import lxml.html body = lxml.html.document_fromstring(html).find('body') html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8') if html.startswith('<p>') and html.endswith('</p>'): html = html[3:-4] return html
def fromstring(cls, html): html = utils.get_unicode(html, is_html=True) # Enclosed in a `try` to prevent bringing the entire library # down due to one article (out of potentially many in a `Source`) try: # Remove encoding tag because lxml won't accept it for # unicode objects (Issue #78) if html.startswith('<?'): html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL) cls.doc = lxml.html.fromstring(html) return cls.doc except Exception: traceback.print_exc() return None
def fromstring(cls, html): html = cls.get_unicode_html(html) # Enclosed in a `try` to prevent bringing the entire library # down due to one article (out of potentially many in a `Source`) try: # lxml does not play well with <? ?> encoding tags if html.startswith('<?'): html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL) cls.doc = lxml.html.fromstring(html) # print('this is the doc {}'.format(cls.doc)) return cls.doc except Exception: log.warn('fromstring() returned an invalid string: %s...', html[:20]) return
def fromstring(cls, html): html = cls.get_unicode_html(html) # Enclosed in a `try` to prevent bringing the entire library # down due to one article (out of potentially many in a `Source`) try: # lxml does not play well with <? ?> encoding tags if html.startswith('<?'): html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL) cls.doc = lxml.html.fromstring(html) return cls.doc except Exception: # Bad html ? Let's try to fix it. try: cls.doc = lxml.html.soupparser.fromstring(html) return cls.doc except Exception as exc: raise ValueError('Could not parse HTML.') from exc
def pre_parse(self): """ Pre-parse a html ebook. Does a full parse because a lightweight parse would be almost as much work. """ # cache if self.xhtml is not None: return debug("HTMLParser.pre_parse () ...") html = self.unicode_content() if html.startswith('<?xml'): # Try a naive parse. This might fail because of errors in # the html or because we have no dtd loaded. We do not # load dtds because that makes us dependent on network and # the w3c site being up. Having all users of ebookmaker # install local dtds is unrealistic. try: self.xhtml = self.__parse(html) except etree.ParseError: pass if self.xhtml is None: # previous parse failed, try tidy info("Running html thru tidy.") html = self.tidy(html) self.xhtml = self.__parse(html) # let exception bubble up self._fix_anchors() # needs relative paths self.xhtml.make_links_absolute(base_url=self.attribs.url) self._to_xhtml11() self._make_coverpage_link() debug("Done parsing %s" % self.attribs.url)
def fromstring(cls, html): # next line shouldn't be necessary because # we will always sanitize_html before passing in # which will always result in unicode # html = cls.get_unicode_html(html) # pylint: disable=no-member if html.startswith('<?'): html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL) # lxml parser must have utf8. We have unicode, though not # necessarily utf8 - so if there's an issue with 'switching # encoding' then we force utf8 encoding and try again try: cls.doc = lxml.html.fromstring(html) # pylint: disable=no-member except lxml.etree.XMLSyntaxError as error: if 'switching encoding' not in str(error): raise error html = codecs.encode(html, 'utf-8') cls.doc = lxml.html.fromstring(html) return cls.doc
def default_html_handler(get_handle, filename, view_name, request): html_template = ViewPageTemplateFile('html_view.pt') # exist-db base url base_url = '{}/@@view/{}'.format( request.context.absolute_url(1), '/'.join(request.subpath[:-1])) # get HTML html = get_handle.open(get_handle.leaf_filename, 'rb').read() html = unicode(html, 'utf8') if html.startswith(u'<?xml'): pos = html.find('?>') html = html[pos+3:] root = lxml.html.fromstring(html) # rewrite relative image urls for img in root.xpath('//img'): src = img.attrib['src'] if not src.startswith('http'): img.attrib['src'] = '{}/{}'.format(base_url, src) # rewrite relative image urls for link in root.xpath('//link'): src = link.attrib['href'] if not src.startswith('http'): link.attrib['href'] = '{}/{}'.format(base_url, src) html = lxml.html.tostring(root, encoding=unicode) return html_template.pt_render(dict( template='html_view', request=request, context=request.context, options=dict( base_url=base_url, html=html)))
def pre_parse (self): """ Pre-parse a html ebook. Does a full parse because a lightweight parse would be almost as much work. """ # cache if self.xhtml is not None: return debug ("HTMLParser.pre_parse () ...") html = self.unicode_content () if html.startswith ('<?xml'): # Try a naive parse. This might fail because of errors in # the html or because we have no dtd loaded. We do not # load dtds because that makes us dependent on network and # the w3c site being up. Having all users of epubmaker # install local dtds is unrealistic. try: self.xhtml = self.__parse (html) except etree.ParseError: pass if self.xhtml is None: # previous parse failed, try tidy info ("Running html thru tidy.") html = self.tidy (html) self.xhtml = self.__parse (html) # let exception bubble up self._fix_anchors () # needs relative paths self.xhtml.make_links_absolute (base_url = self.url) self.find_coverpage () self._to_xhtml11 () debug ("Done parsing %s" % self.url)
def convertAllData(outputCsv, outputMedia, rootDir, origMediaPrefix, dbName='we_import'): # connect to db #connection = psycopg2.connect("dbname=we_production") connection = psycopg2.connect('dbname='+dbName) cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor) origDocPrefix = origMediaPrefix + '/docs/0000/' origImagePrefix = origMediaPrefix + '/images/0000/' # grab table data crossRef = {} allEntries = grabData(cursor, 'entries', crossRef) allPages = grabData(cursor, 'pages', crossRef) allImages = grabData(cursor, 'images', crossRef) allDocuments = grabData(cursor, 'documents', crossRef) allSpecies = grabData(cursor, 'species', crossRef) allSections = grabData(cursor, 'sections', crossRef) allEvents = grabData(cursor, 'events', crossRef) allFaqs = grabData(cursor,'faqs', crossRef) allVersions = grabData(cursor,'versions', crossRef) allAlerts = grabData(cursor,'alerts', crossRef) allLocations = grabData(cursor,'locations', crossRef) rawEntriesSections = grabData(cursor, 'entries_sections', crossRef) rawAlertsSections = grabData(cursor,'alerts_sections', crossRef) # clean up database connection cursor.close() connection.close() # create entry-section lookup table entryToSection = {} for raw in rawEntriesSections: if raw['entry_id'] not in entryToSection: entryToSection[raw['entry_id']] = [raw['section_id']] else: entryToSection[raw['entry_id']].append(raw['section_id']) # create alert-section lookup table alertToSection = {} for raw in rawAlertsSections: if raw['alert_id'] not in alertToSection: alertToSection[raw['alert_id']] = [raw['section_id']] else: alertToSection[raw['alert']].append(raw['section_id']) # create media dirs try: shutil.rmtree(outputMedia) os.makedirs(outputMedia) os.makedirs(outputMedia + '/images') os.makedirs(outputMedia + '/documents') except: pass # find and copy latest version of each media file versionLookup = {} docFileMap = {} for version in crossRef['versions'].values(): docId = version.document_id versions = [(v.id,v.updated_at) for v in crossRef['versions'].values() if v.document_id==docId] ids,dates = zip(*versions) latestId = ids[dates.index(max(dates))] fileNameOrig = crossRef['versions'][latestId].filename fileName = convertFileName(fileNameOrig) fileNameBase = fileName postNum = 1 while fileName in versionLookup: postNum += 1 name,ext = os.path.splitext(fileNameBase) fileName = name + '-' + str(postNum) + ext version.out_filename = fileName fileNameOrig = '%s%04d/%s' % (origDocPrefix, latestId, fileNameOrig) fileNameNew = outputMedia + '/documents/' + fileName docFileMap[fileName] = fileNameOrig # TODO shutil.copy(fileNameOrig, fileNameNew) #print 'copied',fileNameOrig,'to',fileNameNew # index image files imageFileMap = {} for f in glob.glob(origImagePrefix + '/*/*'): fileName = convertFileName(os.path.basename(f)) destFile = outputMedia + '/images/' + fileName imageFileMap[fileName] = f # convert titles and set tags in all tables titleLookup = {} for name,table in crossRef.iteritems(): for curId,item in table.iteritems(): # convert title if hasattr(item,'title') and item.title is not None: title = item.title elif hasattr(item,'name') and item.name is not None: title = item.name elif hasattr(item,'filename') and item.filename is not None: title = item.filename elif hasattr(item,'common_name') and item.common_name is not None: title = item.common_name titleNew = convertTitle(title) titleNewBase = titleNew postNum = 1 while titleNew in titleLookup: postNum += 1 titleNew = titleNewBase + '-' + str(postNum) item.out_title = title item.out_title_link = titleNew titleLookup[titleNew] = True # convert date if hasattr(item,'updated_at'): item.out_date = item.updated_at # set tags if name=='entries': if curId in entryToSection: item.sections = entryToSection[curId] else: item.sections = [] elif name=='actions': if curId in actionToSection: item.sections = actionToSection[curId] else: item.sections = [] elif name=='sections': item.sections = [curId] # translate links in html mediaFiles = [] contentTypes = ['entries','pages','sections','locations','species','events','faqs','alerts'] for curType in contentTypes: for curId,entry in crossRef[curType].iteritems(): # get correct html field if hasattr(entry,'body_html'): htmlOrig = entry.body_html elif hasattr(entry,'description_html'): htmlOrig = entry.description_html if not htmlOrig: entry.out_content = '' continue # iterate over and translate each link tree = lxml.html.fromstring(htmlOrig.decode('utf-8')) links = tree.iterlinks() for link in links: linkBefore = link[0].get(link[1]) replaceLink(link, crossRef, rootDir, mediaFiles) linkAfter = link[0].get(link[1]) print 'TRANSLATED',linkBefore,'TO',linkAfter # form new html string html = lxml.html.tostring(tree) if html.endswith('</div>'): html = html[0:-6] if html.startswith('<div>'): html = html[5:] entry.out_content = html if '\x2019' in htmlOrig and 'path on the seawall' in htmlOrig: print '**********' print htmlOrig print '++++++++++' print html #sys.exit(-1) # find and copy images for media in mediaFiles: if media.isImage: if media.fileName in imageFileMap: destFile = outputMedia + '/images/' + media.fileName shutil.copy(imageFileMap[media.fileName], destFile) print 'copied image', imageFileMap[media.fileName], media.fileName else: print 'IMGFILE BAD', media.fileName else: if media.fileName in docFileMap: destFile = outputMedia + '/documents/' + media.fileName shutil.copy(docFileMap[media.fileName], destFile) print 'copied doc', docFileMap[media.fileName], media.fileName else: print 'DOCFILE BAD', media.fileName # collect all items allItems = [] for ref in [crossRef[contentType] for contentType in contentTypes]: allItems += ref.values() # add remaining fields curId = 1 for item in allItems: item.out_id = 10000 + curId curId += 1 item.out_tags = [] if hasattr(item,'sections'): item.out_tags = [crossRef['sections'][tag].title for tag in item.sections] print 'TAGS',item.out_tags item.out_thumb = '' # TODO: thumb # output csv f = open(outputCsv,'w') f.write('post_id,post_name,post_type,post_date,post_title,post_content,post_status,post_category,post_tags,post_thumbnail,news_summary\n') for item in allItems: f.write(createCsvRow(item)) f.close() print 'ALL DONE, wrote', len(allItems), 'records'