Exemplo n.º 1
0
def strip_wrapping(html):
    """
    Removes the wrapping that might have resulted when using get_html_tree().
    """
    if html.startswith('<div>') and html.endswith('</div>'):
        html = html[5:-6]
    return html.strip()
Exemplo n.º 2
0
def strip_wrapping(html):
    """
    Removes the wrapping that might have resulted when using get_html_tree().
    """
    if html.startswith('<div>') and html.endswith('</div>'):
        html = html[5:-6]
    return html.strip()
Exemplo n.º 3
0
 def block_html(self, html):
     if self.texoid and html.startswith('<latex'):
         attr = html[6:html.index('>')]
         latex = html[html.index('>') + 1:html.rindex('<')]
         latex = self.parser.unescape(latex)
         result = self.texoid.get_result(latex)
         if not result:
             return '<pre>%s</pre>' % mistune.escape(latex, smart_amp=False)
         elif 'error' not in result:
             img = ('''<img src="%(svg)s" onerror="this.src='%(png)s';this.onerror=null"'''
                    'width="%(width)s" height="%(height)s"%(tail)s>') % {
                       'svg': result['svg'], 'png': result['png'],
                       'width': result['meta']['width'], 'height': result['meta']['height'],
                       'tail': ' /' if self.options.get('use_xhtml') else ''
                   }
             style = ['max-width: 100%',
                      'height: %s' % result['meta']['height'],
                      'max-height: %s' % result['meta']['height'],
                      'width: %s' % result['meta']['height']]
             if 'inline' in attr:
                 tag = 'span'
             else:
                 tag = 'div'
                 style += ['text-align: center']
             return '<%s style="%s">%s</%s>' % (tag, ';'.join(style), img, tag)
         else:
             return '<pre>%s</pre>' % mistune.escape(result['error'], smart_amp=False)
     return super(AwesomeRenderer, self).block_html(html)
Exemplo n.º 4
0
 def from_string(cls, html):
     html = cls.get_unicode_html(html)
     # Enclosed in a `try` to prevent bringing the entire library
     # down due to one article (out of potentially many in a `Source`)
     # noinspection PyBroadException,PyUnusedLocal
     # lxml does not play well with <? ?> encoding tags
     if html.startswith('<?'):
         html = re.sub(r'^<\?.*?\?>', '', html, flags=re.DOTALL)
     cls.doc = lxml.html.fromstring(html)
     return cls.doc
Exemplo n.º 5
0
 def test_home_page_returns_correct_html(self):
     # we need Karyn in the DB in order to log her in.
     # load_model_objects returns a `dot_notation` dict which we can
     # use all of the model objects from, seen in the print stmnt below.
     self.client.login(username='******', password='******')
     response = self.client.get('/')
     html = response.content.decode('utf8').rstrip()
     self.assertTrue(html.startswith('<!DOCTYPE html>'))
     self.assertIn('<title>factotum</title>', html)
     self.assertTrue(html.endswith('</html>'))
Exemplo n.º 6
0
	def fromstring(cls, html):
		html = cls.get_unicode_html(html)
		# Enclosed in a `try` to prevent bringing the entire library
		# down due to one article (out of potentially many in a `Source`)
		try:
			# lxml does not play well with <? ?> encoding tags
			if html.startswith('<?'):
				html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL)
			cls.doc = lxml.html.fromstring(html)
			return cls.doc
		except Exception as e:
			print(e)
Exemplo n.º 7
0
 def fromstring(cls, html):
     html = cls.get_unicode_html(html)
     # Enclosed in a `try` to prevent bringing the entire library
     # down due to one article (out of potentially many in a `Source`)
     try:
         # lxml does not play well with <? ?> encoding tags
         if html.startswith('<?'):
             html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL)
         cls.doc = lxml.html.fromstring(html)
         return cls.doc
     except Exception:
         traceback.print_exc()
         return
Exemplo n.º 8
0
def sanitize_fragment(html):
    '''
    #html5lib reorders arguments, so not usable
    import html5lib
    return html5lib.parseFragment(html).toxml().decode('utf-8')
    '''
    if not html:
        return u''
    import lxml.html
    body = lxml.html.document_fromstring(html).find('body')
    html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')
    if html.startswith('<p>') and html.endswith('</p>'):
        html = html[3:-4]
    return html
Exemplo n.º 9
0
 def fromstring(cls, html):
     html = utils.get_unicode(html, is_html=True)
     # Enclosed in a `try` to prevent bringing the entire library
     # down due to one article (out of potentially many in a `Source`)
     try:
         # Remove encoding tag because lxml won't accept it for
         # unicode objects (Issue #78)
         if html.startswith('<?'):
             html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL)
         cls.doc = lxml.html.fromstring(html)
         return cls.doc
     except Exception:
         traceback.print_exc()
         return None
Exemplo n.º 10
0
 def fromstring(cls, html):
     html = utils.get_unicode(html, is_html=True)
     # Enclosed in a `try` to prevent bringing the entire library
     # down due to one article (out of potentially many in a `Source`)
     try:
         # Remove encoding tag because lxml won't accept it for
         # unicode objects (Issue #78)
         if html.startswith('<?'):
             html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL)
         cls.doc = lxml.html.fromstring(html)
         return cls.doc
     except Exception:
         traceback.print_exc()
         return None
Exemplo n.º 11
0
 def fromstring(cls, html):
     html = cls.get_unicode_html(html)
     # Enclosed in a `try` to prevent bringing the entire library
     # down due to one article (out of potentially many in a `Source`)
     try:
         # lxml does not play well with <? ?> encoding tags
         if html.startswith('<?'):
             html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL)
         cls.doc = lxml.html.fromstring(html)
         # print('this is the doc {}'.format(cls.doc))
         return cls.doc
     except Exception:
         log.warn('fromstring() returned an invalid string: %s...',
                  html[:20])
         return
Exemplo n.º 12
0
 def fromstring(cls, html):
     html = cls.get_unicode_html(html)
     # Enclosed in a `try` to prevent bringing the entire library
     # down due to one article (out of potentially many in a `Source`)
     try:
         # lxml does not play well with <? ?> encoding tags
         if html.startswith('<?'):
             html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL)
         cls.doc = lxml.html.fromstring(html)
         return cls.doc
     except Exception:
         # Bad html ? Let's try to fix it.
         try:
             cls.doc = lxml.html.soupparser.fromstring(html)
             return cls.doc
         except Exception as exc:
             raise ValueError('Could not parse HTML.') from exc
Exemplo n.º 13
0
    def pre_parse(self):
        """
        Pre-parse a html ebook.

        Does a full parse because a lightweight parse would be almost
        as much work.

        """

        # cache
        if self.xhtml is not None:
            return

        debug("HTMLParser.pre_parse () ...")

        html = self.unicode_content()

        if html.startswith('<?xml'):
            # Try a naive parse. This might fail because of errors in
            # the html or because we have no dtd loaded.  We do not
            # load dtds because that makes us dependent on network and
            # the w3c site being up.  Having all users of ebookmaker
            # install local dtds is unrealistic.
            try:
                self.xhtml = self.__parse(html)
            except etree.ParseError:
                pass

        if self.xhtml is None:
            # previous parse failed, try tidy
            info("Running html thru tidy.")
            html = self.tidy(html)
            self.xhtml = self.__parse(html)  # let exception bubble up

        self._fix_anchors()  # needs relative paths

        self.xhtml.make_links_absolute(base_url=self.attribs.url)

        self._to_xhtml11()

        self._make_coverpage_link()

        debug("Done parsing %s" % self.attribs.url)
Exemplo n.º 14
0
    def fromstring(cls, html):
        # next line shouldn't be necessary because
        # we will always sanitize_html before passing in
        # which will always result in unicode
        # html = cls.get_unicode_html(html)
        # pylint: disable=no-member
        if html.startswith('<?'):
            html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL)

        # lxml parser must have utf8.  We have unicode, though not
        # necessarily utf8 - so if there's an issue with 'switching
        # encoding' then we force utf8 encoding and try again
        try:
            cls.doc = lxml.html.fromstring(html)
        # pylint: disable=no-member
        except lxml.etree.XMLSyntaxError as error:
            if 'switching encoding' not in str(error):
                raise error
            html = codecs.encode(html, 'utf-8')
            cls.doc = lxml.html.fromstring(html)
        return cls.doc
Exemplo n.º 15
0
def default_html_handler(get_handle, filename, view_name, request):

    html_template = ViewPageTemplateFile('html_view.pt')

    # exist-db base url
    base_url = '{}/@@view/{}'.format(
        request.context.absolute_url(1),
        '/'.join(request.subpath[:-1]))
    # get HTML
    html = get_handle.open(get_handle.leaf_filename, 'rb').read()
    html = unicode(html, 'utf8')
    if html.startswith(u'<?xml'):
        pos = html.find('?>')
        html = html[pos+3:]
    root = lxml.html.fromstring(html)

    # rewrite relative image urls
    for img in root.xpath('//img'):
        src = img.attrib['src']
        if not src.startswith('http'):
            img.attrib['src'] = '{}/{}'.format(base_url, src)

    # rewrite relative image urls
    for link in root.xpath('//link'):
        src = link.attrib['href']
        if not src.startswith('http'):
            link.attrib['href'] = '{}/{}'.format(base_url, src)

    html = lxml.html.tostring(root, encoding=unicode)

    return html_template.pt_render(dict(
        template='html_view',
        request=request,
        context=request.context,
        options=dict(
            base_url=base_url,
            html=html)))
Exemplo n.º 16
0
    def pre_parse (self):
        """ Pre-parse a html ebook. Does a full parse because a
        lightweight parse would be almost as much work. """

        # cache
        if self.xhtml is not None:
            return

        debug ("HTMLParser.pre_parse () ...")

        html = self.unicode_content ()

        if html.startswith ('<?xml'):
            # Try a naive parse. This might fail because of errors in
            # the html or because we have no dtd loaded.  We do not
            # load dtds because that makes us dependent on network and
            # the w3c site being up.  Having all users of epubmaker
            # install local dtds is unrealistic.
            try:
                self.xhtml = self.__parse (html)
            except etree.ParseError:
                pass

        if self.xhtml is None:
            # previous parse failed, try tidy
            info ("Running html thru tidy.")
            html = self.tidy (html)
            self.xhtml = self.__parse (html)     # let exception bubble up

        self._fix_anchors () # needs relative paths
        self.xhtml.make_links_absolute (base_url = self.url)
        self.find_coverpage ()

        self._to_xhtml11 ()

        debug ("Done parsing %s" % self.url)
Exemplo n.º 17
0
def convertAllData(outputCsv, outputMedia, rootDir, origMediaPrefix, dbName='we_import'):

	# connect to db
	#connection = psycopg2.connect("dbname=we_production")
	connection = psycopg2.connect('dbname='+dbName)
	cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor)

	origDocPrefix = origMediaPrefix + '/docs/0000/'
	origImagePrefix = origMediaPrefix + '/images/0000/'

	# grab table data
	crossRef = {}
	allEntries = grabData(cursor, 'entries', crossRef)
	allPages = grabData(cursor, 'pages', crossRef)
	allImages = grabData(cursor, 'images', crossRef)
	allDocuments = grabData(cursor, 'documents', crossRef)
	allSpecies = grabData(cursor, 'species', crossRef)
	allSections = grabData(cursor, 'sections', crossRef)
	allEvents = grabData(cursor, 'events', crossRef)
	allFaqs = grabData(cursor,'faqs', crossRef)
	allVersions = grabData(cursor,'versions', crossRef)
	allAlerts = grabData(cursor,'alerts', crossRef)
	allLocations = grabData(cursor,'locations', crossRef)
	rawEntriesSections = grabData(cursor, 'entries_sections', crossRef)
	rawAlertsSections = grabData(cursor,'alerts_sections', crossRef)

	# clean up database connection
	cursor.close()
	connection.close()

	# create entry-section lookup table
	entryToSection = {}
	for raw in rawEntriesSections:
		if raw['entry_id'] not in entryToSection:
			entryToSection[raw['entry_id']] = [raw['section_id']]
		else:
			entryToSection[raw['entry_id']].append(raw['section_id'])

	# create alert-section lookup table
	alertToSection = {}
	for raw in rawAlertsSections:
		if raw['alert_id'] not in alertToSection:
			alertToSection[raw['alert_id']] = [raw['section_id']]
		else:
			alertToSection[raw['alert']].append(raw['section_id'])

	# create media dirs
	try:
		shutil.rmtree(outputMedia)
		os.makedirs(outputMedia)
		os.makedirs(outputMedia + '/images')
		os.makedirs(outputMedia + '/documents')
	except:
		pass

	# find and copy latest version of each media file
	versionLookup = {}
	docFileMap = {}
	for version in crossRef['versions'].values():
		docId = version.document_id
		versions = [(v.id,v.updated_at) for v in crossRef['versions'].values() if v.document_id==docId]
		ids,dates = zip(*versions)
		latestId = ids[dates.index(max(dates))]
		fileNameOrig = crossRef['versions'][latestId].filename
		fileName = convertFileName(fileNameOrig)

		fileNameBase = fileName
		postNum = 1
		while fileName in versionLookup:
			postNum += 1
			name,ext = os.path.splitext(fileNameBase)
			fileName = name + '-' + str(postNum) + ext
		version.out_filename = fileName
		fileNameOrig = '%s%04d/%s' % (origDocPrefix, latestId, fileNameOrig)
		fileNameNew = outputMedia + '/documents/' + fileName
		docFileMap[fileName] = fileNameOrig
		# TODO shutil.copy(fileNameOrig, fileNameNew)
		#print 'copied',fileNameOrig,'to',fileNameNew

	# index image files
	imageFileMap = {}
	for f in glob.glob(origImagePrefix + '/*/*'):
		fileName = convertFileName(os.path.basename(f))
		destFile = outputMedia + '/images/' + fileName
		imageFileMap[fileName] = f

	# convert titles and set tags in all tables
	titleLookup = {}
	for name,table in crossRef.iteritems():
		for curId,item in table.iteritems():
			# convert title
			if hasattr(item,'title') and item.title is not None:
				title = item.title
			elif hasattr(item,'name') and item.name is not None:
				title = item.name
			elif hasattr(item,'filename') and item.filename is not None:
				title = item.filename
			elif hasattr(item,'common_name') and item.common_name is not None:
				title = item.common_name
			titleNew = convertTitle(title)
			titleNewBase = titleNew
			postNum = 1
			while titleNew in titleLookup:
				postNum += 1
				titleNew = titleNewBase + '-' + str(postNum)
			item.out_title = title
			item.out_title_link = titleNew
			titleLookup[titleNew] = True
			
			# convert date
			if hasattr(item,'updated_at'):
				item.out_date = item.updated_at

			# set tags
			if name=='entries':
				if curId in entryToSection:
					item.sections = entryToSection[curId]
				else:
					item.sections = []
			elif name=='actions':
				if curId in actionToSection:
					item.sections = actionToSection[curId]
				else:
					item.sections = []
			elif name=='sections':
				item.sections = [curId]
		
	# translate links in html
	mediaFiles = []
	contentTypes = ['entries','pages','sections','locations','species','events','faqs','alerts']
	for curType in contentTypes:
		for curId,entry in crossRef[curType].iteritems():
			
			# get correct html field
			if hasattr(entry,'body_html'):
				htmlOrig = entry.body_html
			elif hasattr(entry,'description_html'):
				htmlOrig = entry.description_html
			if not htmlOrig:
				entry.out_content = ''
				continue

			# iterate over and translate each link
			tree = lxml.html.fromstring(htmlOrig.decode('utf-8'))
			links = tree.iterlinks()
			for link in links:
				linkBefore = link[0].get(link[1])
				replaceLink(link, crossRef, rootDir, mediaFiles)
				linkAfter = link[0].get(link[1])
				print 'TRANSLATED',linkBefore,'TO',linkAfter
				
			# form new html string
			html = lxml.html.tostring(tree)
			if html.endswith('</div>'):
				html = html[0:-6]
			if html.startswith('<div>'):
				html = html[5:]
			entry.out_content = html
			if '\x2019' in htmlOrig and 'path on the seawall' in htmlOrig:
				print '**********'
				print htmlOrig
				print '++++++++++'
				print html
				#sys.exit(-1)

	# find and copy images
	for media in mediaFiles:
		if media.isImage:
			if media.fileName in imageFileMap:
				destFile = outputMedia + '/images/' + media.fileName
				shutil.copy(imageFileMap[media.fileName], destFile)
				print 'copied image', imageFileMap[media.fileName], media.fileName
			else:
				print 'IMGFILE BAD', media.fileName
		else:
			if media.fileName in docFileMap:
				destFile = outputMedia + '/documents/' + media.fileName
				shutil.copy(docFileMap[media.fileName], destFile)
				print 'copied doc', docFileMap[media.fileName], media.fileName
			else:
				print 'DOCFILE BAD', media.fileName

			
	# collect all items
	allItems = []
	for ref in [crossRef[contentType] for contentType in contentTypes]:
		allItems += ref.values()

	# add remaining fields
	curId = 1
	for item in allItems:
		item.out_id = 10000 + curId
		curId += 1
		item.out_tags = []
		if hasattr(item,'sections'):
			item.out_tags = [crossRef['sections'][tag].title for tag in item.sections]
			print 'TAGS',item.out_tags
		item.out_thumb = '' # TODO: thumb

	# output csv
	f = open(outputCsv,'w')
	f.write('post_id,post_name,post_type,post_date,post_title,post_content,post_status,post_category,post_tags,post_thumbnail,news_summary\n')
	for item in allItems:
		f.write(createCsvRow(item))
	f.close()
	print 'ALL DONE, wrote', len(allItems), 'records'