def test_xhtml_escape(self): tests = [ ("<foo>", "<foo>"), (u"<foo>", u"<foo>"), (b("<foo>"), b("<foo>")), ("<>&\"", "<>&""), ("&", "&amp;"), ] for unescaped, escaped in tests: self.assertEqual(utf8(xhtml_escape(unescaped)), utf8(escaped)) self.assertEqual(utf8(unescaped), utf8(xhtml_unescape(escaped)))
def get_part_of_page(url, xpath="//div[contains(@class,'entrybody')]", charset='utf-8'): page_contents = get_page(url, charset) if page_contents: tree = etree.parse(StringIO(page_contents), etree.HTMLParser()) find_content = etree.XPath(xpath) entry = find_content(tree) if len(entry) > 0: return xhtml_unescape(etree.tostring(entry[0], pretty_print=True).strip()) else: logging.error('xpath expression "%s" returned nothing on "%s" - modify it', xpath, url) return None else: return None
def get_part_of_page(url, xpath="//div[contains(@class,'entrybody')]", charset='utf-8'): page_contents = get_page(url, charset) if page_contents: tree = etree.parse(StringIO(page_contents), etree.HTMLParser()) find_content = etree.XPath(xpath) entry = find_content(tree) if len(entry) > 0: return xhtml_unescape( etree.tostring(entry[0], pretty_print=True).strip()) else: logging.error( 'xpath expression "%s" returned nothing on "%s" - modify it', xpath, url) return None else: return None
c.comment_author as 'author', c.comment_author_email as 'email', c.comment_author_url as 'author_url', c.comment_date as 'date', c.comment_content as 'content', c.user_id > 0 as 'is_user', CASE c.comment_type WHEN 'pingback' THEN 'pingback' ELSE 'comment' END as 'type', p.post_name, p.post_date, p.guid as 'old_path' FROM {0}comments c JOIN {0}posts p ON (c.comment_post_ID=p.ID) WHERE c.comment_approved='1' AND p.post_type='post' AND p.post_status='publish' ORDER BY p.ID ASC, c.comment_date ASC""".format(wp_prefix)) # """ comments_by_id = dict() threads = OrderedDict() for row in cur: comment = dict(list(zip([c[0] for c in cur.description], row))) comment['content'] = xhtml_unescape(comment['content']).replace( '\r', '') comment['postfile_path'] = path.join( outdir, str(comment['post_date'].year), "%02d" % comment['post_date'].month, comment['post_name'] + '.comments') # author if comment['type'] == 'pingback': comment['title'] = xhtml_unescape(comment['author']) comment['source'] = comment['author_url'] del comment['author_url'] del comment['author'] del comment['email'] # pingback verification if check_pingbacks: logging.debug(
c.comment_author as 'author', c.comment_author_email as 'email', c.comment_author_url as 'author_url', c.comment_date as 'date', c.comment_content as 'content', c.user_id > 0 as 'is_user', CASE c.comment_type WHEN 'pingback' THEN 'pingback' ELSE 'comment' END as 'type', p.post_name, p.post_date, p.guid as 'old_path' FROM {0}comments c JOIN {0}posts p ON (c.comment_post_ID=p.ID) WHERE c.comment_approved='1' AND p.post_type='post' AND p.post_status='publish' ORDER BY p.ID ASC, c.comment_date ASC""".format(wp_prefix)) # """ comments_by_id = dict() threads = OrderedDict() for row in cur: comment = dict(zip([c[0] for c in cur.description], row)) comment['content'] = xhtml_unescape(comment['content']).replace('\r', '') comment['postfile_path'] = path.join(outdir, str(comment['post_date'].year), "%02d" % comment['post_date'].month, comment['post_name']+'.comments') # author if comment['type'] == 'pingback': comment['title'] = xhtml_unescape(comment['author']) comment['source'] = comment['author_url'] del comment['author_url'] del comment['author'] del comment['email'] # pingback verification if check_pingbacks: logging.debug('about to load page "%s", which has been the source of a pingback', comment['source']) contents = get_page(comment['source']) old_path = '/'.join(['/', comment['old_path'].split('/', 3)[2], str(comment['post_date'].year), "%02d" % comment['post_date'].month, comment['post_name']]) if contents and ('href="http:'+old_path in contents or 'href="https:'+old_path in contents):