示例#1
0
def grab_entry(url):
    doc = scraper.get(url).html()
    art = doc.find('.//div[@class="article"]')
    meta = {
        'html': html.tostring(art),
        'info_url': url
    }
    meta['person'] = art.findtext('./h3/span')
    
    for item in art.findall('.//li'):
        if 'download' in item.get('class', ''):
            doc_url = item.find('.//a').get('href')
            meta['source_url'] = urljoin(url, doc_url)
            continue

        label = item.findtext('./label')
        if label is not None:
            label = label.strip().lower().replace(' ', '_')

        content = item.find('./span')
        if content is None:
            continue

        content = html.tostring(content).split('>', 1)[-1].rsplit('<', 1)[0]
        if 'gifts' in item.get('class', ''):
            items = map(clean, content.split('<br>'))
            meta[label] = filter(lambda s: len(s), items)
        else:
            meta[label] = clean(content)
    if 'pdf' in meta.get('source_url', ''):
        print meta['source_url']
        collection.ingest(meta.get('source_url'), **meta)
示例#2
0
文件: app.py 项目: newsapps/layercake
    def prepare_updated_content(self,slug,rendered_content):
        """Given a slug, get its current content. Either insert the HTML string 'rendered_content' as the first child
        of the current content for the slug, or, if an existing element with the same tag and id exists, replace that
        element. Return the title and current content as a tuple of strings."""
        content_item = self.p2p_get_content_item(slug)
        current_content = content_item['body']
        title = content_item['title']
        parsed = fromstring(current_content)
        container = parsed.find('.//div[@id="layercake-items"]')
        if container is None:
            container = parsed.makeelement('div',{'id': 'layercake-items'})
            made_container = True
        else:
            made_container = False
        new_parsed = fromstring(rendered_content)
        try:
            existing = container.find("%s[@id='%s']" % (new_parsed.tag,new_parsed.attrib['id']))
        except KeyError:
            existing = None

        if existing is not None:
            existing.addnext(new_parsed)
            container.remove(existing)
        else:
            container.insert(0,new_parsed)

        # TODO: consider timestamping the CSS URL
        if made_container:
            new_content = tostring(container)
            return title,current_content + new_content

        return title,tostring(parsed)
    def test_detokenize_single(self):
        src_tree = self._load()
        orig_src_tree = deepcopy(src_tree)

        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(src_tree)
        new_tree = tokenizer.cleanup_tree(src_tree)
        self.assertIn(b'__START_ORG__', tostring(src_tree))
        self.assertNotIn(b'__START_ORG__', tostring(new_tree))

        self.assertHtmlTreeEqual(
            new_tree,
            html_document_fromstring(UNANNOTATED_HTML)
        )

        html_tokens, _ = tokenizer.tokenize_single(new_tree)
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertIn(b'__START_ORG__', tostring(detokenized_tree))

        self.assertHtmlTreeEqual(
            detokenized_tree,
            html_document_fromstring(ANNOTATED_HTML)
        )
        self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree)
        self.assertHtmlTreeEqual(detokenized_tree, src_tree)
    def get_items_from_page_num(self, num):
        url = self.WISHLIST_PAGE_TEMPLATE.format(
            wishlist_id=self.wishlist_id,
            page_number=num,
        )
        _LOG.debug("Fetch from: %s", url)

        wishlist_page = requests.get(url)
        wishlist_page_html = wishlist_page.text
        _PLAIN_ERROR_LOGGER.debug(wishlist_page_html)

        tree = html.fromstring(wishlist_page_html)
        all_h5_nodes = tree.xpath("//div[@class='a-row a-size-small']/h5")

        items = []
        for h5_node in all_h5_nodes:
            try:
                item = self._get_item_from_idea_h5_node(h5_node)
                if not item:
                    item = self._get_item_from_amazon_item_h5_node(h5_node)

                if item:
                    items.append(item)
                else:
                    _LOG.warn("Fail to retrieve an item for snippet")
                    _PLAIN_ERROR_LOGGER.warn("===== Start of snippet =====")
                    _PLAIN_ERROR_LOGGER.warn(html.tostring(h5_node))
                    _PLAIN_ERROR_LOGGER.warn("===== End of snippet =====")
            except ValueError as ex:
                _LOG.exception("Fail to retrieve an item: %s", ex)
                _PLAIN_ERROR_LOGGER.warn("===== Start of snippet =====")
                _PLAIN_ERROR_LOGGER.warn(html.tostring(h5_node))
                _PLAIN_ERROR_LOGGER.warn("===== End of snippet =====")

        return items
示例#5
0
文件: base.py 项目: dehao/news-diff
  def _decorate_article(self, article):
    """在 parse_response 後執行,後處理其輸出"""

    # html post-process
    from lxml.html import tostring, fromstring
    from bs4 import BeautifulSoup
    from lib.util.net import normalize_url
    from lib.util.text import pack_string

    # article['content'] may be list of lxml doms
    if type(article['content']) is list:
      article['content'] = \
        fromstring('\n'.join([tostring(x, encoding=unicode) for x in article['content']]))

    # remove unwanted tags
    self.css_sel_drop_tree(article['content'], ['script'])

    # prettify html with BeautifulSoup
    html_bs4 = BeautifulSoup(tostring(article['content'], encoding=unicode)).body.next

    article['text'] = pack_string(html_bs4.text)
    article['html'] = pack_string(unicode(html_bs4))
    article["ctlr_classname"] = str(self.__class__)

    article['url'] = normalize_url(article['url'])
    article['url_read'] = normalize_url(article['url_read'])
    article['url_canonical'] = normalize_url(article['url_canonical'])

    self.move_out_of_meta(article, 'title')

    return article
def get_services(category, trs):
    for tr in trs:
        print tostring(tr)
        tds = tr.cssselect("td")
    
        if len(tds) == 0:
            continue
        ahref = tds[0].cssselect("a")[0]
        link = ahref.attrib["href"]
        if category == 'Community Services':
            id = link.split('_')[0]
        elif category == 'Family Services':
            id = link.replace('result_detail.asp?externalId=', '')
        title = ahref.text_content()
        sub_category = tds[1].text_content()
        telephone_number = tds[2].text_content()
    
        print title, sub_category, telephone_number, link
    
        data = {
            'id' : id,
            'link' : base_url + link,
            'title' : title,
            'category' : category,
            'sub_category' : sub_category,
            'telephone_number' : telephone_number,
        }
        scraperwiki.sqlite.save(unique_keys=['id'], data=data)
示例#7
0
 def _read_version_history_html(self, forum_link):
     br = browser()
     br.set_handle_gzip(True)
     try:
         raw = br.open_novisit(forum_link).read()
         if not raw:
             return None
     except:
         traceback.print_exc()
         return None
     raw = raw.decode('utf-8', errors='replace')
     root = html.fromstring(raw)
     spoiler_nodes = root.xpath('//div[@class="smallfont" and strong="Spoiler"]')
     for spoiler_node in spoiler_nodes:
         try:
             if spoiler_node.getprevious() is None:
                 # This is a spoiler node that has been indented using [INDENT]
                 # Need to go up to parent div, then previous node to get header
                 heading_node = spoiler_node.getparent().getprevious()
             else:
                 # This is a spoiler node after a BR tag from the heading
                 heading_node = spoiler_node.getprevious().getprevious()
             if heading_node is None:
                 continue
             if heading_node.text_content().lower().find('version history') != -1:
                 div_node = spoiler_node.xpath('div')[0]
                 text = html.tostring(div_node, method='html', encoding='unicode')
                 return re.sub(r'<div\s.*?>', '<div>', text)
         except:
             if DEBUG:
                 prints('======= MobileRead Parse Error =======')
                 traceback.print_exc()
                 prints(html.tostring(spoiler_node))
     return None
示例#8
0
def get_article(url, mode=None):

    returnee = {}
    now = time.localtime()

    if not mode:
        agent = "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/1A542a Safari/419.3"
        structure = requests.get(url, headers={"User-Agent": agent}, timeout=5.0)

    else:
        structure = mode

    charset = structure.encoding


    tree = html.fromstring(structure.text)
    body = tree.cssselect("div#ct")[0]

    title = body.cssselect("div.end_tt h2")[0]
    title.remove(title.cssselect("a")[0])

    returnee["title"] = st.refine_text(html.tostring(title), encoding=charset)

    returnee["name"] = st.refine_text(html.tostring(body.cssselect("div.end_tt p span a")[0]), encoding=charset)

    date = datetime.datetime.now()
    try:
        date = DATE.parse(st.refine_text(html.tostring(body.cssselect("div.end_tt p span.s_tm")[0]), encoding=charset))
    except Exception, e:
        pass
  def get_branch_info(self):
    row={}
    row['date_scraped']=DATE
    trs=self.x.xpath('id("Centralcolum3_dtgGroup")/descendant::tr[td/*[self::span or self::strong]]')[:-1] #Skip the junk last row
    for tr in trs:
      tds=tr.xpath('td')
      if len(tds)==1:
        td=tds[0]
        if 2==td.xpath('count(span/b/text())'):
          row['loc1'],row['loc2']=[PostbankBrowser.compact(text) for text in td.xpath('span/b/text()')]
        else:
          log(tostring(td))

      elif len(tds)==2:
        cells=tr.xpath('td/*[self::span or self::strong]')
        key=cells[0].text
        value=cells[1].text

        for thing in [key,value]:
          if thing==None:
            thing=""
          else:
            thing=PostbankBrowser.compact(thing)

        row[key]=value
      else:
        raise self.TableRowError(tostring(tr))
    return row
示例#10
0
    def test_innerhtml(self):
        from mobilize.components import XPath
        html_str = '''<table><tr><td>Hello</td></tr></table>'''
        # test for innerhtml=False
        component_f = XPath('//td', idname='foo', innerhtml=False)
        component_f.extract(html.fromstring(html_str))
        extracted = component_f.process()
        extracted_str = html.tostring(extracted)
        expected = '<div class="mwu-elem" id="foo"><td>Hello</td></div>'
        e = normxml(expected)
        a = normxml(extracted_str)
        self.assertSequenceEqual(e, a)
        
        # test for innerhtml=True
        component_t = XPath('//td', idname='foo', innerhtml=True)
        component_t.extract(html.fromstring(html_str))
        extracted = component_t.process()
        extracted_str = html.tostring(extracted)
        expected = '<div class="mwu-elem" id="foo">Hello</div>'
        self.assertSequenceEqual(normxml(expected), normxml(extracted_str))
        
        # test for ineffectiveness of innerhtml=True with multiple matching elements
        component_t = XPath('//td', idname='foo', innerhtml=True)
        component_t.extract(html.fromstring('''
<table><tr>
<td>Hello</td>
<td>Goodbye</td>
</tr></table>
'''))
        extracted = component_t.process()
        extracted_str = html.tostring(extracted)
        expected = '<div class="mwu-elem" id="foo"><td>Hello</td><td>Goodbye</td></div>'
        self.assertSequenceEqual(normxml(expected), normxml(extracted_str))
示例#11
0
  def business_premises(self, session):
    s = session
    html = fromstring(s.get(self.business_premises_url()).content)
    premises = html.xpath('//table[@width="740" and @cellpadding="5"]')

    if len(html.xpath('//font[text()="No Business \n                      Premises Found."]')) == 1:
        data = []

    else:
        assert 1 == len(premises), tostring(html)
        trs = premises[0].cssselect('tr')
        datalists = [[td.text_content() for td in tr.cssselect('td')] for tr in trs]
        header = [key.replace(' ', '') for key in datalists.pop(0)]
        data = [dict(zip(header, row)) for row in datalists]
        for row in data:
            row.update({
                'date_scraped': DATE,
                'businessPremisesURL': self.business_premises_url()
            })

    registrant_data = {}
    for bodybold in html.cssselect('span.bodybold'):
        text = bodybold.xpath('following-sibling::span[@class="Text"][position()=1]/text()')
        assert len(text) == 1, tostring(html)
        registrant_data['bp_' + bodybold.text.replace(' ', '').replace(':', '')] = text[0].strip()

    return data, registrant_data
示例#12
0
def parse_book_file(href, book):
    block = {}
    book_tree = lxml.html.parse(join(books_dir, href), parser)
    if not 'page_count' in book:
        td = book_tree.xpath(
                "//td[descendant::*[contains(text(), '{}')]]".format(
                    book['title'])
                )
        if len(td):
            td = td[0]
            page_info = td.xpath("descendant::*[contains(text(), 'страниц')]")
            if len(page_info):
                book['page_count'] = patterns[0][1].search(
                        tostring(page_info[0], encoding='unicode')).groups()[0]

    block['annotation'] = book_tree.xpath(
            r"//table[descendant::*[contains(text(), 'Аннотация')]]")
    block['contents'] = book_tree.xpath(
            r"//table[descendant::*[contains(text(), 'Содержание')]]")
    for key in block:
        if len(block[key]):
            mark = block[key][-1]
            book[key] = ""
            for element in mark.itersiblings():
                if element.tag == "table":
                    break
                drop_a(element)
                remove_attr(element)
                book[key] += tostring(element, encoding='unicode')
            book[key] = tidy_fragment(clean(book[key]))[0]
    return book
示例#13
0
def WestBradfordGrill(clas):
  resp = l2.urlopen(clas.url)
  data = resp.read()
  #tappath = '//div[@id="sidebar-left-1"]/div[@id="TextList1"]//li'
  #doc = lxml.html.fromstring(data)
  a = lxml.html.fromstring(data)
  print tostring(a)
def download_user_review(url):
  try:
    f = get_url(url)

    page = html.parse(f)
    root = page.getroot()

    if len(root.cssselect("div.error404")) > 0:
      #print url + " 404'ed"
      return {}

    meta = html.tostring(root.cssselect("#player_review div.body div.user_reviews")[0])
    #@TODO parse meta
    if len(root.cssselect("#player_score_details div.body dl.review_details")) > 0:
      score_details = html.tostring(root.cssselect("#player_score_details div.body dl.review_details")[0])
    else:
      score_details = "No Details"
    body = html.tostring(root.cssselect("#player_review_body")[0])

    ret = {}
    ret['meta'] = meta
    ret['score_details'] = score_details
    ret['body'] = body
    #@TODO parse body
    ret['url'] = url
    return ret

    #ipdb.set_trace()
  except:
    traceback.print_exc()
    gmail.send("exception!", "*****@*****.**")
    ipdb.set_trace()
示例#15
0
def buildStatesFromTokenizerElement(state_name, html_snippet, sm):
    print __b(state_name), "in process"
    if state_name == "tokenizing-character-references":
        return

    dom = lhtml.fromstring(html_snippet)
    switch = dom.cssselect("dl")
    if not switch:
        raise Exception("%s does not have <dl> (switch)" % state_name)
    if len(switch) > 1:
        print __em("%s have too many <dl> (switch)" % state_name)
    switch = switch[0]

    transitions = []

    for elmt in switch:
        if elmt.tag not in ("dt", "dd"):
            continue
        if elmt.tag == "dt":
            dt_text = elmt.text
            if not dt_text:
                dt_text = lhtml.tostring(elmt)
            transitions.append(dt_text)
        elif elmt.tag == "dd":
            # We consume the transitions to jump into the
            # specified state
            buildSwitchTransitions(state_name, sm, switch=transitions, to=lhtml.tostring(elmt))
            transitions = []
示例#16
0
def get_meal_info2():
    url = "https://zerocater.com/menu/uVGcXhj/"
    tree = html.fromstring(urllib.urlopen(url).read())
    meals = tree.xpath('//div[@class="inherit-height meal-item"]')
    i = 0
    for meal in meals:
        today = meal.xpath('.//span[@class="meal-is-today label"]/text()')
        if len(today) > 0:
            break

        date = meal.xpath('.//h4[@class="overview-time"]/text()')
        date_string = get_string(date)
        date_string = ' '.join(date_string.split())
        day = int(date_string[-2:].strip())
        print day
        print date_string
        today = datetime.utcnow() - timedelta(hours=7)
        print today.day
        if today.day <= day:
            break

        i = i + 1

    meal_today = meals[i]
    vendor = meal_today.xpath('.//div[@class="overview-wrapper"]')[0]
    menu = meal_today.xpath('.//ul[@class="list-group swiper-no-swiping"]')[0]
    data = {}
    data['overview'] = html.tostring(vendor)
    data['menu'] = html.tostring(menu)
    print data
    return data
def create_html():
    """Creates the html for the page."""

    table = create_html_table(DISCOVERY_LIST)

    root = LH.tostring(table) #convert the generated HTML to a string

    comment = ET.Comment(TITLE)
    comment = ET.tostring(comment)

    script = ELEMENT.Script(SCRIPT, type="text/x-mathjax-config")
    script = LH.tostring(script)

    script2 = ELEMENT.Script(src=SCRIPT_URL, type="text/javascript")
    script2 = LH.tostring(script2)

    script3 = ELEMENT.Script(src=SORT_SCRIPT, type="text/javascript")
    script3 = LH.tostring(script3)


    email = ELEMENT.A(EMAIL, href=MAILTO)
    paragraph = ELEMENT.P(PARAGRAPH, email, ".")
    date_time = 'Updated ' + time.strftime('%Y-%m-%d %H:%M:%S')
    date_time = ELEMENT.I(date_time)
    paragraph.append(date_time)
    paragraph = LH.tostring(paragraph)
    paragraph = re.sub(r' \.', '.', paragraph)

    root = comment + script + script2 + script3 + paragraph + root

    soup = BS(root)         #make BeautifulSoup
    out = soup.prettify()   #prettify the html

    return out
示例#18
0
def html_chenger(li):

    if type(li) in [str, unicode, int, float]:
        return li

    if type(li) is html.HtmlElement:
        li = html.tostring(li)
        return li

    if type(li) is list and len(li) == 1:
        li = li[0]
        if type(li) is html.HtmlElement:
            li = html.tostring(li)
        return li

    if li == list():
        return ''

    if type(li) is list:
        for i, el in enumerate(li):
            if type(el) in [str, unicode, int, float]:
                continue
            elif type(el) is html.HtmlElement:
                li[i] = html.tostring(el)
        return '; '.join(li)
    return None
示例#19
0
def _get_image(doc):

    image_html = ''

    list = [l for l in extract_patterns.split('|')]

    """for i in list:
        p = i.split(':')
        pattern = Pattern(p[0], p[1])
        patterns.append(pattern)"""

    patterns = [Pattern(i.split(':')[0], i.split(':')[1]) for i in list]

    for p in patterns:
        try:
            if p.pattern_type == 'id':
                d = doc.get_element_by_id(p.pattern_value)
                image_html = html.tostring(d).strip()
                if len(image_html) > 0 and _image_count(image_html):
                    break
            elif p.pattern_type == 'class':
                d = doc.find_class(p.pattern_value)

                if d:
                    image_html = html.tostring(d[0])
                    if len(image_html) > 0 and _image_count(image_html) > 0:
                        break
        except Exception, ex:
            continue
示例#20
0
    def _parse(self, body):
        q = {}
        doc = HTML.fromstring(body)
        qbox = doc.xpath("//*[@id=\"question-box\"]")[0]
        qtitle = qbox.xpath(".//h1[@id='question-title']//span")[1]
        qbody = qbox.xpath(".//*[@id=\"question-content\"]")[0]
        #get question
        q["title"] = qtitle.text_content()
        self.title = q["title"]
        self.rkeys.append(self.title)
        q["body"] = qbody.text_content()

        anwsers = [None,None] #0 best anwser, 1 recommended anwser, 2-more other anwser
        '''get best anwser'''
        bae = doc.xpath("//*[@id='best-answer-panel']")
        if bae:
            ba = bae[0].xpath(".//*[@class='content']")[0]
            ba = HTML.tostring(ba,encoding="utf-8")
            anwsers[0] = ba

        '''get recommended anwser'''
        rae = doc.xpath("//*[@id='recommend-answer-panel']")
        if rae:
            ra = rae[0].xpath(".//*[@class='content']")[0]
            ra = HTML.tostring(ra,encoding="utf-8")
            anwsers[1] = ra

        '''get other anwsers'''
        oae = doc.xpath("//*[@id='reply-panel']")
        if oae:
            aes = oae[0].xpath(".//*[@class='content']")
            for aei in aes:
                anwsers.append(HTML.tostring(aei,encoding="utf-8"))
        q["anwsers"] = anwsers
        return q
示例#21
0
def LoadFP():
	check = getURL(ROOT_URL, False)

	if check[1] != {None:None}:
		# Needed Authentication ctTV-Main Page
		ctTV_Main = HTML.ElementFromURL(ROOT_URL, headers=check[1], cacheTime=0, encoding="Latin-1", errors="ignore")
	else:
		ctTV_Main = HTML.ElementFromURL(ROOT_URL, cacheTime=0, encoding="Latin-1", errors="ignore")

	# Read a string version of the page
	ctTV_MainString = cleanHTML(urllib2.urlopen(check[0]).read())

	# Get some MAIN Meta-Data of c't TV:
	mainTitle = ctTV_Main.xpath("/html/body/div[@id='navi_top']/div[1]/ul[1]/li[2]/a")[0]
	mainTitle = tostring(mainTitle).split('">')[1][:-4].replace('<span>','').replace('</span>','').encode('Latin-1').decode('utf-8')
	mainSubtitle = ctTV_Main.xpath("/html/body/div[@id='navi_top']/div[1]/ul[3]/li[4]/a")[0].text.encode('Latin-1').decode('utf-8')

	# Define current video
	currentVideoTitle1 = ctTV_Main.xpath("//*[@id='hauptbereich']/div[@id='video']/h1/text()")[0].encode('Latin-1').decode('utf-8')
	currentVideoTitle2 = ctTV_Main.xpath("//*[@id='hauptbereich']/div[@id='video']/h1")[0]
	currentVideoTitle2 = tostring(currentVideoTitle2).split('|')[1].split('<')[0].encode('Latin-1').decode('utf-8')
	currentVideoTitle = currentVideoTitle1 + '|' + currentVideoTitle2
	currentVideoURL = ROOT_URL

	themes = getThemes(ctTV_Main)
	topics = getTopics(ctTV_Main)
	archive = getArchive(ctTV_MainString)

	return (mainTitle, mainSubtitle, currentVideoTitle, currentVideoURL, themes, topics, archive)
示例#22
0
    def replace_terms(html):
        html = force_text(html)
        remove_body = False
        remove_p = False
        etree = parse(StringIO(html))
        root_node = etree.getroot()
        if not _looks_like_full_html_unicode(html):
            root_node = root_node.getchildren()[0]
            remove_body = True
            if root_node.getchildren()[0].tag == 'p' and html[:3] != '<p>':
                remove_p = True

        variants_dict = Term.objects.variants_dict()
        replace_dict = Term.objects.replace_dict()
        replace_regexp = Term.objects.replace_regexp()
        replace_regexp__sub = replace_regexp.sub
        translate = get_translate_function(replace_dict, variants_dict)

        for node in get_interesting_contents(root_node, replace_regexp):
            new_content = replace_regexp__sub(
                translate, tostring(node, encoding='unicode'))
            new_node = parse(StringIO(new_content)).getroot().getchildren()[0]
            if node.tag != 'body':
                new_node = new_node.getchildren()[0]
            node.getparent().replace(node, new_node)

        if remove_body:
            if remove_p:
                root_node = root_node.getchildren()[0]
            out = root_node.text or ''
            out += ''.join([tostring(node, encoding='unicode')
                            for node in root_node.getchildren()])
            return out
        return tostring(etree, encoding='unicode')
示例#23
0
文件: scrapyelp.py 项目: hseran/IAR
def myparser(reviewObj, element):
	populateReviewerInfo(reviewObj, element);
	

    #date
	tempList = element.cssselect('.review-meta .date')
	date = ''
	if (len(tempList) > 0):
		date = html.tostring(tempList[0], method='text', encoding=unicode).strip()

	reviewObj.setReviewDate(date)

	
    #comment
	tempList = element.cssselect('.externalReview .review_comment')
	comment = ''
	if (len(tempList) > 0):
		tempElement = html.fragment_fromstring(html.tostring(tempList[0]).replace('<br>', ' ').replace('<br/>', ' ').replace('<BR>', ' ').replace('<BR/>', ' '))
		comment = html.tostring(tempElement, method='text', encoding=unicode).strip()
		
	reviewObj.setReviewText(comment)

	#rating
	tempList = element.cssselect('.externalReview .review-meta .rating meta')
	rating = ''
	if (len(tempList) > 0):
		rating = tempList[0].get('content')
    
	reviewObj.setReviewRating(rating)
def copy_chapters_across_with_fixes(chapter_info, fixed_toc):
    comments_html = open('disqus_comments.html').read()
    buy_book_div = html.fromstring(open('buy_the_book_banner.html').read())
    analytics_div = html.fromstring(open('analytics.html').read())
    load_toc_script = open('load_toc.js').read()

    for chapter in CHAPTERS:
        old_contents = open(chapter).read()
        new_contents = fix_xrefs(old_contents, chapter, chapter_info)
        new_contents = fix_title(new_contents, chapter, chapter_info)
        parsed = html.fromstring(new_contents)
        body = parsed.cssselect('body')[0]
        if parsed.cssselect('#header'):
            head = parsed.cssselect('head')[0]
            head.append(html.fragment_fromstring('<script>' + load_toc_script + '</script>'))
            body.set('class', 'article toc2 toc-left')
        body.insert(0, buy_book_div)
        body.append(html.fromstring(
            comments_html.replace('CHAPTER_NAME', chapter.split('.')[0])
        ))
        body.append(analytics_div)
        fixed_contents = html.tostring(parsed)

        with open(DEST / chapter, 'w') as f:
            f.write(fixed_contents.decode('utf8'))
        with open(DEST / 'toc.html', 'w') as f:
            f.write(html.tostring(fixed_toc).decode('utf8'))
示例#25
0
    def results(self, query, pages_max=1):
        for page in range(1, pages_max + 1):
            if page > 1:
                if not self._next(page):
                    break
            else:
                self.browser.submit_form(self.url, fields={'q': query})

            for li in self.browser.cssselect('li.g', []):
                log = html.tostring(li, pretty_print=True)[:1000]

                links = li.cssselect('a')
                if not links:
                    logger.error('failed to get links from %s', log)
                    continue
                url = links[0].get('href')
                if not url or not urlparse(url).scheme:
                    continue
                title = clean(self.get_link_text(html.tostring(links[0])))
                if not title:
                    continue
                yield {
                    'title': title,
                    'url': url,
                    'page': page,
                    }
示例#26
0
def expect_redirect(step, from_url, to_url):
    """Go to a url and expect a 302, check the DOM returned == expected DOM.

    Bit weird this one, and might not be useful. The :py:data:`to_url` you are
    expecting to eventually reach (via the :py:data:`from_url`) is first hit
    in the normal fashion and the DOM is saved to a string. The
    :py:data:`from_url` is then hit, checked for a 302 and the eventual DOM is
    compared to the stored one.

    If Selenium is used, the :py:data:`from_url` is hit, and waited for as per
    :py:func:`access_url`.

    """
    step.given('I access the url "%s"' % to_url)
    expected_dom_str = html.tostring(world.dom)
    response = world.browser.get(from_url)
    code = response.status_code
    assert code == 302, \
        "Failed to get a 302 for %s, got %s" % (from_url, code)

    response = world.browser.get(from_url, follow=True)
    world.dom = html.fromstring(response.content)
    world.templates = [t.name for t in response.template]

    assert html.tostring(world.dom) == expected_dom_str, \
        "Expected DOM doesn't match redirected DOM"

    if world.using_selenium:
        world.sel.open(from_url)
        world.sel.wait_for_page_to_load(world.timeout)
示例#27
0
文件: app.py 项目: maralla/dict
def word_def(word):
    word = word.lower()
    phrase = '-'.join(word.split())
    words = [word, phrase, '%s_1' % word, '%s_1' % phrase]
    cursor = db.words.find({'word': {'$in': words}})
    if cursor.count():
        w = cursor.next()
        return jsonify(word=word, content=w['content'], related=w['related'])
    try:
        word_define = urlopen('%s/dictionary/%s' % (app.config['URL'], word))
    except:
        abort(404)
    doc = etree.HTML(word_define.read())
    if '/spellcheck/?q' in word_define.url:
        content = polish(tostring(doc.xpath(
            "/html/body/div[@id='ox-container']"
            "/div[@id='ox-wrapper']/div[@id='main_column']")[0])).strip()
        related = '#'
    else:
        contentElem = doc.xpath(
            "/html/body/div[@id='ox-container']/div[@id='ox-wrapper']"
            "/div[@id='main_column']/div[@id='main-container']"
            "/div[@id='entryContent']")[0]
        content = polish(tostring(contentElem)).strip()
        related = polish(tostring(doc.xpath(
            "/html/body/div[@id='ox-container']/div[@id='leftcolumn']"
            "/div[@id='relatedentries']")[0])).strip()
        thread = DocParseThread(contentElem, content, related)
        thread.start()
        g.thread = thread
    return jsonify(word=word, content=content, related=related)
示例#28
0
def get_forms(request):
    
    # take passed url, use regular expression to capture domain
    def get_store_name(url):
        store_name = re.search(r'http://www.(\w*).', url).group(1)
        return store_name
            
    if request.is_ajax():
        product_url = request.POST['product_url']
        store_name = get_store_name(product_url)
        form_list = []
                                
        # setup the browser object
        b = mechanize.Browser()
        b.set_handle_robots(False)
        b.set_proxies({'http': 'api.crawlera.com'})
        b.add_proxy_password("jquintal","we8GeegieR")
        
        # fetch page and open in lxml
        b_response = b.open(product_url)
        html = b_response.read()
        tree = lh.fromstring(html)
        
        # fetch forms    
        if(store_name == "target"):
            forms = tree.cssselect('.order-item')
            for form in forms:
                form_list.append(lh.tostring(form))

        elif(store_name == "radioshack"):
            # GET RADIO SHACK FORMS
            # NOTE: RadioShack appears to have no forms other than quantity, meaning: they list all product variants as separate entries EX: Beats Pill Blue, Beats Pill Red, etc.
            pass
        
        elif(store_name == "amazon"):
            # GET AMAZON FORMS
            pass

        elif(store_name == "toysrus"):
            forms = tree.cssselect('#buyInterior')
            for div in forms:
                form_list.append(lh.tostring(div))
                
        elif(store_name == "tigerdirect"):
            forms = tree.cssselect('.prodAction')
            for form in forms:
                form_list.append(lh.tostring(form))
                
        elif(store_name == "overstock"):
            forms = tree.cssselect('#addCartWrap_addCartMain')
            for form in forms:
                form_list.append(lh.tostring(form))
                
        elif(store_name == "newegg"):
            # NOT YET WORKING
            forms = tree.cssselect('.grpQty')
            for form in forms:
                form_list.append(lh.tostring(form))
                                                  
        return render_to_response('cart/store_forms.html', {'forms': form_list})
示例#29
0
def copy_chapters_across_fixing_xrefs(chapter_info, fixed_toc):
    comments_div = html.fromstring(open('disqus_comments.html').read())
    buy_book_div = html.fromstring(open('buy_the_book_banner.html').read())
    analytics_div = html.fromstring(open('analytics.html').read())
    load_toc_script = open('load_toc.js').read()

    for chapter in CHAPTERS:
        new_contents = fix_xrefs(chapter, chapter_info)
        parsed = html.fromstring(new_contents)
        body = parsed.cssselect('body')[0]
        if parsed.cssselect('#header'):
            head = parsed.cssselect('head')[0]
            head.append(html.fragment_fromstring('<script>' + load_toc_script + '</script>'))
            body.set('class', 'article toc2 toc-left')
        body.insert(0, buy_book_div)
        body.append(comments_div)
        body.append(analytics_div)
        fixed_contents = html.tostring(parsed)

        target = os.path.join('/home/harry/workspace/www.obeythetestinggoat.com/content/book', chapter)
        with open(target, 'w') as f:
            f.write(fixed_contents.decode('utf8'))
        toc = '/home/harry/workspace/www.obeythetestinggoat.com/content/book/toc.html'
        with open(toc, 'w') as f:
            f.write(html.tostring(fixed_toc).decode('utf8'))
def download_user_review(url):
  try:
    if url.find("http://www.gamespot.com") == -1:
      url = "http://www.gamespot.com" + url
    f = urllib.urlopen(url)
  except:
    traceback.print_exc()
    ipdb.set_trace()

  try:
    page = html.parse(f)
    root = page.getroot()

    meta = html.tostring(root.cssselect("#player_review div.body div.user_reviews")[0])
    if len(root.cssselect("#player_score_details div.body dl.review_details")) > 0:
      score_details = html.tostring(root.cssselect("#player_score_details div.body dl.review_details")[0])
    else:
      score_details = "No Details"
    body = html.tostring(root.cssselect("#player_review_body")[0])

    ret = {}
    ret['meta'] = meta
    ret['score_details'] = score_details
    ret['body'] = body
    return ret

    #ipdb.set_trace()
  except:
    traceback.print_exc()
    ipdb.set_trace()
示例#31
0
文件: _base.py 项目: simudream/spyne
 def any_html_to_string(self, cls, value):
     return html.tostring(value)
示例#32
0
with open('./rebuild.sh') as f:
    lines = f.readlines()[2:]

t = [x.strip('# \n') for x in lines[-12::2]]
dic = [(x.split('|')[0].decode('utf-8'), x.split('|')[1]) for x in t]
dic = dic[::-1]

with open('./html/index.html') as f:
    root = fromstring(f.read())

ul = root.xpath('//ul[@id="toc"]')[0]
lis = ul.xpath('./li')

# 移除li
for li in lis:
    li.getparent().remove(li)
# 生成新的li
for name, date in dic:
    li = etree.Element('li')
    a = etree.Element('a', href='./' + name + '.html')
    a.text = name
    span = etree.Element('span')
    span.set("class", 'time')
    span.text = date
    a.append(span)
    li.append(a)
    ul.append(li)

with open('./html/index.html', 'wb') as f:
    f.write(tostring(root, encoding='unicode').encode('utf-8'))
示例#33
0
def extract_from_html(msg_body):
    """
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.

    Cut out the 'blockquote', 'gmail_quote' tags.
    Cut Microsoft quotations.

    Then use plain text algorithm to cut out splitter or
    leftover quotation.
    This works by adding checkpoint text to all html tags,
    then converting html to text,
    then extracting quotations from text,
    then checking deleted checkpoints,
    then deleting necessary tags.
    """

    if msg_body.strip() == '':
        return msg_body

    # Fix bad HTML caused by weird encoding issues
    if msg_body.count('=3D') > 2:
        # it's unlikely this was intentional
        msg_body = msg_body.replace('=3D', '=')
        # also get rid of trailing equals; in doing so, strip newlines
        # as there may have been spurious ones inserted in the middle of tags
        msg_body = msg_body.replace('=\n', '')

    try:
        html_tree = html.document_fromstring(
            msg_body,
            parser=html.HTMLParser(encoding="utf-8")
        )
    except etree.ParserError:
        # Malformed HTML, don't try to strip.
        return msg_body

    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
                      html_quotations.cut_blockquote(html_tree) or
                      html_quotations.cut_microsoft_quote(html_tree) or
                      html_quotations.cut_by_id(html_tree) or
                      html_quotations.cut_from_block(html_tree)
                      )

    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False for i in xrange(number_of_checkpoints)]
    msg_with_checkpoints = html.tostring(html_tree)

    h = html2text.HTML2Text()
    h.body_width = 0  # generate plain text without wrap

    # html2text adds unnecessary star symbols. Remove them.
    # Mask star symbols
    msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
    plain_text = h.handle(msg_with_checkpoints)
    # Remove created star symbols
    plain_text = plain_text.replace('*', '')
    # Unmask saved star symbols
    plain_text = plain_text.replace('3423oorkg432', '*')

    delimiter = get_delimiter(plain_text)

    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
        return msg_body

    # Collect checkpoints on each line
    line_checkpoints = [
        [int(i[4:-4])  # Only checkpoint number
         for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)]
        for line in lines]

    # Remove checkpoints
    lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line)
             for line in lines]

    # Use plain text quotation extracting algorithm
    markers = mark_message_lines(lines)
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags

    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in xrange(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True
    else:
        if cut_quotations:
            return html.tostring(html_tree_copy)
        else:
            return msg_body

    # Remove tags with quotation checkpoints
    html_quotations.delete_quotation_tags(
        html_tree_copy, 0, quotation_checkpoints
    )

    return html.tostring(html_tree_copy)
示例#34
0
        new_page = requests.get(notebook_page)
        current_status = new_page.status_code

        if current_status != 404:
            # Get image and text if any
            new_tree = html.fromstring(new_page.content)

            if new_tree.xpath('count(//textarea)') != 0:
                page_text = new_tree.xpath('//textarea/text()')

                with open(new_path + 'data/' + page_number + ".txt", "w") as d:
                    d.write(page_text[0].encode("utf-8"))

                d.close()

            if new_tree.xpath('count(//table/tr/th[2])') != 0:
                page_image = html.tostring(
                    new_tree.xpath('//table/tr/th[2]')[0])
                image_text = page_image[page_image.find("src=") + 5:page_image.
                                        find("jpg", page_image.find("src=")) +
                                        3]

                r = requests.get(root_page + notebook_links[count] +
                                 image_text)

                i = Image.open(StringIO(r.content))
                i.save(new_path + 'images/' + image_text)

    count += 1
示例#35
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = 'ipetrash'


from lxml import html
root = html.fromstring('<p>Hello<br>world!</p><br>')

print(html.tostring(root))                      # b'<div><p>Hello<br>world!</p><br></div>'
print(html.tostring(root, encoding='unicode'))  # <div><p>Hello<br>world!</p><br></div>
print(html.tostring(root, pretty_print=True))   # b'<div>\n<p>Hello<br>world!</p>\n<br>\n</div>\n'
print(html.tostring(root, encoding='unicode', pretty_print=True))
# <div>
# <p>Hello<br>world!</p>
# <br>
# </div>
示例#36
0
def inner_html(node):
    """
    get original html from a node
    """
    return (node.text or '') + ''.join(
        [html.tostring(child) for child in node.iterchildren()])
示例#37
0
def get_html_from_element(element):
    return tostring(element)
示例#38
0
def semanticize(doc_path='test.html'):
    """
    P: unbroken set of lines (.t divs) of the same look make one <p>
    H1-3: Top 3 kinds of font size are turned to h1, h2 and h3.
    TABLE: use x and y position to indicate <td>, TODO: colspan support
    """
    print(doc_path)
    dom, dimensions = prepare(doc_path)
    get_dimension = lambda el, dim_type: dimensions[dim_type].get(
        classN(dim_type, el)) or 0

    # recover text from embedded fonts with bad CMAPS if > 50% of characters are unicode PUA
    recover = pua_content(dom.text_content()) > 0.5
    if recover:
        print('Recovery needed, not now.')
        return
        recover_text(dom, os.path.dirname(doc_path))

    # remove paging headers
    if REMOVE_HEADERS:
        dom = remove_headers(dom)

    # remove javascript holders
    for div in dom.cssselect('.j'):
        remove(div)

    if TABLES:
        table_data = grid_data(dom, get_dimension)
        dom = reconstruct_tables(dom, table_data)

    h_levels = heading_levels(dom, dimensions)

    # line by line analysis and conversion
    p_look = p_height = p_space = p_tag = box = 0

    for l in dom.cssselect('.t'):
        # Gather information about this line to see if it's part of a block.
        # 1. detect change of look - different css classes from previous line
        look = ' '.join([
            c for c in l.attrib['class'].split()
            if c[0] != 'y' and c[0:2] != 'fc'
        ])  # ignore y pos and font color
        new_look = p_look != look
        # 2. detect change of margin height - larger difference in bottom position from previous line
        height = get_dimension(l, 'h')
        line_height = p_height - height
        margin = line_height > MAX_LINE_HEIGHT
        # 3. space above - preceding empty line
        space = not l.text_content().strip()

        # Based on collected info: does this line belong to previous line?
        append = new_look == p_space == margin == False

        txt = l.text_content()

        tag = 'p'

        # LI
        indent = 'x0' not in look  # there is some indentation
        if [1 for b in BULLETS if txt.startswith(b)]:
            tag = 'li'
            append = 0
        elif indent and p_tag == 'li':
            tag = 'li'
            append = 1
        # H1, H2...
        size = classN('fs', l)
        if size in h_levels.keys():
            append = 0
            tag = 'h%s' % h_levels[size]

        # merge multiline-elements
        if txt.strip():
            if append:
                if BR: box.append(Element('br'))
                box.append(l)
            else:
                box = l
                l.tag = tag
        else:
            remove(l)

        if DEBUG:
            mark = ('<%s>' % tag).ljust(5)
            if append: mark = 5 * ' '
            print(' Aa %d    ⇪ %d    ⇕ % 3d    %s    %s    %s' %\
                (new_look, p_space, line_height, l.attrib['class'].ljust(40), mark, txt))

        # save current values for comparison in the next loop iteration
        p_space, p_height, p_look, p_tag = space, height, look, tag

    wrap_set(dom, 'li', 'ul')

    if STRIP_CSS:
        for e in dom.cssselect("style"):
            remove(e)
        for attr in 'style id class data-page-no data-data'.split():
            for e in dom.cssselect("*"):
                try:
                    del e.attrib[attr]
                except KeyError:
                    pass

    # save file
    html = tostring(dom, encoding=ENCODING, pretty_print=True).decode(ENCODING)
    s = '<!DOCTYPE html>' + html
    for a, b in REPLACE_AFTER:
        s = re.sub(a, b, s)
    for rm in REMOVE_AFTER:
        s = re.sub(rm, '', s)
    for b in BULLETS:
        s = s.replace(b, '')
    if recover:
        for rm in REMOVE_BEFORE:
            s = re.sub(rm, '', s)

    # New file is .htm, not .html
    save_path = doc_path[:-1]
    f = open(save_path, 'w', encoding=ENCODING)
    f.write(s)
    f.close()
示例#39
0
            pre = content.makeelement('pre', {'class': 'converted-comment'})
            pre.text = c.text
            c.getparent().replace(c, pre)
        else:
            logger.warn('Removing commment')
            c.getparent().remove(c)

    # Convert style="text-align: right" to class
    for tag in content.xpath("//*[starts-with(@style, 'text-align: right')]"):
        logger.debug('Converting "text-align: right" to class')
        del tag.attrib['style']
        tag.attrib['class'] = 'text-right'

    # Convert style="text-align: center" to class
    for tag in content.xpath("//*[starts-with(@style, 'text-align: center')]"):
        logger.debug('Converting "text-align: center" to class')
        del tag.attrib['style']
        tag.attrib['class'] = 'text-center'

    # Check for missed style attributes
    for tag in content.xpath("//*[@style]"):
        logger.warn('Found remaining style attribute')
        sys.exit('Giving up')

    chapter = Chapter()
    chapter.html = HTML(html.tostring(content), encoding='utf-8')
    chapter.title = page['title']
    book.sections.append(chapter)

book.make(book.title + '.epub')
示例#40
0
def examine_meta(tree):
    '''Search meta tags for relevant information'''
    metadata = dict.fromkeys(METADATA_LIST)
    # bootstrap from potential OpenGraph tags
    title, author, url, description, site_name = extract_opengraph(tree)
    # test if all return values have been assigned
    if all((title, author, url, description,
            site_name)):  # if they are all defined
        metadata['title'], metadata['author'], metadata['url'], metadata[
            'description'], metadata[
                'sitename'] = title, author, url, description, site_name
        return metadata
    tags = []
    # skim through meta tags
    for elem in tree.iterfind('.//head/meta[@content]'):
        # content
        if not elem.get('content'):
            continue
        content_attr = elem.get('content')
        # image info
        # ...
        # property
        if 'property' in elem.attrib:
            # no opengraph a second time
            if elem.get('property').startswith('og:'):
                continue
            if elem.get('property') == 'article:tag':
                tags.append(content_attr)
            elif elem.get('property') in ('author', 'article:author'):
                if author is None:
                    author = content_attr
        # name attribute
        elif 'name' in elem.attrib:
            name_attr = elem.get('name').lower()
            # author
            if name_attr in ('author', 'byl', 'dc.creator', 'dcterms.creator',
                             'sailthru.author'):  # twitter:creator
                if author is None:
                    author = content_attr
            # title
            elif name_attr in ('title', 'dc.title', 'dcterms.title',
                               'fb_title', 'sailthru.title', 'twitter:title'):
                if title is None:
                    title = content_attr
            # description
            elif name_attr in ('description', 'dc.description',
                               'dcterms.description', 'dc:description',
                               'sailthru.description', 'twitter:description'):
                if description is None:
                    description = content_attr
            # site name
            elif name_attr in ('publisher', 'dc.publisher',
                               'dcterms.publisher', 'twitter:site',
                               'application-name'
                               ) or 'twitter:app:name' in elem.get('name'):
                if site_name is None:
                    site_name = content_attr
            # url
            elif name_attr == 'twitter:url':
                if url is None and validate_url(content_attr)[0] is True:
                    url = content_attr
            # keywords
            elif name_attr == 'keywords':  # 'page-topic'
                tags.append(content_attr)
        elif 'itemprop' in elem.attrib:
            if elem.get('itemprop') == 'author':
                if author is None:
                    author = content_attr
            elif elem.get('itemprop') == 'description':
                if description is None:
                    description = content_attr
            elif elem.get('itemprop') == 'headline':
                if title is None:
                    title = content_attr
            # to verify:
            #elif elem.get('itemprop') == 'name':
            #    if title is None:
            #        title = elem.get('content')
        # other types
        else:
            if not 'charset' in elem.attrib and not 'http-equiv' in elem.attrib and not 'property' in elem.attrib:
                LOGGER.debug(
                    html.tostring(elem, pretty_print=False,
                                  encoding='unicode').strip())
    metadata['title'], metadata['author'], metadata['url'], metadata[
        'description'], metadata['sitename'], metadata[
            'tags'] = title, author, url, description, site_name, tags
    return metadata
import scraperwiki
from mechanize import Browser
from BeautifulSoup import BeautifulSoup
from lxml import html
import time

mech = Browser()
url = "http://www.bnm.gov.my/index.php?ch=12&pg=852"
page = mech.open(url)
html1 = page.read()
tree = html.fromstring(html1)
table, = tree.xpath('//*[.="Tenure"]/ancestor::table[1]')
soup1 = BeautifulSoup(html.tostring(table))
table = soup1.find("table")
now = time.time()
for row in table.findAll('tr')[1:]:
    col = row.findAll('td')
    data = {
        'time': now,
        'Tenure': col[0].string,
        'Buying': col[1].string,
        'Selling': col[2].string
    }
    scraperwiki.sqlite.save(unique_keys=['time'], data=data)
    now = now + 1
import scraperwiki
from mechanize import Browser
from BeautifulSoup import BeautifulSoup
from lxml import html
import time
示例#42
0
def sanitize_html(html):
    html = html5parser.fragment_fromstring(html, create_parent="div")
    html = cleaner.clean_html(tostring(html)).decode()
    return html
示例#43
0
def show_tip_filter(qa_html, qa, dummy_fields, dummy_model, dummy_data,
                    dummy_col):
    """
    Filter the answers to add the kanji diagram pop-ups.
    """
    if not question_tips and not qa == 'a':
        return qa_html
    global do_show
    global current_script
    do_show = False
    current_script = show_tips_script
    try:
        doc = html.fromstring(qa_html)
    except:
        return qa_html
    elements = []
    for ts in tip_selectors:
        elements += doc.cssselect(ts)
    elements = uniqify_list(elements)
    for el in elements:
        skip_elements = []
        for skip_sel in skip_selectors:
            skip_elements += el.cssselect(skip_sel)
        skip_elements = uniqify_list(skip_elements)
        for sub_el in el.iter():
            if sub_el in skip_elements:
                continue
            if sub_el.text is not None:
                bad_chars = media_characters(sub_el.text)
                new_index = 0
                new_element = None
                tip_text = u''
                sub_e_t = sub_el.text
                for i, g in enumerate(sub_e_t):
                    if i in bad_chars:
                        tip_text += g
                        continue
                    ge = maybe_make_tip(g)
                    if ge is not None:
                        do_show = True
                        if new_element is None:
                            sub_el.text = tip_text
                        else:
                            # new_element is the old new element...
                            new_element.tail = tip_text
                        sub_el.insert(new_index, ge)
                        new_index += 1
                        new_element = ge
                        tip_text = u''
                    else:
                        tip_text += g
                if new_element is not None:
                    new_element.tail = tip_text
            if sub_el is not el and sub_el.tail is not None:
                # We have to skip the tail of the element that
                # trigered the selector. That is *not* in the
                # selector.
                bad_chars = media_characters(sub_el.tail)
                parent = sub_el.getparent()
                new_index = parent.index(sub_el) + 1
                new_element = None
                tip_tail = u''
                sub_e_t = sub_el.tail
                for i, g in enumerate(sub_e_t):
                    if i in bad_chars:
                        tip_tail += g
                        continue
                    ge = maybe_make_tip(g)
                    if ge is not None:
                        do_show = True
                        if new_element is None:
                            sub_el.tail = tip_tail
                        else:
                            new_element.tail = tip_tail
                        # We have to inser this into the parent, not
                        # into this sub_el.
                        parent.insert(new_index, ge)
                        new_index += 1
                        new_element = ge
                        tip_tail = u''
                    else:
                        tip_tail += g
                if new_element is not None:
                    new_element.tail = tip_tail
    if do_show:
        head = doc[1]
        jqui_style = html.Element('link')
        jqui_style.set('type', 'text/css')
        jqui_style.set('rel', 'stylesheet')
        jqui_style.set('href', jqui_style_path)
        jqui_style.tail = '\n'
        head.append(jqui_style)
        jqui_theme_style = html.Element('link')
        jqui_theme_style.set('type', 'text/css')
        jqui_theme_style.set('rel', 'stylesheet')
        jqui_theme_style.set('href', jqui_theme_style_path)
        jqui_theme_style.tail = '\n'
        head.append(jqui_theme_style)
        tt_style = html.Element('link')
        tt_style.set('type', 'text/css')
        tt_style.set('rel', 'stylesheet')
        tt_style.set('href', tips_style_path)
        tt_style.tail = '\n'
        head.append(tt_style)
    return unicode(urllib.unquote(html.tostring(doc, encoding='utf-8')),
                   'utf-8')
示例#44
0
    def getSoup(self, link):
        start = requests.get(link)
        tree = html.fromstring(start.text)
        soup = BeautifulSoup(html.tostring(tree))

        return soup
示例#45
0
def post2rss(post, digest=False, pic=None, extra_types=()):
    """
  :param post (dict): 帖子数据
  :param digest (bool): 输出摘要
  :param pic (str): pic=cf 或 pic=google:指定图片代理提供方
  :param extra_types (tuple): 除回答和文章之外的其他帖子类型
  :return: PyRSS2Gen.RSSItem: post RSS item
  """
    if post['type'] == 'answer':
        title = '[回答] %s' % post['question']['title']
        url = 'https://www.zhihu.com/question/%s/answer/%s' % (
            post['question']['id'], post['id'])
        t_c = post['created_time']
        author = post['author']['name']

    elif post['type'] == 'article':
        title = '[文章] %s' % post['title']
        url = 'https://zhuanlan.zhihu.com/p/%s' % post['id']
        t_c = post['created']
        author = post['author']['name']

    elif post['type'] == 'pin':
        title = '[想法] %s' % post['excerpt_title']
        url = 'https://www.zhihu.com/pin/%s' % post['id']
        t_c = post['created']
        author = post['author']['name']

    elif 'question' in extra_types and post['type'] == 'question':
        title = '[问题] %s' % post['title']
        url = 'https://www.zhihu.com/question/%s' % (post['id'])
        t_c = post['created']
        author = None

    elif post['type'] == 'ANSWER_VOTE_UP':
        title = '[赞同了回答] %s by %s' % (post['question']['title'],
                                      post['author']['name'])
        url = 'https://www.zhihu.com/question/%s/answer/%s' % (
            post['question']['id'], post['id'])
        t_c = post['vote_up_time']
        author = post['author']['name']

    elif post['type'] == 'MEMBER_VOTEUP_ARTICLE':
        title = '[赞同了文章] %s by %s' % (post['title'], post['author']['name'])
        url = 'https://zhuanlan.zhihu.com/p/%s' % post['id']
        t_c = post['vote_up_time']
        author = post['author']['name']

    elif post['type'] == 'QUESTION_ANSWER':
        title = '%s 的回答' % post['author']['name']
        url = 'https://www.zhihu.com/question/%s/answer/%s' % (
            post['question']['id'], post['id'])
        t_c = post['created_time']
        author = post['author']['name']

    elif post['type'] == 'MEMBER_COLLECT_ANSWER':
        title = '[收藏了回答] %s by %s' % (post['question']['title'],
                                      post['author']['name'])
        url = 'https://www.zhihu.com/question/%s/answer/%s' % (
            post['question']['id'], post['id'])
        t_c = post['created_time']
        author = post['author']['name']

    elif post['type'] == 'MEMBER_COLLECT_ARTICLE':
        title = '[收藏了文章] %s by %s' % (post['title'], post['author']['name'])
        url = 'https://zhuanlan.zhihu.com/p/%s' % post['id']
        t_c = post['created']
        author = post['author']['name']

    elif post['type'] in ['roundtable', 'live', 'column']:
        return

    else:
        logger.warn('unknown type: %s', post['type'])
        return

    if post['type'] == 'pin':
        content = pin_content(post)
    else:
        content = post_content(post, digest)

    if post['type'] == 'ANSWER_VOTE_UP':
        content += "<p>回答发布于 %s </p>" % (datetime.datetime.utcfromtimestamp(
            post['created_time']).strftime('%Y-%m-%d %H:%M:%S'))
        content += "<p>回答编辑于 %s </p>" % (datetime.datetime.utcfromtimestamp(
            post['updated_time']).strftime('%Y-%m-%d %H:%M:%S'))
    elif post['type'] == 'MEMBER_VOTEUP_ARTICLE':
        content += "<p>文章发布于 %s </p>" % (datetime.datetime.utcfromtimestamp(
            post['created']).strftime('%Y-%m-%d %H:%M:%S'))
        content += "<p>文章编辑于 %s </p>" % (datetime.datetime.utcfromtimestamp(
            post['updated']).strftime('%Y-%m-%d %H:%M:%S'))
    else:
        pass

    content = content.replace('<code ', '<pre><code ')
    content = content.replace('</code>', '</code></pre>')

    # Post only contains images but no text
    if not content:
        content = '<img src="%s">' % post.get('thumbnail')

    doc = fromstring(content)
    tidy_content(doc)
    if pic:
        base.proxify_pic(doc, re_zhihu_img, pic)
    content = tostring(doc, encoding=str)

    pub_date = datetime.datetime.utcfromtimestamp(t_c)

    item = PyRSS2Gen.RSSItem(
        title=title.replace('\x08', ''),
        link=url,
        guid=url,
        description=content.replace('\x08', ''),
        pubDate=pub_date,
        author=author,
    )
    return item
示例#46
0
def linkedin_companies_parser(url):
    for i in range(1):
        try:
            headers = {
                'User-Agent':
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
            }
            print "Fetching :", url
            response = requests.get(url, headers=headers, verify=False)
            print response.content
            print response.status_code
            print response.headers
            print response.history
            formatted_response = response.content.replace('<!--', '').replace(
                '-->', '')
            doc = html.fromstring(formatted_response)
            print html.tostring(doc)
            '''
            datafrom_xpath = doc.xpath('//code[@id="stream-promo-top-bar-embed-id-content"]//text()')
            content_about = doc.xpath('//code[@id="stream-about-section-embed-id-content"]')
            print('weird')
            if not content_about:
                content_about = doc.xpath('//code[@id="stream-footer-embed-id-content"]')
            if content_about:
                pass
                # json_text = content_about[0].html_content().replace('<code id="stream-footer-embed-id-content"><!--','').replace('<code id="stream-about-section-embed-id-content"><!--','').replace('--></code>','')
            
            if datafrom_xpath:
                try:
                    json_formatted_data = json.loads(datafrom_xpath[0])
                    company_name = json_formatted_data['companyName'] if 'companyName' in json_formatted_data.keys() else None
                    size = json_formatted_data['size'] if 'size' in json_formatted_data.keys() else None
                    industry = json_formatted_data['industry'] if 'industry' in json_formatted_data.keys() else None
                    description = json_formatted_data['description'] if 'description' in json_formatted_data.keys() else None
                    follower_count = json_formatted_data['followerCount'] if 'followerCount' in json_formatted_data.keys() else None
                    year_founded = json_formatted_data['yearFounded'] if 'yearFounded' in json_formatted_data.keys() else None
                    website = json_formatted_data['website'] if 'website' in json_formatted_data.keys() else None
                    type = json_formatted_data['companyType'] if 'companyType' in json_formatted_data.keys() else None
                    specialities = json_formatted_data['specialties'] if 'specialties' in json_formatted_data.keys() else None

                    if "headquarters" in json_formatted_data.keys():
                        city = json_formatted_data["headquarters"]['city'] if 'city' in json_formatted_data["headquarters"].keys() else None
                        country = json_formatted_data["headquarters"]['country'] if 'country' in json_formatted_data['headquarters'].keys() else None
                        state = json_formatted_data["headquarters"]['state'] if 'state' in json_formatted_data['headquarters'].keys() else None
                        street1 = json_formatted_data["headquarters"]['street1'] if 'street1' in json_formatted_data['headquarters'].keys() else None
                        street2 = json_formatted_data["headquarters"]['street2'] if 'street2' in json_formatted_data['headquarters'].keys() else None
                        zip = json_formatted_data["headquarters"]['zip'] if 'zip' in json_formatted_data['headquarters'].keys() else None
                        street = street1 + ', ' + street2
                    else:
                        city = None
                        country = None
                        state = None
                        street1 = None
                        street2 = None
                        street = None
                        zip = None

                    data = {
                                'company_name': company_name,
                                'size': size,
                                'industry': industry,
                                'description': description,
                                'follower_count': follower_count,
                                'founded': year_founded,
                                'website': website,
                                'type': type,
                                'specialities': specialities,
                                'city': city,
                                'country': country,
                                'state': state,
                                'street': street,
                                'zip': zip,
                                'url': url
                            }
                    return data
                except:
                    print "cant parse page", url
            '''
            # Retry in case of captcha or login page redirection
            if len(response.content) < 2000 or "trk=login_reg_redirect" in url:
                if response.status_code == 404:
                    print("linkedin page not found")
                else:
                    raise ValueError(
                        'redirecting to login page or captcha found')
        except Exception as e:
            print str(e)
            print "retrying :", url
def lxmlTable2Pandas3(*args, **kwargs):
    kwargs['Data'] = pd.read_html(lh.tostring(kwargs['Data']))[0]
    print(kwargs['Data'])
    return kwargs
 def get_body(self):
     body_ele = self.tree.xpath("//div[contains(@id,'qnaContainer-')]")
     if body_ele is None:
         return None
     body_ele = body_ele[0]
     return html.tostring(body_ele, pretty_print=True).decode()
示例#49
0
def innerHTML(el):
    if el is None:
        return ''
    return (el.text or '') + ''.join(
        html.tostring(x, encoding="unicode") for x in el)
示例#50
0
def outerHTML(el):
    if el is None:
        return ''
    return html.tostring(el, with_tail=False, encoding="unicode")
示例#51
0
 def any_html_to_unicode(self, cls, value, **_):
     return html.tostring(value, encoding='unicode')
示例#52
0
    def get_articles(self):
        headers = {
            'Pragma': 'no-cache',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Referer': 'http://www.nlpjob.com/',
            'Connection': 'keep-alive',
            'Cache-Control': 'no-cache',
        }
        # for page in range(self.page, self.max_page+1):
        for page in range(self.page, 30):  # TODO check if next page exist
            print('\ngo to page:', page)
            with open('.progress.json', 'w+') as f:
                json.dump({'page': page}, f)
            params = {'p': page}
            try:
                a = time.time()
                resp = requests.get(self.base_url,
                                    headers=headers,
                                    params=params)
                b = time.time()
                print(b - a)
            except Exception as e:
                print(e)
                sys.exit(1)
            else:
                if resp.status_code != 200:
                    print('code != 200')
                    sys.exit(1)
                if '全部职位' not in resp.text:
                    print('not in the right page')
                    print('current page:', resp.url)
                    sys.exit(1)

                tree = html.fromstring(resp.text)
                articles = tree.xpath('//div[contains(@class, "row")]')
                print('count:', len(articles))
                for article in articles:
                    article = html.fromstring(html.tostring(article))

                    publish_time = article.xpath('//span[@class="time-posted"]'
                                                 )[0].text_content().strip()
                    if '2017-06-27' in publish_time:
                        sys.exit(1)

                    href = article.xpath('//span[@class="row-info"]/a/@href')

                    title = title = article.xpath(
                        '//span[@class="row-info"]/a/text()')

                    if href and title:
                        href = href[0]
                        title = title[0].strip()
                    else:
                        break

                    id = href.split('/')[4]

                    article_json = {
                        'id': id,
                        'href': href,
                        'title': title,
                        'publishTime': publish_time,
                        'status': 'not_done'
                    }
                    # pprint(article_json)

                    if not self.col.find_one({'id': id}):
                        self.col.insert_one(article_json)

                # break ### for debug

        return True
示例#53
0
def reader(link, start, last, folder):
    #link = "https://www.wuxiaworld.com/novel/against-the-gods/atg-chapter-0"
    found = ""
    f = requests.get(link)
    page = html.fromstring(f.text)
    p = html.tostring(page).decode('utf-8')

    p = p.replace("&#8220;", '"')
    p = p.replace("&#8230;", "...")
    p = p.replace("&#8221;", '"')
    p = p.replace("&#8217;", "'")
    p = p.replace("&#8211;", "-")

    lines = p.splitlines()

    for i in range(len(lines)):
        if lines[i] == '<div class="fr-view">':
            found = lines[i + 1]
            break

    for i in range(len(lines)):
        if '/images/arrow-right.png' in lines[i]:
            nextchap = lines[i - 1]
            nextchap = re.search('"(.*)" class', nextchap).group(1)
            nextchap = "https://www.wuxiaworld.com" + nextchap
            break

    found = found.replace("</p><p>", "\n\n")
    found = found.replace("<p>", "")
    found = found.replace("</p>", "")
    found = found.replace("<strong>", "")
    found = found.replace("</strong>", "")
    name = found.splitlines()[0]

    if (("Chapter" not in name) or ("Previous" in name) or (len(name) > 45)):
        name = "Chapter " + str(start)

    name = name.translate(str.maketrans('', '', string.punctuation))

    found = '\n'.join(found.splitlines()[1:])
    found = '\n'.join(found.splitlines()[:-3])

    file = open(folder + "/" + name + ".html", "w+")
    file.write(
        r'<style>p { font-family: Palatino, "Palatino Linotype", "Palatino LT STD", "Book Antiqua", Georgia, serif; font-size: 20px; font-style: normal; font-variant: normal; font-weight: 400; line-height: 25px; }</style>'
    )
    file.write("<h2><strong><center>" + name + "</center></strong></h2>" +
               "<br>")
    file.write("<p>" + found.replace("\n\n", "</p><p>") + "</p>")
    file.close()

    options = {
        'page-size': 'Executive',
        'margin-top': '0.75in',
        'margin-right': '0.75in',
        'margin-bottom': '0.75in',
        'margin-left': '0.75in',
    }
    path_wkthmltopdf = r'C:\\Program Files\\wkhtmltopdf\bin\\wkhtmltopdf.exe'
    config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
    pdfkit.from_file(folder + r'/' + name + ".html",
                     folder + r'/' + name + ".pdf",
                     options=options,
                     configuration=config)
    os.remove(folder + "/" + name + ".html")
    if (start == last):
        return False
    else:
        print(nextchap)
        return reader(nextchap, start + 1, last, folder)
示例#54
0
 def any_html_to_bytes(self, cls, value, **_):
     return html.tostring(value)
示例#55
0
    def process_html(self, archive, context, text, target, options):
        # soup = fragment_fromstring(b'<article>' + text.encode('utf-8', 'replace') + b'</article>', create_parent=True)
        soup = fragment_fromstring(text, create_parent=True)
        escape = html.escape
        console = context[".console"]

        def write_error(insert_ref, el, msg, exc=None):
            log.error("insert '%s' failed; %s", insert_ref, msg)

            if context[".debug"]:
                if exc is not None:
                    c = Console(text=True, width=120)
                    c.obj(context, exc)
                    _html = '<pre class="moya-insert-error"><code>{}</code></pre>'.format(
                        escape(c.get_text()))
                else:
                    _html = '<pre class="moya-insert-error"><code>{}</code></pre>'.format(
                        escape(msg))
                new_el = fromstring(_html)
                el.getparent().replace(el, new_el)
            else:
                el.getparent().remove(el)

            console.obj(context, exc)

        for el in self._selector(soup):

            try:
                insert_ref = el.attrib["insert"]
            except IndexError:
                write_error(el, "no 'insert' attribute in <moya> markup tag")

            app = None
            attribs = dict(el.attrib.items())
            app_name = attribs.pop("app", None) or context.get(
                ".app.name", None)
            if app_name is None:
                write_error(insert_ref, el,
                            "'app' attribute is required on <moya> tag")
                continue

            # Get data params
            params = {k.rsplit("-", 1)[-1]: v for k, v in attribs.items()}

            params.update(options)

            app = app or context.get(".app", None)

            if "#" in insert_ref:
                try:
                    _app, insert_el = archive.get_element(insert_ref, app=app)
                except ElementNotFoundError as e:
                    write_error(
                        insert_ref,
                        el,
                        "markup insert element '{}' was not found".format(
                            insert_ref),
                        exc=e,
                    )
                    continue
            else:
                from .tags.markup import MarkupInsert

                try:
                    insert_el = MarkupInsert.registry[insert_ref]
                except KeyError:
                    write_error(
                        insert_ref,
                        el,
                        "markup insert element '{}' was not found".format(
                            insert_ref),
                        exc=e,
                    )
                    continue
                _app = app

            if not getattr(insert_el, "_moya_markup_insert", False):
                msg = "{} is not safe for markup insertion".format(
                    html.escape(insert_el))
                write_error(insert_ref, el, msg)
                continue

            insert_callable = archive.get_callable_from_element(insert_el,
                                                                app=_app)

            try:
                replace_markup = insert_callable(context, **params)
            except LogicError as e:
                write_error(
                    insert_ref,
                    el,
                    "markup insert failed due to logic error, see logs",
                    exc=e,
                )
                continue
            except Exception as e:
                write_error(insert_ref,
                            el,
                            "markup insert failed, see logs",
                            exc=e)
                continue

            new_el = fromstring(replace_markup)
            new_el.tail = el.tail
            el.getparent().replace(el, new_el)

        return HTML("".join(tostring(e).decode("utf-8") for e in soup))
示例#56
0
 def _upc(self):
     upc_list = re.search('upc : (\[[^\]]*\])',
                          html.tostring(self.tree_html)).group(1)
     upc_list = ast.literal_eval(upc_list)
     return upc_list[0]
示例#57
0
def main():
    document = html.document_fromstring(sys.stdin.read())
    for script in [s for s in document.getiterator('script')]:
        script.getparent().remove(script)
    print(html.tostring(document, encoding='unicode'))
示例#58
0
def send(self, varBody=None):
    msg = "send"
    if varBody:
        msg = "%s('%s')" % (
            msg,
            str(varBody),
        )

    log.ThugLogging.add_behavior_warn("[Microsoft XMLHTTP ActiveX] %s" %
                                      (msg, ))
    log.ThugLogging.add_behavior_warn(
        "[Microsoft XMLHTTP ActiveX] Fetching from URL %s (method: %s)" % (
            self.bstrUrl,
            self.bstrMethod,
        ))
    log.ThugLogging.log_exploit_event(self._window.url,
                                      "Microsoft XMLHTTP ActiveX",
                                      "Send",
                                      forward=False,
                                      data={
                                          "method": self.bstrMethod,
                                          "url": str(self.bstrUrl)
                                      })

    response = None

    self.dispatchEvent("loadstart")

    try:
        response = self._window._navigator.fetch(
            self.bstrUrl,
            method=self.bstrMethod,
            headers=self.requestHeaders,
            body=varBody,
            redirect_type="Microsoft XMLHTTP")
    except Exception:
        log.ThugLogging.add_behavior_warn(
            '[Microsoft XMLHTTP ActiveX] Fetch failed')
        self.dispatchEvent("timeout")
        self.dispatchEvent("error")

    if response is None:
        return 0

    self.status = response.status_code
    self.responseHeaders = response.headers
    self.responseBody = response.content
    self.responseText = response.text
    self.readyState = 4

    if getattr(log, 'XMLHTTP', None) is None:
        log.XMLHTTP = dict()

    log.XMLHTTP['status'] = self.status
    log.XMLHTTP['responseHeaders'] = self.responseHeaders
    log.XMLHTTP['responseBody'] = self.responseBody
    log.XMLHTTP['responseText'] = self.responseText
    log.XMLHTTP['readyState'] = self.readyState

    last_bstrUrl = log.XMLHTTP.get('last_bstrUrl', None)
    last_bstrMethod = log.XMLHTTP.get('last_bstrMethod', None)

    if last_bstrUrl in (self.bstrUrl, ) and last_bstrMethod in (
            self.bstrMethod, ):  # pragma: no cover
        return 0

    log.XMLHTTP['last_bstrUrl'] = str(self.bstrUrl)
    log.XMLHTTP['last_bstrMethod'] = str(self.bstrMethod)

    if self.mimeType:
        contenttype = self.mimeType
    else:
        contenttype = self.responseHeaders.get('content-type', None)

    if contenttype is None:  # pragma: no cover
        return 0

    self.dispatchEvent("load")
    self.dispatchEvent("readystatechange")

    if 'javascript' in contenttype:
        html = tostring(E.HTML(E.HEAD(), E.BODY(E.SCRIPT(response.text))))

        doc = DOM.W3C.w3c.parseString(html)
        window = DOM.Window.Window(self.bstrUrl,
                                   doc,
                                   personality=log.ThugOpts.useragent)

        dft = DOM.DFT.DFT(window)
        dft.run()
        return 0

    if 'text/html' in contenttype:
        tags = ('<html', '<body', '<head', '<script')

        if not any(tag in response.text.lower() for tag in tags):
            html = tostring(E.HTML(E.HEAD(), E.BODY(E.SCRIPT(
                response.text))))  # pragma: no cover
        else:
            html = response.text

        doc = DOM.W3C.w3c.parseString(html)
        window = DOM.Window.Window(self.bstrUrl,
                                   doc,
                                   personality=log.ThugOpts.useragent)
        dft = DOM.DFT.DFT(window)
        dft.run()
        return 0

    handler = log.MIMEHandler.get_handler(contenttype)
    if handler:
        handler(self.bstrUrl, self.responseBody)

    return 0
示例#59
0
        # read stdout
        filename = fp.readline().strip().split()[1].strip("'")
        perc = float(fp.readline().split(':')[1].split('%')[0])
        gcov = fp.readline().strip().split()[1].strip("'")
        # move genereted gcov to coverage folder
        new_dir = join(target_dir, dirname(source))
        try:
            makedirs(new_dir)
        except OSError:
            pass
        rename(join(obspy_dir, gcov), join(new_dir, gcov))
        cov.append((filename, join(new_dir, gcov), perc))

# GENERATE HTML
page = fromstring("<html><table></table></html>")
table = page.xpath('.//table')[0]
for name, gcov, perc in cov:
    td1, td2 = Element('td'), Element('td')
    gcov = gcov.replace(target_dir, './')
    a = Element('a', attrib={'href': gcov})
    a.text = name
    td1.append(a)
    td2.text = "%6.2f%%" % perc
    tr = Element('tr')
    tr.extend([td1, td2])
    table.append(tr)
with open(join(target_dir, 'index.html'), 'wb') as fp:
    fp.write(tostring(page))

cleanup('*.o')
示例#60
0
        else:
            old_docs = old.docs[name]
            items = []
            for key in old_docs.docs:
                old_id, old_title, old_xml = old_docs.docs[key]
                if key not in new_docs.docs:
                    items.append(builder.I(builder.LI(old_title)))
                else:
                    diffs = diff_xml(old_xml, new_docs.docs[key][2], verbose)
                    if diffs is not None:
                        title = builder.B(old_title)
                        items.append(builder.LI(title, diffs))
            if not items:
                body.append(builder.P(CHECK, OK))
            else:
                body.append(builder.UL(*items))


parser = ArgumentParser()
parser.add_argument("--old", required=True)
parser.add_argument("--new", default=db.connect(user="******").cursor())
parser.add_argument("--verbose", action="store_true")
opts = parser.parse_args()
old = Data(opts.old)
new = Data(opts.new, old)
body = builder.BODY(builder.H1(TITLE))
compare_tables(body, old, new)
compare_docs(body, old, new, opts.verbose)
report = builder.HTML(HEAD, body)
print(html.tostring(report, pretty_print=True).decode("ascii"))