Python parse_tree_from_string示例

编程语言: Python

命名空间/包名称: scraping.html_parsing

方法/功能: parse_tree_from_string

hotexamples.com的示例: 3

Python parse_tree_from_string - 已找到3个示例。这些是从开源项目中提取的最受好评的scraping.html_parsing.parse_tree_from_string现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： rick_steves_guide.py 项目： superf2t/clipper-foundation

 def get_raw_entities(self):
     entities = []
     items = self.root.xpath('.//h2[@class="accordion-title" and contains(., "At a Glance")]/following-sibling::div//p')
     for item in items:
         num_stars = len(item.text.strip())
         starred = num_stars == 3
         name = item.xpath('.//strong')[0].text.strip()
         temp_html = re.sub('<strong>.*</strong>', 'SPLIT_POINT', etree.tostring(item))
         temp_node = html_parsing.parse_tree_from_string(temp_html.encode('utf-8'))
         desc = html_parsing.tostring(temp_node).split('SPLIT_POINT')[1].strip()
         entities.append(data.Entity(name=name, starred=starred, description=desc))
     return entities

示例#2

显示文件

文件： scrape_logic.py 项目： superf2t/clipper-foundation

def build_scrapers(url, client_page_source=None, force_fetch_page=False, allow_expansion=True, for_guide=False):
    page_source_tree = html_parsing.parse_tree_from_string(client_page_source) if client_page_source else None
    if not page_source_tree and (url_requires_server_page_source(url) or force_fetch_page):
        page_source_tree = html_parsing.parse_tree(url)

    scraped_pages = []
    for scraper_class in ALL_SCRAPERS:
        handleable_urls = scraper_class.handleable_urls(url, page_source_tree, allow_expansion)
        if handleable_urls:
            reqs = [html_parsing.make_request(u) for u in handleable_urls]
            resps = utils.parallelize(utils.retryable(urllib2.urlopen, 3), [(req,) for req in reqs])
            for url, resp in zip(handleable_urls, resps):
                if not resp:
                    print "Failed to fetch url: %s" % url
                    continue
                tree = etree.parse(resp, html_parsing.htmlparser())
                scraper = scraper_class(url, tree, for_guide)
                scraped_pages.append(scraper)
            break
    return scraped_pages

示例#3

显示文件

文件： clip_logic.py 项目： superf2t/clipper-foundation

def extract_urls_from_page_source(url, page_source):
    urls = []
    tree = html_parsing.parse_tree_from_string(page_source)
    urls.extend(extract_all_links_from_anchors(url, tree))
    urls.extend(extract_all_links_from_text(html_parsing.tostring(tree.getroot())))
    return urls