def get_raw_entities(self): entities = [] items = self.root.xpath('.//h2[@class="accordion-title" and contains(., "At a Glance")]/following-sibling::div//p') for item in items: num_stars = len(item.text.strip()) starred = num_stars == 3 name = item.xpath('.//strong')[0].text.strip() temp_html = re.sub('<strong>.*</strong>', 'SPLIT_POINT', etree.tostring(item)) temp_node = html_parsing.parse_tree_from_string(temp_html.encode('utf-8')) desc = html_parsing.tostring(temp_node).split('SPLIT_POINT')[1].strip() entities.append(data.Entity(name=name, starred=starred, description=desc)) return entities
def build_scrapers(url, client_page_source=None, force_fetch_page=False, allow_expansion=True, for_guide=False): page_source_tree = html_parsing.parse_tree_from_string(client_page_source) if client_page_source else None if not page_source_tree and (url_requires_server_page_source(url) or force_fetch_page): page_source_tree = html_parsing.parse_tree(url) scraped_pages = [] for scraper_class in ALL_SCRAPERS: handleable_urls = scraper_class.handleable_urls(url, page_source_tree, allow_expansion) if handleable_urls: reqs = [html_parsing.make_request(u) for u in handleable_urls] resps = utils.parallelize(utils.retryable(urllib2.urlopen, 3), [(req,) for req in reqs]) for url, resp in zip(handleable_urls, resps): if not resp: print "Failed to fetch url: %s" % url continue tree = etree.parse(resp, html_parsing.htmlparser()) scraper = scraper_class(url, tree, for_guide) scraped_pages.append(scraper) break return scraped_pages
def extract_urls_from_page_source(url, page_source): urls = [] tree = html_parsing.parse_tree_from_string(page_source) urls.extend(extract_all_links_from_anchors(url, tree)) urls.extend(extract_all_links_from_text(html_parsing.tostring(tree.getroot()))) return urls