def make_article_parser(url, parser_type_name=None): if parser_type_name: for parser_class in ALL_PARSERS: if parser_class.__name__ == parser_type_name: return parser_class(url, html_parsing.parse_tree(url)) else: for parser_class in ALL_PARSERS: if parser_class.can_parse(url): return parser_class(url, html_parsing.parse_tree(url)) return default_article_parser.DefaultArticleParser(url, html_parsing.parse_tree(url)) return None
def get_photos(self): urls = [] photo_page_url = 'http://www.yelp.com/biz_photos/' + self.get_site_specific_entity_id() photos_root = html_parsing.parse_tree(photo_page_url).getroot() for thumb_img in photos_root.findall('body//div[@id="photo-thumbnails"]//a/img'): urls.append(thumb_img.get('src').replace('ms.jpg', 'l.jpg')) return urls
def get_photos(self): urls = [] try: url = self.root.find('body//img[@class="photo_image"]').get('src') if url: urls.append(url) except: pass for script in self.root.findall('body//script'): if script.text and 'lazyImgs' in script.text: lines = script.text.split('\n') for line in lines: for elem_id in ('HERO_PHOTO', 'THUMB_PHOTO'): if elem_id in line: line = line.strip().strip(',') some_json = json.loads(line) urls.append(some_json['data']) break if self.get_category() == values.Category.LODGING: hotelsdotcom_url = crossreference.find_hotelsdotcom_url(self.get_entity_name()) if hotelsdotcom_url: tree = html_parsing.parse_tree(hotelsdotcom_url) hotelsdotcom_scraper = hotels_dot_com.HotelsDotComScraper(url, tree) additional_urls = hotelsdotcom_scraper.get_photos() urls.extend(additional_urls) return urls
def scrape_entity_page(self, url): entity_root = html_parsing.parse_tree(url).getroot() name = html_parsing.tostring(entity_root.xpath('.//div[@class="title-desc-inner"]//h1')[0]) content_p_elems = entity_root.xpath(".//div[@class='content']//div[not(@class='image-caption')]/p") description = '\n\n'.join(html_parsing.tostring(p) for p in content_p_elems) photo_urls = entity_root.xpath(".//div[@class='content']//img/@data-src") return data.Entity(name=name, description=description, photo_urls=photo_urls)
def get_description(self): desc_nodes = self.root.xpath('.//div[@id="listing_main"]//div[@class="listing_description"]') if not desc_nodes: return None desc_node = desc_nodes[0] details_link = desc_node.xpath('.//a/@href') if details_link: url = self.absolute_url(details_link[0]) details_page_tree = html_parsing.parse_tree(url) details_node = details_page_tree.getroot().xpath('.//div[@class="articleBody"]')[0] if details_node.xpath('.//p'): return html_parsing.join_element_text_using_xpaths(details_node, ['.//p'], '\n\n') else: return html_parsing.tostring(details_node) elif desc_node.xpath('.//span[@class="onShow"]'): return ''.join(desc_node.xpath('.//span[@class="onShow"]/text()')).strip() else: return ''.join(desc_node.xpath('text()')).strip()
def build_scrapers(url, client_page_source=None, force_fetch_page=False, allow_expansion=True, for_guide=False): page_source_tree = html_parsing.parse_tree_from_string(client_page_source) if client_page_source else None if not page_source_tree and (url_requires_server_page_source(url) or force_fetch_page): page_source_tree = html_parsing.parse_tree(url) scraped_pages = [] for scraper_class in ALL_SCRAPERS: handleable_urls = scraper_class.handleable_urls(url, page_source_tree, allow_expansion) if handleable_urls: reqs = [html_parsing.make_request(u) for u in handleable_urls] resps = utils.parallelize(utils.retryable(urllib2.urlopen, 3), [(req,) for req in reqs]) for url, resp in zip(handleable_urls, resps): if not resp: print "Failed to fetch url: %s" % url continue tree = etree.parse(resp, html_parsing.htmlparser()) scraper = scraper_class(url, tree, for_guide) scraped_pages.append(scraper) break return scraped_pages
def get_photo_page(self): if not hasattr(self, '_photo_page'): self._photo_page = html_parsing.parse_tree(self.get_photo_page_url()) return self._photo_page