def make_article_parser(url, parser_type_name=None):
    if parser_type_name:
        for parser_class in ALL_PARSERS:
            if parser_class.__name__ == parser_type_name:
                return parser_class(url, html_parsing.parse_tree(url))
    else:
        for parser_class in ALL_PARSERS:
            if parser_class.can_parse(url):
                return parser_class(url, html_parsing.parse_tree(url))
        return default_article_parser.DefaultArticleParser(url, html_parsing.parse_tree(url))
    return None
Пример #2
0
 def get_photos(self):
     urls = []
     photo_page_url = 'http://www.yelp.com/biz_photos/' + self.get_site_specific_entity_id()
     photos_root = html_parsing.parse_tree(photo_page_url).getroot()
     for thumb_img in photos_root.findall('body//div[@id="photo-thumbnails"]//a/img'):
         urls.append(thumb_img.get('src').replace('ms.jpg', 'l.jpg'))
     return urls
Пример #3
0
 def get_photos(self):
     urls = []
     try:
         url = self.root.find('body//img[@class="photo_image"]').get('src')
         if url:
             urls.append(url)
     except:
         pass
     for script in self.root.findall('body//script'):
         if script.text and 'lazyImgs' in script.text:
             lines = script.text.split('\n')
             for line in lines:
                 for elem_id in ('HERO_PHOTO', 'THUMB_PHOTO'):
                     if elem_id in line:
                         line = line.strip().strip(',')
                         some_json = json.loads(line)
                         urls.append(some_json['data'])
             break
     if self.get_category() == values.Category.LODGING:
         hotelsdotcom_url = crossreference.find_hotelsdotcom_url(self.get_entity_name())
         if hotelsdotcom_url:
             tree = html_parsing.parse_tree(hotelsdotcom_url)
             hotelsdotcom_scraper = hotels_dot_com.HotelsDotComScraper(url, tree)
             additional_urls = hotelsdotcom_scraper.get_photos()
             urls.extend(additional_urls)
     return urls
Пример #4
0
 def scrape_entity_page(self, url):
     entity_root = html_parsing.parse_tree(url).getroot()
     name = html_parsing.tostring(entity_root.xpath('.//div[@class="title-desc-inner"]//h1')[0])
     content_p_elems = entity_root.xpath(".//div[@class='content']//div[not(@class='image-caption')]/p")
     description = '\n\n'.join(html_parsing.tostring(p) for p in content_p_elems)
     photo_urls = entity_root.xpath(".//div[@class='content']//img/@data-src")
     return data.Entity(name=name, description=description, photo_urls=photo_urls)
Пример #5
0
 def get_description(self):
     desc_nodes = self.root.xpath('.//div[@id="listing_main"]//div[@class="listing_description"]')
     if not desc_nodes:
         return None
     desc_node = desc_nodes[0]
     details_link = desc_node.xpath('.//a/@href')
     if details_link:
         url = self.absolute_url(details_link[0])
         details_page_tree = html_parsing.parse_tree(url)
         details_node = details_page_tree.getroot().xpath('.//div[@class="articleBody"]')[0]
         if details_node.xpath('.//p'):
             return html_parsing.join_element_text_using_xpaths(details_node, ['.//p'], '\n\n')
         else:
             return html_parsing.tostring(details_node)
     elif desc_node.xpath('.//span[@class="onShow"]'):
         return ''.join(desc_node.xpath('.//span[@class="onShow"]/text()')).strip()
     else:
         return ''.join(desc_node.xpath('text()')).strip()
Пример #6
0
def build_scrapers(url, client_page_source=None, force_fetch_page=False, allow_expansion=True, for_guide=False):
    page_source_tree = html_parsing.parse_tree_from_string(client_page_source) if client_page_source else None
    if not page_source_tree and (url_requires_server_page_source(url) or force_fetch_page):
        page_source_tree = html_parsing.parse_tree(url)

    scraped_pages = []
    for scraper_class in ALL_SCRAPERS:
        handleable_urls = scraper_class.handleable_urls(url, page_source_tree, allow_expansion)
        if handleable_urls:
            reqs = [html_parsing.make_request(u) for u in handleable_urls]
            resps = utils.parallelize(utils.retryable(urllib2.urlopen, 3), [(req,) for req in reqs])
            for url, resp in zip(handleable_urls, resps):
                if not resp:
                    print "Failed to fetch url: %s" % url
                    continue
                tree = etree.parse(resp, html_parsing.htmlparser())
                scraper = scraper_class(url, tree, for_guide)
                scraped_pages.append(scraper)
            break
    return scraped_pages
Пример #7
0
 def get_photo_page(self):
     if not hasattr(self, '_photo_page'):
         self._photo_page = html_parsing.parse_tree(self.get_photo_page_url())
     return self._photo_page