示例#1
0
 def _get_hours_info(self, hours_node):
     hours_info = {}
     day_nodes = HtmlUtils.get_elements(hours_node, './/tr')
     for day_node in day_nodes:
         key = HtmlUtils.get_element_value(day_node, './td[contains(text(),"day")]/text()').lower()
         value = HtmlUtils.get_element_value(day_node,'.//strong/text()')
         hours_info[key] = value
     return hours_info
示例#2
0
 def _get_social_media_info(self, html_doc):
     social_media_info = {}
     social_media_info['facebook'] = HtmlUtils.get_element_value(html_doc, '//div[@class="brand-page"]//ul[contains(@class,"list-unstyled")]//a/i[contains(@class,"facebook")]/parent::a/@href')
     social_media_info['twitter'] = HtmlUtils.get_element_value(html_doc, '//div[@class="brand-page"]//ul[contains(@class,"list-unstyled")]//a/i[contains(@class,"twitter")]/parent::a/@href')
     social_media_info['instagram'] = HtmlUtils.get_element_value(html_doc, '//div[@class="brand-page"]//ul[contains(@class,"list-unstyled")]//a/i[contains(@class,"instagram")]/parent::a/@href')
     social_media_info['google-plus'] = HtmlUtils.get_element_value(html_doc, '//div[@class="brand-page"]//ul[contains(@class,"list-unstyled")]//a/i[contains(@class,"google-plus")]/parent::a/@href')
     
     return social_media_info
示例#3
0
 def _get_menu_info(self, html_doc):
     category_nodes = HtmlUtils.get_elements(html_doc, '//div[@class="panel panel-default"]')
     categories = {}
     for category_node in category_nodes:
         category_name = HtmlUtils.get_element_value(category_node, './/h2[@class="panel-title"]/a/text()')
         if category_name != '':
             categories[category_name] = self._get_category_from_menu_info(category_node)
             
     return categories
示例#4
0
 def _get_menu_item_prices_info(self, item_node):
     price_nodes = HtmlUtils.get_elements(item_node, './/div[contains(@class,"menu-item-prices hidden")]//li[not(contains(@class,"price-empty")) and contains(@class, "rounded gram-price")]')
     price_infos = []
     for price_node in price_nodes:
         unit = HtmlUtils.get_element_value(price_node, './/div[@class="unit"]/text()')
         if unit != '':
             info = {}
             info[unit] = HtmlUtils.get_element_value(price_node, './/div[@class="price rounded"]/text()')
             price_infos.append(info)
             
     return price_infos
示例#5
0
 def _get_category_from_menu_info(self, category_node):
     item_nodes = HtmlUtils.get_elements(category_node, './/div[@class="menu-item"]')
     items_info = []
     for item_node in item_nodes:
         item_name = HtmlUtils.get_element_value(item_node, './/h3[@class="menu-item-name"]/text()')
         if item_name != '':
             info = {}
             info['name'] = item_name
             info['prices'] = self._get_menu_item_prices_info(item_node)
             items_info.append(info)
     
     return items_info
 def _get_city_urls(self, page_html, host):
     html_doc = html.fromstring(page_html)
     city_nodes = HtmlUtils.get_elements(
         html_doc,
         '(//div[@id="maincolumn"]//ul[@class="dropdown-menu"])[1]/li/a')
     result = []
     for node in city_nodes:
         city_name = HtmlUtils.get_element_value(node, './text()')
         if self._dispensary_filter.match_city(city_name):
             result.append(host +
                           HtmlUtils.get_element_value(node, './@href'))
     return result
 def get_about_info(self, url):
     response = self._http_client.get(url)
     if response.success:
         html_doc = html.fromstring(response.content)
         return HtmlUtils.get_element_value(
             html_doc, "//div[@class='store-about']/text()")
     return ''
示例#8
0
 def get_list_page(self, url):
     global content
     retry = 0
     st = 0  # st  1:fail 1:success
     while not st and (retry < 100):
         st, content = HtmlUtils.download_html(url, headers=self.headers)
     return st, content
 def get_rank_status(self, item_node):
     imgurl = HtmlUtils.get_element_value(
         item_node, './/div[@class="bs-product-rank-image"]/img/@src')
     if 'rank-down' in imgurl:
         return 'down'
     if 'rank-up' in imgurl:
         return 'up'
     return 'unkown'
 def produce(self, state_name):
     response = self._http_client.get(self._url.format(state_name.lower()))
     if response.success:
         html_doc = html.fromstring(response.content)
         category_urls = HtmlUtils.get_elements(
             html_doc,
             './/div[@class="w-dyn-items"]//div[@class="w-embed"]/a/@href')
         return category_urls
     return []
 def produce(self, state_name):
     resp, host = self._get_state_response(state_name)
     if resp.success:
         city_urls = self._get_city_urls(resp.content, host)
         for city_url in city_urls:
             res = self._http_client.get(city_url)
             if res.success:
                 html_doc = html.fromstring(res.content)
                 store_urls = HtmlUtils.get_elements(
                     html_doc,
                     '//div[contains(@class,"-listing")]//*[self::h3 or self::h4]/a/@href'
                 )
                 for url in store_urls:
                     yield host + url, host
示例#12
0
    def parse(self):
        site_result = NewsSite(self.url)

        html_string = self.html_getter.get(self.url)
        if not html_string:
            return

        html_document = html.fromstring(html_string)

        news = HtmlUtils.get_elements(html_document, self.xpaths_container.get_news_xpath())

        for n in news:
            title = self._get_title(n)
            url = self._get_url(n)

            if not url:
                continue

            html_string = self.html_getter.get(url)

            if not html_string:
                continue

            article = html.fromstring(html_string)

            image_url = self._get_image_url(article)
            date = self._get_date(article)

            self.remove_elements(article, self.xpaths_container.get_elements_to_remove_xpaths())

            text_html = self._get_text_html(article)
            text_plain = self._get_text_plain(article)

            site_result.add_article(title, url, image_url, date, text_html, text_plain)

        return site_result.to_dict()
    def get_brand_image(self, item_node):
        image_url = HtmlUtils.get_element_value(
            item_node, './/div[@class="bs-product-brand-image"]//img/@src')

        return image_url if 'http' in image_url else ''
 def get_rank(self, item_node):
     return HtmlUtils.get_element_value(
         item_node, './div[@class="bs-product-rank"]/h2/text()')
 def get_category_name(self, html_doc):
     return HtmlUtils.get_element_value(
         html_doc,
         '//div[@class="flex-horiz-product-list"]//h2[@class="best-seller-cat-title"]/text()'
     )
 def get_items(self, html_doc):
     nodes = HtmlUtils.get_elements(
         html_doc,
         '//div[@class="w-dyn-list"]//div[@class="product-listing"]')
     return map(self.parse_item, nodes)
示例#17
0
 def _strip_quantity(self, quantity):
     if quantity is str:
         return quantity.strip()
     return HtmlUtils.get_element_value(quantity, './text()')
 def get_image_chart(self, item_node):
     return HtmlUtils.get_element_value(
         item_node, './/div[@class="bs-product-chart"]//img/@src')
示例#19
0
 def _get_image_url(self, deal_node):
     return 'https://www.leafbuyer.com' + HtmlUtils.get_element_value(
         deal_node, '//div[contains(@class, "img-block")]/a/img/@src')
示例#20
0
 def _get_hours(self, html_doc):
     hours_nodes = HtmlUtils.get_elements(html_doc, '//table[@class="table table-striped"]')
     if len(hours_nodes) > 0:
        return self._get_hours_info(hours_nodes[0])
     return {}
示例#21
0
 def _get_dispensary_address(self, deal_node):
     return HtmlUtils.get_element_value(
         deal_node,
         './/div[@class="text-box"]//span[@class="txt"]/text()').strip()
示例#22
0
 def _get_dispensary_phone_number(self, deal_node):
     return HtmlUtils.get_element_value(
         deal_node,
         './/div[@class="text-box"]//span[@class="tel-link"]/text()')
示例#23
0
 def _get_dispensary_minimum_age(self, deal_node):
     return '21' if HtmlUtils.get_element_value(
         deal_node,
         '//ul[@class="detail-list"]/li/span[contains(@class, "icon-retail")]'
     ) != '' else 'unkown'
示例#24
0
 def is_top_deal(self, deal_node):
     return HtmlUtils.get_element_value(
         deal_node, './/div[@class="deal-box"]/text()') != ''
 def get_brand_name(self, item_node):
     return HtmlUtils.get_element_value(
         item_node, './/div[@class="bs-product-name"]/div/text()')
示例#26
0
 def _get_dispensary_name(self, deal_node):
     return HtmlUtils.get_element_value(
         deal_node, './/div[@class="profile-link"]/text()')
 def get_product_price(self, item_node):
     value = HtmlUtils.get_element_value(
         item_node, './/div[@class="bs-product-price"]/h4[2]/text()')
     currency = HtmlUtils.get_element_value(
         item_node, './/div[@class="bs-product-price"]/h4[1]/text()')
     return value + currency
示例#28
0
 def _get_dispensary_url(self, deal_node):
     return 'https://www.leafbuyer.com' + HtmlUtils.get_element_value(
         deal_node, './/div[@class="loc-name-addr"]/strong/a/@href')
示例#29
0
 def _has_menu(self, html_doc):
     return len(HtmlUtils.get_elements(html_doc, '//div[contains(@id,"cpg-menu")]')) > 0
示例#30
0
 def _get_deal_name(self, deal_node):
     return HtmlUtils.get_element_value(
         deal_node, './/div[@class="text-wrap"]/h1/text()')