def generate_category_urls(self): html = retry_get_html(self.domain) parser = etree.HTML(html) category_xpath_str = '//*[@id="nav"]/div/div/ul//a/@href' category_hrefs = parser.xpath(category_xpath_str) category_urls = [] for href in category_hrefs: category_urls.append(urljoin(self.domain, href)) for url in category_urls: parser = etree.HTML(retry_get_html(url)) try: page_str = parser.xpath('//div[@class="cur"]/text()')[0] num = int(page_str.split('/')[1]) except Exception: num = 1 for page_num in range(1, num+1): yield url + '/%d.html' % page_num
def generate_category_urls(self): html = retry_get_html(self.domain) parser = etree.HTML(html) category_xpath_str = '//*[@id="nav"]/div/div/ul//a/@href' category_hrefs = parser.xpath(category_xpath_str) category_urls = [] for href in category_hrefs: category_urls.append(urljoin(self.domain, href)) for url in category_urls: parser = etree.HTML(retry_get_html(url)) try: page_str = parser.xpath('//div[@class="cur"]/text()')[0] num = int(page_str.split('/')[1]) except Exception: num = 1 for page_num in range(1, num + 1): yield url + '/%d.html' % page_num
def generate_item_urls(self): """遍历站点的目录页,返回所有目录页的商品url列表""" category_url_list = self.generate_category_urls() href_xpath = """//div[@id="prod_list"]//a[@class="pic_box"]/@href""" for category_url in category_url_list: html = retry_get_html(category_url) parser = etree.HTML(html) href_list = parser.xpath(href_xpath) for href in href_list: yield urljoin(self.domain, href)
def html(self): return retry_get_html(self.url)