def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) nav_nodes = sel.xpath( '//div[@id="menu"]/ul/li[child::a[@href][text()]][child::div[@class="submenuMask"]]' ) for node in nav_nodes: try: tag_text = node.xpath('./a[@href][text()]/text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except (TypeError, IndexError): continue if tag_text and tag_name: m = copy.deepcopy(metadata) m['tags_mapping']['category-0'] = [{ 'name': tag_name, 'title': tag_text }] gender = common.guess_gender(tag_name) if gender: m['gender'] = [gender] sub_nodes = node.xpath( './div[@class="submenuMask"]/ul/li/a[@href][text()]') for sub_node in sub_nodes: try: tag_text = sub_node.xpath('./text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except (TypeError, IndexError): continue if tag_text and tag_name: mc = copy.deepcopy(m) mc['tags_mapping']['category-1'] = [{ 'name': tag_name, 'title': tag_text }] gender = common.guess_gender(tag_name) if gender: mc['gender'] = [gender] try: href = sub_node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) except (TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product_list, errback=self.onerr, meta={'userdata': mc})
def parse_gender(self, response): metadata = response.meta['userdata'] sel = Selector(response) node_list = sel.xpath('//div[contains(@class,"switchGender")]') if node_list: for node in node_list[0].xpath( './ul/li/a[@href and @class="notSelGender"]'): try: tmp = self.reformat( node.xpath('text()').extract()[0]).lower() except (TypeError, IndexError): continue m = copy.deepcopy(metadata) gender = cm.guess_gender(tmp) if gender: m['gender'] = [gender] yield Request(url=self.process_href( node.xpath('@href').extract()[0], response.url), callback=self.parse_cat1, errback=self.onerr, meta={'userdata': m}) try: tmp = self.reformat(node_list[0].xpath( './ul/li/span[@class="selGender"]/text()').extract() [0]).lower() gender = cm.guess_gender(tmp) if gender: metadata['gender'] = [gender] except (TypeError, IndexError): pass for val in self.parse_cat1(response): yield val
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) nav_nodes = sel.xpath('//div[contains(@class, "global-nav")]/ul/li') for node in nav_nodes: try: tag_text = ' '.join(node.xpath('./a//text()').extract()) tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except (TypeError, IndexError): continue if tag_text and tag_name: m = copy.deepcopy(metadata) m['tags_mapping']['category-0'] = [{ 'name': tag_name, 'title': tag_text, }] gender = common.guess_gender(tag_name) if gender: m['gender'] = [gender] # 这个不包含最后一个nav里边的链接,那里边没单品 sub_nodes = node.xpath( './div/div/ul/li[child::a[text()][@href]]') for sub_node in sub_nodes: try: tag_text = sub_node.xpath('./a/text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except (TypeError, IndexError): continue if tag_text and tag_name: mc = copy.deepcopy(m) mc['tags_mapping']['category-1'] = [{ 'name': tag_name, 'title': tag_text, }] gender = common.guess_gender(tag_name) if gender: mc['gender'] = [gender] try: href = sub_node.xpath('./a/@href').extract()[0] href = self.process_href(href, response.url) except (TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product_list, errback=self.onerr, meta={'userdata': mc})
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) nav_nodes = sel.xpath('//div[@id="pre-footer"]/ul[@class="nav"]/li[child::h4[text()]]') for node in nav_nodes: try: tag_text = ''.join( self.reformat(val) for val in node.xpath('./h4//text()').extract() ) tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except(TypeError, IndexError): continue if tag_text and tag_name: m = copy.deepcopy(metadata) m['tags_mapping']['category-0'] = [ {'name': tag_name, 'title': tag_text, }, ] gender = common.guess_gender(tag_name) if gender: m['gender'] = [gender] sub_nodes = node.xpath('./ul/li[child::a[@href][text()]]') for sub_node in sub_nodes: try: tag_text = sub_node.xpath('./a/text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except(TypeError, IndexError): continue if tag_text and tag_name: mc = copy.deepcopy(m) mc['tags_mapping']['category-1'] = [ {'name': tag_name, 'title': tag_text, }, ] gender = common.guess_gender(tag_name) if gender: mc['gender'] = [gender] try: href = sub_node.xpath('./a/@href').extract()[0] href = self.process_href(href, response.url) except(TypeError, IndexError): continue yield Request(url=href, callback=self.parse_filter, errback=self.onerr, meta={'userdata': mc})
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) for node1 in sel.xpath('//ul[@class="mainNavi"]//li[contains(@class,"mainNavi_item")]'): try: tmp = node1.xpath('.//a[@href and contains(@class,"mainNavi_link")]/span/text()').extract() cat_title = self.reformat(tmp[0]) cat_name = cat_title.lower() except (IndexError, TypeError): continue m1 = copy.deepcopy(metadata) m1['tags_mapping']['category-0'] = [{'title': cat_title, 'name': cat_name}] gender = cm.guess_gender(cat_name) if gender: m1['gender'] = [gender] for node2 in node1.xpath('.//ul[@class="nav_category_list"]/li/a[@href]'): url = self.process_href(node2.xpath('@href').extract()[0], response.url) try: tmp = node2.xpath('./span/text()').extract() cat_title = self.reformat(tmp[0]) cat_name = cat_title.lower() except (IndexError, TypeError): continue m2 = copy.deepcopy(m1) m2['tags_mapping']['category-1'] = [{'title': cat_title, 'name': cat_name}] yield Request(url=url, callback=self.parse_list, errback=self.onerr, meta={'userdata': m2})
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) for node in sel.xpath('//ul/li/a[@href and @data-cat]'): try: cat_title = self.reformat(node.xpath('@data-cat').extract()[0]) cat_name = cat_title.lower() url = self.process_href( node.xpath('@href').extract()[0], response.url) except (IndexError, TypeError): continue m = copy.deepcopy(metadata) m['tags_mapping']['category-0'] = [{ 'title': cat_title, 'name': cat_name }] gender = cm.guess_gender(cat_name) if gender: m['gender'] = [gender] yield Request(callback=self.parse_cat, errback=self.onerr, meta={'userdata': m}, url=url)
def parse(self, response): metadata = response.meta['userdata'] #处理常规部分 link_extractor = SgmlLinkExtractor(restrict_xpaths=('//div[@class="linksList"]//a')) links = link_extractor.extract_links(response) for link in links: m = copy.deepcopy(metadata) url = link.url cat_title = link.text cat_name = cat_title.lower() m['tags_mapping']['category-0'] = [{'title': cat_title, 'name': cat_name}] gender = cm.guess_gender(cat_name) if gender: m['gender'] = [gender] yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m}) #处理区域特别部分 region = metadata['region'] if region == 'jp': extra_urls = [ 'http://www.paulsmith.co.jp/shop/gifts/products', 'http://www.paulsmith.co.jp/shop/reserve/products', 'http://www.paulsmith.co.jp/shop/sales/products', 'http://www.paulsmith.co.jp/shop/paulsmithcollection/products' ] for url in extra_urls: m = copy.deepcopy(metadata) yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m}) else: extra_urls = [ 'http://www.paulsmith.co.uk/%s-en/shop/valentines-day-gifts/valentines-day-gifts-for-her' % region, 'http://www.paulsmith.co.uk/%s-en/shop/valentines-day-gifts/valentines-day-gifts-for-him' % region, ] for url in extra_urls: m = copy.deepcopy(metadata) yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m})
def parse_cat(self, response): metadata = response.meta['userdata'] sel = Selector(response) nav_nodes = sel.xpath('//ul[@id="main-nav"]/li[position()>1]/a[@href][text()]') for node in nav_nodes: try: tag_text = node.xpath('./text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except(TypeError, IndexError): continue if tag_text and tag_name: m = copy.deepcopy(metadata) m['tags_mapping']['category-0'] = [ {'name': tag_name, 'title': tag_text}, ] gender = common.guess_gender(tag_name) if gender: m['gender'] = [gender] try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) except(TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product_list, errback=self.onerr, meta={'userdata': m})
def parse_procut_list(self, response): """ 处理单品列表 """ metadata = response.meta['userdata'] sel = Selector(response) product_nodes = sel.xpath('//div[@class="category-view"]/div/a') for node in product_nodes: m = copy.deepcopy(metadata) name_node = node.xpath('.//h3[text()]') if name_node: name = name_node.xpath('./text()').extract()[0] name = self.reformat(name) m['name'] = name gender = common.guess_gender(name, extra={ 'male': [], 'female': ['lady'] }) if gender: m['gender'] = [gender] href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) yield Request(url=href, callback=self.parse_product, errback=self.onerr, meta={'userdata': m}, dont_filter=True)
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) for node in sel.xpath( '//ul[@id="global-nav" or @id="rl-globalnav"]/li/a[@title and @href]' ): try: cat_title = self.reformat(node.xpath('@title').extract()[0]) cat_name = cat_title.lower() except (IndexError, TypeError): continue m = copy.deepcopy(metadata) m['tags_mapping']['category-0'] = [{ 'name': cat_name, 'title': cat_title }] gender = cm.guess_gender(cat_name) if gender: m['gender'] = [gender] url = self.process_href( node.xpath('@href').extract()[0], response.url) yield Request(url=url, callback=self.parse_1, errback=self.onerr, meta={'userdata': m}, dont_filter=True)
def parse(self, response): metadata = response.meta['userdata'] m = re.search(r'([a-zA-Z]{2})\.burberry\.com', response.url) if m: hxs = Selector(response) for item in hxs.xpath( "//div[@id='shared_sidebar']//div[@id='nav']//ul[@class='l-1-set']//li[@class='l-1-link " "l-1-link-open']//li/a[@href and @title]"): href = item.xpath('@href').extract()[0] # TODO What is cat? cat = self.reformat(re.sub(r'/', '', href)).lower() title = self.reformat(item.xpath('@title').extract()[0]) m = copy.deepcopy(metadata) m['tags_mapping']['category-1'] = [{ 'name': cat, 'title': title }] gender = cm.guess_gender(cat) if gender: m['gender'] = [gender] yield Request(url=self.process_href(href, response.url), meta={'userdata': m}, dont_filter=True, callback=self.parse_category_1, errback=self.onerr)
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) # 左侧边栏第一栏中带连接的那些node nav_nodes = sel.xpath('//div[@id="sidebar"]/ul/ul/li/a[@href]') for node in nav_nodes: m = copy.deepcopy(metadata) try: tag_text = node.xpath('./text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except(TypeError, IndexError): continue if tag_text and tag_name: m['tags_mapping']['category-0'] = [ {'name': tag_name, 'title': tag_text, }, ] gender = common.guess_gender(tag_name) if gender: m['gender'] = [gender] try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) except(TypeError, IndexError): continue yield Request(url=href, callback=self.parse_filter1, errback=self.onerr, meta={'userdata': m})
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) for node1 in sel.xpath('//li/a[contains(@class,"header_menu")]'): try: tmp = node1.xpath('./*/text()').extract() cat_title = self.reformat(tmp[0]) cat_name = cat_title.lower() except (IndexError, TypeError): continue m1 = copy.deepcopy(metadata) m1['tags_mapping']['category-0'] = [{ 'title': cat_title, 'name': cat_name }] gender = cm.guess_gender(cat_name) if gender: m1['gender'] = [gender] for node2 in node1.xpath( '../div[contains(@class,"submenu")]/ul/li[contains(@class,"title_column")]/a[@href]' ): try: tmp = node2.xpath('./*/text()').extract() cat_title = self.reformat(tmp[0]) cat_name = cat_title.lower() except (IndexError, TypeError): continue m2 = copy.deepcopy(m1) m2['tags_mapping']['category-1'] = [{ 'title': cat_title, 'name': cat_name }] gender = cm.guess_gender(cat_name) if gender: if 'gender' in m2 and m2['gender']: tmp = set(m2['gender']) tmp.add(gender) m2['gender'] = list(tmp) else: m2['gender'] = [gender] yield Request(url=self.process_href( node2.xpath('@href').extract()[0], response.url), callback=self.parse_grid, errback=self.onerr, meta={'userdata': m2})
def parse_collection(self, response): metadata = response.meta['userdata'] sel = Selector(response) collection_nodes = sel.xpath( '//table[@id="top-watches-list"]//tr[@class="top-list-buttons"]/td/a' ) for node in collection_nodes: m = copy.deepcopy(metadata) try: tag_text = node.xpath('./@title').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except (TypeError, IndexError): continue if tag_text and tag_name: m['tags_mapping']['category-1'] = [ { 'name': tag_name, 'title': tag_text, }, ] gender = common.guess_gender(tag_name) if gender: m['gender'] = [gender] try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) except (TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product_list_collection, errback=self.onerr, meta={'userdata': m}) # 这里既可能是男女的collection页,可能是watch finder页 view_all_node = sel.xpath('//div[@id="l-gender-teaser"]//a[@href]') if view_all_node: try: href = view_all_node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) yield Request(url=href, callback=self.parse_product_list_collection, errback=self.onerr, meta={'userdata': metadata}) except (TypeError, IndexError): pass else: for val in self.parse_product_list_watchesfinder(response): yield val
def parse_details(self, response): metadata = response.meta['userdata'] sel = Selector(response) model = self.fetch_model(response) if model: metadata['model'] = model else: return ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] name = self.fetch_name(response) if name: metadata['name'] = name colors = self.fetch_color(response) if colors: metadata['color'] = colors description = self.fetch_description(response) if description: metadata['description'] = description image_urls = [] for image_node in sel.xpath( '//article[@class="product"]/figure[@class="slider"]/img[@data-zoom-url]' ): tmp = image_node.xpath('./@data-zoom-url').extract() if tmp: if tmp[0] == '/static_assets/images/products/placeholders/standard.jpg': tmp = image_node.xpath('./@src').extract() if tmp and re.search( r'\.(jpg|png|jpeg)', tmp[0], flags=re.IGNORECASE): image_urls.append( self.process_href(tmp[0], response.url)) else: image_urls.append(self.process_href(tmp[0], response.url)) gender = cm.guess_gender( metadata['tags_mapping']['category-0'][0]['name']) if gender: metadata['gender'] = [gender] metadata['url'] = response.url item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata return item
def parse_base(self, response, xpath_dict, sel=None, metadata=None, cat_level=0, is_leaf=None): """ @param is_leaf: 函数:判断当前节点是否为叶节点 @param response: @param xpath_dict: 形式:{'cat_level_0: [xpath, xpath_extra], 'cat_level_1': xpath, ...'cat_level_extra': xpath} @param sel: @param metadata: @param cat_level: """ if not metadata: metadata = response.meta['userdata'] if not sel: sel = Selector(response) if not is_leaf: is_leaf = lambda x: False if cat_level == 0: xpath, xpath_extra = xpath_dict['cat_level_0'] node_list = sel.xpath(xpath) if xpath_extra: node_list.extend(sel.xpath(xpath_extra)) else: cat_key = str.format('cat_level_{0}', cat_level) if cat_key in xpath_dict: xpath = xpath_dict[cat_key] else: xpath = xpath_dict['cat_level_extra'] node_list = sel.xpath(xpath) if node_list and not is_leaf(sel): # 深度优先递归,继续下级分支 for node in node_list: try: tag_title = self.reformat(node.xpath('text()').extract()[0]) tag_name = tag_title.lower() except (TypeError, IndexError): continue m1 = copy.deepcopy(metadata) if cat_level == 0: gender = cm.guess_gender(tag_name) if gender: m1['gender'] = [gender] m1['tags_mapping'][str.format('category-{0}', cat_level)] = [{'name': tag_name, 'title': tag_title}] for val in self.parse_base(response, xpath_dict, node, m1, cat_level + 1, is_leaf=is_leaf): yield val else: # 到达叶节点 tmp = sel.xpath('@href').extract() if tmp: yield Request(url=self.process_href(tmp[0], response.url), callback=self.spider_data['callbacks'][metadata['region']][1], errback=self.onerr, meta={'userdata': metadata})
def parse(self, response, metadata=None, current_node=None, level=0): if not metadata: metadata = response.meta['userdata'] sel = Selector(response) if current_node: node_list = current_node.xpath('../ul/li/a[@href]') else: node_list = sel.xpath( '//*[@id="sidebarMenu"]/ul/li[contains(@class,"selected")]/a[@href]' ) if node_list: for node1 in node_list: try: tag_text = self.reformat( node1.xpath('text()').extract()[0]) tag_name = tag_text.lower() except (IndexError, TypeError): continue m1 = copy.deepcopy(metadata) gender = cm.guess_gender(tag_text) if gender: m1['gender'] = [gender] new_level = level else: m1['tags_mapping'][str.format('category-{0}', level)] = [{ 'name': tag_name, 'title': tag_text }] new_level = level + 1 for val in self.parse(response, m1, node1, new_level): yield val else: prod_list = sel.xpath('//*[@id="elementsContainer"]') if prod_list: # 到达单品页面 for val in self.parse_list(response, metadata): yield val else: # 继续 try: url = self.process_href( current_node.xpath('@href').extract()[0], response.url) yield Request(url=url, callback=self.parse_list, errback=self.onerr, meta={'userdata': metadata}) except (IndexError, TypeError): pass
def parse_sub_nav(self, response): """ 处理二级分类 有些分类有二级分类,比如men """ metadata = response.meta['userdata'] sel = Selector(response) sub_nav_nodes = sel.xpath( '//div[@id="main"]/div/div[contains(@class, "navigation")]/ul/li/ul/li/ul/li/a[text()]' ) for sub_node in sub_nav_nodes: m = copy.deepcopy(metadata) tag_text = sub_node.xpath('./text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() if tag_text and tag_name: m['tags_mapping']['category-1'] = [ { 'name': tag_name, 'title': tag_text, }, ] gender = common.guess_gender(tag_name, extra={ 'male': [], 'female': ['lady'] }) if gender: m['gender'] = [gender] href = sub_node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) yield Request(url=href, callback=self.parse_third_nav, errback=self.onerr, meta={'userdata': m}) for val in self.parse_procut_list(response): yield val
def parse_productList(self, response): ''' 解析单品列表 ''' metadata = response.meta['userdata'] sel = Selector(response) product_list_nodes = sel.xpath('//div[@class="models-list"]//li') for node in product_list_nodes: try: m = copy.deepcopy(metadata) model_node = node.xpath('.//h5') if model_node: model = model_node.xpath('.//div').extract()[0] model = self.reformat(model) if model: m['model'] = model else: continue else: continue name_node = node.xpath('.//h5') if name_node: nameText = name_node.xpath('./text()').extract()[0] nameText = self.reformat(nameText) if nameText: m['name'] = nameText if m['name']: gender = common.guess_gender(m['name']) if gender: m['gender'] = [gender] href = node.xpath('.//a/@href').extract()[0] href = self.process_href(href, response.url) yield Request(url=href, callback=self.parse_product, errback=self.onerr, meta={'userdata': m}) except (TypeError, IndexError): continue
def parse(self, response): sel = Selector(response) metadata = response.meta['userdata'] for node1 in sel.xpath( '//nav[@id="mainMenu"]/ul[contains(@class, "menuHeader") and ' 'contains(@class, "firstLevel")]/li'): tag_text = None if 'data-main-menu' in node1._root.attrib: tag_text = self.reformat( unicodify(node1._root.attrib['data-main-menu'])) else: tmp = node1.xpath('./a[@href]') if tmp: tag_text = self.reformat(unicodify(tmp[0]._root.text)) if not tag_text: continue m1 = copy.deepcopy(metadata) m1['tags_mapping']['category-0'] = [{ 'name': tag_text.lower(), 'title': tag_text }] gender = cm.guess_gender(tag_text.lower()) if gender: m1['gender'] = [gender] for node2 in node1.xpath( './ul[contains(@class,"secondLevel")]/li/a[@href]'): tag_text = self.reformat(unicodify(node2._root.text)) if not tag_text: continue m2 = copy.deepcopy(metadata) m2['tags_mapping']['category-1'] = [{ 'name': tag_text.lower(), 'title': tag_text }] m2['category'] = [tag_text.lower()] yield Request(url=self.process_href(node2._root.attrib['href'], response.url), callback=self.parse_cat1, errback=self.onerr, meta={'userdata': m2})
def parse_left_nav_collection(self, response): """ 这里解析左边当前类别右侧的系列,看起来是当前类别的下属 """ metadata = response.meta['userdata'] sel = Selector(response) sub_nodes = sel.xpath( '//div[@id="col_colizq"]/div[@id="col_list"]/ul/li[child::a[text()]]' ) for sub_node in sub_nodes: m = copy.deepcopy(metadata) try: tag_text = sub_node.xpath('./a/text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except (TypeError, IndexError): continue if tag_text and tag_name: m['tags_mapping']['category-1'] = [ { 'name': tag_name, 'title': tag_text, }, ] gender = common.guess_gender(tag_name, { 'male': [], 'female': [u'少女'] }) if gender: m['gender'] = [gender] href = sub_node.xpath('./a/@href').extract()[0] href = self.process_href(href, response.url) yield Request(url=href, callback=self.parse_product_list, errback=self.onerr, meta={'userdata': m})
def parse_cat2_us(self, response): metadata = response.meta['userdata'] sel = Selector(response) cat_nodes = sel.xpath( '//div[@id="content"]/div[@id="categories"]/div[contains(@class, "category")]/a[@href][child::img[@title]]' ) for node in cat_nodes: try: tag_text = node.xpath('./img/@title').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except (TypeError, IndexError): continue if tag_text and tag_name: m = copy.deepcopy(metadata) m['tags_mapping']['category-2'] = [ { 'name': tag_name, 'title': tag_text, }, ] gender = cm.guess_gender(tag_name) if gender: m['gender'] = [gender] try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) href = self.process_href_for_us(href) except (TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product_list_us, errback=self.onerr, meta={'userdata': m}) for val in self.parse_product_list_us(response): yield val
def parse_filter2(self, response): """ 有些类别有二级的分类 比如:http://usa.agnesb.com/en/shopping_online/tous-produits/accessories/women-1 """ metadata = response.meta['userdata'] sel = Selector(response) sub_nodes = sel.xpath('//div[@id="sidebar"]/ul/ul/ul/ul/li/a[@href]') for sub_node in sub_nodes: m = copy.deepcopy(metadata) try: tag_text = sub_node.xpath('./text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except(TypeError, IndexError): continue if tag_text and tag_name: m['tags_mapping']['category-2'] = [ {'name': tag_name, 'title': tag_text, }, ] gender = common.guess_gender(tag_name) if gender: m['gender'] = [gender] try: href = sub_node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) except(TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product_list, errback=self.onerr, meta={'userdata': m}) for val in self.parse_product_list(response): yield val
def parse_left_filter(self, response): metadata = response.meta['userdata'] sel = Selector(response) # 有些类别有第三级展开,比如中国,促销,女装 nav_nodes = sel.xpath('//nav[@id="navMenu"]//ul//ul//ul//li//a[@href]') if not nav_nodes: # 针对美国官网 nav_nodes = sel.xpath( '//div[@class="left-navigation"]//ul/li/ul/li/a[@href]') for node in nav_nodes: try: tag_text = node.xpath('./text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except (TypeError, IndexError): continue if tag_text and tag_name: m = copy.deepcopy(metadata) m['tags_mapping']['category-2'] = [ { 'name': tag_name, 'title': tag_text }, ] gender = common.guess_gender(tag_name) if gender: m['gender'] = gender href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) yield Request(url=href, callback=self.parse_product_list, errback=self.onerr, meta={'userdata': m}) for val in self.parse_product_list(response): yield val
def parse_cat(self, response): link_extractor = SgmlLinkExtractor( restrict_xpaths=('//div[@class="inner-nav-content"]//a')) links = link_extractor.extract_links(response) metadata = response.meta['userdata'] for link in links: m = copy.deepcopy(metadata) url = link.url cat_title = link.text cat_name = cat_title.lower() m['tags_mapping']['category-0'] = [{ 'title': cat_title, 'name': cat_name }] gender = cm.guess_gender(cat_name) if gender: m['gender'] = [gender] yield Request(url=url, callback=self.parse_type, errback=self.onerr, meta={'userdata': m})
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) nav_nodes = sel.xpath('//div[@id="main_menu_menu"]/ul/li') for node in nav_nodes: m = copy.deepcopy(metadata) try: tag_text = ''.join( self.reformat(val) for val in node.xpath('.//text()').extract()) tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except (TypeError, IndexError): continue if tag_text and tag_name: m['tags_mapping']['category-0'] = [ { 'name': tag_name, 'title': tag_text, }, ] gender = common.guess_gender(tag_name, { 'male': [], 'female': [u'少女'] }) if gender: m['gender'] = [gender] href = node.xpath('./a[@href]/@href').extract()[0] href = self.process_href(href, response.url) yield Request(url=href, callback=self.parse_left_nav, errback=self.onerr, meta={'userdata': m})
def parse_collection(self, response): metadata = response.meta['userdata'] sel = Selector(response) collection_nodes = sel.xpath('//div[@id="main"]/ul//div[@class="row"]') for node in collection_nodes: try: tag_text = node.xpath('.//h2/text()').extract()[0] tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except(TypeError, IndexError): continue if tag_text and tag_name: m = copy.deepcopy(metadata) m['tags_mapping']['category-2'] = [ {'name': tag_name, 'title': tag_text, }, ] gender = common.guess_gender(tag_name) if gender: m['gender'] = [gender] product_nodes = node.xpath('.//div[@class="thumbnail"][child::a[@href]]') for product_node in product_nodes: mc = copy.deepcopy(m) try: href = product_node.xpath('./a[@href]/@href').extract()[0] href = self.process_href(href, response.url) except(TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product, errback=self.onerr, meta={'userdata': mc}, dont_filter=True)
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) for node in sel.xpath( '//div[contains(@class,"main-menu")]//li[contains(@class,"level0")]' ): node_class = node._root.attrib['class'] mt = re.search(r'\b(\w+)\s*$', node_class) if not mt: continue tag_type = 'category-0' tag_name = unicodify(mt.group(1)).lower() temp = node.xpath('./a[@href]') if not temp: continue href = temp[0]._root.attrib['href'] tag_text = u', '.join([ cm.html2plain(unicodify(val.text)) for val in temp[0]._root.iterdescendants() if val.text and val.text.strip() ]) m = copy.deepcopy(metadata) m['tags_mapping'][tag_type] = [{ 'name': tag_name, 'title': tag_text }] gender = cm.guess_gender(tag_name) if gender: m['gender'] = [gender] if not href or not href.strip(): continue else: yield Request(url=href, meta={'userdata': m}, callback=self.parse_category_0)
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) nav_nodes = sel.xpath( '//div[@class="l-watches-navigation"]/div/div[@class="navigation-element"]' ) for node in nav_nodes: m = copy.deepcopy(metadata) try: tag_text = ''.join( self.reformat(val) for val in node.xpath('./h3//text()').extract()) tag_text = self.reformat(tag_text) tag_name = tag_text.lower() except (TypeError, IndexError): continue if tag_text and tag_name: m['tags_mapping']['category-0'] = [ { 'name': tag_name, 'title': tag_text, }, ] gender = common.guess_gender(tag_name) if gender: m['gender'] = [gender] href = node.xpath('.//a[@href]/@href').extract()[0] href = self.process_href(href, response.url) yield Request(url=href, callback=self.parse_collection, errback=self.onerr, meta={'userdata': m})
def parse(self, response): sel = Selector(response) cat_title = ''.join(''.join( sel.xpath('//div[@id="wrapperOuter"]/nav/h2//text()').extract())) cat_name = cat_title.lower() link_extractor = SgmlLinkExtractor( restrict_xpaths=('//section[@id="main"]')) links = link_extractor.extract_links(response) metadata = response.meta['userdata'] for link in links: m = copy.deepcopy(metadata) m['tags_mapping']['category-0'] = [{ 'title': cat_title, 'name': cat_name }] gender = cm.guess_gender(cat_name) if gender: m['gender'] = [gender] url = link.url yield Request(url=url, callback=self.parse_details, errback=self.onerr, meta={'userdata': m})