def getData(self, leibie_url, leibie): parser = etree.HTMLParser(encoding="utf-8") product_page = 1 main_url = leibie_url % product_page text = urllib2.urlopen(main_url).read() tree = etree.HTML(text, parser=parser) nodes = tree.xpath(LEIBIE_XPATH) pages_url = [main_url] page_nodes = tree.xpath(PAGE_XPATH) if page_nodes is not None: pages = len(page_nodes) + 1 print "total pages %d" % pages for page in range(1, pages): new_url = leibie_url % (page + 1) pages_url.append(new_url) for page_url in pages_url: item_text = urllib2.urlopen(page_url).read() item_tree = etree.HTML(item_text, parser=parser) item_nodes = item_tree.xpath(ITEM_XPATH) if item_nodes is None: continue for item_node in item_nodes: item_table = item_node.find("table") if item_table is None: continue time = datetime.datetime.now().strftime("%Y-%m-%d") url = PARENT_URL + item_table.find("tr/td/a").attrib["href"] title = item_table.find("tr[3]/td/b").text price = "".join(item_table.find("tr[4]/td").text.split(" ")).lstrip("\r\n") image_text = urllib2.urlopen(url).read() tree_image = etree.HTML(image_text, parser=parser) img_node = tree_image.xpath(BIG_XPATH)[0] image_url = img_node.attrib["src"] self.logger.info("%s(%s) - %s @ %s" % (title, price, url, image_url)) collector.object_found.send( self, time=time, title=title, url=url, image_url=image_url, price=price, leibie=leibie ) from shopping.signals import item_found item_found.send( self, name=title, url=url, brand=self.__class__.__name__, image_url=image_url, image_url2=None, price=price, category=leibie, )
def getData(self, category, old_leibie): parser = etree.XMLParser(encoding='utf-8') self.logger.info('Category: %s:' % category) url = LIST_URL %(category) text = urllib2.urlopen(url).read() time = datetime.datetime.now().strftime('%Y-%m-%d') tree = etree.XML(text, parser=parser) nodes = tree.xpath(XPATH) for node in nodes: title = node.find('Title').text if u'内衣' in title or u'n内裤' in title or u'袜子' in title: continue if u'裙' in title: leibie = u'裙' elif u'裤' in title: leibie = u'裤' elif u'鞋' in title: leibie = u'鞋' elif u'包' in title: leibie = u'配饰' elif u'5239145' in title or u'装' in title or u'衣' in title or u'衫' in title\ or u'夹' in title or u'恤' in title: leibie = u'上装' else: continue price = node.find('Price').text image_url = u'http://me-city.com/'+node.find('FullImage').text self.logger.info('%s:%s(%s) - %s @ %s' % (leibie, title, price, image_url, image_url)) collector.object_found.send( self, time = time, title = title, url = image_url, image_url = image_url, price = price, leibie = leibie ) from shopping.signals import item_found item_found.send( self, name = title, url = image_url, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = leibie, )
def getData(self, category, subcate, pages, leibie): parser = etree .HTMLParser(encoding='utf-8') self.logger.info('Category: %s-%s:' % (category, subcate)) for page in range(1,pages): self.logger.info('Page: %d:' % page) url = LIST_URL % (page, subcate, category) text = urllib2.urlopen(url).read() tree = etree.HTML(text, parser=parser) time = datetime.datetime.now().strftime('%Y-%m-%d') nodes = tree.xpath(XPATH) for node in nodes: sub_node = node.find('dt[@class="skuname"]/a') ourl = urlparse.urljoin(url,sub_node.attrib['href']) title = sub_node.text sub_node = node.find('dt[@class="price"]/span[@id="listPrice"]') price = u'¥' + sub_node.text sub_node = node.find('dt[@class="img"]/a/img') #print etree.tostring(sub_node, method='html', encoding='utf-8') image_url = sub_node.attrib['lazy_src'] detail_text = urllib2.urlopen(ourl).read() detail_tree = etree.HTML(detail_text, parser=parser) image_node = detail_tree.xpath(BIG_XPATH)[0] image_url = image_node.attrib['src'] self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url)) collector.object_found.send( self, time = time, title = title, url = ourl, image_url = image_url, price = price, leibie = leibie ) from shopping.signals import item_found item_found.send( self, name = title, url = ourl, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = leibie, )
def getData(self,category,pages,detail_leibie,leibie): parser = etree .HTMLParser(encoding='utf-8') for subcate in range(1,pages): time = datetime.datetime.now().strftime('%Y-%m-%d') self.logger.info('Category: %s' % leibie) self.logger.info('Category: %s-%s:' % (detail_leibie, subcate)) url = LIST_URL % (category,category) text = urllib2.urlopen(url).read() tree = etree.HTML(text,parser=parser) nodes = tree.xpath(XPATH) for num in range(2,7): for i in range(1,4): sub_node = nodes[num].find('div['+str(i)+']') name_node = sub_node.find('ul/li[1]/a') title = name_node.attrib['title'] ourl = name_node.attrib['href'] # image_node = name_node.find('img') # image_url = image_node.attrib['src'] text = urllib2.urlopen(ourl).read() tree = etree.HTML(text, parser=parser) imgnodes = tree.xpath(BIGXPATH) image_url = imgnodes[0].attrib['href'] price_node = sub_node.find('ul/li[3]') price = price_node.text if price.find(u'£') < 0: price_node = sub_node.find('ul/li[4]') price = price_node.text self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url)) collector.object_found.send( self, time = time, title = title, url = ourl, image_url = image_url, price = price, leibie = leibie ) from shopping.signals import item_found item_found.send( self, name = title, url = ourl, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = leibie, )
def fetch(self): self.logger.info('MANGO started.') parser = etree .HTMLParser(encoding='utf-8') for page in range(1,4): self.logger.info('Page: %d:' % page) url = LIST_URL + page.__str__() print url text = urllib2.urlopen(url).read() tree = etree.HTML(text, parser=parser) time = datetime.datetime.now().strftime('%Y-%m-%d') nodes = tree.xpath(XPATH) for node in nodes: sub_node = node.find('tr[1]/td/div/a/img') #print etree.tostring(node, method='html', encoding='utf-8') image_url = sub_node.attrib['src'] sub_node = node.find('tr[1]/td/div/img') image_url_backup = sub_node.attrib['src'] sub_node = node.find('tr[2]/td/div/table/tr[2]/td/a') title = sub_node.find('span').text url = urlparse.urljoin("http://shop.mango.com/",sub_node.attrib['href']) sub_node = node.find('tr[2]/td/div/table/tr[3]/td/span') price = sub_node.text self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url)) collector.object_found.send( self, time = time, title = title, url = url, image_url = image_url, image_url2 = image_url_backup, price = price, leibie = u'女装' ) from shopping.signals import item_found item_found.send( self, name = title, url = url, brand = self.__class__.__name__, image_url = image_url, image_url2 = image_url_backup, price = price, category = u'女装', )
def fetch(self): self.logger.info('Goelia started.') parser = etree .HTMLParser(encoding='utf-8') for page in range(1,8): self.logger.info('Page: %d:' % page) url = LIST_URL1 % page + LIST_URL2 text = urllib2.urlopen(url).read() tree = etree.HTML(text, parser=parser) time = datetime.datetime.now().strftime('%Y-%m-%d') nodes = tree.xpath(XPATH) for node in nodes: sub_node = node.find('div[@class="goodpic"]/a/img') #print etree.tostring(node, method='html', encoding='utf-8') image_url = sub_node.attrib['lazyload'] sub_node = node.find('div[@class="goods-main"]/div[1]/h6/a') title = sub_node.text ourl = urlparse.urljoin(url,sub_node.attrib['href']) sub_node = node.find('div[@class="goods-main"]/div[2]/ul/li[1]/em[@class="sell-price"]') price = sub_node.text.strip() self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url)) collector.object_found.send( self, time = time, title = title, url = ourl, image_url = image_url, price = price, leibie = u'女装' ) from shopping.signals import item_found item_found.send( self, name = title, url = ourl, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = u'女装', )
def fetch(self): self.logger.info('Oasis started.') parser = etree .HTMLParser(encoding='utf-8') for page in range(1,7): self.logger.info('Page: %d:' % page) url = LIST_URL % page text = urllib2.urlopen(url).read() tree = etree.HTML(text, parser=parser) time = datetime.datetime.now().strftime('%Y-%m-%d') nodes = tree.xpath(XPATH) for node in nodes: sub_node = node.find('dd/a/img') #print etree.tostring(node, method='html', encoding='utf-8') image_url = sub_node.attrib['src'] sub_node = node.find('dt/a') title = sub_node.text ourl = urlparse.urljoin(url,sub_node.attrib['href']) price = self.getPrice(ourl) self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url)) collector.object_found.send( self, time = time, title = title, url = ourl, image_url = image_url, price = price, leibie = u"女装" ) from shopping.signals import item_found item_found.send( self, name = title, url = ourl, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = u"女装", )
def getData(self): parser = etree.HTMLParser(encoding='utf-8') text = urllib2.urlopen(MAIN_URL).read() tree = etree.HTML(text, parser=parser) nodes = tree.xpath(LEIBIE_XPATH) for node in nodes: sub_node = node.find('a') if sub_node is None: continue leibie_detail = sub_node.text if leibie_detail is None: leibie_detail = sub_node.find('span').text count = leibie_detail[leibie_detail.index('(') + 1:leibie_detail.index(')')] if count == '0': continue leibie_url = sub_node.attrib['href'] leibie = leibie_detail if u'卫衣' in leibie_detail or u' T恤' in leibie_detail or u'衬衫' in leibie_detail or\ u'装' in leibie_detail: leibie = u'上装' if u'裤' in leibie_detail or u'50' in leibie_detail: leibie = u'裤' if u'鞋' in leibie_detail: leibie = u'鞋' if u'配件' in leibie_detail or u'包' in leibie_detail or u'皮带' in leibie_detail \ or u'眼镜' in leibie_detail or u'手表' in leibie_detail: leibie = u'配饰' self.logger.info('Leibie: %s' % leibie) print leibie_url text_leibie = urllib2.urlopen(leibie_url).read() tree_leibie = etree.HTML(text_leibie, parser=parser) time = datetime.datetime.now().strftime('%Y-%m-%d') page_nodes = tree_leibie.xpath(PAGE_XPATH)[:-1] pages = len(page_nodes) + 1 print 'total pages %s' % pages url_list = [leibie_url] for page in page_nodes: url_list.append(page.attrib['href']) for page_url in url_list: text_item = urllib2.urlopen(page_url).read() tree_item = etree.HTML(text_item, parser=parser) nodes_item = tree_item.xpath(ITEM_XPATH) for sub_node_item in nodes_item: item_node = sub_node_item.find('a') if item_node is None: continue url = item_node.attrib['href'] title = item_node.attrib['title'] price_node = sub_node_item.find('div/font[2]') price = 'RMB' + price_node.text new_price_node = sub_node_item.find('div/font[1]') new_price = 'RMB' + new_price_node.text image_url_backup = '' text_detail = urllib2.urlopen(url).read() tree_detail = etree.HTML(text_detail, parser=parser) img_node = tree_detail.xpath(BIG_XPATH)[0] image_url = img_node.attrib['src'] self.logger.info('%s(%s--%s) - %s @ %s' % (title, price, new_price, url, image_url)) collector.object_found.send( self, time = time, title = title, url = url, image_url = image_url, image_url2 = image_url_backup, price = price, leibie = leibie ) from shopping.signals import item_found, item_update item_found.send( self, name = title, url = url, brand = self.__class__.__name__, image_url = image_url, image_url2 = image_url_backup, price = price, category = leibie, ) item_update.send( self, url=url, new_price=new_price )
def getData(self, main_url): parser = etree.HTMLParser(encoding='utf-8') text = urllib2.urlopen(main_url).read() tree = etree.HTML(text, parser=parser) nodes = tree.xpath(LEIBIE_XPATH) for node in nodes: sub_node = node.find('a') if sub_node is None: continue leibie_detail = sub_node.text leibie_url = PARENT_URL + sub_node.attrib['href'] leibie = leibie_detail if u'全部' in leibie_detail or u'袜子' in leibie_detail or u'泳装' in leibie_detail: continue if u'夹克' in leibie_detail or u'牛仔' in leibie_detail or u'衫' in leibie_detail or\ u'T恤' in leibie_detail: leibie = u'上装' if u'裤' in leibie_detail: leibie = u'裤' if u'鞋' in leibie_detail: leibie = u'鞋' if u'带' in leibie_detail or u'包' in leibie_detail or u'帽' in leibie_detail\ or u'链' in leibie_detail or u'手' in leibie_detail or u'围巾' in leibie_detail: leibie = u'配饰' self.logger.info('Leibie: %s' % leibie) print leibie_url time = datetime.datetime.now().strftime('%Y-%m-%d') text_leibie = urllib2.urlopen(leibie_url).read() tree_leibie = etree.HTML(text_leibie, parser=parser) item_nodes = tree_leibie.xpath(ITEM_XPATH) for node in item_nodes: item_node = node.find('a') if item_node is None: continue title = node.find('div[2]/div/a').text url = 'http://www.converse.com.cn'+node.find('div[2]/div/a').attrib['href'] price = ''.join(node.find('div[2]/div[2]').text.split(' ')) image_url_backup = '' text_detail = urllib2.urlopen(url).read() tree_detail = etree.HTML(text_detail, parser=parser) img_node = tree_detail.xpath(BIG_XPATH)[0] image_url = img_node.attrib['href'] self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url)) collector.object_found.send( self, time = time, title = title, url = url, image_url = image_url, image_url2 = image_url_backup, price = price, leibie = leibie ) from shopping.signals import item_found item_found.send( self, name = title, url = url, brand = self.__class__.__name__, image_url = image_url, image_url2 = image_url_backup, price = price, category = leibie, )
def getData(self, category, subcate, leibie): parser = etree .HTMLParser(encoding='utf-8') self.logger.info('Category: %s-%s:' % (category, subcate)) url = LIST_URL % (subcate, category, 1) text = urllib2.urlopen(url).read() tree = etree.HTML(text, parser=parser) leibie_nodes = tree.xpath(LEIBIE_XPATH) for leibie_node in leibie_nodes: leibie_node_a = leibie_node.find('a') leibie_name = leibie_node_a.text if u'上衣' in leibie_name or u'夹克' in leibie_name or\ u'球衣' in leibie_name or u'运动衫' in leibie_name or u'卫衣' in leibie_name or\ u'POLO衫' in leibie_name or u'有袖' in leibie_name or u'夹克' in leibie_name or\ u'马甲' in leibie_name or u'风衣' in leibie_name or u'大衣' in leibie_name or\ u'棉服' in leibie_name or u'女款' in leibie_name or u'男款' in leibie_name: leibie = u'上装' elif u'裤子' in leibie_name or u'短裤' in leibie_name or u'连身裤' in leibie_name\ or u'连身裤' in leibie_name or u'针织裤' in leibie_name or u'西裤' in leibie_name: leibie = u'裤' elif u'套服' in leibie_name: leibie = u'套装' elif u'裙装' in leibie_name or u'短裙' in leibie_name: leibie = u'裙' elif u'鞋' in leibie_name or u'拖' in leibie_name: leibie = u'鞋' elif u'饰品' in leibie_name or u'手套' in leibie_name or u'背包' in leibie_name or\ u'帽子' in leibie_name or u'头饰' in leibie_name or u'附件' in leibie_name: leibie = u'配饰' else: continue leibie_url = PARENT_URL + leibie_node_a.attrib['href'] + '?p=1' time = datetime.datetime.now().strftime('%Y-%m-%d') items_text = urllib2.urlopen(leibie_url).read() print "leibie url %s" % leibie_url items_tree = etree.HTML(items_text, parser=parser) pages_node = items_tree.xpath(PAGE_XPATH) if len(pages_node) != 0: pages_node = pages_node[-2] pages = int(pages_node.text, 10) else: pages = 1 for page in range(1, pages+1): if page == 1: items_nodes = items_tree.xpath(ITEM_XPATH) else: query_string = QUERY_STRING % page items_url = PARENT_URL + leibie_node_a.attrib['href'] + query_string items_text = urllib2.urlopen(items_url).read() items_tree = etree.HTML(items_text, parser=parser) items_nodes = items_tree.xpath(ITEM_XPATH) for node in items_nodes: sub_node = node.find('div[1]/a/img') image_url = sub_node.attrib['src'] sub_node = node.find('div[3]/a') #print etree.tostring(node, method='html', encoding='utf-8') ourl = urlparse.urljoin(url,sub_node.attrib['href']) title = sub_node.text detail_text = urllib2.urlopen(ourl).read() detail_tree = etree.HTML(detail_text, parser=parser) image_node = detail_tree.xpath(BIG_XPATH)[0] image_url = image_node.attrib['src'][2:] price = '0' self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url)) collector.object_found.send( self, time = time, title = title, url = ourl, image_url = image_url, price = price, leibie = leibie ) item_found.send( self, name = title, url = ourl, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = leibie, )
def getData(self, category, kuanshi): if u'WOMEN' in kuanshi : start = 7 else: start = 5 parser = etree .HTMLParser(encoding='utf-8') time = datetime.datetime.now().strftime('%Y-%m-%d') self.logger.info('Category: %s:' % kuanshi) M_URL = LIST_URL % (category,category) text = urllib2.urlopen(M_URL).read() tree = etree.HTML(text,parser=parser) nodes = tree.xpath(XPATH) for num in range(start,20): if num>=7 and num<=11: leibie = u'上装' elif num>=12 and num<=15: leibie = u'裤' elif num>=16 and num<=17: leibie = u'裙' elif num == 18: leibie = u'鞋' elif num == 19: leibie = u'配饰' node = nodes[num].find('a') detail_leibie = node.text self.logger.info('leibie: %s' % leibie+'-'+detail_leibie) cate_url = node.attrib['href'] cate_url = 'http://www.abercrombie.com'+cate_url text = urllib2.urlopen(cate_url).read() tree = etree.HTML(text,parser=parser) cat_nodes = tree.xpath(CAT_XPATH) for cat_node in cat_nodes: i=1 while cat_node.find('div/ul/li['+str(i)+']') is not None: clo_nodes = cat_node.find('div/ul/li['+str(i)+']') # print clo_nodes i=i+1 for clo_node in clo_nodes: name_node = clo_node.find('span[@class="name"]/h3/a') if name_node is None: name_node = clo_node.find('span[@class="name"]/h2/a') #print etree.tostring(clo_node.find('span[@class="name"]'), method='html', encoding='utf-8') #print name_node.text title = name_node.text url_node = clo_node.find('div[@class="image-wrap"]/a') url = 'http://www.abercrombie.com/webapp/wcs/stores/servlet/'+url_node.attrib['href'] text = urllib2.urlopen(url).read() tree = etree.HTML(text,parser=parser) imgnodes = tree.xpath(BIGXPATH) if imgnodes is not None: image_url = 'http:'+imgnodes[0].attrib['src'] # image_node = url_node.find('img') # image_url = 'http:'+image_node.attrib['src'] price_node = clo_node.find('div[@class="price"]/span') price = price_node.text self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url)) from shopping.signals import item_found item_found.send( self, name = title, url = url, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = leibie, )
def getData(self,category,pages,leibie): temp_leibie = leibie parser = etree .HTMLParser(encoding='utf-8') for subcate in range(1,pages): time = datetime.datetime.now().strftime('%Y-%m-%d') self.logger.info('Category: %s-%s:' % (leibie, subcate)) urlleft = LEFT_URL % (category) urlright = RIGHT_URL % (subcate) url = urlleft + u'%20' + urlright text = urllib2.urlopen(url).read() tree = etree.HTML(text,parser=parser) nodes = tree.xpath(XPATH) for node in nodes: #print etree.tostring(node, method='html', encoding='utf-8') s_node = node.find('div/a') title = urlparse.urljoin(url,s_node.attrib['title'])[24:] if temp_leibie == u'打折单品': cate_node = node.find('span') leibie_detail = cate_node.text print leibie_detail # self.logger.info('Category: %s' % (leibie)) if u'衫' in leibie_detail or u'T' in leibie_detail or u'外套' in leibie_detail or u'背心' in leibie_detail or u'毛衣' in leibie_detail or u'上衣' in leibie_detail: leibie = u'上装' if u'裤' in leibie_detail: leibie = u'裤' if u'裙' in leibie_detail: leibie = u'裙' if u'配饰' in leibie_detail and u'鞋' in title: leibie = u'鞋' self.logger.info('Category: %s' % (leibie)) ourl = urlparse.urljoin(url,s_node.attrib['href']) image_url2_whole = urlparse.urljoin(url,s_node.attrib['onmouseover']) image_url2 = image_url2_whole[image_url2_whole.find('src=')+5 :] image_url2 = image_url2.replace('\'','') sub_node = s_node.find('img') image_url = urlparse.urljoin(url,sub_node.attrib['datasrc']) p_node = node.find('p') d_node = p_node.find('del') new_price = '' if d_node is not None: price = d_node.text new_info = etree.tostring(p_node, method='html', encoding='utf-8') new_price = new_info [new_info.find('</del>')+len('</del>'):new_info.find('</p>')] else: price = p_node.text self.logger.info('%s(%s,now:%s) - %s @ %s' % (title, price,new_price.decode('utf-8'), ourl, image_url)) from shopping.signals import item_found,item_update item_found.send( self, name = title, url = ourl, brand = self.__class__.__name__, image_url = image_url, image_url2 = image_url2, price = price, category = leibie, ) if new_price != '': item_update.send( self, name = title, url = ourl, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, new_price = new_price, category = leibie, )
def getData(self, category, pages, mainleibie): parser = etree .HTMLParser(encoding='utf-8') self.logger.info('Category: %s:' % category) text = urllib2.urlopen(LIST_URL %(category)).read() tree = etree.HTML(text, parser=parser) nodes = tree.xpath(MAIN_XPATH) for node in nodes: sub_node = node.find('a') leibie_detail = sub_node.text leibie_url = sub_node.attrib['href'] leibie = leibie_detail if u'内衣' in leibie_detail or u'睡衣' in leibie_detail or u'袜' in leibie_detail or u'孕妇' in leibie_detail: continue if u'衣' in leibie_detail or u'装' in leibie_detail or u'衫' in leibie_detail: leibie = u'上装' if u'裤' in leibie_detail: leibie = u'裤' if u'裙' in leibie_detail: leibie = u'裙' if u'配饰' in leibie_detail: leibie = u'配饰' if u'鞋' in leibie_detail: leibie = u'鞋' self.logger.info('leibie: %s:' % mainleibie+'-'+leibie) print leibie_url text = urllib2.urlopen(leibie_url).read() # print LIST_URL %(category,page) tree = etree.HTML(text, parser=parser) time = datetime.datetime.now().strftime('%Y-%m-%d') nodes = tree.xpath(XPATH) for i in range(1,len(nodes)): sub_node = nodes[i].find('a') title = sub_node.attrib['title'] url = sub_node.attrib['href'] price = sub_node.find('span/span/span').text price = u'¥' + price[len('RMB '):len(price)] text = urllib2.urlopen(url).read() tree = etree.HTML(text, parser=parser) img_nodes = tree.xpath(BIGXPATH) image_url = 'http:'+img_nodes[0].attrib['src'] image_url = 'http:'+urllib.quote(image_url[5:],safe='/') # sub_node = nodes[i].find('div[1]') # img = sub_node.find('img[2]') # image_url = '' # if img is not None: # image_url = 'http:'+sub_node.find('img[2]').attrib['src'] # image_url = 'http:'+urllib.quote(image_url[5:],safe='/')#image_url.replace(' ','%20') # backup = sub_node.find('img[1]') image_url_backup = '' # if backup is not None: # image_url_backup = 'http:'+sub_node.find('img[1]').attrib['src'] self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url)) collector.object_found.send( self, time = time, title = title, url = url, image_url = image_url, image_url2 = image_url_backup, price = price, leibie = leibie ) from shopping.signals import item_found item_found.send( self, name = title, url = url, brand = self.__class__.__name__, image_url = image_url, image_url2 = image_url_backup, price = price, category = leibie, )
def getData(self, category,kuanshi): parser = etree .HTMLParser(encoding='utf-8') url = LIST_URL % (category) text = urllib2.urlopen(url).read() tree = etree.HTML(text, parser=parser) time = datetime.datetime.now().strftime('%Y-%m-%d') nodes = tree.xpath(XPATH) for node in nodes: leibie = node.text.strip() if leibie==u'皮带': leibie=u'配饰' elif leibie==u'围巾/帽子': leibie=u'配饰' elif leibie==u'手袋/钱夹': leibie=u'配饰' elif leibie==u'牛仔裤': leibie=u'裤' elif leibie==u'热裤/七分裤': leibie=u'裤' elif leibie==u'时尚及休闲裤': leibie=u'裤' elif leibie==u'连衣裙': leibie=u'裙' elif leibie==u'半身裙': leibie=u'裙' elif leibie==u'浪漫韵动': leibie='none' elif leibie==u'意桃粉丽人': leibie='none' elif leibie==u'白色的纯纯夏日': leibie='none' else: leibie=u'上装' print leibie if leibie=='none': continue # #unction getSkus(url,_this){ # 基于需要f5刷新的需要,将局部刷新修改为页面跳转 #/products/2---Women@[email protected] # window.location.href=url; link_url=node.attrib['onclick'] #getSkus('/products/2-6-22-------.htm',this) link_url = link_url[link_url.find("/"):link_url.find("',")] link_url = 'http://www.esprit.cn'+link_url print link_url text = urllib2.urlopen(link_url).read() tree = etree.HTML(text,parser=parser) nodes = tree.xpath(XPATHSUB) #nodesTitle = tree.xpath(TITLEXPATH) #nodesPrice = tree.xpath() #index = 0 for node in nodes: #image_url=node.find('div[@class="sku_pic"]') #image_url=node.find('a[@class="category_skudetails_href"]') #if node is None: # continue # nodeTitle=nodesTitle[index] image_url = 'http://www.esprit.cn'+node.attrib['href'] # title=nodeTitle.text print image_url text = urllib2.urlopen(image_url).read() tree = etree.HTML(text, parser=parser) node = tree.xpath(TITLEXPATH)[0] title= node.text node = tree.xpath(PRICXPATH1)[0] price=node.text print price node = tree.xpath(PRICXPATH2) if len(node) !=0: oldPrice=price forsale = True price =node[0].text print price else: forsale =False bigimage_url= tree.xpath(BIGXPATH)[0] bigimage_url = bigimage_url.attrib['href'] print bigimage_url self.logger.info('%s(%s) - %s @ %s' % (title, price, image_url,bigimage_url)) if forsale: price1 = oldPrice price2 = price else: price1 = price price2 = price print "%s , %s , %s , %s , %s , %s" % (title,image_url,bigimage_url,price1,price2,leibie) from shopping.signals import item_found, item_update item_found.send( self, name = title, url = image_url, brand = self.__class__.__name__, image_url = bigimage_url, image_url2 = None, price = price1, category = leibie ) item_update.send( self, url=image_url, new_price=price2 )
def getData(self, category, kuanshi): parser = etree .HTMLParser(encoding='utf-8') time = datetime.datetime.now().strftime('%Y-%m-%d') self.logger.info('Category: %s:' % kuanshi) M_URL = LIST_URL % (category,category) print M_URL text = urllib2.urlopen(M_URL).read() tree = etree.HTML(text,parser=parser) nodes = tree.xpath(XPATH) #print nodes for node in nodes: cat_url = node.attrib['href'] #print cat_url leibie_node = node.find('span') leibie_node_text=leibie_node.text.strip() print leibie_node_text print leibie_node_text=="T's + Polos" if leibie_node_text=='Tops': leibie_node_text=u'上装' # elif leibie_node_text=="T's + Polos": # leibie_node_text=u'套装' # elif leibie_node_text=='Tanks + Camis': # leibie_node_text=u'套装' # elif leibie_node_text=="Graphic T's": # leibie_node_text=u'上装' # elif leibie_node_text=='Shirts': # leibie_node_text=u'上装' # elif leibie_node_text=='Sweaters + Cardis': # leibie_node_text=u'套装' # elif leibie_node_text=='Sweatshirts': # leibie_node_text=u'上装' # elif leibie_node_text=='Outerwear': # leibie_node_text=u'上装' elif leibie_node_text=='Bottoms': leibie_node_text=u'下装' elif leibie_node_text=='Accessories': leibie_node_text=u'配饰' elif leibie_node_text=='College': leibie_node_text=u'上装' elif leibie_node_text=='Footwear': leibie_node_text=u'鞋' elif leibie_node_text=='New Arrivals': leibie_node_text='none' elif leibie_node_text=='Web Exclusives': leibie_node_text='none' elif leibie_node_text=="$10 T's + Tanks": leibie_node_text='none' elif leibie_node_text=='Looks To Live In': leibie_node_text='none' elif leibie_node_text=='Jean Guide': leibie_node_text='none' elif leibie_node_text=='Fragrance': leibie_node_text='none' elif leibie_node_text=='Clearance': leibie_node_text='none' elif leibie_node_text=='Back To Basics': return #print 'fafd %s' % leibie_node_text if leibie_node_text == 'none': continue if leibie_node_text!=u'下装': print "ups" leibie = kuanshi + '-' + leibie_node_text print leibie text = urllib2.urlopen(cat_url).read() tree = etree.HTML(text,parser=parser) nodes = tree.xpath(CAT_XPATH) for node in nodes: url_node = node.find('a') url = 'http://www.ae.com'+url_node.attrib['href'] image_node = url_node.find('span/img') title = image_node.attrib['alt'] #print title image_url = 'http:'+image_node.attrib['src'] #print image_url price_node = url_node.find('span[4]') price = price_node.text if price.find(u'Was:')==0: price_node = url_node.find('span[5]') price = price_node.text[5:] print price text = urllib2.urlopen(url).read() tree = etree.HTML(text,parser=parser) node = tree.xpath(BIG_XPATH)[0] image_node=node.find('img') image_url='http:'+image_node.attrib['src'] print image_url node = tree.xpath(PRICE_XPATH) flag = False if node: #print node flag = True oldPrice = node[0].text print oldPrice self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url)) if flag: price1 = oldPrice price2 = price else: price1 = price price2 = price #print "%s %s %s %s %s %s" % (title,url,image_url,price1,price2,leibie) from shopping.signals import item_found, item_update item_found.send( self, name = title, url = url, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price1, category = leibie ) item_update.send( self, url=url, new_price=price2 ) else: print "bottoms" nodes = tree.xpath(XPATHSUB) for node in nodes: cat_url = node.attrib['href'] leibie_node = node.find('span') leibie_node_text=leibie_node.text.strip() if leibie_node_text=='Jeans': leibie_node_text=u'裤' elif leibie_node_text=='Pants + Crops': leibie_node_text=u'裤' elif leibie_node_text=='Shorts': leibie_node_text=u'裤' elif leibie_node_text=='Dresses': leibie_node_text=u'裙' else: leibie_node_text= 'none' if leibie_node_text == 'none': continue leibie = kuanshi + '-' + leibie_node_text print leibie text = urllib2.urlopen(cat_url).read() tree = etree.HTML(text,parser=parser) nodes = tree.xpath(CAT_XPATH) for node in nodes: url_node = node.find('a') url = 'http://www.ae.com'+url_node.attrib['href'] image_node = url_node.find('span/img') title = image_node.attrib['alt'] #print title image_url = 'http:'+image_node.attrib['src'] #print image_url price_node = url_node.find('span[4]') price = price_node.text if price.find(u'Was:')==0: price_node = url_node.find('span[5]') price = price_node.text[5:] print price #http://www.ae.com/web/browse/product.jsp?productId=2371_9560_199&catId=cat90030 text = urllib2.urlopen(url).read() tree = etree.HTML(text,parser=parser) node = tree.xpath(BIG_XPATH)[0] image_node=node.find('img') image_url='http:'+image_node.attrib['src'] node = tree.xpath(PRICE_XPATH) flag = False if node: # print node flag = True oldPrice = node[0].text print oldPrice print image_url if flag: price1 = oldPrice price2 = price else: price1 = price price2 = price item_found.send( self, name = title, url = url, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price1, category = leibie ) item_update.send( self, url=url, new_price=price2 )
def getData(self, category, leibie_detail): parser = etree .HTMLParser(encoding='utf-8') # for page in range(1,pages): # self.logger.info('Page: %d:' % page) leibie = leibie_detail if u'衣' in leibie_detail or u'衫' in leibie_detail or u'外套' in leibie_detail: leibie = u'上装' if u'裤' in leibie_detail: leibie = u'裤' if u'裙' in leibie_detail: leibie = u'裙' if u'配饰' in leibie_detail or u'包袋' in leibie_detail: leibie = u'配饰' if u'鞋' in leibie_detail: leibie = u'鞋' self.logger.info('leibie: %s:' % leibie) self.logger.info('leibie_detail: %s:' % leibie_detail) url = LIST_URL % category text = urllib2.urlopen(url).read() tree = etree.HTML(text, parser=parser) time = datetime.datetime.now().strftime('%Y-%m-%d') nodes = tree.xpath(XPATH) for node in nodes: sub_node = node.find('h5/a') #print etree.tostring(node, method='html', encoding='utf-8') url = sub_node.attrib['href'] title = sub_node.text.strip() if leibie_detail == u'打折单品': if u'衫' in title or u'T' in title or u'外套' in title or u'背心' in title or u'毛衣' in title or u'上衣' in title or u'吊带' in title: leibie = u'上装' if u'裤' in title: leibie = u'裤' if u'裙' in title: leibie = u'裙' if u'鞋' in title: leibie = u'鞋' if u'帽' in title or u'围巾' in title or u'皮带' in title or u'腰带' in title: leibie = u'配饰' self.logger.info('Category: %s' % (leibie)) sub_node = node.find('div[3]/p/span/span') price = sub_node.text price = price[0:price.index('.')] new_price = '' sub_node = node.find('div[3]/p[2]/span') if sub_node is not None: new_price = price price = sub_node.text.strip() price = price[0:price.index('.')] text = urllib2.urlopen(url).read() tree = etree.HTML(text, parser=parser) imgnodes = tree.xpath(BIGXPATH) image_url = imgnodes[0].attrib['href'] # sub_node = node.find('div[1]/p/a/img') # image_url = sub_node.attrib['src'] self.logger.info('%s(%s,discount:%s) - %s @ %s - %s' % (title, price, new_price, url, image_url, leibie)) from shopping.signals import item_found,item_update item_found.send( self, name = title, url = url, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = leibie, ) if new_price != '': item_update.send( self, name = title, url = url, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, new_price = new_price, category = leibie, )
def fetch(self): self.logger.info('Kappa started.') kuanshi ='MENs' parser = etree .HTMLParser(encoding='utf-8') for page in range(1,2): URL = LIST_ADDR % (page) text = urllib2.urlopen(URL).read() tree = etree.HTML(text, parser=parser) time = datetime.datetime.now().strftime('%Y-%m-%d') nodes = tree.xpath(LEIBIEXPATH) for node in nodes: if node.text: leibie = node.text if leibie ==u'运动': leibie = 'none' elif leibie == u'运动时尚': leibie = 'none' elif leibie == u'时尚': leibie = 'none' if leibie != 'none': print leibie title =leibie if leibie.find(u'裤')!=-1: leibie=u'上装' else: leibie=u'下装' print leibie leibie_url='http://www.kappa.com.cn/product/'+ node.attrib['href'] #print leibie_url text = urllib2.urlopen(leibie_url).read() tree = etree.HTML(text,parser=parser) nodes = tree.xpath(XPATH) for node in nodes: image_url = 'http://www.kappa.com.cn/product/'+node.attrib['href'] print image_url text = urllib2.urlopen(image_url).read() tree = etree.HTML(text,parser=parser) node = tree.xpath(PRICE_XPATH)[0] #print node kuanshi=node.find('tr[3]').find('td[2]') if kuanshi==u'男': leibie='MENs'+leibie else: leibie='WOMENs'+leibie price_node = node.find('tr[7]') #print price_node price_node = price_node.find('td[2]') #print price_node price = price_node.text print price bigimage_node = tree.xpath(IMAGE_XPATH)[0] bigimage_url =bigimage_node.text.strip() #print bigimage_url #loadBigPic('/upload/product/K2104MM595-990_4_1.png') start = bigimage_url.find("/") end = bigimage_url.find("')") #print start # print end bigimage_url=bigimage_url[start:end] bigimage_url='http://www.kappa.com.cn'+bigimage_url print bigimage_url self.logger.info('%s(%s) - %s @ %s' % (title, price, image_url, bigimage_url)) price1 = price price2 = price print "%s , %s , %s , %s , %s , %s" % (title,image_url,bigimage_url,price1,price2,leibie) from shopping.signals import item_found, item_update item_found.send( self, name = title, url = image_url, brand = self.__class__.__name__, image_url = bigimage_url, image_url2 = None, price = price1, category = leibie ) item_update.send( self, url=image_url, new_price=price2 )
def getData(self, url, mainleibie): parser = etree .HTMLParser(encoding='utf-8') time = datetime.datetime.now().strftime('%Y-%m-%d') #year = datetime.datetime.now().year self.logger.info('mainleibie: %s:' % mainleibie) text = urllib2.urlopen(url).read() tree = etree.HTML(text, parser=parser) nodes_url = tree.xpath(XPATH_URL3) #print nodes_url for node_url in nodes_url: url = node_url.attrib['href'] # print url category = node_url.text.strip() if u'衣' in category or u'T' in category or u'衫' in category or u'套' in category: leibie = u'上装' elif u'裤' in category: leibie = u'裤' elif u'裙' in category: leibie = u'裙' elif u'配饰' in category or u'包' in category: leibie = u'配饰' elif u'鞋' in category: leibie = u'鞋' else: leibie = category self.logger.info('leibie: %s:' % leibie) self.logger.info('category: %s:' % category) text = urllib2.urlopen(url).read() tree = etree.HTML(text, parser=parser) nodes = tree.xpath(XPATH) productText = nodes[0].text productText = productText[productText.index("categoryData: ") + len("categoryData: ") :productText.rindex('},' )+1] # print productText data = json.loads(productText) urlPrefix = data["urlPrefix"] imgPrefix = data["imgPrefix"] items = data["items"] for item in items: title = item["name"] price = item["numPrice"] if price > 0: price = u'¥' + price.__str__() url = urlPrefix + item["link"]["full"] # print url text = urllib2.urlopen(url).read() tree = etree.HTML(text, parser=parser) nodes = tree.xpath(BIG_XPATH) # print nodes image_url = nodes[0].attrib['src'] # return # image_url = imgPrefix + item["image"]["standard"] self.logger.info('%s (%s) - %s @ %s' % (title, price, url, image_url)) collector.object_found.send( self, time = time, title = title, url = url, image_url = image_url, price = price, leibie = leibie ) from shopping.signals import item_found item_found.send( self, name = title, url = url, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = leibie, )
def fetch(self): parser = etree .HTMLParser(encoding='utf-8') time = datetime.datetime.now().strftime('%Y-%m-%d') # self.logger.info('Category: %s:' % category) text = urllib2.urlopen(ALL_URL).read() tree = etree.HTML(text, parser=parser) nodes = tree.xpath(MAIN_XPATH) for node in nodes: sub_node = node.find('a') if sub_node is None: continue detail_leibie = sub_node.text leibie_url = sub_node.attrib['href'] Category = '' if leibie_url.find('female')>0: Category = u'女装' elif leibie_url.find('male')>0: Category = u'男装' continue leibie = detail_leibie if u'裤' in detail_leibie: leibie = u'裤' elif u'裙' in detail_leibie: leibie = u'裙' elif u'配饰' in detail_leibie: leibie = u'配饰' else: leibie = u'上装' self.logger.info('Category: %s' % Category+'-'+leibie) leibie_url = 'http://www.c-and-a.com.cn/cn/fashion/product/'+leibie_url print leibie_url text = urllib2.urlopen(leibie_url).read() tree = etree.HTML(text, parser=parser) snodes = tree.xpath(XPATH) #print snodes for snode in snodes: # print etree.tostring(snode, method='html', encoding='utf-8') # sub_node = snode.find('a/img') # image_url = urlparse.urljoin(leibie_url,sub_node.attrib['src']) sub_node = snode.find('div[1]/div[2]/a') #print sub_node title = sub_node.text ourl = urlparse.urljoin(leibie_url,sub_node.attrib['href']) print ourl text = urllib2.urlopen(ourl).read() tree = etree.HTML(text, parser=parser) imgnodes = tree.xpath(BIGXPATH) image_url = 'http://www.c-and-a.com.cn/'+imgnodes[0].attrib['src'][len('../../..'):] sub_node = snode.find('div[1]/div[3]') price = sub_node.text.strip() price = u'¥' + price[0:price.index(' RMB')] self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url)) collector.object_found.send( self, time = time, title = title, url = ourl, image_url = image_url, price = price, leibie = leibie ) from shopping.signals import item_found item_found.send( self, name = title, url = ourl, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = leibie, ) text = urllib2.urlopen(ACC_URL).read() tree = etree.HTML(text, parser=parser) a_nodes = tree.xpath(XPATH) leibie = u'配饰' self.logger.info('Category: %s:' % leibie) for snode in a_nodes: # #print etree.tostring(node, method='html', encoding='utf-8') # sub_node = snode.find('a/img') # image_url = urlparse.urljoin(ACC_URL,sub_node.attrib['src']) sub_node = snode.find('div[1]/div[2]/a') title = sub_node.text ourl = urlparse.urljoin(ACC_URL,sub_node.attrib['href']) text = urllib2.urlopen(ourl).read() tree = etree.HTML(text, parser=parser) imgnodes = tree.xpath(BIGXPATH) image_url = 'http://www.c-and-a.com.cn/'+imgnodes[0].attrib['src'][len('../../..'):] sub_node = snode.find('div[1]/div[3]') price = sub_node.text.strip() price = u'¥' + price[0:price.index(' RMB')] self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url)) collector.object_found.send( self, time = time, title = title, url = ourl, image_url = image_url, price = price, leibie = leibie ) from shopping.signals import item_found item_found.send( self, name = title, url = ourl, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = leibie, )
def getData(self,category,leibie): temp_leibie = leibie parser = etree .HTMLParser(encoding='utf-8') URL=ALL_URL % (category) text = urllib2.urlopen(URL).read() tree = etree.HTML(text, parser=parser) nodes = tree.xpath(ALL_XPATH) time = datetime.datetime.now().strftime('%Y-%m-%d') print len(nodes) for node in nodes: leibie_url = node.find('a').attrib['href'] leibie_detail = node.find('a').text print leibie_url if temp_leibie == u'打折单品': if u'上装' in leibie_detail or u'T' in leibie_detail or u'衫' in leibie_detail or u'毛衣' in leibie_detail: leibie = u'上装' if u'裤' in leibie_detail: leibie = u'裤' if u'裙' in leibie_detail: leibie = u'裙' if u'配' in leibie_detail: leibie = u'配饰' self.logger.info('Category: %s' % leibie+'-'+leibie_detail) text = urllib2.urlopen(leibie_url).read() tree = etree.HTML(text, parser=parser) cat_nodes = tree.xpath(XPATH) # for page in range(0,21): # self.logger.info('Page: %d:' % (page+1)) # # url = LIST_URL % (page * 28) # text = urllib2.urlopen(url).read() # tree = etree.HTML(text, parser=parser) # # time = datetime.datetime.now().strftime('%Y-%m-%d') # nodes = tree.xpath(XPATH) for cat_node in cat_nodes: #print etree.tostring(node, method='html', encoding='utf-8') sub_node = cat_node.find('a') ourl = sub_node.attrib['href'] text = urllib2.urlopen(ourl).read() tree = etree.HTML(text, parser=parser) imgnodes = tree.xpath(BIGXPATH) image_url = imgnodes[0].attrib['jqimg'] title = sub_node.find('span').text.strip() if u'配' in leibie and u'鞋' in title: leibie = u'鞋类' self.logger.info('Category: %s' % leibie+'-'+leibie_detail) sub_node = cat_node.find('p') priceinfo = etree.tostring(sub_node, method='html', encoding='utf-8') print '-------------------------------------------------------------------' price = priceinfo [priceinfo.find('¥'):priceinfo.find('.00')+len('.00')] ori_node = sub_node.find('del') now_node = sub_node.find('span') new_price = '' if ori_node is not None and now_node is not None: price = ori_node.text.strip() new_price = now_node.text.strip() # print etree.tostring(sub_node, method='html', encoding='utf-8') # new_price = '' # # if sub_node is None: # bold_node = cat_node.find('*/span[@class="listPrice bold"]') # print bold_node # now_node = cat_node.find('div[2]/div[1]/span[@class="offer_price"]') # price = bold_node.text.strip() # price = price[0:price.index('.')] # new_price = now_node.text.strip() # new_price = price[0:price.index('.')] # else: # price = sub_node.text.strip() # price = price[0:price.index('.')] # # self.logger.info('%s(%s,now:%s) - %s @ %s' % (title, price.decode('utf-8'),new_price.decode('utf-8'), ourl, image_url)) from shopping.signals import item_found,item_update item_found.send( self, name = title, url = ourl, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = leibie, ) if new_price != '': item_update.send( self, name = title, url = ourl, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, new_price = new_price, category = leibie, )
def getData(self): parser = etree.HTMLParser(encoding='utf-8') text = urllib2.urlopen(MAIN_URL).read() tree = etree.HTML(text, parser=parser) nodes = tree.xpath(LEIBIE_XPATH) for node in nodes: sub_node = node.find('td/a') if sub_node is None: continue leibie_detail = sub_node.text if not leibie_detail: leibie_detail = sub_node.find('span').text leibie_url = PARENT_URL + sub_node.attrib['href'] leibie = leibie_detail if u'配件' in leibie_detail or u'服装' in leibie_detail or u'裤子' in leibie_detail or u'冲浪裤' in leibie_detail: continue if u'短袖' in leibie_detail or u'背心' in leibie_detail or u'衫' in leibie_detail: leibie = u'上装' if u'裤' in leibie_detail: leibie = u'裤' if u'鞋' in leibie_detail: leibie = u'鞋' if u'帽' in leibie_detail or u'包' in leibie_detail: leibie = u'配饰' self.logger.info('Leibie: %s' % leibie) print leibie_url text_leibie = urllib2.urlopen(leibie_url).read() tree_leibie = etree.HTML(text_leibie, parser=parser) time = datetime.datetime.now().strftime('%Y-%m-%d') page_nodes = tree_leibie.xpath(PAGE_XPATH) pages = len(page_nodes) + 1 print 'total pages %s' % pages for page in range(0, pages): print "page %s" % page data = "start=%d" % (page * 8) text_item = urllib2.urlopen(leibie_url, data).read() tree_item = etree.HTML(text_item, parser=parser) nodes_item1 = tree_item.xpath(ITEM_XPATH1) nodes_item2 = tree_item.xpath(ITEM_XPATH2) nodes_item = [nodes_item1, nodes_item2] for sub_node_item_no in nodes_item: for sub_node_item_td in sub_node_item_no: sub_node_item = sub_node_item_td.find('table') if sub_node_item is None: continue ourltext = sub_node_item.find('tr[1]/td/a').attrib['onclick'] url = 'http://www.quiksilver.cn/cn/tw/' + ourltext[ourltext.index("..") + 3:ourltext.rindex("'")] title = sub_node_item.find('tr[2]/td/div').text price_node = sub_node_item.find('tr[3]/td/div') pricetext = etree.tostring(price_node, method='html', encoding='utf-8') price = pricetext[pricetext.index("<br>") + len("<br>"):pricetext.index("</div>")] image_url_backup = '' text_detail = urllib2.urlopen(url).read() tree_detail = etree.HTML(text_detail, parser=parser) clo_nodes = tree_detail.xpath(BIG_XPATH) image_url = urlparse.urljoin(url,clo_nodes[0].attrib['src']) self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url)) collector.object_found.send( self, time = time, title = title, url = url, image_url = image_url, image_url2 = image_url_backup, price = price, leibie = leibie ) from shopping.signals import item_found item_found.send( self, name = title, url = url, brand = self.__class__.__name__, image_url = image_url, image_url2 = image_url_backup, price = price, category = leibie, )
def getData(self): parser = etree.HTMLParser(encoding="utf-8") text = urllib2.urlopen(MAIN_URL).read() tree = etree.HTML(text, parser=parser) nodes = tree.xpath(LEIBIE_XPATH) for node in nodes: sub_node = node.find("td/a") if sub_node is None: continue leibie_detail = sub_node.text if leibie_detail is None: leibie_detail = sub_node.find("span").text leibie_url = PARENT_URL + sub_node.attrib["href"] leibie = leibie_detail if ( u"配件" in leibie_detail or u"服装" in leibie_detail or u"裤子" in leibie_detail or u"冲浪裤" in leibie_detail or u"比基尼" in leibie_detail ): continue if u"短袖" in leibie_detail or u"背心" in leibie_detail or u"毛衣" in leibie_detail: leibie = u"上装" if u"短裙" in leibie_detail: leibie = u"裙" if u"裤" in leibie_detail: leibie = u"裤" if u"鞋" in leibie_detail: leibie = u"鞋" if u"帽" in leibie_detail or u"包" in leibie_detail or u"腰带" in leibie_detail: leibie = u"配饰" self.logger.info("Leibie: %s" % leibie) print leibie_url text_leibie = urllib2.urlopen(leibie_url).read() tree_leibie = etree.HTML(text_leibie, parser=parser) time = datetime.datetime.now().strftime("%Y-%m-%d") page_nodes = tree_leibie.xpath(PAGE_XPATH) pages = len(page_nodes) + 1 print "total pages %s" % pages for page in range(0, pages): print "page %s" % page data = "start=%d" % (page * 8) text_item = urllib2.urlopen(leibie_url, data).read() tree_item = etree.HTML(text_item, parser=parser) nodes_item1 = tree_item.xpath(ITEM_XPATH1) nodes_item2 = tree_item.xpath(ITEM_XPATH2) nodes_item = [nodes_item1, nodes_item2] for sub_node_item_no in nodes_item: for sub_node_item_td in sub_node_item_no: sub_node_item = sub_node_item_td.find("table") if sub_node_item is None: continue ourltext = sub_node_item.find("tr[1]/td/a").attrib["onclick"] url = PARENT_URL + ourltext[ourltext.index("..") + 3 : ourltext.rindex("'")] title = sub_node_item.find("tr[2]/td/div").text price_node = sub_node_item.find("tr[3]/td/div") pricetext = etree.tostring(price_node, method="html", encoding="utf-8") price = pricetext[pricetext.index("<br>") + len("<br>") : pricetext.index("</div>")] image_url_backup = "" text_detail = urllib2.urlopen(url).read() tree_detail = etree.HTML(text_detail, parser=parser) clo_nodes = tree_detail.xpath(BIG_XPATH) image_url = urlparse.urljoin(url, clo_nodes[0].attrib["src"]) self.logger.info("%s(%s) - %s @ %s" % (title, price, url, image_url)) collector.object_found.send( self, time=time, title=title, url=url, image_url=image_url, image_url2=image_url_backup, price=price, leibie=leibie, ) from shopping.signals import item_found item_found.send( self, name=title, url=url, brand=self.__class__.__name__, image_url=image_url, image_url2=image_url_backup, price=price, category=leibie, )
def getData(self, URL, gender): parser = etree .HTMLParser(encoding='utf-8') self.logger.info('Gender: %s:' % gender) text = urllib2.urlopen(URL).read() tree = etree.HTML(text, parser=parser) time = datetime.datetime.now().strftime('%Y-%m-%d') nodes = tree.xpath(LIST_XPATH) if gender == u'女士商品' : start = 2 end = 8 else: start = 4 end = 14 for i in range(start,end): leibie_url = 'http://china.coach.com'+nodes[i].find('a').attrib['href'] leibie_detail = nodes[i].find('a/strong').text leibie = '' if u'服饰' in leibie_detail: leibie = u'服饰' elif u'鞋' in leibie_detail: leibie = u'鞋' else: leibie = u'配饰' self.logger.info('Category: %s' % leibie+'-'+leibie_detail) text = urllib2.urlopen(leibie_url).read() tree = etree.HTML(text,parser=parser) cat_nodes = tree.xpath(XPATH) for cat_node in cat_nodes: #sub_node = node.find('a[1]') #ourl = urlparse.urljoin(url,sub_node.attrib['href']) sub_node = cat_node.find('a[1]/img') # print etree.tostring(cat_node, method='html', encoding='utf-8') print '------------------------------------------------------------------------------------------' image_url = sub_node.attrib['src'] title = sub_node.attrib['alt'] if leibie_detail == u'服饰': if u'风衣' in title : leibie = u'上装' else: leibie = u'配饰' self.logger.info('Category: %s' % leibie_detail+'-'+leibie) productinfo = sub_node.attrib['onmouseover'] productID = productinfo[productinfo.index("('")+2:productinfo.index("',")] price = u'¥' + self.getPrice(productID) #self.logger.info('%s(%s) - %s @ %s - %s' % (title, price, ourl, image_url)) self.logger.info('%s(%s) - %s-%s' % (title, price, image_url, leibie_detail)) from shopping.signals import item_found item_found.send( self, name = title, url = image_url, brand = self.__class__.__name__, image_url = image_url, image_url2 = None, price = price, category = leibie, )
def getData(self, target_url, leibie_spc=u'默认'): parser = etree .HTMLParser(encoding='utf-8') text = urllib2.urlopen(target_url).read() tree = etree.HTML(text, parser=parser) leibie_nodes = tree.xpath(LEIBIE_XPATH) print len(leibie_nodes) for leibie_node in leibie_nodes: if leibie_node.find('a') is None: continue leibie_name = leibie_node.find('a').text leibie_url = PARENT_URL + leibie_node.find('a').attrib['href'] if u'T恤' in leibie_name or u'衬衫' in leibie_name or \ u'针织衫' in leibie_name or u'背心' in leibie_name or u'卫衣' in leibie_name or \ u'POLO衫' in leibie_name or u'西服' in leibie_name or u'夹克' in leibie_name or \ u'马甲' in leibie_name or u'风衣' in leibie_name or u'大衣' in leibie_name or \ u'棉服' in leibie_name or u'女款' in leibie_name or u'男款' in leibie_name: leibie = u'上装' elif u'牛仔裤' in leibie_name or u'休闲裤' in leibie_name or u'连身裤' in leibie_name\ or u'连身裤' in leibie_name or u'针织裤' in leibie_name or u'西裤' in leibie_name: leibie = u'裤' elif u'半裙' in leibie_name or u'连衣裙' in leibie_name: leibie = u'裙' elif u'女鞋' in leibie_name or u'男鞋' in leibie_name: leibie = u'鞋' elif u'饰品' in leibie_spc: leibie = u'配饰' else: continue leibie_text = urllib2.urlopen(leibie_url).read() leibie_tree = etree.HTML(leibie_text, parser=parser) page_node = leibie_tree.xpath(PAGE_XPATH)[0] pages = string.atoi(page_node.text[1:-1]) for page in range(1,pages+1): items_url = leibie_url + (QUERY_STRING % page) item_text = urllib2.urlopen(items_url).read() item_tree = etree.HTML(item_text, parser=parser) item_nodes = item_tree.xpath(ITEMS_XPATH) for item_node in item_nodes: item_url = item_node.find('div[1]/a').attrib['href'] url = item_url title = item_node.find('div[1]/a').attrib['title'] time = datetime.datetime.now().strftime('%Y-%m-%d') detail_text = urllib2.urlopen(item_url).read() detail_tree = etree.HTML(detail_text, parser=parser) detail_node = detail_tree.xpath(BIG_XPATH)[0] image_url = detail_node.attrib['src'] cuxiao_node = detail_tree.xpath(CUXIAO_XPATH)[0] new_price = cuxiao_node.text new_price = 'RMB' + new_price myprice_node = detail_tree.xpath(MYPRICE_XPATH)[0] price = 'RMB' + myprice_node.text image_url_backup = '' self.logger.info('%s(%s--%s) - %s @ %s' % (title, price, new_price, url, image_url)) collector.object_found.send( self, time = time, title = title, url = url, image_url = image_url, image_url2 = image_url_backup, price = price, leibie = leibie ) from shopping.signals import item_found, item_update item_found.send( self, name = title, url = url, brand = self.__class__.__name__, image_url = image_url, image_url2 = image_url_backup, price = price, category = leibie, ) item_update.send( self, url=url, new_price=new_price )