Пример #1
0
    def getData(self):
        parser = etree.HTMLParser(encoding='utf-8')

        text = urllib2.urlopen(MAIN_URL).read()
        tree = etree.HTML(text, parser=parser)
        nodes = tree.xpath(LEIBIE_XPATH)


        for node in nodes:
            sub_node = node.find('a')
            if sub_node is None:
                continue

            leibie_detail = sub_node.text
            if leibie_detail is None:
                leibie_detail = sub_node.find('span').text

            count = leibie_detail[leibie_detail.index('(') + 1:leibie_detail.index(')')]
            if count == '0':
                continue

            leibie_url = sub_node.attrib['href']
            leibie = leibie_detail

            if u'卫衣' in leibie_detail or u' T恤' in leibie_detail or u'衬衫' in leibie_detail or\
               u'装' in leibie_detail:
                leibie = u'上装'
            if u'裤' in leibie_detail or u'50' in leibie_detail:
                leibie = u'裤'
            if u'鞋' in leibie_detail:
                leibie = u'鞋'
            if u'配件' in leibie_detail or u'包' in leibie_detail or u'皮带' in leibie_detail \
                or u'眼镜' in leibie_detail or u'手表' in leibie_detail:
                leibie = u'配饰'

            self.logger.info('Leibie: %s' % leibie)
            print leibie_url
            text_leibie = urllib2.urlopen(leibie_url).read()
            tree_leibie = etree.HTML(text_leibie, parser=parser)
            time = datetime.datetime.now().strftime('%Y-%m-%d')

            page_nodes = tree_leibie.xpath(PAGE_XPATH)[:-1]
            pages = len(page_nodes) + 1
            print 'total pages %s' % pages

            url_list = [leibie_url]
            for page in page_nodes:
                url_list.append(page.attrib['href'])

            for page_url in url_list:
                text_item = urllib2.urlopen(page_url).read()
                tree_item = etree.HTML(text_item, parser=parser)
                nodes_item = tree_item.xpath(ITEM_XPATH)

                for sub_node_item in nodes_item:
                    item_node = sub_node_item.find('a')
                    if item_node is None:
                        continue

                    url = item_node.attrib['href']
                    title = item_node.attrib['title']
                    price_node = sub_node_item.find('div/font[2]')
                    price = 'RMB' + price_node.text

                    new_price_node = sub_node_item.find('div/font[1]')
                    new_price = 'RMB' + new_price_node.text

                    image_url_backup = ''

                    text_detail = urllib2.urlopen(url).read()
                    tree_detail = etree.HTML(text_detail, parser=parser)
                    img_node = tree_detail.xpath(BIG_XPATH)[0]
                    image_url = img_node.attrib['src']

                    self.logger.info('%s(%s--%s) - %s @ %s' % (title, price, new_price, url, image_url))

                    collector.object_found.send(
                        self,
                        time = time, title = title, url = url,
                        image_url = image_url,
                        image_url2 = image_url_backup,
                        price = price,
                        leibie = leibie
                    )
                    from shopping.signals import item_found, item_update
                    item_found.send(
                        self,
                        name = title,
                        url = url,
                        brand = self.__class__.__name__,
                        image_url = image_url,
                        image_url2 = image_url_backup,
                        price = price,
                        category = leibie,

                    )

                    item_update.send(
                        self,
                        url=url,
                        new_price=new_price
                    )
Пример #2
0
    def fetch(self):
        self.logger.info('Kappa started.')
        kuanshi ='MENs'
        parser = etree .HTMLParser(encoding='utf-8')
        for page in range(1,2):
            URL = LIST_ADDR % (page)
            text = urllib2.urlopen(URL).read()
            tree = etree.HTML(text, parser=parser)
            time = datetime.datetime.now().strftime('%Y-%m-%d')
            nodes = tree.xpath(LEIBIEXPATH)
            for node in nodes:
                if node.text:
                   leibie = node.text
                   if leibie ==u'运动':
                       leibie = 'none'
                   elif leibie == u'运动时尚':
                       leibie = 'none'
                   elif leibie == u'时尚':
                       leibie = 'none'
                   if leibie != 'none':
                        print leibie
                        title =leibie
                        if leibie.find(u'裤')!=-1:
                            leibie=u'上装'
                        else:
                            leibie=u'下装'
                        print leibie
                        leibie_url='http://www.kappa.com.cn/product/'+ node.attrib['href']
                        #print leibie_url
                        text = urllib2.urlopen(leibie_url).read()
                        tree = etree.HTML(text,parser=parser)
                        nodes = tree.xpath(XPATH)
                        for node in nodes:
                            image_url = 'http://www.kappa.com.cn/product/'+node.attrib['href']
                            print image_url
                            text = urllib2.urlopen(image_url).read()
                            tree = etree.HTML(text,parser=parser)
                            node = tree.xpath(PRICE_XPATH)[0]
                            #print node


                            kuanshi=node.find('tr[3]').find('td[2]')
                            if kuanshi==u'男':
                                leibie='MENs'+leibie
                            else:
                                leibie='WOMENs'+leibie
                            price_node = node.find('tr[7]')
                            #print price_node
                            price_node = price_node.find('td[2]')
                            #print price_node
                            price = price_node.text
                            print price
                            bigimage_node = tree.xpath(IMAGE_XPATH)[0]
                            bigimage_url =bigimage_node.text.strip()
                            #print bigimage_url
                            #loadBigPic('/upload/product/K2104MM595-990_4_1.png')
                            start = bigimage_url.find("/")
                            end = bigimage_url.find("')")
                            #print start
                            # print end
                            bigimage_url=bigimage_url[start:end]
                            bigimage_url='http://www.kappa.com.cn'+bigimage_url
                            print bigimage_url
                            self.logger.info('%s(%s) - %s @ %s' % (title, price, image_url, bigimage_url))

                            price1 = price
                            price2 = price
                            print "%s  , %s  , %s  ,  %s   ,  %s   ,  %s" % (title,image_url,bigimage_url,price1,price2,leibie)
                            from shopping.signals import item_found, item_update
                            item_found.send(
                            self,
                            name = title,
                            url = image_url,
                            brand = self.__class__.__name__,
                            image_url = bigimage_url,
                            image_url2 = None,
                            price = price1,
                            category = leibie
                            )

                            item_update.send(
                            self,
                            url=image_url,
                            new_price=price2
                            )
Пример #3
0
    def getData(self,category,pages,leibie):
        temp_leibie = leibie
        parser = etree .HTMLParser(encoding='utf-8')
        for subcate in range(1,pages):
            time = datetime.datetime.now().strftime('%Y-%m-%d')
            self.logger.info('Category: %s-%s:' % (leibie, subcate))
            urlleft = LEFT_URL % (category)
            urlright = RIGHT_URL % (subcate)
            url = urlleft + u'%20' + urlright
            text = urllib2.urlopen(url).read()
            tree = etree.HTML(text,parser=parser)
            nodes = tree.xpath(XPATH)
            for node in nodes:
                #print etree.tostring(node, method='html', encoding='utf-8')
                s_node = node.find('div/a')
                title = urlparse.urljoin(url,s_node.attrib['title'])[24:]
                if temp_leibie == u'打折单品':
                    cate_node = node.find('span')
                    leibie_detail = cate_node.text
                    print leibie_detail
#                    self.logger.info('Category: %s' % (leibie))
                    if u'衫' in leibie_detail or u'T' in leibie_detail or u'外套' in leibie_detail or u'背心' in leibie_detail or u'毛衣' in leibie_detail or u'上衣' in leibie_detail:
                        leibie = u'上装'
                    if u'裤' in leibie_detail:
                        leibie = u'裤'
                    if u'裙' in leibie_detail:
                        leibie = u'裙'
                    if u'配饰' in leibie_detail and u'鞋' in title:
                        leibie = u'鞋'
                    self.logger.info('Category: %s' % (leibie))

                ourl = urlparse.urljoin(url,s_node.attrib['href'])
                image_url2_whole = urlparse.urljoin(url,s_node.attrib['onmouseover'])
                image_url2 = image_url2_whole[image_url2_whole.find('src=')+5 :]
                image_url2 = image_url2.replace('\'','')
                sub_node = s_node.find('img')
                image_url = urlparse.urljoin(url,sub_node.attrib['datasrc'])
                p_node = node.find('p')
                d_node = p_node.find('del')
                new_price = ''
                if d_node is not None:
                    price = d_node.text
                    new_info = etree.tostring(p_node, method='html', encoding='utf-8')
                    new_price = new_info [new_info.find('</del>')+len('</del>'):new_info.find('</p>')]
                else:
                    price = p_node.text

                self.logger.info('%s(%s,now:%s) - %s @ %s' % (title, price,new_price.decode('utf-8'), ourl, image_url))
                from shopping.signals import item_found,item_update
                item_found.send(
                    self,
                    name = title,
                    url = ourl,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = image_url2,
                    price = price,
                    category = leibie,
                )
                if new_price != '':
                    item_update.send(
                       self,
                       name = title,
                       url = ourl,
                       brand = self.__class__.__name__,
                       image_url = image_url,
                       image_url2 = None,
                       price = price,
                       new_price = new_price,
                       category = leibie,
                    )
Пример #4
0
    def getData(self,category,leibie):
        temp_leibie = leibie
        parser = etree .HTMLParser(encoding='utf-8')
        URL=ALL_URL % (category)
        text = urllib2.urlopen(URL).read()
        tree = etree.HTML(text, parser=parser)
        nodes = tree.xpath(ALL_XPATH)
        time = datetime.datetime.now().strftime('%Y-%m-%d')
        print len(nodes)
        for node in nodes:
            leibie_url = node.find('a').attrib['href']
            leibie_detail = node.find('a').text
            print leibie_url
            if temp_leibie == u'打折单品':
                if u'上装' in leibie_detail or u'T' in leibie_detail or u'衫' in leibie_detail or u'毛衣' in leibie_detail:
                    leibie = u'上装'
                if u'裤' in leibie_detail:
                    leibie = u'裤'
                if u'裙' in leibie_detail:
                    leibie = u'裙'
                if u'配' in leibie_detail:
                    leibie = u'配饰'
            self.logger.info('Category: %s' % leibie+'-'+leibie_detail)

            text = urllib2.urlopen(leibie_url).read()
            tree = etree.HTML(text, parser=parser)
            cat_nodes = tree.xpath(XPATH)
#        for page in range(0,21):
#            self.logger.info('Page: %d:' % (page+1))
#          #  url = LIST_URL % (page * 28)
#            text = urllib2.urlopen(url).read()
#            tree = etree.HTML(text, parser=parser)
#
#            time = datetime.datetime.now().strftime('%Y-%m-%d')
#            nodes = tree.xpath(XPATH)




            for cat_node in cat_nodes:
                #print etree.tostring(node, method='html', encoding='utf-8')
                sub_node = cat_node.find('a')
                ourl = sub_node.attrib['href']
                text = urllib2.urlopen(ourl).read()
                tree = etree.HTML(text, parser=parser)
                imgnodes = tree.xpath(BIGXPATH)
                image_url = imgnodes[0].attrib['jqimg']
                title = sub_node.find('span').text.strip()

                if u'配' in leibie and u'鞋' in title:
                    leibie = u'鞋类'
                    self.logger.info('Category: %s' % leibie+'-'+leibie_detail)


                sub_node = cat_node.find('p')
                priceinfo = etree.tostring(sub_node, method='html', encoding='utf-8')
                print '-------------------------------------------------------------------'
                price = priceinfo [priceinfo.find('¥'):priceinfo.find('.00')+len('.00')]
                ori_node = sub_node.find('del')
                now_node = sub_node.find('span')
                new_price = ''
                if ori_node is not None and now_node is not None:
                    price = ori_node.text.strip()
                    new_price = now_node.text.strip()





             #   print etree.tostring(sub_node, method='html', encoding='utf-8')
#                new_price = ''
#
#                if sub_node is None:
#                    bold_node = cat_node.find('*/span[@class="listPrice bold"]')
#                    print bold_node
#                    now_node = cat_node.find('div[2]/div[1]/span[@class="offer_price"]')
#                    price = bold_node.text.strip()
#                    price = price[0:price.index('.')]
#                    new_price = now_node.text.strip()
#                    new_price = price[0:price.index('.')]
#                else:
#                    price = sub_node.text.strip()
#                    price = price[0:price.index('.')]
#
#
                self.logger.info('%s(%s,now:%s) - %s @ %s' % (title, price.decode('utf-8'),new_price.decode('utf-8'), ourl, image_url))
                from shopping.signals import item_found,item_update
                item_found.send(
                    self,
                    name = title,
                    url = ourl,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = None,
                    price = price,
                    category = leibie,
                )
                if new_price != '':
                    item_update.send(
                       self,
                       name = title,
                       url = ourl,
                       brand = self.__class__.__name__,
                       image_url = image_url,
                       image_url2 = None,
                       price = price,
                       new_price = new_price,
                       category = leibie,
                    )
Пример #5
0
    def getData(self, category, kuanshi):
        parser = etree .HTMLParser(encoding='utf-8')
        time = datetime.datetime.now().strftime('%Y-%m-%d')
        self.logger.info('Category: %s:' % kuanshi)
        M_URL = LIST_URL % (category,category)
        print M_URL
        text = urllib2.urlopen(M_URL).read()
        tree = etree.HTML(text,parser=parser)
        nodes = tree.xpath(XPATH)
        #print nodes
        for node in nodes:

            cat_url = node.attrib['href']
            #print cat_url
            leibie_node = node.find('span')
            leibie_node_text=leibie_node.text.strip()
            print leibie_node_text
            print leibie_node_text=="T's + Polos"
            if leibie_node_text=='Tops':
                leibie_node_text=u'上装'
#            elif leibie_node_text=="T's + Polos":
#                leibie_node_text=u'套装'
#            elif leibie_node_text=='Tanks + Camis':
#                leibie_node_text=u'套装'
#            elif leibie_node_text=="Graphic T's":
#                leibie_node_text=u'上装'
#            elif leibie_node_text=='Shirts':
#                leibie_node_text=u'上装'
#            elif leibie_node_text=='Sweaters + Cardis':
#                leibie_node_text=u'套装'
#            elif leibie_node_text=='Sweatshirts':
#                leibie_node_text=u'上装'
#            elif leibie_node_text=='Outerwear':
#                leibie_node_text=u'上装'
            elif leibie_node_text=='Bottoms':
                leibie_node_text=u'下装'
            elif leibie_node_text=='Accessories':
                leibie_node_text=u'配饰'
            elif leibie_node_text=='College':
                leibie_node_text=u'上装'
            elif leibie_node_text=='Footwear':
                leibie_node_text=u'鞋'
            elif leibie_node_text=='New Arrivals':
                leibie_node_text='none'
            elif leibie_node_text=='Web Exclusives':
                leibie_node_text='none'
            elif leibie_node_text=="$10 T's + Tanks":
                leibie_node_text='none'
            elif leibie_node_text=='Looks To Live In':
                leibie_node_text='none'
            elif leibie_node_text=='Jean Guide':
                leibie_node_text='none'
            elif leibie_node_text=='Fragrance':
                leibie_node_text='none'
            elif leibie_node_text=='Clearance':
                leibie_node_text='none'
            elif leibie_node_text=='Back To Basics':
                return
            #print 'fafd %s' % leibie_node_text
            if leibie_node_text == 'none':
                continue

            if leibie_node_text!=u'下装':
                print "ups"
                leibie = kuanshi + '-' + leibie_node_text

                print leibie
                text = urllib2.urlopen(cat_url).read()
                tree = etree.HTML(text,parser=parser)
                nodes = tree.xpath(CAT_XPATH)

                for node in nodes:
                    url_node = node.find('a')
                    url = 'http://www.ae.com'+url_node.attrib['href']

                    image_node = url_node.find('span/img')
                    title = image_node.attrib['alt']
                    #print title
                    image_url = 'http:'+image_node.attrib['src']
                    #print image_url
                    price_node = url_node.find('span[4]')
                    price = price_node.text

                    if price.find(u'Was:')==0:
                        price_node = url_node.find('span[5]')
                        price = price_node.text[5:]
                        print price
                        text = urllib2.urlopen(url).read()
                        tree = etree.HTML(text,parser=parser)
                        node = tree.xpath(BIG_XPATH)[0]

                        image_node=node.find('img')
                        image_url='http:'+image_node.attrib['src']
                        print image_url
                        node = tree.xpath(PRICE_XPATH)
                        flag = False
                        if node:
                            #print node
                            flag = True
                            oldPrice = node[0].text
                            print oldPrice
                        self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url))


                        if flag:
                           price1 = oldPrice
                           price2 = price
                        else:
                            price1 = price
                            price2 = price

                        #print "%s   %s   %s    %s     %s     %s" % (title,url,image_url,price1,price2,leibie)
                        from shopping.signals import item_found, item_update
                        item_found.send(
                        self,
                        name = title,
                        url = url,
                        brand = self.__class__.__name__,
                        image_url = image_url,
                        image_url2 = None,
                        price = price1,
                        category = leibie
                        )

                        item_update.send(
                        self,
                        url=url,
                        new_price=price2
                        )
            else:
                print "bottoms"
                nodes = tree.xpath(XPATHSUB)
                for node in nodes:
                    cat_url = node.attrib['href']
                    leibie_node = node.find('span')
                    leibie_node_text=leibie_node.text.strip()
                    if leibie_node_text=='Jeans':
                         leibie_node_text=u'裤'
                    elif leibie_node_text=='Pants + Crops':
                         leibie_node_text=u'裤'
                    elif leibie_node_text=='Shorts':
                         leibie_node_text=u'裤'
                    elif leibie_node_text=='Dresses':
                         leibie_node_text=u'裙'
                    else:
                        leibie_node_text= 'none'

                    if leibie_node_text == 'none':
                        continue
                    leibie = kuanshi + '-' + leibie_node_text

                    print leibie
                    text = urllib2.urlopen(cat_url).read()
                    tree = etree.HTML(text,parser=parser)
                    nodes = tree.xpath(CAT_XPATH)

                    for node in nodes:
                        url_node = node.find('a')
                        url = 'http://www.ae.com'+url_node.attrib['href']

                        image_node = url_node.find('span/img')
                        title = image_node.attrib['alt']
                        #print title
                        image_url = 'http:'+image_node.attrib['src']
                        #print image_url
                        price_node = url_node.find('span[4]')
                        price = price_node.text

                        if price.find(u'Was:')==0:
                            price_node = url_node.find('span[5]')
                            price = price_node.text[5:]
                            print price
                        #http://www.ae.com/web/browse/product.jsp?productId=2371_9560_199&catId=cat90030
                        text = urllib2.urlopen(url).read()
                        tree = etree.HTML(text,parser=parser)
                        node = tree.xpath(BIG_XPATH)[0]

                        image_node=node.find('img')
                        image_url='http:'+image_node.attrib['src']
                        node = tree.xpath(PRICE_XPATH)
                        flag = False
                        if node:
                           # print node
                            flag = True
                            oldPrice = node[0].text
                            print oldPrice
                        print image_url
                        if flag:
                            price1 = oldPrice
                            price2 = price
                        else:
                            price1 = price
                            price2 = price
                        item_found.send(
                        self,
                        name = title,
                        url = url,
                        brand = self.__class__.__name__,
                        image_url = image_url,
                        image_url2 = None,
                        price = price1,
                        category = leibie
                        )

                        item_update.send(
                        self,
                        url=url,
                        new_price=price2
                        )
Пример #6
0
    def getData(self, target_url, leibie_spc=u'默认'):
        parser = etree .HTMLParser(encoding='utf-8')
        text = urllib2.urlopen(target_url).read()
        tree = etree.HTML(text, parser=parser)
        leibie_nodes = tree.xpath(LEIBIE_XPATH)

        print len(leibie_nodes)
        for leibie_node in leibie_nodes:

            if leibie_node.find('a') is None:
                continue
            leibie_name = leibie_node.find('a').text
            leibie_url = PARENT_URL + leibie_node.find('a').attrib['href']

            if u'T恤' in leibie_name or u'衬衫' in leibie_name or \
                u'针织衫' in leibie_name or u'背心' in leibie_name or u'卫衣' in leibie_name or \
                u'POLO衫' in leibie_name or u'西服' in leibie_name or u'夹克' in leibie_name or \
                u'马甲' in leibie_name or u'风衣' in leibie_name or u'大衣' in leibie_name or \
               u'棉服' in leibie_name or u'女款' in leibie_name or u'男款' in leibie_name:
                leibie = u'上装'

            elif u'牛仔裤' in leibie_name or u'休闲裤' in leibie_name or u'连身裤' in leibie_name\
                or u'连身裤' in leibie_name or u'针织裤' in leibie_name or u'西裤' in leibie_name:
                leibie = u'裤'

            elif u'半裙' in leibie_name  or u'连衣裙' in leibie_name:
                leibie = u'裙'
            elif u'女鞋' in leibie_name  or u'男鞋' in leibie_name:
                leibie = u'鞋'
            elif u'饰品' in leibie_spc:
                leibie = u'配饰'
            else:
                continue

            leibie_text = urllib2.urlopen(leibie_url).read()
            leibie_tree = etree.HTML(leibie_text, parser=parser)
            page_node = leibie_tree.xpath(PAGE_XPATH)[0]
            pages = string.atoi(page_node.text[1:-1])


            for page in range(1,pages+1):
                items_url = leibie_url + (QUERY_STRING % page)


                item_text = urllib2.urlopen(items_url).read()
                item_tree = etree.HTML(item_text, parser=parser)
                item_nodes = item_tree.xpath(ITEMS_XPATH)


                for item_node in item_nodes:
                    item_url = item_node.find('div[1]/a').attrib['href']
                    url = item_url
                    title = item_node.find('div[1]/a').attrib['title']
                    time = datetime.datetime.now().strftime('%Y-%m-%d')

                    detail_text = urllib2.urlopen(item_url).read()
                    detail_tree = etree.HTML(detail_text, parser=parser)
                    detail_node = detail_tree.xpath(BIG_XPATH)[0]
                    image_url = detail_node.attrib['src']

                    cuxiao_node = detail_tree.xpath(CUXIAO_XPATH)[0]
                    new_price = cuxiao_node.text
                    new_price = 'RMB' + new_price

                    myprice_node = detail_tree.xpath(MYPRICE_XPATH)[0]
                    price = 'RMB' + myprice_node.text

                    image_url_backup = ''

                    self.logger.info('%s(%s--%s) - %s @ %s' % (title, price, new_price, url, image_url))

                    collector.object_found.send(
                        self,
                        time = time, title = title, url = url,
                        image_url = image_url,
                        image_url2 = image_url_backup,
                        price = price,
                        leibie = leibie
                    )
                    from shopping.signals import item_found, item_update
                    item_found.send(
                        self,
                        name = title,
                        url = url,
                        brand = self.__class__.__name__,
                        image_url = image_url,
                        image_url2 = image_url_backup,
                        price = price,
                        category = leibie,

                    )

                    item_update.send(
                        self,
                        url=url,
                        new_price=new_price
                    )
Пример #7
0
    def getData(self, category,kuanshi):
        parser = etree .HTMLParser(encoding='utf-8')
        url = LIST_URL % (category)
        text = urllib2.urlopen(url).read()
        tree = etree.HTML(text, parser=parser)
        time = datetime.datetime.now().strftime('%Y-%m-%d')
        nodes = tree.xpath(XPATH)
        for node in nodes:
            leibie = node.text.strip()

            if leibie==u'皮带':
                leibie=u'配饰'
            elif leibie==u'围巾/帽子':
                leibie=u'配饰'
            elif leibie==u'手袋/钱夹':
                leibie=u'配饰'
            elif leibie==u'牛仔裤':
                leibie=u'裤'
            elif leibie==u'热裤/七分裤':
                leibie=u'裤'
            elif leibie==u'时尚及休闲裤':
                leibie=u'裤'
            elif leibie==u'连衣裙':
                leibie=u'裙'
            elif leibie==u'半身裙':
                leibie=u'裙'
            elif leibie==u'浪漫韵动':
                leibie='none'
            elif leibie==u'意桃粉丽人':
                leibie='none'
            elif leibie==u'白色的纯纯夏日':
                leibie='none'
            else:
                leibie=u'上装'
            print leibie
            if leibie=='none':
                continue
#            #unction getSkus(url,_this){
#            基于需要f5刷新的需要,将局部刷新修改为页面跳转
            #/products/2---Women@[email protected]
#        window.location.href=url;
            link_url=node.attrib['onclick']
            #getSkus('/products/2-6-22-------.htm',this)
            link_url = link_url[link_url.find("/"):link_url.find("',")]
            link_url = 'http://www.esprit.cn'+link_url
            print link_url
            text = urllib2.urlopen(link_url).read()
            tree = etree.HTML(text,parser=parser)
            nodes = tree.xpath(XPATHSUB)
            #nodesTitle = tree.xpath(TITLEXPATH)
            #nodesPrice = tree.xpath()
            #index = 0
            for node in nodes:
                #image_url=node.find('div[@class="sku_pic"]')
                #image_url=node.find('a[@class="category_skudetails_href"]')
                #if node is None:
                #   continue
               # nodeTitle=nodesTitle[index]
                image_url = 'http://www.esprit.cn'+node.attrib['href']
               # title=nodeTitle.text
                print image_url
                text = urllib2.urlopen(image_url).read()
                tree = etree.HTML(text, parser=parser)
                node = tree.xpath(TITLEXPATH)[0]
                title= node.text
                node = tree.xpath(PRICXPATH1)[0]
                price=node.text
                print price
                node = tree.xpath(PRICXPATH2)
                if len(node) !=0:
                    oldPrice=price
                    forsale = True
                    price =node[0].text
                    print price
                else:
                    forsale =False
                bigimage_url= tree.xpath(BIGXPATH)[0]
                bigimage_url = bigimage_url.attrib['href']
                print bigimage_url

                self.logger.info('%s(%s) - %s @ %s' % (title, price, image_url,bigimage_url))

                if forsale:
                    price1 = oldPrice
                    price2 = price
                else:
                    price1 = price
                    price2 = price
                print "%s  , %s  , %s  ,  %s   ,  %s   ,  %s" % (title,image_url,bigimage_url,price1,price2,leibie)
                from shopping.signals import item_found, item_update
                item_found.send(
                    self,
                    name = title,
                    url = image_url,
                    brand = self.__class__.__name__,
                    image_url = bigimage_url,
                    image_url2 = None,
                    price = price1,
                    category = leibie
                )

                item_update.send(
                    self,
                    url=image_url,
                    new_price=price2
                )
Пример #8
0
    def getData(self, category, leibie_detail):
            parser = etree .HTMLParser(encoding='utf-8')

     #   for page in range(1,pages):
          #  self.logger.info('Page: %d:' % page)
            leibie = leibie_detail
            if u'衣' in leibie_detail or u'衫' in leibie_detail or u'外套' in leibie_detail:
                leibie = u'上装'
            if u'裤' in leibie_detail:
                leibie = u'裤'
            if u'裙' in leibie_detail:
                leibie = u'裙'
            if u'配饰' in leibie_detail or u'包袋' in leibie_detail:
                leibie = u'配饰'
            if u'鞋' in leibie_detail:
                leibie = u'鞋'

            self.logger.info('leibie: %s:' % leibie)
            self.logger.info('leibie_detail: %s:' % leibie_detail)

            url = LIST_URL % category
            text = urllib2.urlopen(url).read()
            tree = etree.HTML(text, parser=parser)

            time = datetime.datetime.now().strftime('%Y-%m-%d')
            nodes = tree.xpath(XPATH)
            for node in nodes:
                sub_node = node.find('h5/a')
                #print etree.tostring(node, method='html', encoding='utf-8')
                url = sub_node.attrib['href']
                title = sub_node.text.strip()

                if leibie_detail == u'打折单品':
                    if u'衫' in title or u'T' in title or u'外套' in title or u'背心' in title or u'毛衣' in title or u'上衣' in title or u'吊带' in title:
                        leibie = u'上装'
                    if u'裤' in title:
                        leibie = u'裤'
                    if u'裙' in title:
                        leibie = u'裙'
                    if u'鞋' in title:
                        leibie = u'鞋'
                    if u'帽' in title or u'围巾' in title or u'皮带' in title or u'腰带' in title:
                        leibie = u'配饰'
                    self.logger.info('Category: %s' % (leibie))

                sub_node = node.find('div[3]/p/span/span')
                price = sub_node.text
                price = price[0:price.index('.')]

                new_price = ''
                sub_node = node.find('div[3]/p[2]/span')
                if sub_node is not None:
                    new_price = price
                    price = sub_node.text.strip()
                    price = price[0:price.index('.')]

                text = urllib2.urlopen(url).read()
                tree = etree.HTML(text, parser=parser)
                imgnodes = tree.xpath(BIGXPATH)
                image_url = imgnodes[0].attrib['href']

#                sub_node = node.find('div[1]/p/a/img')
#                image_url = sub_node.attrib['src']

                self.logger.info('%s(%s,discount:%s) - %s @ %s - %s' % (title, price, new_price, url, image_url, leibie))
                from shopping.signals import item_found,item_update
                item_found.send(
                    self,
                    name = title,
                    url = url,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = None,
                    price = price,
                    category = leibie,
                )
                if new_price != '':
                    item_update.send(
                        self,
                        name = title,
                        url = url,
                        brand = self.__class__.__name__,
                        image_url = image_url,
                        image_url2 = None,
                        price = price,
                        new_price = new_price,
                        category = leibie,
                    )