示例#1
0
    def getData(self, leibie_url, leibie):
        parser = etree.HTMLParser(encoding="utf-8")
        product_page = 1
        main_url = leibie_url % product_page
        text = urllib2.urlopen(main_url).read()
        tree = etree.HTML(text, parser=parser)
        nodes = tree.xpath(LEIBIE_XPATH)

        pages_url = [main_url]
        page_nodes = tree.xpath(PAGE_XPATH)
        if page_nodes is not None:
            pages = len(page_nodes) + 1
            print "total pages %d" % pages
            for page in range(1, pages):
                new_url = leibie_url % (page + 1)
                pages_url.append(new_url)

        for page_url in pages_url:
            item_text = urllib2.urlopen(page_url).read()
            item_tree = etree.HTML(item_text, parser=parser)
            item_nodes = item_tree.xpath(ITEM_XPATH)

            if item_nodes is None:
                continue

            for item_node in item_nodes:
                item_table = item_node.find("table")
                if item_table is None:
                    continue
                time = datetime.datetime.now().strftime("%Y-%m-%d")
                url = PARENT_URL + item_table.find("tr/td/a").attrib["href"]
                title = item_table.find("tr[3]/td/b").text
                price = "".join(item_table.find("tr[4]/td").text.split(" ")).lstrip("\r\n")

                image_text = urllib2.urlopen(url).read()
                tree_image = etree.HTML(image_text, parser=parser)
                img_node = tree_image.xpath(BIG_XPATH)[0]
                image_url = img_node.attrib["src"]

                self.logger.info("%s(%s) - %s @ %s" % (title, price, url, image_url))
                collector.object_found.send(
                    self, time=time, title=title, url=url, image_url=image_url, price=price, leibie=leibie
                )

                from shopping.signals import item_found

                item_found.send(
                    self,
                    name=title,
                    url=url,
                    brand=self.__class__.__name__,
                    image_url=image_url,
                    image_url2=None,
                    price=price,
                    category=leibie,
                )
示例#2
0
    def getData(self, category, old_leibie):
        parser = etree.XMLParser(encoding='utf-8')
        self.logger.info('Category: %s:' % category)
        url = LIST_URL %(category)
        text = urllib2.urlopen(url).read()
        time = datetime.datetime.now().strftime('%Y-%m-%d')
        tree = etree.XML(text, parser=parser)
        nodes = tree.xpath(XPATH)
        for node in nodes:
            title = node.find('Title').text

            if u'内衣' in title or  u'n内裤' in title or u'袜子' in title:
                continue

            if u'裙' in title:
                leibie = u'裙'
            elif u'裤' in title:
                leibie = u'裤'
            elif u'鞋' in title:
                leibie = u'鞋'
            elif u'包' in title:
                leibie = u'配饰'
            elif u'5239145' in title or u'装' in title or u'衣' in title or u'衫' in title\
                 or u'夹' in title or u'恤' in title:
                leibie = u'上装'
            else:
                continue


            price = node.find('Price').text

            image_url = u'http://me-city.com/'+node.find('FullImage').text


            self.logger.info('%s:%s(%s) - %s @ %s' % (leibie, title, price, image_url, image_url))
            collector.object_found.send(
                self,
                time = time, title = title, url = image_url,
                image_url = image_url,
                price = price,
                leibie = leibie
            )
            from shopping.signals import item_found
            item_found.send(
                self,
                name = title,
                url = image_url,
                brand = self.__class__.__name__,
                image_url = image_url,
                image_url2 = None,
                price = price,
                category = leibie,
            )
示例#3
0
    def getData(self, category, subcate, pages, leibie):
        parser = etree .HTMLParser(encoding='utf-8')
        self.logger.info('Category: %s-%s:' % (category, subcate))
        for page in range(1,pages):
            self.logger.info('Page: %d:' % page)
            url = LIST_URL % (page, subcate, category)
            text = urllib2.urlopen(url).read()
            tree = etree.HTML(text, parser=parser)



            time = datetime.datetime.now().strftime('%Y-%m-%d')
            nodes = tree.xpath(XPATH)
            for node in nodes:
                sub_node = node.find('dt[@class="skuname"]/a')
                ourl = urlparse.urljoin(url,sub_node.attrib['href'])
                title = sub_node.text

                sub_node = node.find('dt[@class="price"]/span[@id="listPrice"]')
                price = u'¥' + sub_node.text

                sub_node = node.find('dt[@class="img"]/a/img')
                #print etree.tostring(sub_node, method='html', encoding='utf-8')
                image_url = sub_node.attrib['lazy_src']

                detail_text = urllib2.urlopen(ourl).read()
                detail_tree = etree.HTML(detail_text, parser=parser)
                image_node = detail_tree.xpath(BIG_XPATH)[0]
                image_url = image_node.attrib['src']



                self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url))
                collector.object_found.send(
                    self,
                    time = time, title = title, url = ourl,
                    image_url = image_url,
                    price = price,
                    leibie = leibie
                )
                from shopping.signals import item_found
                item_found.send(
                    self,
                    name = title,
                    url = ourl,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = None,
                    price = price,
                    category = leibie,
                )
示例#4
0
    def getData(self,category,pages,detail_leibie,leibie):
        parser = etree .HTMLParser(encoding='utf-8')
        for subcate in range(1,pages):
            time = datetime.datetime.now().strftime('%Y-%m-%d')
            self.logger.info('Category: %s' % leibie)
            self.logger.info('Category: %s-%s:' % (detail_leibie, subcate))
            url = LIST_URL % (category,category)
            text = urllib2.urlopen(url).read()
            tree = etree.HTML(text,parser=parser)
            nodes = tree.xpath(XPATH)
            for num in range(2,7):
                for i in range(1,4):
                    sub_node = nodes[num].find('div['+str(i)+']')
                    name_node = sub_node.find('ul/li[1]/a')
                    title = name_node.attrib['title']
                    ourl = name_node.attrib['href']
#                    image_node = name_node.find('img')
#                    image_url = image_node.attrib['src']

                    text = urllib2.urlopen(ourl).read()
                    tree = etree.HTML(text, parser=parser)
                    imgnodes = tree.xpath(BIGXPATH)
                    image_url = imgnodes[0].attrib['href']

                    price_node = sub_node.find('ul/li[3]')
                    price = price_node.text
                    if price.find(u'£') < 0:
                        price_node = sub_node.find('ul/li[4]')
                        price = price_node.text

                    self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url))
                    collector.object_found.send(
                        self,
                        time = time, title = title, url = ourl,
                        image_url = image_url,
                        price = price,
                        leibie = leibie
                    )

                    from shopping.signals import item_found
                    item_found.send(
                        self,
                        name = title,
                        url = ourl,
                        brand = self.__class__.__name__,
                        image_url = image_url,
                        image_url2 = None,
                        price = price,
                        category = leibie,
                    )
示例#5
0
    def fetch(self):
        self.logger.info('MANGO started.')

        parser = etree .HTMLParser(encoding='utf-8')
        for page in range(1,4):
            self.logger.info('Page: %d:' % page)
            url = LIST_URL + page.__str__()
            print url
            text = urllib2.urlopen(url).read()
            tree = etree.HTML(text, parser=parser)

            time = datetime.datetime.now().strftime('%Y-%m-%d')
            nodes = tree.xpath(XPATH)
            for node in nodes:
                sub_node = node.find('tr[1]/td/div/a/img')
                #print etree.tostring(node, method='html', encoding='utf-8')
                image_url = sub_node.attrib['src']

                sub_node = node.find('tr[1]/td/div/img')
                image_url_backup = sub_node.attrib['src']

                sub_node = node.find('tr[2]/td/div/table/tr[2]/td/a')
                title = sub_node.find('span').text
                url = urlparse.urljoin("http://shop.mango.com/",sub_node.attrib['href'])

                sub_node = node.find('tr[2]/td/div/table/tr[3]/td/span')
                price = sub_node.text

                self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url))
                collector.object_found.send(
                    self,
                    time = time, title = title, url = url,
                    image_url = image_url,
                    image_url2 = image_url_backup,
                    price = price,
                    leibie = u'女装'
                )
                from shopping.signals import item_found
                item_found.send(
                    self,
                    name = title,
                    url = url,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = image_url_backup,
                    price = price,
                    category = u'女装',
                )
示例#6
0
    def fetch(self):
        self.logger.info('Goelia started.')
        parser = etree .HTMLParser(encoding='utf-8')

        for page in range(1,8):
            self.logger.info('Page: %d:' % page)
            url = LIST_URL1 % page + LIST_URL2
            text = urllib2.urlopen(url).read()
            tree = etree.HTML(text, parser=parser)

            time = datetime.datetime.now().strftime('%Y-%m-%d')
            nodes = tree.xpath(XPATH)
            for node in nodes:
                sub_node = node.find('div[@class="goodpic"]/a/img')
                #print etree.tostring(node, method='html', encoding='utf-8')
                image_url = sub_node.attrib['lazyload']

                sub_node = node.find('div[@class="goods-main"]/div[1]/h6/a')
                title = sub_node.text
                ourl = urlparse.urljoin(url,sub_node.attrib['href'])

                sub_node = node.find('div[@class="goods-main"]/div[2]/ul/li[1]/em[@class="sell-price"]')
                price = sub_node.text.strip()

                self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url))
                collector.object_found.send(
                    self,
                    time = time, title = title, url = ourl,
                    image_url = image_url,
                    price = price,
                    leibie = u'女装'
                )
                from shopping.signals import item_found
                item_found.send(
                    self,
                    name = title,
                    url = ourl,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = None,
                    price = price,
                    category = u'女装',
                )
示例#7
0
    def fetch(self):
        self.logger.info('Oasis started.')
        parser = etree .HTMLParser(encoding='utf-8')

        for page in range(1,7):
            self.logger.info('Page: %d:' % page)
            url = LIST_URL % page
            text = urllib2.urlopen(url).read()
            tree = etree.HTML(text, parser=parser)

            time = datetime.datetime.now().strftime('%Y-%m-%d')
            nodes = tree.xpath(XPATH)
            for node in nodes:
                sub_node = node.find('dd/a/img')
                #print etree.tostring(node, method='html', encoding='utf-8')
                image_url = sub_node.attrib['src']

                sub_node = node.find('dt/a')
                title = sub_node.text
                ourl = urlparse.urljoin(url,sub_node.attrib['href'])

                price = self.getPrice(ourl)

                self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url))
                collector.object_found.send(
                    self,
                    time = time, title = title, url = ourl,
                    image_url = image_url,
                    price = price,
                    leibie = u"女装"
                )
                from shopping.signals import item_found
                item_found.send(
                    self,
                    name = title,
                    url = ourl,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = None,
                    price = price,
                    category = u"女装",
                )
示例#8
0
    def getData(self):
        parser = etree.HTMLParser(encoding='utf-8')

        text = urllib2.urlopen(MAIN_URL).read()
        tree = etree.HTML(text, parser=parser)
        nodes = tree.xpath(LEIBIE_XPATH)


        for node in nodes:
            sub_node = node.find('a')
            if sub_node is None:
                continue

            leibie_detail = sub_node.text
            if leibie_detail is None:
                leibie_detail = sub_node.find('span').text

            count = leibie_detail[leibie_detail.index('(') + 1:leibie_detail.index(')')]
            if count == '0':
                continue

            leibie_url = sub_node.attrib['href']
            leibie = leibie_detail

            if u'卫衣' in leibie_detail or u' T恤' in leibie_detail or u'衬衫' in leibie_detail or\
               u'装' in leibie_detail:
                leibie = u'上装'
            if u'裤' in leibie_detail or u'50' in leibie_detail:
                leibie = u'裤'
            if u'鞋' in leibie_detail:
                leibie = u'鞋'
            if u'配件' in leibie_detail or u'包' in leibie_detail or u'皮带' in leibie_detail \
                or u'眼镜' in leibie_detail or u'手表' in leibie_detail:
                leibie = u'配饰'

            self.logger.info('Leibie: %s' % leibie)
            print leibie_url
            text_leibie = urllib2.urlopen(leibie_url).read()
            tree_leibie = etree.HTML(text_leibie, parser=parser)
            time = datetime.datetime.now().strftime('%Y-%m-%d')

            page_nodes = tree_leibie.xpath(PAGE_XPATH)[:-1]
            pages = len(page_nodes) + 1
            print 'total pages %s' % pages

            url_list = [leibie_url]
            for page in page_nodes:
                url_list.append(page.attrib['href'])

            for page_url in url_list:
                text_item = urllib2.urlopen(page_url).read()
                tree_item = etree.HTML(text_item, parser=parser)
                nodes_item = tree_item.xpath(ITEM_XPATH)

                for sub_node_item in nodes_item:
                    item_node = sub_node_item.find('a')
                    if item_node is None:
                        continue

                    url = item_node.attrib['href']
                    title = item_node.attrib['title']
                    price_node = sub_node_item.find('div/font[2]')
                    price = 'RMB' + price_node.text

                    new_price_node = sub_node_item.find('div/font[1]')
                    new_price = 'RMB' + new_price_node.text

                    image_url_backup = ''

                    text_detail = urllib2.urlopen(url).read()
                    tree_detail = etree.HTML(text_detail, parser=parser)
                    img_node = tree_detail.xpath(BIG_XPATH)[0]
                    image_url = img_node.attrib['src']

                    self.logger.info('%s(%s--%s) - %s @ %s' % (title, price, new_price, url, image_url))

                    collector.object_found.send(
                        self,
                        time = time, title = title, url = url,
                        image_url = image_url,
                        image_url2 = image_url_backup,
                        price = price,
                        leibie = leibie
                    )
                    from shopping.signals import item_found, item_update
                    item_found.send(
                        self,
                        name = title,
                        url = url,
                        brand = self.__class__.__name__,
                        image_url = image_url,
                        image_url2 = image_url_backup,
                        price = price,
                        category = leibie,

                    )

                    item_update.send(
                        self,
                        url=url,
                        new_price=new_price
                    )
示例#9
0
    def getData(self, main_url):
        parser = etree.HTMLParser(encoding='utf-8')

        text = urllib2.urlopen(main_url).read()
        tree = etree.HTML(text, parser=parser)
        nodes = tree.xpath(LEIBIE_XPATH)

        for node in nodes:
            sub_node = node.find('a')
            if sub_node is None:
                continue

            leibie_detail = sub_node.text
            leibie_url = PARENT_URL + sub_node.attrib['href']

            leibie = leibie_detail
            if u'全部' in leibie_detail or u'袜子' in leibie_detail or u'泳装' in leibie_detail:
                continue

            if u'夹克' in leibie_detail or u'牛仔' in leibie_detail or u'衫' in leibie_detail or\
               u'T恤' in leibie_detail:
                leibie = u'上装'
            if u'裤' in leibie_detail:
                leibie = u'裤'
            if u'鞋' in leibie_detail:
                leibie = u'鞋'
            if u'带' in leibie_detail or u'包' in leibie_detail or u'帽' in leibie_detail\
               or u'链' in leibie_detail or u'手' in leibie_detail or u'围巾' in leibie_detail:
                leibie = u'配饰'

            self.logger.info('Leibie: %s' % leibie)
            print leibie_url
            time = datetime.datetime.now().strftime('%Y-%m-%d')

            text_leibie = urllib2.urlopen(leibie_url).read()
            tree_leibie = etree.HTML(text_leibie, parser=parser)
            item_nodes = tree_leibie.xpath(ITEM_XPATH)

            for node in item_nodes:
                item_node = node.find('a')
                if item_node is None:
                    continue


                title = node.find('div[2]/div/a').text
                url = 'http://www.converse.com.cn'+node.find('div[2]/div/a').attrib['href']
                price = ''.join(node.find('div[2]/div[2]').text.split(' '))
                image_url_backup = ''

                text_detail = urllib2.urlopen(url).read()
                tree_detail = etree.HTML(text_detail, parser=parser)
                img_node = tree_detail.xpath(BIG_XPATH)[0]
                image_url = img_node.attrib['href']

                self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url))

                collector.object_found.send(
                    self,
                    time = time, title = title, url = url,
                    image_url = image_url,
                    image_url2 = image_url_backup,
                    price = price,
                    leibie = leibie
                )
                from shopping.signals import item_found
                item_found.send(
                    self,
                    name = title,
                    url = url,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = image_url_backup,
                    price = price,
                    category = leibie,
                )
示例#10
0
    def getData(self, category, subcate, leibie):
        parser = etree .HTMLParser(encoding='utf-8')
        self.logger.info('Category: %s-%s:' % (category, subcate))


        url = LIST_URL % (subcate, category, 1)
        text = urllib2.urlopen(url).read()
        tree = etree.HTML(text, parser=parser)
        leibie_nodes = tree.xpath(LEIBIE_XPATH)

        for leibie_node in leibie_nodes:
            leibie_node_a = leibie_node.find('a')
            leibie_name = leibie_node_a.text

            if u'上衣' in leibie_name or u'夹克' in leibie_name or\
               u'球衣' in leibie_name or u'运动衫' in leibie_name or u'卫衣' in leibie_name or\
               u'POLO衫' in leibie_name or u'有袖' in leibie_name or u'夹克' in leibie_name or\
               u'马甲' in leibie_name or u'风衣' in leibie_name or u'大衣' in leibie_name or\
               u'棉服' in leibie_name or u'女款' in leibie_name or u'男款' in leibie_name:
                leibie = u'上装'

            elif u'裤子' in leibie_name or u'短裤' in leibie_name or u'连身裤' in leibie_name\
                 or u'连身裤' in leibie_name or u'针织裤' in leibie_name or u'西裤' in leibie_name:
                leibie = u'裤'

            elif u'套服' in leibie_name:
                leibie = u'套装'

            elif u'裙装' in leibie_name  or u'短裙' in leibie_name:
                leibie = u'裙'
            elif u'鞋' in leibie_name  or u'拖' in leibie_name:
                leibie = u'鞋'
            elif u'饰品' in leibie_name or u'手套' in leibie_name or u'背包' in leibie_name or\
                u'帽子' in leibie_name or u'头饰' in leibie_name or u'附件' in leibie_name:
                leibie = u'配饰'
            else:
                continue

            leibie_url = PARENT_URL + leibie_node_a.attrib['href'] + '?p=1'

            time = datetime.datetime.now().strftime('%Y-%m-%d')

            items_text = urllib2.urlopen(leibie_url).read()
            print "leibie url %s" % leibie_url
            items_tree = etree.HTML(items_text, parser=parser)
            pages_node = items_tree.xpath(PAGE_XPATH)

            if len(pages_node) != 0:
                pages_node = pages_node[-2]
                pages = int(pages_node.text, 10)
            else:
                pages = 1

            for page in range(1, pages+1):
                if page == 1:
                    items_nodes = items_tree.xpath(ITEM_XPATH)
                else:
                    query_string = QUERY_STRING % page
                    items_url =  PARENT_URL + leibie_node_a.attrib['href'] + query_string
                    items_text = urllib2.urlopen(items_url).read()
                    items_tree = etree.HTML(items_text, parser=parser)
                    items_nodes = items_tree.xpath(ITEM_XPATH)

                for node in items_nodes:
                    sub_node = node.find('div[1]/a/img')
                    image_url = sub_node.attrib['src']

                    sub_node = node.find('div[3]/a')
                    #print etree.tostring(node, method='html', encoding='utf-8')
                    ourl = urlparse.urljoin(url,sub_node.attrib['href'])
                    title = sub_node.text

                    detail_text = urllib2.urlopen(ourl).read()
                    detail_tree = etree.HTML(detail_text, parser=parser)
                    image_node = detail_tree.xpath(BIG_XPATH)[0]
                    image_url = image_node.attrib['src'][2:]

                    price = '0'

                    self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url))
                    collector.object_found.send(
                        self,
                        time = time, title = title, url = ourl,
                        image_url = image_url,
                        price = price,
                        leibie = leibie
                    )
                    item_found.send(
                        self,
                        name = title,
                        url = ourl,
                        brand = self.__class__.__name__,
                        image_url = image_url,
                        image_url2 = None,
                        price = price,
                        category = leibie,
                    )
示例#11
0
    def getData(self, category, kuanshi):
        if u'WOMEN' in kuanshi :
             start = 7
        else:
             start = 5
        parser = etree .HTMLParser(encoding='utf-8')
        time = datetime.datetime.now().strftime('%Y-%m-%d')
        self.logger.info('Category: %s:' % kuanshi)

        M_URL = LIST_URL % (category,category)
        text = urllib2.urlopen(M_URL).read()
        tree = etree.HTML(text,parser=parser)
        nodes = tree.xpath(XPATH)

        for num in range(start,20):
            if num>=7 and num<=11:
                leibie = u'上装'
            elif num>=12 and num<=15:
                leibie = u'裤'
            elif num>=16 and num<=17:
                leibie = u'裙'
            elif num == 18:
                leibie = u'鞋'
            elif num == 19:
                leibie = u'配饰'
            node = nodes[num].find('a')
            detail_leibie = node.text
            self.logger.info('leibie: %s' % leibie+'-'+detail_leibie)

            cate_url = node.attrib['href']
            cate_url = 'http://www.abercrombie.com'+cate_url

            text = urllib2.urlopen(cate_url).read()
            tree = etree.HTML(text,parser=parser)
            cat_nodes = tree.xpath(CAT_XPATH)

            for cat_node in cat_nodes:
                i=1
                while cat_node.find('div/ul/li['+str(i)+']') is not None:
                    clo_nodes = cat_node.find('div/ul/li['+str(i)+']')
                   # print clo_nodes
                    i=i+1
                    for clo_node in clo_nodes:
                        name_node = clo_node.find('span[@class="name"]/h3/a')
                        if name_node is None:
                            name_node = clo_node.find('span[@class="name"]/h2/a')
                        #print etree.tostring(clo_node.find('span[@class="name"]'), method='html', encoding='utf-8')
                        #print name_node.text

                        title = name_node.text
                        url_node = clo_node.find('div[@class="image-wrap"]/a')
                        url = 'http://www.abercrombie.com/webapp/wcs/stores/servlet/'+url_node.attrib['href']
                        text = urllib2.urlopen(url).read()
                        tree = etree.HTML(text,parser=parser)
                        imgnodes = tree.xpath(BIGXPATH)
                        if imgnodes is not None:
                            image_url = 'http:'+imgnodes[0].attrib['src']
#                        image_node = url_node.find('img')
#                        image_url = 'http:'+image_node.attrib['src']
                        price_node = clo_node.find('div[@class="price"]/span')
                        price = price_node.text

                        self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url))

                        from shopping.signals import item_found
                        item_found.send(
                            self,
                            name = title,
                            url = url,
                            brand = self.__class__.__name__,
                            image_url = image_url,
                            image_url2 = None,
                            price = price,
                            category = leibie,
                        )
示例#12
0
    def getData(self,category,pages,leibie):
        temp_leibie = leibie
        parser = etree .HTMLParser(encoding='utf-8')
        for subcate in range(1,pages):
            time = datetime.datetime.now().strftime('%Y-%m-%d')
            self.logger.info('Category: %s-%s:' % (leibie, subcate))
            urlleft = LEFT_URL % (category)
            urlright = RIGHT_URL % (subcate)
            url = urlleft + u'%20' + urlright
            text = urllib2.urlopen(url).read()
            tree = etree.HTML(text,parser=parser)
            nodes = tree.xpath(XPATH)
            for node in nodes:
                #print etree.tostring(node, method='html', encoding='utf-8')
                s_node = node.find('div/a')
                title = urlparse.urljoin(url,s_node.attrib['title'])[24:]
                if temp_leibie == u'打折单品':
                    cate_node = node.find('span')
                    leibie_detail = cate_node.text
                    print leibie_detail
#                    self.logger.info('Category: %s' % (leibie))
                    if u'衫' in leibie_detail or u'T' in leibie_detail or u'外套' in leibie_detail or u'背心' in leibie_detail or u'毛衣' in leibie_detail or u'上衣' in leibie_detail:
                        leibie = u'上装'
                    if u'裤' in leibie_detail:
                        leibie = u'裤'
                    if u'裙' in leibie_detail:
                        leibie = u'裙'
                    if u'配饰' in leibie_detail and u'鞋' in title:
                        leibie = u'鞋'
                    self.logger.info('Category: %s' % (leibie))

                ourl = urlparse.urljoin(url,s_node.attrib['href'])
                image_url2_whole = urlparse.urljoin(url,s_node.attrib['onmouseover'])
                image_url2 = image_url2_whole[image_url2_whole.find('src=')+5 :]
                image_url2 = image_url2.replace('\'','')
                sub_node = s_node.find('img')
                image_url = urlparse.urljoin(url,sub_node.attrib['datasrc'])
                p_node = node.find('p')
                d_node = p_node.find('del')
                new_price = ''
                if d_node is not None:
                    price = d_node.text
                    new_info = etree.tostring(p_node, method='html', encoding='utf-8')
                    new_price = new_info [new_info.find('</del>')+len('</del>'):new_info.find('</p>')]
                else:
                    price = p_node.text

                self.logger.info('%s(%s,now:%s) - %s @ %s' % (title, price,new_price.decode('utf-8'), ourl, image_url))
                from shopping.signals import item_found,item_update
                item_found.send(
                    self,
                    name = title,
                    url = ourl,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = image_url2,
                    price = price,
                    category = leibie,
                )
                if new_price != '':
                    item_update.send(
                       self,
                       name = title,
                       url = ourl,
                       brand = self.__class__.__name__,
                       image_url = image_url,
                       image_url2 = None,
                       price = price,
                       new_price = new_price,
                       category = leibie,
                    )
示例#13
0
    def getData(self, category, pages, mainleibie):
        parser = etree .HTMLParser(encoding='utf-8')
        self.logger.info('Category: %s:' % category)

        text = urllib2.urlopen(LIST_URL %(category)).read()
        tree = etree.HTML(text, parser=parser)
        nodes = tree.xpath(MAIN_XPATH)
        for node in nodes:
            sub_node = node.find('a')
            leibie_detail = sub_node.text
            leibie_url = sub_node.attrib['href']
            leibie = leibie_detail
            if u'内衣' in leibie_detail or u'睡衣' in leibie_detail or u'袜' in leibie_detail or u'孕妇' in leibie_detail:
                continue
            if u'衣' in leibie_detail or u'装' in leibie_detail or u'衫' in leibie_detail:
                leibie = u'上装'
            if u'裤' in leibie_detail:
                leibie = u'裤'
            if u'裙' in leibie_detail:
                leibie = u'裙'
            if u'配饰' in leibie_detail:
                leibie = u'配饰'
            if u'鞋' in leibie_detail:
                leibie = u'鞋'
            self.logger.info('leibie: %s:' % mainleibie+'-'+leibie)
            print leibie_url
            text = urllib2.urlopen(leibie_url).read()
#             print LIST_URL %(category,page)
            tree = etree.HTML(text, parser=parser)
            time = datetime.datetime.now().strftime('%Y-%m-%d')
            nodes = tree.xpath(XPATH)
            for i in range(1,len(nodes)):
                sub_node = nodes[i].find('a')
                title = sub_node.attrib['title']

                url = sub_node.attrib['href']

                price = sub_node.find('span/span/span').text
                price =  u'¥' + price[len('RMB '):len(price)]

                text = urllib2.urlopen(url).read()
                tree = etree.HTML(text, parser=parser)
                img_nodes = tree.xpath(BIGXPATH)
                image_url = 'http:'+img_nodes[0].attrib['src']
                image_url = 'http:'+urllib.quote(image_url[5:],safe='/')


#                sub_node = nodes[i].find('div[1]')
#                img = sub_node.find('img[2]')
#                image_url = ''
#                if img is not None:
#                    image_url = 'http:'+sub_node.find('img[2]').attrib['src']
#                image_url = 'http:'+urllib.quote(image_url[5:],safe='/')#image_url.replace(' ','%20')
#                backup = sub_node.find('img[1]')
                image_url_backup = ''
#                if backup is not None:
#                     image_url_backup = 'http:'+sub_node.find('img[1]').attrib['src']


                self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url))
                collector.object_found.send(
                    self,
                    time = time, title = title, url = url,
                    image_url = image_url,
                    image_url2 = image_url_backup,
                    price = price,
                    leibie = leibie
                )
                from shopping.signals import item_found
                item_found.send(
                    self,
                    name = title,
                    url = url,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = image_url_backup,
                    price = price,
                    category = leibie,
                )
示例#14
0
    def getData(self, category,kuanshi):
        parser = etree .HTMLParser(encoding='utf-8')
        url = LIST_URL % (category)
        text = urllib2.urlopen(url).read()
        tree = etree.HTML(text, parser=parser)
        time = datetime.datetime.now().strftime('%Y-%m-%d')
        nodes = tree.xpath(XPATH)
        for node in nodes:
            leibie = node.text.strip()

            if leibie==u'皮带':
                leibie=u'配饰'
            elif leibie==u'围巾/帽子':
                leibie=u'配饰'
            elif leibie==u'手袋/钱夹':
                leibie=u'配饰'
            elif leibie==u'牛仔裤':
                leibie=u'裤'
            elif leibie==u'热裤/七分裤':
                leibie=u'裤'
            elif leibie==u'时尚及休闲裤':
                leibie=u'裤'
            elif leibie==u'连衣裙':
                leibie=u'裙'
            elif leibie==u'半身裙':
                leibie=u'裙'
            elif leibie==u'浪漫韵动':
                leibie='none'
            elif leibie==u'意桃粉丽人':
                leibie='none'
            elif leibie==u'白色的纯纯夏日':
                leibie='none'
            else:
                leibie=u'上装'
            print leibie
            if leibie=='none':
                continue
#            #unction getSkus(url,_this){
#            基于需要f5刷新的需要,将局部刷新修改为页面跳转
            #/products/2---Women@[email protected]
#        window.location.href=url;
            link_url=node.attrib['onclick']
            #getSkus('/products/2-6-22-------.htm',this)
            link_url = link_url[link_url.find("/"):link_url.find("',")]
            link_url = 'http://www.esprit.cn'+link_url
            print link_url
            text = urllib2.urlopen(link_url).read()
            tree = etree.HTML(text,parser=parser)
            nodes = tree.xpath(XPATHSUB)
            #nodesTitle = tree.xpath(TITLEXPATH)
            #nodesPrice = tree.xpath()
            #index = 0
            for node in nodes:
                #image_url=node.find('div[@class="sku_pic"]')
                #image_url=node.find('a[@class="category_skudetails_href"]')
                #if node is None:
                #   continue
               # nodeTitle=nodesTitle[index]
                image_url = 'http://www.esprit.cn'+node.attrib['href']
               # title=nodeTitle.text
                print image_url
                text = urllib2.urlopen(image_url).read()
                tree = etree.HTML(text, parser=parser)
                node = tree.xpath(TITLEXPATH)[0]
                title= node.text
                node = tree.xpath(PRICXPATH1)[0]
                price=node.text
                print price
                node = tree.xpath(PRICXPATH2)
                if len(node) !=0:
                    oldPrice=price
                    forsale = True
                    price =node[0].text
                    print price
                else:
                    forsale =False
                bigimage_url= tree.xpath(BIGXPATH)[0]
                bigimage_url = bigimage_url.attrib['href']
                print bigimage_url

                self.logger.info('%s(%s) - %s @ %s' % (title, price, image_url,bigimage_url))

                if forsale:
                    price1 = oldPrice
                    price2 = price
                else:
                    price1 = price
                    price2 = price
                print "%s  , %s  , %s  ,  %s   ,  %s   ,  %s" % (title,image_url,bigimage_url,price1,price2,leibie)
                from shopping.signals import item_found, item_update
                item_found.send(
                    self,
                    name = title,
                    url = image_url,
                    brand = self.__class__.__name__,
                    image_url = bigimage_url,
                    image_url2 = None,
                    price = price1,
                    category = leibie
                )

                item_update.send(
                    self,
                    url=image_url,
                    new_price=price2
                )
示例#15
0
    def getData(self, category, kuanshi):
        parser = etree .HTMLParser(encoding='utf-8')
        time = datetime.datetime.now().strftime('%Y-%m-%d')
        self.logger.info('Category: %s:' % kuanshi)
        M_URL = LIST_URL % (category,category)
        print M_URL
        text = urllib2.urlopen(M_URL).read()
        tree = etree.HTML(text,parser=parser)
        nodes = tree.xpath(XPATH)
        #print nodes
        for node in nodes:

            cat_url = node.attrib['href']
            #print cat_url
            leibie_node = node.find('span')
            leibie_node_text=leibie_node.text.strip()
            print leibie_node_text
            print leibie_node_text=="T's + Polos"
            if leibie_node_text=='Tops':
                leibie_node_text=u'上装'
#            elif leibie_node_text=="T's + Polos":
#                leibie_node_text=u'套装'
#            elif leibie_node_text=='Tanks + Camis':
#                leibie_node_text=u'套装'
#            elif leibie_node_text=="Graphic T's":
#                leibie_node_text=u'上装'
#            elif leibie_node_text=='Shirts':
#                leibie_node_text=u'上装'
#            elif leibie_node_text=='Sweaters + Cardis':
#                leibie_node_text=u'套装'
#            elif leibie_node_text=='Sweatshirts':
#                leibie_node_text=u'上装'
#            elif leibie_node_text=='Outerwear':
#                leibie_node_text=u'上装'
            elif leibie_node_text=='Bottoms':
                leibie_node_text=u'下装'
            elif leibie_node_text=='Accessories':
                leibie_node_text=u'配饰'
            elif leibie_node_text=='College':
                leibie_node_text=u'上装'
            elif leibie_node_text=='Footwear':
                leibie_node_text=u'鞋'
            elif leibie_node_text=='New Arrivals':
                leibie_node_text='none'
            elif leibie_node_text=='Web Exclusives':
                leibie_node_text='none'
            elif leibie_node_text=="$10 T's + Tanks":
                leibie_node_text='none'
            elif leibie_node_text=='Looks To Live In':
                leibie_node_text='none'
            elif leibie_node_text=='Jean Guide':
                leibie_node_text='none'
            elif leibie_node_text=='Fragrance':
                leibie_node_text='none'
            elif leibie_node_text=='Clearance':
                leibie_node_text='none'
            elif leibie_node_text=='Back To Basics':
                return
            #print 'fafd %s' % leibie_node_text
            if leibie_node_text == 'none':
                continue

            if leibie_node_text!=u'下装':
                print "ups"
                leibie = kuanshi + '-' + leibie_node_text

                print leibie
                text = urllib2.urlopen(cat_url).read()
                tree = etree.HTML(text,parser=parser)
                nodes = tree.xpath(CAT_XPATH)

                for node in nodes:
                    url_node = node.find('a')
                    url = 'http://www.ae.com'+url_node.attrib['href']

                    image_node = url_node.find('span/img')
                    title = image_node.attrib['alt']
                    #print title
                    image_url = 'http:'+image_node.attrib['src']
                    #print image_url
                    price_node = url_node.find('span[4]')
                    price = price_node.text

                    if price.find(u'Was:')==0:
                        price_node = url_node.find('span[5]')
                        price = price_node.text[5:]
                        print price
                        text = urllib2.urlopen(url).read()
                        tree = etree.HTML(text,parser=parser)
                        node = tree.xpath(BIG_XPATH)[0]

                        image_node=node.find('img')
                        image_url='http:'+image_node.attrib['src']
                        print image_url
                        node = tree.xpath(PRICE_XPATH)
                        flag = False
                        if node:
                            #print node
                            flag = True
                            oldPrice = node[0].text
                            print oldPrice
                        self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url))


                        if flag:
                           price1 = oldPrice
                           price2 = price
                        else:
                            price1 = price
                            price2 = price

                        #print "%s   %s   %s    %s     %s     %s" % (title,url,image_url,price1,price2,leibie)
                        from shopping.signals import item_found, item_update
                        item_found.send(
                        self,
                        name = title,
                        url = url,
                        brand = self.__class__.__name__,
                        image_url = image_url,
                        image_url2 = None,
                        price = price1,
                        category = leibie
                        )

                        item_update.send(
                        self,
                        url=url,
                        new_price=price2
                        )
            else:
                print "bottoms"
                nodes = tree.xpath(XPATHSUB)
                for node in nodes:
                    cat_url = node.attrib['href']
                    leibie_node = node.find('span')
                    leibie_node_text=leibie_node.text.strip()
                    if leibie_node_text=='Jeans':
                         leibie_node_text=u'裤'
                    elif leibie_node_text=='Pants + Crops':
                         leibie_node_text=u'裤'
                    elif leibie_node_text=='Shorts':
                         leibie_node_text=u'裤'
                    elif leibie_node_text=='Dresses':
                         leibie_node_text=u'裙'
                    else:
                        leibie_node_text= 'none'

                    if leibie_node_text == 'none':
                        continue
                    leibie = kuanshi + '-' + leibie_node_text

                    print leibie
                    text = urllib2.urlopen(cat_url).read()
                    tree = etree.HTML(text,parser=parser)
                    nodes = tree.xpath(CAT_XPATH)

                    for node in nodes:
                        url_node = node.find('a')
                        url = 'http://www.ae.com'+url_node.attrib['href']

                        image_node = url_node.find('span/img')
                        title = image_node.attrib['alt']
                        #print title
                        image_url = 'http:'+image_node.attrib['src']
                        #print image_url
                        price_node = url_node.find('span[4]')
                        price = price_node.text

                        if price.find(u'Was:')==0:
                            price_node = url_node.find('span[5]')
                            price = price_node.text[5:]
                            print price
                        #http://www.ae.com/web/browse/product.jsp?productId=2371_9560_199&catId=cat90030
                        text = urllib2.urlopen(url).read()
                        tree = etree.HTML(text,parser=parser)
                        node = tree.xpath(BIG_XPATH)[0]

                        image_node=node.find('img')
                        image_url='http:'+image_node.attrib['src']
                        node = tree.xpath(PRICE_XPATH)
                        flag = False
                        if node:
                           # print node
                            flag = True
                            oldPrice = node[0].text
                            print oldPrice
                        print image_url
                        if flag:
                            price1 = oldPrice
                            price2 = price
                        else:
                            price1 = price
                            price2 = price
                        item_found.send(
                        self,
                        name = title,
                        url = url,
                        brand = self.__class__.__name__,
                        image_url = image_url,
                        image_url2 = None,
                        price = price1,
                        category = leibie
                        )

                        item_update.send(
                        self,
                        url=url,
                        new_price=price2
                        )
示例#16
0
    def getData(self, category, leibie_detail):
            parser = etree .HTMLParser(encoding='utf-8')

     #   for page in range(1,pages):
          #  self.logger.info('Page: %d:' % page)
            leibie = leibie_detail
            if u'衣' in leibie_detail or u'衫' in leibie_detail or u'外套' in leibie_detail:
                leibie = u'上装'
            if u'裤' in leibie_detail:
                leibie = u'裤'
            if u'裙' in leibie_detail:
                leibie = u'裙'
            if u'配饰' in leibie_detail or u'包袋' in leibie_detail:
                leibie = u'配饰'
            if u'鞋' in leibie_detail:
                leibie = u'鞋'

            self.logger.info('leibie: %s:' % leibie)
            self.logger.info('leibie_detail: %s:' % leibie_detail)

            url = LIST_URL % category
            text = urllib2.urlopen(url).read()
            tree = etree.HTML(text, parser=parser)

            time = datetime.datetime.now().strftime('%Y-%m-%d')
            nodes = tree.xpath(XPATH)
            for node in nodes:
                sub_node = node.find('h5/a')
                #print etree.tostring(node, method='html', encoding='utf-8')
                url = sub_node.attrib['href']
                title = sub_node.text.strip()

                if leibie_detail == u'打折单品':
                    if u'衫' in title or u'T' in title or u'外套' in title or u'背心' in title or u'毛衣' in title or u'上衣' in title or u'吊带' in title:
                        leibie = u'上装'
                    if u'裤' in title:
                        leibie = u'裤'
                    if u'裙' in title:
                        leibie = u'裙'
                    if u'鞋' in title:
                        leibie = u'鞋'
                    if u'帽' in title or u'围巾' in title or u'皮带' in title or u'腰带' in title:
                        leibie = u'配饰'
                    self.logger.info('Category: %s' % (leibie))

                sub_node = node.find('div[3]/p/span/span')
                price = sub_node.text
                price = price[0:price.index('.')]

                new_price = ''
                sub_node = node.find('div[3]/p[2]/span')
                if sub_node is not None:
                    new_price = price
                    price = sub_node.text.strip()
                    price = price[0:price.index('.')]

                text = urllib2.urlopen(url).read()
                tree = etree.HTML(text, parser=parser)
                imgnodes = tree.xpath(BIGXPATH)
                image_url = imgnodes[0].attrib['href']

#                sub_node = node.find('div[1]/p/a/img')
#                image_url = sub_node.attrib['src']

                self.logger.info('%s(%s,discount:%s) - %s @ %s - %s' % (title, price, new_price, url, image_url, leibie))
                from shopping.signals import item_found,item_update
                item_found.send(
                    self,
                    name = title,
                    url = url,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = None,
                    price = price,
                    category = leibie,
                )
                if new_price != '':
                    item_update.send(
                        self,
                        name = title,
                        url = url,
                        brand = self.__class__.__name__,
                        image_url = image_url,
                        image_url2 = None,
                        price = price,
                        new_price = new_price,
                        category = leibie,
                    )
示例#17
0
    def fetch(self):
        self.logger.info('Kappa started.')
        kuanshi ='MENs'
        parser = etree .HTMLParser(encoding='utf-8')
        for page in range(1,2):
            URL = LIST_ADDR % (page)
            text = urllib2.urlopen(URL).read()
            tree = etree.HTML(text, parser=parser)
            time = datetime.datetime.now().strftime('%Y-%m-%d')
            nodes = tree.xpath(LEIBIEXPATH)
            for node in nodes:
                if node.text:
                   leibie = node.text
                   if leibie ==u'运动':
                       leibie = 'none'
                   elif leibie == u'运动时尚':
                       leibie = 'none'
                   elif leibie == u'时尚':
                       leibie = 'none'
                   if leibie != 'none':
                        print leibie
                        title =leibie
                        if leibie.find(u'裤')!=-1:
                            leibie=u'上装'
                        else:
                            leibie=u'下装'
                        print leibie
                        leibie_url='http://www.kappa.com.cn/product/'+ node.attrib['href']
                        #print leibie_url
                        text = urllib2.urlopen(leibie_url).read()
                        tree = etree.HTML(text,parser=parser)
                        nodes = tree.xpath(XPATH)
                        for node in nodes:
                            image_url = 'http://www.kappa.com.cn/product/'+node.attrib['href']
                            print image_url
                            text = urllib2.urlopen(image_url).read()
                            tree = etree.HTML(text,parser=parser)
                            node = tree.xpath(PRICE_XPATH)[0]
                            #print node


                            kuanshi=node.find('tr[3]').find('td[2]')
                            if kuanshi==u'男':
                                leibie='MENs'+leibie
                            else:
                                leibie='WOMENs'+leibie
                            price_node = node.find('tr[7]')
                            #print price_node
                            price_node = price_node.find('td[2]')
                            #print price_node
                            price = price_node.text
                            print price
                            bigimage_node = tree.xpath(IMAGE_XPATH)[0]
                            bigimage_url =bigimage_node.text.strip()
                            #print bigimage_url
                            #loadBigPic('/upload/product/K2104MM595-990_4_1.png')
                            start = bigimage_url.find("/")
                            end = bigimage_url.find("')")
                            #print start
                            # print end
                            bigimage_url=bigimage_url[start:end]
                            bigimage_url='http://www.kappa.com.cn'+bigimage_url
                            print bigimage_url
                            self.logger.info('%s(%s) - %s @ %s' % (title, price, image_url, bigimage_url))

                            price1 = price
                            price2 = price
                            print "%s  , %s  , %s  ,  %s   ,  %s   ,  %s" % (title,image_url,bigimage_url,price1,price2,leibie)
                            from shopping.signals import item_found, item_update
                            item_found.send(
                            self,
                            name = title,
                            url = image_url,
                            brand = self.__class__.__name__,
                            image_url = bigimage_url,
                            image_url2 = None,
                            price = price1,
                            category = leibie
                            )

                            item_update.send(
                            self,
                            url=image_url,
                            new_price=price2
                            )
示例#18
0
    def getData(self, url, mainleibie):
        parser = etree .HTMLParser(encoding='utf-8')
        time = datetime.datetime.now().strftime('%Y-%m-%d')
        #year = datetime.datetime.now().year
        self.logger.info('mainleibie: %s:' % mainleibie)
        text = urllib2.urlopen(url).read()
        tree = etree.HTML(text, parser=parser)
        nodes_url = tree.xpath(XPATH_URL3)
        #print nodes_url
        for node_url in nodes_url:
            url = node_url.attrib['href']
         #   print url

            category = node_url.text.strip()
            if u'衣' in category or u'T' in category or u'衫' in category or u'套' in category:
                leibie = u'上装'
            elif u'裤' in category:
                leibie = u'裤'
            elif u'裙' in category:
                leibie = u'裙'
            elif u'配饰' in category or u'包' in category:
                leibie = u'配饰'
            elif u'鞋' in category:
                leibie = u'鞋'
            else:
                leibie = category
            self.logger.info('leibie: %s:' % leibie)
            self.logger.info('category: %s:' % category)
            text = urllib2.urlopen(url).read()
            tree = etree.HTML(text, parser=parser)

            nodes = tree.xpath(XPATH)
            productText = nodes[0].text

            productText = productText[productText.index("categoryData: ") + len("categoryData: ") :productText.rindex('},' )+1]
              #  print productText
            data = json.loads(productText)
            urlPrefix = data["urlPrefix"]
            imgPrefix = data["imgPrefix"]
            items = data["items"]

            for item in items:
                title = item["name"]
                price = item["numPrice"]
                if price > 0:
                    price = u'¥' + price.__str__()
                url = urlPrefix + item["link"]["full"]
             #   print url
                text = urllib2.urlopen(url).read()
                tree = etree.HTML(text, parser=parser)

                nodes = tree.xpath(BIG_XPATH)
#                print nodes
                image_url = nodes[0].attrib['src']

                 #   return

                   # image_url = imgPrefix + item["image"]["standard"]
                self.logger.info('%s (%s) - %s @ %s' % (title, price, url, image_url))
                collector.object_found.send(
                    self,
                    time = time, title = title, url = url,
                    image_url = image_url,
                    price = price,
                    leibie = leibie
                )
                from shopping.signals import item_found
                item_found.send(
                    self,
                    name = title,
                    url = url,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = None,
                    price = price,
                    category = leibie,
                )
示例#19
0
    def fetch(self):
        parser = etree .HTMLParser(encoding='utf-8')
        time = datetime.datetime.now().strftime('%Y-%m-%d')
      #  self.logger.info('Category: %s:' % category)
        text = urllib2.urlopen(ALL_URL).read()
        tree = etree.HTML(text, parser=parser)
        nodes = tree.xpath(MAIN_XPATH)
        for node in nodes:
            sub_node = node.find('a')
            if sub_node is None:
                continue
            detail_leibie = sub_node.text
            leibie_url = sub_node.attrib['href']
            Category = ''
            if leibie_url.find('female')>0:
                Category = u'女装'
            elif leibie_url.find('male')>0:
                Category = u'男装'
                continue
            leibie = detail_leibie

            if u'裤' in detail_leibie:
                leibie = u'裤'
            elif u'裙' in detail_leibie:
                leibie = u'裙'
            elif u'配饰' in detail_leibie:
                leibie = u'配饰'
            else:
                leibie = u'上装'

            self.logger.info('Category: %s' % Category+'-'+leibie)
            leibie_url = 'http://www.c-and-a.com.cn/cn/fashion/product/'+leibie_url
            print leibie_url

            text = urllib2.urlopen(leibie_url).read()
            tree = etree.HTML(text, parser=parser)
            snodes = tree.xpath(XPATH)
            #print snodes
            for snode in snodes:
                # print etree.tostring(snode, method='html', encoding='utf-8')
                #  sub_node = snode.find('a/img')
                #  image_url = urlparse.urljoin(leibie_url,sub_node.attrib['src'])

                sub_node = snode.find('div[1]/div[2]/a')
                #print sub_node
                title = sub_node.text
                ourl = urlparse.urljoin(leibie_url,sub_node.attrib['href'])
                print ourl

                text = urllib2.urlopen(ourl).read()
                tree = etree.HTML(text, parser=parser)
                imgnodes = tree.xpath(BIGXPATH)
                image_url = 'http://www.c-and-a.com.cn/'+imgnodes[0].attrib['src'][len('../../..'):]

                sub_node = snode.find('div[1]/div[3]')
                price = sub_node.text.strip()
                price = u'¥' + price[0:price.index(' RMB')]

                self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url))
                collector.object_found.send(
                    self,
                    time = time, title = title, url = ourl,
                    image_url = image_url,
                    price = price,
                    leibie = leibie
                )
                from shopping.signals import item_found
                item_found.send(
                    self,
                    name = title,
                    url = ourl,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = None,
                    price = price,
                    category = leibie,
                )
        text = urllib2.urlopen(ACC_URL).read()
        tree = etree.HTML(text, parser=parser)
        a_nodes = tree.xpath(XPATH)
        leibie = u'配饰'
        self.logger.info('Category: %s:' % leibie)
        for snode in a_nodes:
#            #print etree.tostring(node, method='html', encoding='utf-8')
#            sub_node = snode.find('a/img')
#            image_url = urlparse.urljoin(ACC_URL,sub_node.attrib['src'])

            sub_node = snode.find('div[1]/div[2]/a')
            title = sub_node.text
            ourl = urlparse.urljoin(ACC_URL,sub_node.attrib['href'])

            text = urllib2.urlopen(ourl).read()
            tree = etree.HTML(text, parser=parser)
            imgnodes = tree.xpath(BIGXPATH)
            image_url = 'http://www.c-and-a.com.cn/'+imgnodes[0].attrib['src'][len('../../..'):]

            sub_node = snode.find('div[1]/div[3]')
            price = sub_node.text.strip()
            price = u'¥' + price[0:price.index(' RMB')]

            self.logger.info('%s(%s) - %s @ %s' % (title, price, ourl, image_url))
            collector.object_found.send(
                self,
                time = time, title = title, url = ourl,
                image_url = image_url,
                price = price,
                leibie = leibie
            )
            from shopping.signals import item_found
            item_found.send(
                self,
                name = title,
                url = ourl,
                brand = self.__class__.__name__,
                image_url = image_url,
                image_url2 = None,
                price = price,
                category = leibie,
            )
示例#20
0
    def getData(self,category,leibie):
        temp_leibie = leibie
        parser = etree .HTMLParser(encoding='utf-8')
        URL=ALL_URL % (category)
        text = urllib2.urlopen(URL).read()
        tree = etree.HTML(text, parser=parser)
        nodes = tree.xpath(ALL_XPATH)
        time = datetime.datetime.now().strftime('%Y-%m-%d')
        print len(nodes)
        for node in nodes:
            leibie_url = node.find('a').attrib['href']
            leibie_detail = node.find('a').text
            print leibie_url
            if temp_leibie == u'打折单品':
                if u'上装' in leibie_detail or u'T' in leibie_detail or u'衫' in leibie_detail or u'毛衣' in leibie_detail:
                    leibie = u'上装'
                if u'裤' in leibie_detail:
                    leibie = u'裤'
                if u'裙' in leibie_detail:
                    leibie = u'裙'
                if u'配' in leibie_detail:
                    leibie = u'配饰'
            self.logger.info('Category: %s' % leibie+'-'+leibie_detail)

            text = urllib2.urlopen(leibie_url).read()
            tree = etree.HTML(text, parser=parser)
            cat_nodes = tree.xpath(XPATH)
#        for page in range(0,21):
#            self.logger.info('Page: %d:' % (page+1))
#          #  url = LIST_URL % (page * 28)
#            text = urllib2.urlopen(url).read()
#            tree = etree.HTML(text, parser=parser)
#
#            time = datetime.datetime.now().strftime('%Y-%m-%d')
#            nodes = tree.xpath(XPATH)




            for cat_node in cat_nodes:
                #print etree.tostring(node, method='html', encoding='utf-8')
                sub_node = cat_node.find('a')
                ourl = sub_node.attrib['href']
                text = urllib2.urlopen(ourl).read()
                tree = etree.HTML(text, parser=parser)
                imgnodes = tree.xpath(BIGXPATH)
                image_url = imgnodes[0].attrib['jqimg']
                title = sub_node.find('span').text.strip()

                if u'配' in leibie and u'鞋' in title:
                    leibie = u'鞋类'
                    self.logger.info('Category: %s' % leibie+'-'+leibie_detail)


                sub_node = cat_node.find('p')
                priceinfo = etree.tostring(sub_node, method='html', encoding='utf-8')
                print '-------------------------------------------------------------------'
                price = priceinfo [priceinfo.find('¥'):priceinfo.find('.00')+len('.00')]
                ori_node = sub_node.find('del')
                now_node = sub_node.find('span')
                new_price = ''
                if ori_node is not None and now_node is not None:
                    price = ori_node.text.strip()
                    new_price = now_node.text.strip()





             #   print etree.tostring(sub_node, method='html', encoding='utf-8')
#                new_price = ''
#
#                if sub_node is None:
#                    bold_node = cat_node.find('*/span[@class="listPrice bold"]')
#                    print bold_node
#                    now_node = cat_node.find('div[2]/div[1]/span[@class="offer_price"]')
#                    price = bold_node.text.strip()
#                    price = price[0:price.index('.')]
#                    new_price = now_node.text.strip()
#                    new_price = price[0:price.index('.')]
#                else:
#                    price = sub_node.text.strip()
#                    price = price[0:price.index('.')]
#
#
                self.logger.info('%s(%s,now:%s) - %s @ %s' % (title, price.decode('utf-8'),new_price.decode('utf-8'), ourl, image_url))
                from shopping.signals import item_found,item_update
                item_found.send(
                    self,
                    name = title,
                    url = ourl,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = None,
                    price = price,
                    category = leibie,
                )
                if new_price != '':
                    item_update.send(
                       self,
                       name = title,
                       url = ourl,
                       brand = self.__class__.__name__,
                       image_url = image_url,
                       image_url2 = None,
                       price = price,
                       new_price = new_price,
                       category = leibie,
                    )
示例#21
0
    def getData(self):
        parser = etree.HTMLParser(encoding='utf-8')

        text = urllib2.urlopen(MAIN_URL).read()
        tree = etree.HTML(text, parser=parser)
        nodes = tree.xpath(LEIBIE_XPATH)

        for node in nodes:
            sub_node = node.find('td/a')
            if sub_node is None:
                continue

            leibie_detail = sub_node.text
            if not leibie_detail:
                leibie_detail = sub_node.find('span').text

            leibie_url = PARENT_URL +  sub_node.attrib['href']
            leibie = leibie_detail

            if u'配件' in leibie_detail or u'服装' in leibie_detail or u'裤子' in leibie_detail or u'冲浪裤' in leibie_detail:
                continue
            if u'短袖' in leibie_detail or u'背心' in leibie_detail or u'衫' in leibie_detail:
                leibie = u'上装'
            if u'裤' in leibie_detail:
                leibie = u'裤'
            if u'鞋' in leibie_detail:
                leibie = u'鞋'
            if u'帽' in leibie_detail or u'包' in leibie_detail:
                leibie = u'配饰'

            self.logger.info('Leibie: %s' % leibie)
            print leibie_url
            text_leibie = urllib2.urlopen(leibie_url).read()
            tree_leibie = etree.HTML(text_leibie, parser=parser)
            time = datetime.datetime.now().strftime('%Y-%m-%d')

            page_nodes = tree_leibie.xpath(PAGE_XPATH)
            pages = len(page_nodes) + 1
            print 'total pages %s' % pages

            for page in range(0, pages):
                print "page %s" % page
                data = "start=%d" % (page * 8)

                text_item = urllib2.urlopen(leibie_url, data).read()
                tree_item = etree.HTML(text_item, parser=parser)
                nodes_item1 = tree_item.xpath(ITEM_XPATH1)
                nodes_item2 = tree_item.xpath(ITEM_XPATH2)
                nodes_item = [nodes_item1, nodes_item2]

                for sub_node_item_no in nodes_item:
                    for sub_node_item_td in sub_node_item_no:
                        sub_node_item = sub_node_item_td.find('table')
                        if sub_node_item is None:
                            continue

                        ourltext = sub_node_item.find('tr[1]/td/a').attrib['onclick']
                        url = 'http://www.quiksilver.cn/cn/tw/' + ourltext[ourltext.index("..") + 3:ourltext.rindex("'")]
                        title = sub_node_item.find('tr[2]/td/div').text
                        price_node = sub_node_item.find('tr[3]/td/div')
                        pricetext = etree.tostring(price_node, method='html', encoding='utf-8')
                        price =  pricetext[pricetext.index("<br>") + len("<br>"):pricetext.index("</div>")]

                        image_url_backup = ''

                        text_detail = urllib2.urlopen(url).read()
                        tree_detail = etree.HTML(text_detail, parser=parser)
                        clo_nodes = tree_detail.xpath(BIG_XPATH)
                        image_url = urlparse.urljoin(url,clo_nodes[0].attrib['src'])


                        self.logger.info('%s(%s) - %s @ %s' % (title, price, url, image_url))

                        collector.object_found.send(
                            self,
                            time = time, title = title, url = url,
                            image_url = image_url,
                            image_url2 = image_url_backup,
                            price = price,
                            leibie = leibie
                        )
                        from shopping.signals import item_found
                        item_found.send(
                            self,
                            name = title,
                            url = url,
                            brand = self.__class__.__name__,
                            image_url = image_url,
                            image_url2 = image_url_backup,
                            price = price,
                            category = leibie,
                        )
示例#22
0
    def getData(self):
        parser = etree.HTMLParser(encoding="utf-8")

        text = urllib2.urlopen(MAIN_URL).read()
        tree = etree.HTML(text, parser=parser)
        nodes = tree.xpath(LEIBIE_XPATH)

        for node in nodes:
            sub_node = node.find("td/a")
            if sub_node is None:
                continue

            leibie_detail = sub_node.text
            if leibie_detail is None:
                leibie_detail = sub_node.find("span").text

            leibie_url = PARENT_URL + sub_node.attrib["href"]
            leibie = leibie_detail

            if (
                u"配件" in leibie_detail
                or u"服装" in leibie_detail
                or u"裤子" in leibie_detail
                or u"冲浪裤" in leibie_detail
                or u"比基尼" in leibie_detail
            ):
                continue
            if u"短袖" in leibie_detail or u"背心" in leibie_detail or u"毛衣" in leibie_detail:
                leibie = u"上装"
            if u"短裙" in leibie_detail:
                leibie = u"裙"
            if u"裤" in leibie_detail:
                leibie = u"裤"
            if u"鞋" in leibie_detail:
                leibie = u"鞋"
            if u"帽" in leibie_detail or u"包" in leibie_detail or u"腰带" in leibie_detail:
                leibie = u"配饰"

            self.logger.info("Leibie: %s" % leibie)
            print leibie_url
            text_leibie = urllib2.urlopen(leibie_url).read()
            tree_leibie = etree.HTML(text_leibie, parser=parser)
            time = datetime.datetime.now().strftime("%Y-%m-%d")

            page_nodes = tree_leibie.xpath(PAGE_XPATH)
            pages = len(page_nodes) + 1
            print "total pages %s" % pages

            for page in range(0, pages):
                print "page %s" % page
                data = "start=%d" % (page * 8)

                text_item = urllib2.urlopen(leibie_url, data).read()
                tree_item = etree.HTML(text_item, parser=parser)
                nodes_item1 = tree_item.xpath(ITEM_XPATH1)
                nodes_item2 = tree_item.xpath(ITEM_XPATH2)
                nodes_item = [nodes_item1, nodes_item2]

                for sub_node_item_no in nodes_item:
                    for sub_node_item_td in sub_node_item_no:
                        sub_node_item = sub_node_item_td.find("table")
                        if sub_node_item is None:
                            continue

                        ourltext = sub_node_item.find("tr[1]/td/a").attrib["onclick"]
                        url = PARENT_URL + ourltext[ourltext.index("..") + 3 : ourltext.rindex("'")]
                        title = sub_node_item.find("tr[2]/td/div").text
                        price_node = sub_node_item.find("tr[3]/td/div")
                        pricetext = etree.tostring(price_node, method="html", encoding="utf-8")
                        price = pricetext[pricetext.index("<br>") + len("<br>") : pricetext.index("</div>")]

                        image_url_backup = ""

                        text_detail = urllib2.urlopen(url).read()
                        tree_detail = etree.HTML(text_detail, parser=parser)
                        clo_nodes = tree_detail.xpath(BIG_XPATH)
                        image_url = urlparse.urljoin(url, clo_nodes[0].attrib["src"])

                        self.logger.info("%s(%s) - %s @ %s" % (title, price, url, image_url))

                        collector.object_found.send(
                            self,
                            time=time,
                            title=title,
                            url=url,
                            image_url=image_url,
                            image_url2=image_url_backup,
                            price=price,
                            leibie=leibie,
                        )
                        from shopping.signals import item_found

                        item_found.send(
                            self,
                            name=title,
                            url=url,
                            brand=self.__class__.__name__,
                            image_url=image_url,
                            image_url2=image_url_backup,
                            price=price,
                            category=leibie,
                        )
示例#23
0
    def getData(self, URL, gender):
        parser = etree .HTMLParser(encoding='utf-8')
        self.logger.info('Gender: %s:' % gender)
        text = urllib2.urlopen(URL).read()
        tree = etree.HTML(text, parser=parser)
        time = datetime.datetime.now().strftime('%Y-%m-%d')
        nodes = tree.xpath(LIST_XPATH)
        if gender == u'女士商品' :
            start = 2
            end = 8
        else:
            start = 4
            end = 14
        for i in range(start,end):
            leibie_url = 'http://china.coach.com'+nodes[i].find('a').attrib['href']
            leibie_detail = nodes[i].find('a/strong').text
            leibie = ''
            if u'服饰' in leibie_detail:
                leibie = u'服饰'
            elif u'鞋' in leibie_detail:
                leibie = u'鞋'
            else:
                leibie = u'配饰'
            self.logger.info('Category: %s' % leibie+'-'+leibie_detail)
            text = urllib2.urlopen(leibie_url).read()
            tree = etree.HTML(text,parser=parser)
            cat_nodes = tree.xpath(XPATH)
            for cat_node in cat_nodes:
            #sub_node = node.find('a[1]')
            #ourl = urlparse.urljoin(url,sub_node.attrib['href'])

                sub_node = cat_node.find('a[1]/img')
#                print etree.tostring(cat_node, method='html', encoding='utf-8')
                print '------------------------------------------------------------------------------------------'
                image_url = sub_node.attrib['src']
                title = sub_node.attrib['alt']
                if leibie_detail == u'服饰':
                    if u'风衣' in title :
                        leibie = u'上装'
                    else:
                        leibie = u'配饰'
                    self.logger.info('Category: %s' % leibie_detail+'-'+leibie)


                productinfo = sub_node.attrib['onmouseover']
                productID = productinfo[productinfo.index("('")+2:productinfo.index("',")]
                price = u'¥' + self.getPrice(productID)

            #self.logger.info('%s(%s) - %s @ %s - %s' % (title, price, ourl, image_url))
                self.logger.info('%s(%s) - %s-%s' % (title, price, image_url, leibie_detail))
                from shopping.signals import item_found
                item_found.send(
                    self,
                    name = title,
                    url = image_url,
                    brand = self.__class__.__name__,
                    image_url = image_url,
                    image_url2 = None,
                    price = price,
                    category = leibie,
                )
示例#24
0
    def getData(self, target_url, leibie_spc=u'默认'):
        parser = etree .HTMLParser(encoding='utf-8')
        text = urllib2.urlopen(target_url).read()
        tree = etree.HTML(text, parser=parser)
        leibie_nodes = tree.xpath(LEIBIE_XPATH)

        print len(leibie_nodes)
        for leibie_node in leibie_nodes:

            if leibie_node.find('a') is None:
                continue
            leibie_name = leibie_node.find('a').text
            leibie_url = PARENT_URL + leibie_node.find('a').attrib['href']

            if u'T恤' in leibie_name or u'衬衫' in leibie_name or \
                u'针织衫' in leibie_name or u'背心' in leibie_name or u'卫衣' in leibie_name or \
                u'POLO衫' in leibie_name or u'西服' in leibie_name or u'夹克' in leibie_name or \
                u'马甲' in leibie_name or u'风衣' in leibie_name or u'大衣' in leibie_name or \
               u'棉服' in leibie_name or u'女款' in leibie_name or u'男款' in leibie_name:
                leibie = u'上装'

            elif u'牛仔裤' in leibie_name or u'休闲裤' in leibie_name or u'连身裤' in leibie_name\
                or u'连身裤' in leibie_name or u'针织裤' in leibie_name or u'西裤' in leibie_name:
                leibie = u'裤'

            elif u'半裙' in leibie_name  or u'连衣裙' in leibie_name:
                leibie = u'裙'
            elif u'女鞋' in leibie_name  or u'男鞋' in leibie_name:
                leibie = u'鞋'
            elif u'饰品' in leibie_spc:
                leibie = u'配饰'
            else:
                continue

            leibie_text = urllib2.urlopen(leibie_url).read()
            leibie_tree = etree.HTML(leibie_text, parser=parser)
            page_node = leibie_tree.xpath(PAGE_XPATH)[0]
            pages = string.atoi(page_node.text[1:-1])


            for page in range(1,pages+1):
                items_url = leibie_url + (QUERY_STRING % page)


                item_text = urllib2.urlopen(items_url).read()
                item_tree = etree.HTML(item_text, parser=parser)
                item_nodes = item_tree.xpath(ITEMS_XPATH)


                for item_node in item_nodes:
                    item_url = item_node.find('div[1]/a').attrib['href']
                    url = item_url
                    title = item_node.find('div[1]/a').attrib['title']
                    time = datetime.datetime.now().strftime('%Y-%m-%d')

                    detail_text = urllib2.urlopen(item_url).read()
                    detail_tree = etree.HTML(detail_text, parser=parser)
                    detail_node = detail_tree.xpath(BIG_XPATH)[0]
                    image_url = detail_node.attrib['src']

                    cuxiao_node = detail_tree.xpath(CUXIAO_XPATH)[0]
                    new_price = cuxiao_node.text
                    new_price = 'RMB' + new_price

                    myprice_node = detail_tree.xpath(MYPRICE_XPATH)[0]
                    price = 'RMB' + myprice_node.text

                    image_url_backup = ''

                    self.logger.info('%s(%s--%s) - %s @ %s' % (title, price, new_price, url, image_url))

                    collector.object_found.send(
                        self,
                        time = time, title = title, url = url,
                        image_url = image_url,
                        image_url2 = image_url_backup,
                        price = price,
                        leibie = leibie
                    )
                    from shopping.signals import item_found, item_update
                    item_found.send(
                        self,
                        name = title,
                        url = url,
                        brand = self.__class__.__name__,
                        image_url = image_url,
                        image_url2 = image_url_backup,
                        price = price,
                        category = leibie,

                    )

                    item_update.send(
                        self,
                        url=url,
                        new_price=new_price
                    )