Python ProcessData示例，processdata.ProcessData Python示例

示例#1

0

显示文件

文件： jd.py 项目： xxguo/crawler

    def handle(self,datas):
        # print '____',datas
        data = datas.xpath("div[@class='item']")
        address = self.mackining('address',data)
        name = self.mackining('name',data)
        url = self.mackining('url',data)
        score = self.mackining('score',data)
        SCORES = re.search(u'\s*([0-5])\s*',score)
        score = int(SCORES.group(1)) if SCORES else ''
        title = self.mackining('title',data)
        comment = self.mackining('comment',data)
        commentid = self.mackining('commentid',data)
        buytime = self.mackining('buytime',data)
        useful = int(self.mackining('useful',data))
        reply = int(self.mackining('reply',data))
        buytime = ProcessData.str_datetime(buytime)
        commenttime = self.mackining('commenttime',data)
        commenttime = ProcessData.str_datetime(commenttime)

        return {
            'address': address,
            'name': name,
            'url': url,
            'score': score,
            'title': title,
            'comment': comment,
            'commentid': commentid,
            'buytime': buytime,
            'commenttime': commenttime,
            'province': address,
            'city': '',
            'useful': useful,
            'reply': reply
            # 'city': city
        }

示例#2

0

显示文件

文件： jd.py 项目： xxguo/crawler

    def crawl(self):
        # fid = '1662'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        fid = self.key
        category_data = extract_category(self)

        count = 3 #页数初始值为3
        pages = 1 #从第一页开始

        while pages <= count:
            url = self.get_url(fid,pages)
            try:
                jsons = ProcessData.get_json_data(url)
                if pages==1 : count = math.ceil(int(jsons['wareCount'])/100)
                lists = jsons['wareInfo']
            except Exception,e:
                self.logger.error(url)
                self.logger.error(e)
                print 'error ',url
                return
            if lists == []:
                return {}
            for i in range(len(lists)):
                ids = uuid.uuid1() #cassandra 主键
                wareId = lists[i]['wareId']

                try:
                    f = lambda x: int(x[:-1])/100.00
                    ecsumscores = float(f(lists[i]['good'])) #商品总评分
                except:
                    ecsumscores = 0

                crawl_data = {
                    # 'id': uuid.uuid1(),
                    'source_id': wareId,
                    'source': self.data.get('source'),
                    'summary': {},
                    'title': lists[i]['wname'],
                    'adword': lists[i]['adword'],
                    'price': float(lists[i]['jdPrice']),
                    'original_price': float(lists[i]['martPrice']),
                    'score': ecsumscores
                }
                crawl_data.update(category_data)
                data = {
                    # 'uuid': ids,
                    'priorcategory': self.data['priorcategory'],
                    'presentcategory': self.data['priorcategory']
#                    'presentcategory': self.data['presentcategory']
                }

                model = EcBasicModel(crawl_data)
                export(model)
                data["uuid"] = model["id"]
                Scheduler.schedule(DetailCrawler.type, key=wareId, data=data)
                Scheduler.schedule(CommentCrawler.type, key=wareId, data=data)


            pages += 1

示例#3

0

显示文件

文件： yhd.py 项目： xxguo/crawler

    def crawl(self):

        wareId = str(self.key)
        url = "http://item.yhd.com/item/%s"%wareId
        html_stream = ProcessData.get_web_data(url)
  #      print html_stream.text.encode('utf-8')
        tree = etree.HTML(html_stream.text)
        self.crawler_data(tree)

示例#4

0

显示文件

文件： newegg.py 项目： xxguo/crawler

 def get_data(self,CatID,pages):
     url = 'http://www.ows.newegg.com.cn/cat/%s'%(str(CatID))
     list_urls = {
         'page': str(pages),
         'pagesize': 20,
         'sort': 10
         }
     return ProcessData.get_json_data(url,parameter=list_urls)

示例#5

0

显示文件

文件： yhd.py 项目： xxguo/crawler

 def crawl(self): 
     key = str(self.key)
     count = 2 #页数初始值为3
     pages = 1 #从第一页开始
     for i in xrange(1,count):
         url = self.get_url(key,pages)
         html_stream = ProcessData.get_web_data(url)
   #      print html_stream.text.encode('utf-8')
         tree = etree.HTML(html_stream.text)
         self.crawler_data(tree)

示例#6

0

显示文件

文件： jd.py 项目： xxguo/crawler

    def crawl(self):
        # wareId = '1229271'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        # ecid = '124'
        wareId = self.key
        ecid =  self.data['uuid']
        category_data = extract_category(self)
        pages = 1
        count = True
        while count: 
            number = 0    #去重
            url = self.get_url(wareId,pages)
            # print '++++++++= ',url
            html_stream = ProcessData.get_web_data(url)
            try:
                tree = etree.HTML(html_stream.text)
            except:
                print 'error: ',url
                break
            xpath = "//div[@id='comments-list']/div[@class='mc']"
            dom = tree.xpath(xpath)
            if dom == []:
                count = False
                continue
            for item in dom:
                datas = self.handle(item)
                comment_data={
                    # 'uuid': uuid.uuid1(),         #primary key
                    'ecid': ecid,        #commodity table foreign key
                    'source_id': wareId,
                    'source': self.data.get('source'),
                    'comment_id': datas['commentid'],  #review id
                    'score': datas['score'],         #commodity score
                    'pubtime': datas['commenttime'],
                    'buytime': datas['buytime'],
                    'user_id': datas['url'],
                    # 'usernickName': groups[i]['usernickName'],
                    'useful': datas['useful'],
                    'reply': datas['reply'],
                    'content': datas['comment'],
                    'province': datas['province']

                }
                comment_data.update(category_data)
                model = EcCommentModel(comment_data)
                is_saved = export(model)
                if is_saved == True:
                    pass
                else:
                    number += 1
            if number > 10:
                break
            pages += 1

示例#7

0

显示文件

文件： yhd.py 项目： xxguo/crawler

 def crawl(self):
     url = "http://interface.m.yhd.com/ \
            mcategory/servlet/CentralMobileFacadeJsonServlet/ \
            getNavCategoryWithKeywordByRootCategoryId? \
            rootCategoryId=0&categoryNavId=0&provinceId=1"
     try:
         jsons = ProcessData.get_json_data(url.replace(' ',''))
         data = jsons['data']
     except Exception,e:
         self.logger.error(url)
         self.logger.error(e)            
         print 'error ',url

示例#8

0

显示文件

文件： jd.py 项目： xxguo/crawler

    def crawl(self):
        # wareId = '1229271'
        # wareId = '1391817787'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        # ids = uuid.uuid1()


        wareId = self.key
        ids =  self.data.get('uuid')
        category_data = extract_category(self)

        url = 'http://m.360buy.com/product/guige/%s.html'%(str(wareId))
        html_stream = ProcessData.get_web_data(url)
        tree = etree.HTML(html_stream.text)
        xpath = "//table[@class='Ptable']/tr/td/text()"
        dom = tree.xpath(xpath)
        specifications = {}
        temporary = ''
        i = 0
        for item in dom:
            item = item.strip()
            if item == '':
                continue
            if i%2 ==0:
                specifications[item] = ''
                temporary = extract_title(item)
            else:
                specifications[temporary] = extract_text(item)

            i += 1

        data = {
            'ecnorms':specifications
        }
        # specifications = json.dumps(specifications, ensure_ascii=False)
        introduce = IntroduceCrawler.crawl(wareId,ids)
        ecbrands = introduce[u'品牌'] if introduce.get(u'品牌') else ''
   #     ecnames = introduce[u'商品名称'].replace('\'',' ') if introduce.get(u'商品名称') else ''
        ecnames = introduce[u'商品名称'] if introduce.get(u'商品名称') else ''
        crawl_data = {
            'id': ids,
            'source': self.data.get('source'),
            'source_id': wareId,
            'summary': specifications,
            'introduce': introduce,
            'name': ecnames,
            'brand': ecbrands
        }
        crawl_data.update(category_data)
        model = EcDetailModel(crawl_data)
        export(model)

示例#9

0

显示文件

文件： newegg.py 项目： xxguo/crawler

 def crawl(self):
     url = 'http://www.ows.newegg.com.cn/category.egg'
     try:
         jsons = ProcessData.get_json_data(url)
     except:
         print 'error ',url
         return
     for item1 in jsons:
         CatName1 = item1['CatName']
         for item2 in item1['SubCategories']:
             CatName2 = item2['CatName']
             for item3 in item2['SubCategories'] :
                 priorcategory = []
                 priorcategory.extend([CatName1,CatName2,item3['CatName']])
                 self.handle(item3['CatID'],priorcategory)

示例#10

0

显示文件

文件： jd.py 项目： xxguo/crawler

    def crawl(self):
        start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body="
        sencond_urls = {
            'catelogyId': '0',
            'isDescription': 'true',
            'isIcon': 'true',
            'level': '0'
        }

        url = start_urls + quote(str(sencond_urls))
        try:
            jsons = ProcessData.get_json_data(url)
            lists = jsons['catelogyList']
        except Exception,e:
            self.logger.error(url)
            self.logger.error(e)
            print 'error ',url
            return

示例#11

0

显示文件

文件： jd.py 项目： xxguo/crawler

    def crawl(self):
        fid = self.key
        categorys = self.data['priorcategory']
 #       fid = '1625'
 #       categorys = ["家居家装","清洁用品"]

        start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body="
        thrid_urls = {
            'catelogyId':str(fid),
            'isDescription':'false',
            'isIcon':'false',
            'level':'2'
        }
        url = start_urls + quote(str(thrid_urls))

        try:
            jsons = ProcessData.get_json_data(url)
            lists = jsons['catelogyList']
        except Exception,e:
            self.logger.error(url)
            self.logger.error(e)
            print 'error ',url
            return

示例#12

0

显示文件

文件： jd.py 项目： xxguo/crawler

    def crawl(self):

        # fid = '1620'
        # categorys = ["家居家装"]

        fid = self.key
        categorys = self.data['priorcategory']

        start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body="
        sencond_urls = {
            'catelogyId': str(fid),
            'isDescription': 'true',
            'isIcon': 'true',
            'level':'1'
        }
        url = start_urls + quote(str(sencond_urls))
        #print 'url ',url
        try:
            jsons = ProcessData.get_json_data(url)
            lists = jsons['catelogyList']
        except:
            print 'error ',url
            return
        if lists == []:
            return {}
        for i in range(len(lists)):

            cid = lists[i]['cid']
#            presentcategory = []
            priorcategory = []
            priorcategory.extend(categorys)
            priorcategory.append(extract_title(lists[i]['name']))
            data = {
                'priorcategory':priorcategory,
#                'presentcategory':presentcategory
            }
            Scheduler.schedule(ThirdCrawler.type, key=cid, data=data)

示例#13

0

显示文件

文件： jd.py 项目： xxguo/crawler

    def crawl(wareId,ids ):
        import sys
        reload(sys)
        sys.setdefaultencoding("utf-8")

        url = 'http://item.jd.com/%s.html'%(str(wareId))
        html_stream = ProcessData.get_web_data(url)
        if html_stream=={}:
            return {}
        html_stream.encoding = 'gb2312'
        tree = etree.HTML(html_stream.text)
        xpath = "//div[@id='product-detail-1']/ul[@class='detail-list']/li//text()"
        dom = tree.xpath(xpath)
        introduce = {}
        temporary = ''
        for item in dom:
            item = item.strip()
            if item == '':
                continue
            elif item.find('：') >0:
                item = item.split('：',1)
                if item[1] == '':
                    temporary = extract_title(item[0])
                else:
                    introduce[extract_title(item[0])] = extract_text(item[1])
            else:
                if temporary != '':
                    introduce[temporary] = extract_text(item)
                    temporary = ''
                else:
                    continue

        if introduce != '':
            return introduce
        else:
            return ''

示例#14

0

显示文件

 def get_response(self, key):
     url = self.get_url(key)
     response = ProcessData.get_web_data(url)
     return response

示例#15

0

显示文件

 def get_data(self, CatID, pages):
     url = 'http://www.ows.newegg.com.cn/cat/%s' % (str(CatID))
     list_urls = {'page': str(pages), 'pagesize': 20, 'sort': 10}
     return ProcessData.get_json_data(url, parameter=list_urls)