예제 #1
0
    def parse(self, response):
        page = response.meta['page']
        response_json = json.loads(response.body)
        meta = response.meta
        meta['retry_times'] = 0
        if response_json:
            item = DianPIngAllStoreJson()
            item['response_content'] = response.body
            meta['dt'] = dt
            item['meta'] = meta
            yield item

        if page == 1:
            # 找到最有一页的页码,比对是否为当前页
            next_page = response_json.get('pageCount')
            if next_page:
                # print next_page
                if int(next_page) == page:
                    pass
                else:
                    for i in xrange(2, int(next_page) + 1):
                        meta['page'] = i
                        next_page_link = response.url + '&page=%s' % i
                        yield Request(next_page_link, callback=self.parse, meta=meta, dont_filter=True, headers=header,
                                      errback=self.parse_failure)
예제 #2
0
    def parse(self, response):
        page = response.meta['page']
        response_json = json.loads(response.body)
        meta = response.meta
        meta['retry_times'] = 0
        if response_json:
            shopRecordBeanList = response_json.get('shopRecordBeanList')
            if shopRecordBeanList:
                # for shopRecordBean1 in shopRecordBeanList:
                item = DianPIngAllStoreJson()
                item['response_content'] = response.body
                meta['search_kw_cate'] = self.search_cate
                item['meta'] = meta
                yield item

        if page == 1:
            # 找到最有一页的页码,比对是否为当前页
            next_page = response_json.get('pageCount')
            if next_page:
                # print next_page
                if int(next_page) == page:
                    pass
                else:
                    for i in xrange(2, int(next_page) + 1):
                        meta['page'] = i
                        next_page_link = response.url + '&page=%s' % i
                        print next_page_link
                        yield Request(next_page_link,
                                      callback=self.parse,
                                      meta=meta,
                                      dont_filter=True,
                                      headers=header1)
 def get_data(self, shop_url, meta, header_phone):
     while True:
         if not self.proxys:
             self.proxys = self.redis_conn1()
         if self.proxys:
             if int(time.time()) - self.dt_proxy > 5:
                 self.dt_proxy = int(time.time())
                 self.proxys = self.redis_conn1()
         proxies = {"http": "%s" % random.choice(self.proxys)}
         try:
             data = requests.get(shop_url,
                                 headers=header_phone,
                                 proxies=proxies,
                                 timeout=5)
             if data.status_code == 200:
                 item = DianPIngAllStoreJson()
                 item['meta'] = meta
                 item['shop_response'] = data.content
                 self.parse_result(item)
                 # return item
                 # break
             else:
                 time.sleep(0.1)
         except:
             pass
 def parse_shop(self, response):
     # if response.status == 403:
     #     with open('error_url', 'a') as f:
     #         f.write(response.url + '\n')
     # print response.headers
     item = DianPIngAllStoreJson()
     item['meta'] = response.meta
     item['shop_response'] = response.body
     yield item
예제 #5
0
 def parse_shop(self, response):
     if response.status == 403:
         print response.request.headers
         self.write_file(response.request.headers.get('User-Agent'))
     else:
         item = DianPIngAllStoreJson()
         item['meta'] = response.meta
         item['shop_response'] = response.body
         yield item
예제 #6
0
    def parse_shop(self, response):
        if response.status == 403:
            print response.request.headers

        else:
            item = DianPIngAllStoreJson()
            item['meta'] = response.meta
            item['shop_response'] = response.body
            yield item
 def parse_shop(self, response):
     data = response.body
     if data:
         item = DianPIngAllStoreJson()
         item['meta'] = response.meta
         item['shop_response'] = response.body
         yield item
     else:
         yield Request(response.url,
                       headers=header1,
                       meta=response.meta,
                       callback=self.parse_shop)
예제 #8
0
    def parse_detail(self, response):
        item = DianPIngAllStoreJson()
        shop_response = response.body
        print shop_response
        meta = response.meta
        if shop_response:

            item['meta'] = meta
            item['shop_response'] = shop_response
            yield item
        else:
            # shop_id = meta.get('shop_id')
            # header_ = header
            # header_['Referer'] = 'http://www.dianping.com/shop/%s' % shop_id
            # yield Request(response.url, callback=self.parse_detail, meta=meta, headers=header_, dont_filter=True)
            # print response.request.headers
            no_result = {}
            no_result['url'] = response.url
            no_result['shop_id'] = meta.get('shop_id')
            with open('no_result_item', 'a') as f:
                f.write(json.dumps(no_result) + '\n')
예제 #9
0
 def parse_shop(self, response):
     item = DianPIngAllStoreJson()
     item['meta'] = response.meta
     item['shop_response'] = response.body
     yield item
예제 #10
0
    def parse(self, response):
        response_json = json.loads(response.body)
        data = response_json.get('data')
        meta = response.meta
        if data:
            moduleInfoList = data.get('moduleInfoList')
            if moduleInfoList:
                moduleInfoList = moduleInfoList[0]
                moduleData = moduleInfoList.get('moduleData')
                if moduleData:
                    data = moduleData.get('data')
                    if data:
                        listData = data.get('listData')
                        if listData:
                            list = listData.get('list')
                            if list:
                                for ll in list:
                                    shop_id = ll.get('id')
                                    print shop_id
                                    shop_url = 'http://www.dianping.com/ajax/json/shopfood/wizard/BasicHideInfoAjaxFP?shopId=' + str(
                                        shop_id)
                                    meta['shop_info'] = ll
                                    meta['dt'] = dt
                                    header_phone[
                                        'Cookie'] = 'cy=%s; cityid=%s' % (
                                            meta.get('city_id'),
                                            meta.get('city_id'))
                                    while True:
                                        if not self.proxys:
                                            self.proxys = self.redis_conn1()
                                        if self.proxys:
                                            if int(time.time()
                                                   ) - self.dt_proxy > 5:
                                                self.dt_proxy = int(
                                                    time.time())
                                                self.proxys = self.redis_conn1(
                                                )
                                        proxies = {
                                            "http":
                                            "%s" % random.choice(self.proxys)
                                        }
                                        try:
                                            data = requests.get(
                                                shop_url,
                                                headers=header_phone,
                                                proxies=proxies,
                                                timeout=5)
                                            if data.status_code == 200:
                                                item = DianPIngAllStoreJson()
                                                item['meta'] = meta
                                                item[
                                                    'shop_response'] = data.content
                                                yield item
                                                break
                                            else:
                                                time.sleep(0.1)
                                        except:
                                            pass
                                recordCount = listData.get('recordCount')
                                print recordCount
                                nextStartIndex = listData.get('nextStartIndex')
                                print nextStartIndex
                                if int(recordCount) > int(nextStartIndex):
                                    startIndex = nextStartIndex
                                    post_data = meta['post_data']
                                    post_data = json.loads(post_data)
                                    post_data['moduleInfoList'][0]['query'][
                                        'search']['start'] = startIndex
                                    print post_data
                                    # print response.url
                                    meta = response.meta
                                    meta['retry_times'] = 0
                                    post_data = json.dumps(post_data)
                                    meta['post_data'] = post_data
                                    yield Request(response.url,
                                                  method='POST',
                                                  headers=header2,
                                                  body=post_data,
                                                  callback=self.parse,
                                                  meta=meta,
                                                  dont_filter=True)

        print response.meta