def parse(self, response): page = response.meta['page'] response_json = json.loads(response.body) meta = response.meta meta['retry_times'] = 0 if response_json: item = DianPIngAllStoreJson() item['response_content'] = response.body meta['dt'] = dt item['meta'] = meta yield item if page == 1: # 找到最有一页的页码,比对是否为当前页 next_page = response_json.get('pageCount') if next_page: # print next_page if int(next_page) == page: pass else: for i in xrange(2, int(next_page) + 1): meta['page'] = i next_page_link = response.url + '&page=%s' % i yield Request(next_page_link, callback=self.parse, meta=meta, dont_filter=True, headers=header, errback=self.parse_failure)
def parse(self, response): page = response.meta['page'] response_json = json.loads(response.body) meta = response.meta meta['retry_times'] = 0 if response_json: shopRecordBeanList = response_json.get('shopRecordBeanList') if shopRecordBeanList: # for shopRecordBean1 in shopRecordBeanList: item = DianPIngAllStoreJson() item['response_content'] = response.body meta['search_kw_cate'] = self.search_cate item['meta'] = meta yield item if page == 1: # 找到最有一页的页码,比对是否为当前页 next_page = response_json.get('pageCount') if next_page: # print next_page if int(next_page) == page: pass else: for i in xrange(2, int(next_page) + 1): meta['page'] = i next_page_link = response.url + '&page=%s' % i print next_page_link yield Request(next_page_link, callback=self.parse, meta=meta, dont_filter=True, headers=header1)
def get_data(self, shop_url, meta, header_phone): while True: if not self.proxys: self.proxys = self.redis_conn1() if self.proxys: if int(time.time()) - self.dt_proxy > 5: self.dt_proxy = int(time.time()) self.proxys = self.redis_conn1() proxies = {"http": "%s" % random.choice(self.proxys)} try: data = requests.get(shop_url, headers=header_phone, proxies=proxies, timeout=5) if data.status_code == 200: item = DianPIngAllStoreJson() item['meta'] = meta item['shop_response'] = data.content self.parse_result(item) # return item # break else: time.sleep(0.1) except: pass
def parse_shop(self, response): # if response.status == 403: # with open('error_url', 'a') as f: # f.write(response.url + '\n') # print response.headers item = DianPIngAllStoreJson() item['meta'] = response.meta item['shop_response'] = response.body yield item
def parse_shop(self, response): if response.status == 403: print response.request.headers self.write_file(response.request.headers.get('User-Agent')) else: item = DianPIngAllStoreJson() item['meta'] = response.meta item['shop_response'] = response.body yield item
def parse_shop(self, response): if response.status == 403: print response.request.headers else: item = DianPIngAllStoreJson() item['meta'] = response.meta item['shop_response'] = response.body yield item
def parse_shop(self, response): data = response.body if data: item = DianPIngAllStoreJson() item['meta'] = response.meta item['shop_response'] = response.body yield item else: yield Request(response.url, headers=header1, meta=response.meta, callback=self.parse_shop)
def parse_detail(self, response): item = DianPIngAllStoreJson() shop_response = response.body print shop_response meta = response.meta if shop_response: item['meta'] = meta item['shop_response'] = shop_response yield item else: # shop_id = meta.get('shop_id') # header_ = header # header_['Referer'] = 'http://www.dianping.com/shop/%s' % shop_id # yield Request(response.url, callback=self.parse_detail, meta=meta, headers=header_, dont_filter=True) # print response.request.headers no_result = {} no_result['url'] = response.url no_result['shop_id'] = meta.get('shop_id') with open('no_result_item', 'a') as f: f.write(json.dumps(no_result) + '\n')
def parse_shop(self, response): item = DianPIngAllStoreJson() item['meta'] = response.meta item['shop_response'] = response.body yield item
def parse(self, response): response_json = json.loads(response.body) data = response_json.get('data') meta = response.meta if data: moduleInfoList = data.get('moduleInfoList') if moduleInfoList: moduleInfoList = moduleInfoList[0] moduleData = moduleInfoList.get('moduleData') if moduleData: data = moduleData.get('data') if data: listData = data.get('listData') if listData: list = listData.get('list') if list: for ll in list: shop_id = ll.get('id') print shop_id shop_url = 'http://www.dianping.com/ajax/json/shopfood/wizard/BasicHideInfoAjaxFP?shopId=' + str( shop_id) meta['shop_info'] = ll meta['dt'] = dt header_phone[ 'Cookie'] = 'cy=%s; cityid=%s' % ( meta.get('city_id'), meta.get('city_id')) while True: if not self.proxys: self.proxys = self.redis_conn1() if self.proxys: if int(time.time() ) - self.dt_proxy > 5: self.dt_proxy = int( time.time()) self.proxys = self.redis_conn1( ) proxies = { "http": "%s" % random.choice(self.proxys) } try: data = requests.get( shop_url, headers=header_phone, proxies=proxies, timeout=5) if data.status_code == 200: item = DianPIngAllStoreJson() item['meta'] = meta item[ 'shop_response'] = data.content yield item break else: time.sleep(0.1) except: pass recordCount = listData.get('recordCount') print recordCount nextStartIndex = listData.get('nextStartIndex') print nextStartIndex if int(recordCount) > int(nextStartIndex): startIndex = nextStartIndex post_data = meta['post_data'] post_data = json.loads(post_data) post_data['moduleInfoList'][0]['query'][ 'search']['start'] = startIndex print post_data # print response.url meta = response.meta meta['retry_times'] = 0 post_data = json.dumps(post_data) meta['post_data'] = post_data yield Request(response.url, method='POST', headers=header2, body=post_data, callback=self.parse, meta=meta, dont_filter=True) print response.meta