def periodic_parse_list(self, response): data = json.loads(response.text) meta = response.meta['rental'] houses = data['data']['topData'] + data['data']['data'] has_outdated = False for house in houses: house['is_vip'] = 'id' not in house # updatetime == creation time in 591... if not house['is_vip'] and house['updatetime'] < self.epoch_ago: has_outdated = True else: house_item = self.gen_shared_attrs(house, meta) # send non-gps request first at it may be closed soon yield self.gen_detail_request( util.DetailRequestMeta(house_item['vendor_house_id'], False)) if meta.name in self.count_per_city: self.count_per_city[meta.name] += 1 if data['data']['data'] and not has_outdated: # only goto next page when there's response and not outdated yield self.gen_list_request( util.ListRequestMeta(meta.id, meta.name, meta.page + 1)) else: logging.info( f'[{meta.name}] total {self.count_per_city[meta.name]} house to crawl!' )
def periodic_parse_list(self, response): data = json.loads(response.text) meta = response.meta['rental'] # per discussion in #8, we don't need AD list at all # ref: https://github.com/rentea-tw/rentea-crawler/issues/8#issuecomment-558021819 houses = data['data']['data'] has_outdated = False for house in houses: # updatetime == creation time in 591... if house['updatetime'] < self.epoch_ago: has_outdated = True else: house_item = self.gen_shared_attrs(house, meta) # send non-gps request first at it may be closed soon request = self.gen_detail_request(util.DetailRequestMeta( house_item['vendor_house_id'], False )) yield request if meta.name in self.count_per_city: self.count_per_city[meta.name] += 1 if houses and not has_outdated: # only goto next page when there's response and not outdated request = self.gen_list_request(util.ListRequestMeta( meta.id, meta.name, meta.page + 1 )) yield request else: logging.info(f'[{meta.name}] total {self.count_per_city[meta.name]} house to crawl!')
def parse_main_response(self, response): for item in super().parse_main_response(response): if not isinstance(item, GenericHouseItem): # Skip original logic about GPS request generation continue if item['deal_status'] == DealStatusType.NOT_FOUND: yield item else: # Got an item that contains GPS! gps_arg = { 'callback': self.parse_detail, **self.gen_detail_request_args( util.DetailRequestMeta(item['vendor_house_id'], True)) } gps_arg['meta']['main_item'] = item yield Request(**gps_arg)
def count_and_parse_list(self, response): meta = response.meta['rental'] data = json.loads(response.text) if meta.page == 0: count = clean_number(data['records']) logging.info(f'[{meta.name}] total {count} house to crawl!') # #items return per request may differ from API endpoint self.N_PAGE = len(data['data']['data']) # generate all list request as now we know number of result cur_page = 1 while cur_page * self.N_PAGE < count: yield self.gen_list_request( util.ListRequestMeta(meta.id, meta.name, cur_page)) cur_page += 1 houses = data['data']['data'] if not self.novip: houses = data['data']['topData'] + houses for house in houses: # copy from twrh house['is_vip'] = 'id' not in house house_item = self.gen_shared_attrs(house, meta) stats, created = HouseStats.get_or_create( job_id=self.job.id, house_id=house_item['vendor_house_id']) if not created: continue yield RawHouseItem(house_id=house_item['vendor_house_id'], vendor=self.vendor, is_list=True, raw=json.dumps(house, ensure_ascii=False)) yield GenericHouseItem(**house_item) yield self.gen_detail_request( util.DetailRequestMeta(house_item['vendor_house_id'], False))
def parse_seed(self, seed): return util.DetailRequestMeta(*seed)