def crawl_listing(self, url, ctx='', **kwargs): prefix_url = url.rsplit('/', 1)[0] + '/' r = req.get(url) event_id, data = re.compile(r'parse_sale_(\w+)\((.*)\);$').search(r.text).groups() data = json.loads(data) event = Event.objects(event_id=event_id).first() if not event: event = Event(event_id=event_id) for product_data in data['asins']: # ensure we download the complete data once if 'cAsin' not in product_data: r = req.get(url) event_id, data = re.compile(r'parse_sale_(\w+)\((.*)\);$').search(r.text).groups() data = json.loads(data) product_ids = [] for product_data in data['asins']: ret = self._parse_product(event_id, event.asin_detail_page, event.casin_soldout_info, prefix_url, product_data, ctx) product_ids.append(ret) if event.urgent == True: event.urgent = False ready = True else: ready = False event.product_ids = product_ids event.update_time = datetime.utcnow() event.save() common_saved.send(sender=ctx, obj_type='Event', key=event_id, is_new=False, is_updated=False, ready=ready)
def _parse_event(self, event_data, ctx): """.. :py:method:: get product detail page by {asin: {url: url_str}}, update soldout info by {casin: {soldout: 1/0, }}, can update them when crawl_listing """ event_id = event_data['id'] info = event_data['saleProps'] is_new, is_updated = False, False event = Event.objects(event_id=event_id).first() if not event: is_new = True event = Event(event_id=event_id) event.urgent = True event.combine_url = 'http://www.myhabit.com/homepage#page=b&sale={0}'.format(event_id) event.sale_title = info['primary']['title'] if 'desc' in info['primary']: event.sale_description = lxml.html.fromstring(info['primary']['desc']).text_content() event.image_urls = [ info['prefix']+val for key, val in info['primary']['imgs'].items() if key == 'hero'] event.image_urls.extend( [ info['prefix']+val for key, val in info['primary']['imgs'].items() if key in ['desc', 'sale']] ) if 'brandUrl' in info['primary']: event.brand_link = info['primary']['brandUrl'] event.listing_url = event_data['prefix'] + event_data['url'] # updating fields events_begin = time2utc(event_data['start']) events_end = time2utc(event_data['end']) if event.events_begin != events_begin: event.update_history.update({ 'events_begin': datetime.utcnow() }) event.events_begin = events_begin if event.events_end != events_end: event.update_history.update({ 'events_end': datetime.utcnow() }) event.events_end = events_end [event.dept.append(dept) for dept in event_data['departments'] if dept not in event.dept] event.soldout = True if 'soldOut' in event_data and event_data['soldOut'] == 1 else False event.update_time = datetime.utcnow() # event_data['dataType'] == 'upcoming' don't have products if 'asins' in event_data: event.asin_detail_page = event_data['asins'] if 'cAsins' in event_data: event.casin_soldout_info = event_data['cAsins'] event.save() common_saved.send(sender=ctx, obj_type='Event', key=event.event_id, url=event.combine_url, is_new=is_new, is_updated=is_updated)