Пример #1
0
    def crawl_listing(self, url, ctx='', **kwargs):
        prefix_url = url.rsplit('/', 1)[0] + '/'
        r = req.get(url)
        event_id, data = re.compile(r'parse_sale_(\w+)\((.*)\);$').search(r.text).groups()
        data = json.loads(data)
        event = Event.objects(event_id=event_id).first()
        if not event: event = Event(event_id=event_id)

        for product_data in data['asins']: # ensure we download the complete data once
            if 'cAsin' not in product_data:
                r = req.get(url)
                event_id, data = re.compile(r'parse_sale_(\w+)\((.*)\);$').search(r.text).groups()
                data = json.loads(data)

        product_ids = []
        for product_data in data['asins']:
            ret = self._parse_product(event_id, event.asin_detail_page, event.casin_soldout_info, prefix_url, product_data, ctx)
            product_ids.append(ret)

        if event.urgent == True:
            event.urgent = False
            ready = True
        else: ready = False
        event.product_ids = product_ids
        event.update_time = datetime.utcnow()
        event.save()
        common_saved.send(sender=ctx, obj_type='Event', key=event_id, is_new=False, is_updated=False, ready=ready)
Пример #2
0
    def _parse_event(self, event_data, ctx):
        """.. :py:method::
            get product detail page by {asin: {url: url_str}},
            update soldout info by {casin: {soldout: 1/0, }}, can update them when crawl_listing
        """
        event_id = event_data['id']
        info = event_data['saleProps']

        is_new, is_updated = False, False
        event = Event.objects(event_id=event_id).first()
        if not event:
            is_new = True
            event = Event(event_id=event_id)
            event.urgent = True
            event.combine_url = 'http://www.myhabit.com/homepage#page=b&sale={0}'.format(event_id)
            event.sale_title = info['primary']['title']
            if 'desc' in info['primary']:
                event.sale_description = lxml.html.fromstring(info['primary']['desc']).text_content()
            event.image_urls = [ info['prefix']+val for key, val in info['primary']['imgs'].items() if key == 'hero']
            event.image_urls.extend( [ info['prefix']+val for key, val in info['primary']['imgs'].items() if key in ['desc', 'sale']] )
            if 'brandUrl' in info['primary']:
                event.brand_link = info['primary']['brandUrl']

        event.listing_url = event_data['prefix'] + event_data['url']
        # updating fields
        events_begin = time2utc(event_data['start'])
        events_end = time2utc(event_data['end'])
        if event.events_begin != events_begin:
            event.update_history.update({ 'events_begin': datetime.utcnow() })
            event.events_begin = events_begin 
        if event.events_end != events_end:
            event.update_history.update({ 'events_end': datetime.utcnow() })
            event.events_end = events_end

        [event.dept.append(dept) for dept in event_data['departments'] if dept not in event.dept]
        event.soldout = True if 'soldOut' in event_data and event_data['soldOut'] == 1 else False
        event.update_time = datetime.utcnow()

        # event_data['dataType'] == 'upcoming' don't have products
        if 'asins' in event_data: event.asin_detail_page = event_data['asins']
        if 'cAsins' in event_data: event.casin_soldout_info = event_data['cAsins']
        event.save()
        common_saved.send(sender=ctx, obj_type='Event', key=event.event_id, url=event.combine_url, is_new=is_new, is_updated=is_updated)