示例#1
0
文件: parsers.py 项目: linVdcd/cola
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        try:
            br = self.opener.browse_open(url)
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 * 10)
        try:
            jsn = json.loads(br.response().read())
        except ValueError:
            print('休息10分钟!')
            time.sleep(60 * 10)
            raise FetchBannedError('fetch banned by weibo server')


#         self.logger.debug('load %s finish' % url)

        try:
            soup = beautiful_soup(jsn['data']['html'])
            current_page = jsn['data']['page']['pagenum']
            n_pages = jsn['data']['page']['totalpage']
        except KeyError:
            print('休息10分钟!')
            time.sleep(60 * 10)
            raise FetchBannedError('fetch banned by weibo server')

        if not self.check(url, br):
            return

        decodes = urldecode(url)
        mid = decodes.get('id', decodes.get('mid'))

        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find('dt').find('img')['src']
            date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text
            date = date.strip().strip('(').strip(')')
            instance.created = self.parse_datetime(date)
            for div in dl.find_all('div'):
                div.extract()
            for span in dl.find_all('span'):
                span.extract()
            instance.content = dl.text.strip()

        counter_type = None
        #print(u'微博:'+mblog.content+u'的评论')
        if url.startswith('http://weibo.com/aj/comment'):
            counter_type = 'comment'
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                uid = dl.find('a', usercard=True)['usercard'].split("id=",
                                                                    1)[1]
                comment = Comment(uid=uid)
                set_instance(comment, dl)
                #print(u'微博评论:'+comment.content)

                mblog.comments.append(comment)
        elif url.startswith('http://weibo.com/aj/mblog/info'):
            counter_type = 'forward'
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                forward_again_a = dl.find(
                    'a',
                    attrs={
                        'action-type': re.compile("^(feed_list|fl)_forward$")
                    })
                uid = urldecode('?%s' % forward_again_a['action-data'])['uid']
                forward = Forward(uid=uid, mid=dl['mid'])
                set_instance(forward, dl)

                mblog.forwards.append(forward)
        elif url.startswith('http://weibo.com/aj/like'):
            counter_type = 'like'
            lis = soup.find_all('li', uid=True)
            for li in lis:
                like = Like(uid=li['uid'])
                like.avatar = li.find('img')['src']

                mblog.likes.append(like)

        mblog.save()
        #       self.logger.debug('parse %s finish' % url)

        # counter add one for the processed forward or comment or like list url
        if counter_type is not None:
            self.counter.inc('processed_%s_list_page' % counter_type, 1)

        if current_page >= n_pages:
            return

        params = urldecode(url)
        new_params = urldecode('?page=%s' % (current_page + 1))
        params.update(new_params)
        params['__rnd'] = int(time.time() * 1000)
        next_page = '%s?%s' % (url.split('?')[0], urllib.urlencode(params))
        yield next_page
示例#2
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        br = None
        jsn = None
        try:
            br = self.opener.browse_open(url)
            self.logger.debug('load %s finish' % url)
            jsn = json.loads(br.response().read())
        except (ValueError, URLError) as e:
            return self._error(url, e)

        soup = beautiful_soup(jsn['data']['html'])
        current_page = jsn['data']['page']['pagenum']
        n_pages = jsn['data']['page']['totalpage']

        if not self.check(url, br):
            return [], []

        decodes = urldecode(url)
        mid = decodes.get('id', decodes.get('mid'))

        mblog = self.bundle.current_mblog
        if mblog is None or mblog.mid != mid:
            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
                mblog.save()

        def set_instance(instance, dl):
            instance.avatar = dl.find('dt').find('img')['src']
            date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text
            date = date.strip().strip('(').strip(')')
            instance.created = self.parse_datetime(date)
            for div in dl.find_all('div'):
                div.extract()
            for span in dl.find_all('span'):
                span.extract()
            instance.content = dl.text.strip()

        if url.startswith('http://weibo.com/aj/comment'):
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                comment = Comment(uid=self.uid)
                set_instance(comment, dl)

                mblog.comments.append(comment)
        elif url.startswith('http://weibo.com/aj/mblog/info'):
            dls = soup.find_all('dl', mid=True)
            for dl in dls:
                forward = Forward(uid=self.uid, mid=dl['mid'])
                set_instance(forward, dl)

                mblog.forwards.append(forward)
        elif url.startswith('http://weibo.com/aj/like'):
            lis = soup.find_all('li', uid=True)
            for li in lis:
                like = Like(uid=li['uid'])
                like.avatar = li.find('img')['src']

                mblog.likes.append(like)

        try:
            mblog.save()
            self.logger.debug('parse %s finish' % url)
        except ValidationError, e:
            return self._error(url, e)