def parse(self, response):
        page = response.meta['page']
        uid = response.meta['uid']

        resp = json.loads(response.body)
        results = []

        if not resp.get('statuses'):
            raise ShouldNotEmptyError()

        for status in resp['statuses']:
            items = resp2item_v2(status)
            results.extend(items)

        # filter or mongo, 检查是否有大于70个有效更新,有则翻页,如果是page=1 还得做积分反馈
        update_count = 0
        if self.bloom:
            for status in resp['statuses']:
                if 'mid' in status and not self.bloom.check(status['mid']):
                    update_count += 1
                    # 更新到filter
                    self.bloom.add(status['mid'], int(time.time() * 1000))
        else:
            for status in resp['statuses']:
                if 'id' in status and self.db.master_timeline_weibo.find({
                        '_id':
                        status['id']
                }).limit(1).count() == 0:
                    update_count += 1

        if page == 1:
            if update_count > 0 and self.r.hget(self.uids_priority_set,
                                                uid) < 10:
                self.r.hincrby(self.uids_priority_set, uid, 1)
            elif update_count == 0 and self.r.hget(self.uids_priority_set,
                                                   uid) > 0:
                self.r.hincrby(self.uids_priority_set, uid, -1)

            log.msg(format='Score [uid:%(uid)s] update to %(score)s',
                    level=log.INFO,
                    uid=uid,
                    score=self.r.hget(self.uids_priority_set, uid))

        if update_count > AT_LEAST_UPDATE_COUNT:
            page += 1
            request = Request(BASE_URL.format(uid=uid, page=page),
                              headers=None)
            request.meta['page'] = page
            request.meta['uid'] = uid

            results.append(request)
            log.msg(
                format=
                'One more page [uid:%(uid)s] page:%(page)s update_count:%(update_count)s',
                level=log.INFO,
                uid=uid,
                page=page,
                update_count=update_count)

        return results
示例#2
0
    def more_reposts(self, response):
        source_weibo = response.meta['source_weibo']
        resp = json.loads(response.body)
        results = []

        if resp['reposts'] == []:
            raise ShouldNotEmptyError()

        for repost in resp['reposts']:
            items = resp2item_v2(repost)
            if items == []:
                continue
            weibo = items[0]  # 取出转发微博
            source_weibo['reposts'].append(weibo['id'])
            results.extend(items)

        results.append(source_weibo)
        return results
示例#3
0
    def source_user(self, response):
        uid = response.meta['uid']
        resp = json.loads(response.body)
        results = []

        items = resp2item_v2(resp)
        if len(items) < 2:
            raise ShouldNotEmptyError()
        results.extend(items)

        user = items[0]
        request = Request(FOLLOWERS_URL.format(uid=uid, cursor=0), headers=None,
                          callback=self.more_followers)
        request.meta['uid'] = uid
        request.meta['cursor'] = 0
        request.meta['source_user'] = user
        results.append(request)

        return results
    def parse(self, response):
        page = response.meta['page']
        uid = response.meta['uid']

        resp = json.loads(response.body)
        results = []

        if resp == []:
            raise ShouldNotEmptyError()

        for status in resp:
            items = resp2item_v1(status)
            results.extend(items)

        if self.mode == 'allpages':
            page += 1
            request = Request(BASE_URL.format(uid=uid, page=page))
            request.meta['page'] = page
            request.meta['uid'] = uid
            results.append(request)

        return results
示例#5
0
    def soucre_weibo(self, response):
        resp = json.loads(response.body)
        results = []

        items = resp2item_v2(resp)
        if len(items) < 2:
            raise ShouldNotEmptyError()
        results.extend(items)

        weibo = items[0]
        reposts_count = weibo['reposts_count']
        wid = weibo['id']
        for i in range(1, int(math.ceil(reposts_count / 200.0)) + 1):
            request = Request(BASE_URL.format(id=wid, page=i), headers=None,
                              callback=self.more_reposts)

            request.meta['page'] = i
            request.meta['wid'] = wid
            request.meta['source_weibo'] = weibo

            results.append(request)

        return results
示例#6
0
    def parse(self, response):
        page = response.meta['page']
        uid = response.meta['uid']

        resp = json.loads(response.body)
        results = []

        if resp.get('statuses') == []:
            raise ShouldNotEmptyError()

        for status in resp['statuses']:
            items = resp2item_v2(status)
            results.extend(items)

        page += 1
        request = Request(BASE_URL.format(uid=uid, page=page,
                          since_id=self.since_id, max_id=self.max_id), headers=None)
        request.meta['page'] = page
        request.meta['uid'] = uid

        results.append(request)

        return results