class Reduce_coll(object):
    def __init__(self):
        self.client = MongoDBHelper()
        self.video_list_coll = self.client.get_collection(
            collection_name='video_list', database_name='TX_Video')
        self.cid_vid_coll = self.client.get_collection(
            collection_name='cid_vid', database_name='TX_Video')

    def reduce_decument(self):
        cid_vid_cursor = self.cid_vid_coll.find({'vids': {
            '$exists': 1
        }}, {
            '_id': 0,
            'cid': 1,
            'vids': 1
        })
        cid_vid_list = [item for item in cid_vid_cursor]
        cid_vid_cursor.close()
        self.target_cids = self._get_dup_cid(cid_vid_list)
        try:
            BONotifier().msg(
                'TX_Video reduce_decument: 所有栏目list=={}'.format(
                    len(self.target_cids)), '@kang')
        except:
            pass
        self.video_list_coll.update({'cid': {
            '$in': list(self.target_cids)
        }}, {'$set': {
            'dup_flag': 1
        }},
                                    upsert=False,
                                    multi=True)
        self.cid_vid_coll.remove({'cid': {
            '$in': list(self.target_cids)
        }},
                                 multi=True)

    def _get_dup_cid(self, cid_vid_list):
        counter_dict = dict()
        for vids_dict in cid_vid_list:
            cid = vids_dict['cid']
            vids_list = vids_dict['vids']
            for vid in vids_list:
                counter_dict.setdefault(vid, {})
                vid_dict = counter_dict[vid]
                vid_dict.setdefault('count', 0)
                vid_dict.setdefault('cid_vid_num', [])
                vid_dict['count'] += 1
                vid_dict['cid_vid_num'].append((cid, len(vids_list)))
        dup_vid_cid_list = list()
        dup_cid_set = set()
        for k, v in counter_dict.items():
            if v['count'] > 1:
                dup_vid_cid_list.append(v['cid_vid_num'])
        for each in dup_vid_cid_list:
            each.sort(key=lambda x: x[1])
            for cid_tup in each[:-1]:
                dup_cid_set.add(cid_tup[0])
        return dup_cid_set
Пример #2
0
class TencentVideoPipeline(object):
    def __init__(self):
        self.client = MongoDBHelper()
        self.video_list_coll = self.client.get_collection(
            collection_name='video_list', database_name='TX_Video')
        self.video_info_coll = self.client.get_collection(
            collection_name='video_info', database_name='TX_Video')
        self.history_video_list_coll = self.client.get_collection(
            collection_name='history_video_list', database_name='TX_Video')
        self.cid_vid_coll = self.client.get_collection(
            collection_name='cid_vid', database_name='TX_Video')
        self.play_info_coll = self.client.get_collection(
            collection_name='play_info', database_name='TX_Video')
        self.play_info_demo_coll = self.client.get_collection(
            collection_name='play_info_demo', database_name='TX_Video')
        self.comment_info_coll = self.client.get_collection(
            collection_name='comment_info', database_name='TX_Video')
        self.user_info_coll = self.client.get_collection(
            collection_name='user_info', database_name='TX_Video')

    def process_item(self, item, spider):
        info = item['info']
        info['ts'] = datetime.datetime.utcnow()
        info['ts_string'] = str(datetime.date.today())

        if isinstance(item, VideoListItem):
            self.process_video_list(info)
        elif isinstance(item, VidItem):
            self.process_cid_vid(info)
        elif isinstance(item, PlayInfoItem):
            self.process_play_info(info)
        elif isinstance(item, CommentInfoItem):
            self.process_comment_info(info)
        elif isinstance(item, VideoInfoItem):
            self.process_video_info(info)
        elif isinstance(item, UserInfoItem):
            self.process_user_info(info)
        elif isinstance(item, PlayInfoDemoItem):
            self.process_play_info_demo(info)

        return item

    def process_video_list(self, info):
        self.history_video_list_coll.insert_one(info)
        if info['type_name'] == '微电影':
            # 20181009开始 所有微电影并到电影里
            info['type_name'] = '电影'
        self.video_list_coll.update_one({'cid': info['cid']}, {'$set': info},
                                        upsert=True)

    def process_video_info(self, info):
        self.video_info_coll.update_one({'unique_id': info['unique_id']},
                                        {'$set': info},
                                        upsert=True)

    def process_play_info(self, info):
        if info['positive_play_count'] == -1 and info[
                'play_count'] != -1:  # 变相处理页面positive_play_count:null的情况
            info['positive_play_count'] = 0
            info['play_count'] = 0
        self.play_info_coll.insert_one(info)

    def process_play_info_demo(self, info):
        if info['positive_play_count'] == -1 and info['play_count'] != -1:
            info['positive_play_count'] = 0
            info['play_count'] = 0
        self.play_info_demo_coll.insert_one(info)

    def process_user_info(self, info):
        self.user_info_coll.insert_one(info)

    def process_comment_info(self, info):
        self.comment_info_coll.insert_one(info)

    def process_cid_vid(self, info):
        cid = info.pop('cid')
        flag = info['flag']
        if flag == 1:
            vids = info['vids']
            vids_coll_dict = self.cid_vid_coll.find_one({'cid': cid}, {
                '_id': 0,
                'vids': 1
            })
            if vids_coll_dict:
                info['vids'] = list(set(vids) | set(vids_coll_dict['vids']))
            self.cid_vid_coll.update_one({'cid': cid}, {'$set': info},
                                         upsert=True)
        elif flag == 2:
            self.cid_vid_coll.update_one({'cid': cid}, {'$set': info},
                                         upsert=True)

    def open_spider(self, spider):
        '''
        try:BONotifier().msg('Tencent_Video {} opened'.format(spider.name), '@kang')
        except:pass
        '''
        pass

    def close_spider(self, spider):
        '''
        try:BONotifier().msg('Tencent_Video {} closed'.format(spider.name), '@kang')
        except:pass
        ts_string = str(datetime.date.today())
        if spider.name == 'CommentInfoSpider':
            CommentCount_daily = self.comment_info_coll.find({'ts_string': ts_string}).count()
            try:
                BONotifier().msg('Tencent_Video CommentInfo_daily({}):{}'.format(ts_string, CommentCount_daily), '@kang')
            except:
                pass
        elif spider.name == 'PlayInfoSpider':
            PlayCount_daily = self.play_info_coll.find({'ts_string': ts_string}).count()
            try:
                BONotifier().msg('Tencent_Video PlayCount_daily({}):{}'.format(ts_string, PlayCount_daily), '@kang')
            except:
                pass
        elif spider.name == 'VideoListSpider':
            VideoList_daily = self.video_list_coll.find({'ts_string': ts_string}).count()
            try:
                BONotifier().msg('Tencent_Video VideoList_daily({}):{}'.format(ts_string, VideoList_daily), '@kang')
            except:
                pass
        elif spider.name == 'VideoInfoSpider':
            VideoInfo_daily = self.video_info_coll.find({'ts_string': ts_string}).count()
            try:
                BONotifier().msg('Tencent_Video VideoInfo_daily({}):{}'.format(ts_string, VideoInfo_daily), '@kang')
            except:
                pass
        elif spider.name == 'UserInfoSpider':
            UserInfo_weekly = self.user_info_coll.find({'ts_string': ts_string}).count()
            try:
                BONotifier().msg('Tencent_Video UserInfo_weekly({}):{}'.format(ts_string, UserInfo_weekly), '@kang')
            except:
                pass
        '''
        pass