Exemplo n.º 1
0
class GetUsers():
	def __init__(self):
		self.get_raw_data = GetRawData()
		self.redis_client = RedisClient()

	def get_users(self, category_id, page):
		try:
			raw_data = self.get_raw_data.get_users(category_id, page)
		except Exception as e:
			raw_data = None
			logger.error('get_users错误-' + e.args + '-category_id-' + category_id + '-page-' + page)
		if raw_data:
			sec_user_id_list = self.parse_users(raw_data)
			self.save_to_redis(sec_user_id_list)

	def parse_users(self, raw_data):
		sec_user_id_list = []
		data = raw_data.get('aweme_list')
		for each in data:
			sec_user_id = each.get('author').get('sec_uid')
			sec_user_id_list.append(sec_user_id)
		return sec_user_id_list

	def save_to_redis(self, sec_user_id_list):
		for each in sec_user_id_list:
			self.redis_client.add_users(each)

	def run(self):
		cate_list = range(-1, 15)
		for cate in cate_list:
			cate_page_list = [[cate, page] for page in range(0, 100)]
			logger.info('get_users当前爬取cate-' + str(cate))
			tasks = [gevent.spawn(self.get_users, str(cate), str(page)) for cate, page in cate_page_list]
			gevent.joinall(tasks)
Exemplo n.º 2
0
class GetRooms():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.sec_user_id_list = []
        self.room_id_list = []

    def get_channel(self):
        try:
            channel_raw_data = self.get_raw_data.get_channel()
        except Exception as e:
            logger.error('get_channel出错-' + e.args[0])
            return None

        try:
            self.parse_channel(channel_raw_data)
        except Exception as e:
            logger.error('parse_channel出错-' + e.args[0])
            return None

        #logger.info(json.dumps([i[-10:-1] for i in self.sec_user_id_list]))
        for each in self.room_id_list:
            self.redis_client.add_rooms(each)
        for each in self.sec_user_id_list:
            self.redis_client.add_users(each, 1)

    def parse_channel(self, channel_raw_data):
        for each in channel_raw_data.get('data'):
            room_id = each.get('data').get('id_str')
            sec_user_id = each.get('data').get('owner').get('sec_uid')
            follower = each.get('data').get('owner').get('follow_info').get(
                'follower_count')

            if follower >= 10000:
                try:
                    item_list = self.get_raw_data.get_item_list(
                        sec_user_id, room_id)
                except Exception as e:
                    logger.error('get_item_list出错-' + e.args[0])
                    return None

                if len(item_list.get('promotions')) != 0:
                    self.room_id_list.append(room_id)
                    self.sec_user_id_list.append(sec_user_id)

    def run(self):
        tasks = [gevent.spawn(self.get_channel) for i in range(1)]
        gevent.joinall(tasks)
        logger.info('本批次共获得room_id和sec_user_id-' +
                    str(len(self.sec_user_id_list)) + '-' +
                    str(len(self.room_id_list)))
        self.sec_user_id_list.clear()
        self.room_id_list.clear()
Exemplo n.º 3
0
class CheckQualificationByPromotion():
    def __init__(self):
        self.get_raw_data = GetRawData()
        self.redis_client = RedisClient()
        self.batch_size = 50

    def get_pre_users(self):
        batch = []
        while len(batch) < self.batch_size:
            pre_user = self.redis_client.get_pre_users()
            if not self.is_user(pre_user):  #如果这个pre_user在user表中还不存在
                batch.append(pre_user)
        return batch

    def count_pre_users(self):
        return self.redis_client.count_pre_users()

    def is_user(self, sec_user_id):
        return self.redis_client.is_user(sec_user_id)

    def check_qualification_by_promotion(self, sec_user_id):
        try:
            raw_data = self.get_raw_data.get_promotions(sec_user_id)
        except Exception as e:
            logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' +
                         sec_user_id)
            return None

        try:
            raw_data.get('columns')[0].get('name')  #表示确实获取到了页面
            data = raw_data.get('promotions')
            if len(data) > 10:  #确实获取到了页面,promotion大于10
                self.redis_client.add_users(sec_user_id)
        except Exception as e:
            logger.error('解析promotions页面失败-sec_user_id-' + sec_user_id + '-' +
                         e.args[0])

    def run(self):
        if self.count_pre_users() > 0:
            batch = self.get_pre_users()
            tasks = [
                gevent.spawn(self.check_qualification_by_promotion,
                             sec_user_id) for sec_user_id in batch
            ]
            gevent.joinall(tasks)
        else:
            logger.info('pre_users列表空了,程序退出')
            sys.exit()