class GetRooms(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.sec_user_id_list = [] self.room_id_list = [] def get_channel(self): try: channel_raw_data = self.get_raw_data.get_channel() except Exception as e: logger.error('get_channel出错-' + e.args[0]) return None try: self.parse_channel(channel_raw_data) except Exception as e: logger.error('parse_channel出错-' + e.args[0]) return None #logger.info(json.dumps([i[-10:-1] for i in self.sec_user_id_list])) for each in self.room_id_list: self.redis_client.add_rooms(each) for each in self.sec_user_id_list: self.redis_client.add_users(each, 1) def parse_channel(self, channel_raw_data): for each in channel_raw_data.get('data'): room_id = each.get('data').get('id_str') sec_user_id = each.get('data').get('owner').get('sec_uid') follower = each.get('data').get('owner').get('follow_info').get( 'follower_count') if follower >= 10000: try: item_list = self.get_raw_data.get_item_list( sec_user_id, room_id) except Exception as e: logger.error('get_item_list出错-' + e.args[0]) return None if len(item_list.get('promotions')) != 0: self.room_id_list.append(room_id) self.sec_user_id_list.append(sec_user_id) def run(self): tasks = [gevent.spawn(self.get_channel) for i in range(1)] gevent.joinall(tasks) logger.info('本批次共获得room_id和sec_user_id-' + str(len(self.sec_user_id_list)) + '-' + str(len(self.room_id_list))) self.sec_user_id_list.clear() self.room_id_list.clear()
class GetCurrentRoom(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.room_id_list = [] def get_users(self): users = self.redis_client.get_users() return users def save_rooms(self): for each in self.room_id_list: self.redis_client.add_rooms(each, 0) def get_current_room(self, sec_user_id): try: raw_data = self.get_raw_data.get_current_room(sec_user_id) except Exception as e: logger.error('get_current_room出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: check = raw_data.get('data').get('pay_grade').get('grade_describe') except Exception as e: logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id) return None own_room = raw_data.get('data').get('own_room') if own_room: #如果有这个,说明直播以及开始了 room_id = own_room.get('room_ids_str')[0] self.room_id_list.append(room_id) logger.info(sec_user_id + '-正在直播,room_id-' + room_id) else: logger.info(sec_user_id + '-未在直播') def run(self): users = self.get_users() logger.info('共有users-' + str(len(users))) batch_size = 20 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('当前爬取用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_current_room, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks)
class GetUserDongtai(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.room_id_list = [] def get_users(self): users = self.redis_client.get_users() return users def save_rooms(self): for each in self.room_id_list: self.redis_client.add_rooms(each, 0) def get_user_dongtai(self, sec_user_id): try: raw_data = self.get_raw_data.get_user_dongtai(sec_user_id) except Exception as e: logger.error('get_user_dongtai出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: self.parse_user_dongtai(raw_data) except Exception as e: logger.error('parse_user_dongtai出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) def parse_user_dongtai(self, raw_data): data = raw_data.get('dongtai_list')[0] room_id = data.get('aweme').get('author').get('room_id') if room_id != 0: self.room_id_list.append(str(room_id)) logger.info('该主播已开始直播,room_id-' + str(room_id)) else: logger.info('该主播尚未开始直播') def run(self): users = self.get_users() logger.info('共有users-' + str(len(users))) batch_size = 20 #20个也获取不到数据 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('当前爬取用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_user_dongtai, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks)
class CheckQualificationByRankList(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.live_user_list = [] self.room_id_list = [] self.stupid_key_words = STUPID_KEY_WORDS def get_users(self): users = self.redis_client.get_users() #每次获取分数最低的10000个 return users def is_live_user(self, sec_user_id): return self.redis_client.is_live_user(sec_user_id) def save_rooms(self): for each in self.room_id_list: self.redis_client.add_rooms(each) def add_to_live_users(self): for each in self.live_user_list: self.redis_client.add_live_users(each, 1) def increase_user_score(self, sec_user_id): self.redis_client.increase_user_score(sec_user_id) def is_qualified_user(self, user): nickname = user.get('nickname') for word in self.stupid_key_words: if word in nickname: return False return True def get_rank_list(self, sec_user_id): if not self.is_live_user(sec_user_id): try: raw_data = self.get_raw_data.get_rank_list(sec_user_id) except Exception as e: logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: user = raw_data.get('data').get('anchor_info').get('user') except Exception as e: logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id) return None if self.is_qualified_user(user): own_room = user.get('own_room') if own_room: #如果有这个,说明直播开始了 room_id = own_room.get('room_ids_str')[0] self.live_user_list.append(sec_user_id) self.room_id_list.append(room_id) self.increase_user_score(sec_user_id) else: self.redis_client.delete_users(sec_user_id) logger.info('删除user-sec_user_id-' + sec_user_id) else: self.increase_user_score(sec_user_id) """ try: raw_data = self.get_raw_data.get_rank_list(sec_user_id) except Exception as e: logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: nickname = raw_data.get('data').get('anchor_info').get('user').get('nickname') print(nickname) except Exception as e: logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id) return None """ def run(self): users = self.get_users() batch_size = 50 #这个接口80个一批可行的(总共近4000个),更多的我不敢再试了 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit+batch_size, len(users)) logger.info('当前获取用户序号-' + str(start+1) + '-' + str(stop)) tasks = [gevent.spawn(self.get_rank_list, sec_user_id) for sec_user_id in users[start:stop]] gevent.joinall(tasks) self.save_rooms() self.add_to_live_users() logger.info('新增room_id-' + str(len(self.room_id_list))) self.room_id_list.clear() self.live_user_list.clear()
class CheckRooms(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.lives_on_list = [] def get_rooms(self): return self.redis_client.get_rooms(0, 0) def change_room_status(self): for each in self.lives_on_list: self.redis_client.add_rooms(each, 1) def check_room(self, room_id): try: room_raw_data = self.get_raw_data.get_live(room_id) except Exception as e: logger.error('get_live出错-' + e.args[0] + '-room_id-' + room_id) return None try: owner = room_raw_data.get('data').get('owner') follower_count = owner.get('follow_info').get('follower_count') sec_user_id = owner.get('sec_uid') except Exception as e: logger.error('解析room_raw_data出错-' + e.args[0] + '-room_id-' + room_id) return None if follower_count < 10000: self.redis_client.delete_users(sec_user_id) self.redis_client.delete_rooms(room_id) else: status = room_raw_data.get('data').get('status') if status == 2: try: #判断该场直播是否带货 item_list_raw_data = self.get_raw_data.get_item_list( sec_user_id, room_id) except Exception as e: logger.error('get_item_list出错' + e.args[0] + '-sec_user_id和room_id-' + sec_user_id + '-' + room_id) return None if len(item_list_raw_data.get('promotions')) != 0: self.lives_on_list.append(room_id) def run(self): all_room_ids = self.get_rooms() logger.info('此前未在直播的直播间数量:' + str(len(all_room_ids))) batch_size = 200 for batch_limit in range(0, len(all_room_ids), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(all_room_ids)) logger.info('待查看的此前未在直播的直播间-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.check_room, room_id) for room_id in all_room_ids[start:stop] ] gevent.joinall(tasks) logger.info('新发现开始的直播数量-' + str(len(self.lives_on_list))) self.change_room_status() self.lives_on_list.clear()
class GetRankList(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.user_list = [] self.room_id_list = [] def get_users(self): users = self.redis_client.get_live_users(0, 0) return users def save_rooms(self): for each in self.room_id_list: self.redis_client.add_rooms(each) def change_user_status(self): for each in self.user_list: self.redis_client.add_live_users(each, 1) def get_rank_list(self, sec_user_id): try: raw_data = self.get_raw_data.get_rank_list(sec_user_id) except Exception as e: logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: own_room = raw_data.get('data').get('anchor_info').get('user').get( 'own_room') except Exception as e: logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id) return None if own_room: #如果有这个,说明直播开始了 room_id = own_room.get('room_ids_str')[0] self.user_list.append(sec_user_id) self.room_id_list.append(room_id) #logger.info(sec_user_id + '-正在直播,room_id-' + room_id) #else: #logger.info(sec_user_id + '-未在直播') def run(self): users = self.get_users() logger.info('共有未在直播的users-' + str(len(users))) batch_size = 50 #这个接口80个一批可行的(总共近4000个),更多的我不敢再试了 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('当前获取用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_rank_list, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks) self.save_rooms() self.change_user_status() logger.info('新增room_id-' + str(len(self.room_id_list))) self.room_id_list.clear() self.user_list.clear()