class GetUsers(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() def get_users(self, category_id, page): try: raw_data = self.get_raw_data.get_users(category_id, page) except Exception as e: raw_data = None logger.error('get_users错误-' + e.args + '-category_id-' + category_id + '-page-' + page) if raw_data: sec_user_id_list = self.parse_users(raw_data) self.save_to_redis(sec_user_id_list) def parse_users(self, raw_data): sec_user_id_list = [] data = raw_data.get('aweme_list') for each in data: sec_user_id = each.get('author').get('sec_uid') sec_user_id_list.append(sec_user_id) return sec_user_id_list def save_to_redis(self, sec_user_id_list): for each in sec_user_id_list: self.redis_client.add_users(each) def run(self): cate_list = range(-1, 15) for cate in cate_list: cate_page_list = [[cate, page] for page in range(0, 100)] logger.info('get_users当前爬取cate-' + str(cate)) tasks = [gevent.spawn(self.get_users, str(cate), str(page)) for cate, page in cate_page_list] gevent.joinall(tasks)
class GetRooms(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.sec_user_id_list = [] self.room_id_list = [] def get_channel(self): try: channel_raw_data = self.get_raw_data.get_channel() except Exception as e: logger.error('get_channel出错-' + e.args[0]) return None try: self.parse_channel(channel_raw_data) except Exception as e: logger.error('parse_channel出错-' + e.args[0]) return None #logger.info(json.dumps([i[-10:-1] for i in self.sec_user_id_list])) for each in self.room_id_list: self.redis_client.add_rooms(each) for each in self.sec_user_id_list: self.redis_client.add_users(each, 1) def parse_channel(self, channel_raw_data): for each in channel_raw_data.get('data'): room_id = each.get('data').get('id_str') sec_user_id = each.get('data').get('owner').get('sec_uid') follower = each.get('data').get('owner').get('follow_info').get( 'follower_count') if follower >= 10000: try: item_list = self.get_raw_data.get_item_list( sec_user_id, room_id) except Exception as e: logger.error('get_item_list出错-' + e.args[0]) return None if len(item_list.get('promotions')) != 0: self.room_id_list.append(room_id) self.sec_user_id_list.append(sec_user_id) def run(self): tasks = [gevent.spawn(self.get_channel) for i in range(1)] gevent.joinall(tasks) logger.info('本批次共获得room_id和sec_user_id-' + str(len(self.sec_user_id_list)) + '-' + str(len(self.room_id_list))) self.sec_user_id_list.clear() self.room_id_list.clear()
class CheckQualificationByPromotion(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.batch_size = 50 def get_pre_users(self): batch = [] while len(batch) < self.batch_size: pre_user = self.redis_client.get_pre_users() if not self.is_user(pre_user): #如果这个pre_user在user表中还不存在 batch.append(pre_user) return batch def count_pre_users(self): return self.redis_client.count_pre_users() def is_user(self, sec_user_id): return self.redis_client.is_user(sec_user_id) def check_qualification_by_promotion(self, sec_user_id): try: raw_data = self.get_raw_data.get_promotions(sec_user_id) except Exception as e: logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: raw_data.get('columns')[0].get('name') #表示确实获取到了页面 data = raw_data.get('promotions') if len(data) > 10: #确实获取到了页面,promotion大于10 self.redis_client.add_users(sec_user_id) except Exception as e: logger.error('解析promotions页面失败-sec_user_id-' + sec_user_id + '-' + e.args[0]) def run(self): if self.count_pre_users() > 0: batch = self.get_pre_users() tasks = [ gevent.spawn(self.check_qualification_by_promotion, sec_user_id) for sec_user_id in batch ] gevent.joinall(tasks) else: logger.info('pre_users列表空了,程序退出') sys.exit()