class CheckQualificationByRankList(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.live_user_list = [] self.room_id_list = [] self.stupid_key_words = STUPID_KEY_WORDS def get_users(self): users = self.redis_client.get_users() #每次获取分数最低的10000个 return users def is_live_user(self, sec_user_id): return self.redis_client.is_live_user(sec_user_id) def save_rooms(self): for each in self.room_id_list: self.redis_client.add_rooms(each) def add_to_live_users(self): for each in self.live_user_list: self.redis_client.add_live_users(each, 1) def increase_user_score(self, sec_user_id): self.redis_client.increase_user_score(sec_user_id) def is_qualified_user(self, user): nickname = user.get('nickname') for word in self.stupid_key_words: if word in nickname: return False return True def get_rank_list(self, sec_user_id): if not self.is_live_user(sec_user_id): try: raw_data = self.get_raw_data.get_rank_list(sec_user_id) except Exception as e: logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: user = raw_data.get('data').get('anchor_info').get('user') except Exception as e: logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id) return None if self.is_qualified_user(user): own_room = user.get('own_room') if own_room: #如果有这个,说明直播开始了 room_id = own_room.get('room_ids_str')[0] self.live_user_list.append(sec_user_id) self.room_id_list.append(room_id) self.increase_user_score(sec_user_id) else: self.redis_client.delete_users(sec_user_id) logger.info('删除user-sec_user_id-' + sec_user_id) else: self.increase_user_score(sec_user_id) """ try: raw_data = self.get_raw_data.get_rank_list(sec_user_id) except Exception as e: logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: nickname = raw_data.get('data').get('anchor_info').get('user').get('nickname') print(nickname) except Exception as e: logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id) return None """ def run(self): users = self.get_users() batch_size = 50 #这个接口80个一批可行的(总共近4000个),更多的我不敢再试了 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit+batch_size, len(users)) logger.info('当前获取用户序号-' + str(start+1) + '-' + str(stop)) tasks = [gevent.spawn(self.get_rank_list, sec_user_id) for sec_user_id in users[start:stop]] gevent.joinall(tasks) self.save_rooms() self.add_to_live_users() logger.info('新增room_id-' + str(len(self.room_id_list))) self.room_id_list.clear() self.live_user_list.clear()
class SaveLiveUsers(): def __init__(self): self.get_raw_data = GetRawData() #self.db = pymysql.connect(host='47.114.166.130', port=13306, user='******', password='******', db='bxdb', charset='utf8mb4') #self.cursor = self.db.cursor() self.db = pymysql.connect(host='localhost', port=3306, user='******', password='******', db='bxmind', charset='utf8mb4') self.cursor = self.db.cursor() self.mysql_client = MysqlClient() self.redis_client = RedisClient() self.stupid_key_words = STUPID_KEY_WORDS self.a_list = [] self.b_list = [] def into_mysql(self, data, table): keys = ','.join(data.keys()) values = ','.join(['%s'] * len(data)) sql = 'insert into %s (%s) values (%s)' %(table, keys, values) try: self.cursor.execute(sql, tuple(data.values())) self.db.commit() except Exception as e: print(e.args) def is_qualified(self, nickname): for word in self.stupid_key_words: if word in nickname: return False return True def run_a(self): table = 'dy_live_lives' with open('lives_20200614.txt', 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: #print(line) data = json.loads(line) nickname = data.get('nickname') sec_user_id = data.get('sec_user_id') if data.get('status') == 4: if self.is_qualified(nickname): self.into_mysql(data, table) else: self.redis_client.delete_users(sec_user_id) self.redis_client.delete_live_users(sec_user_id) def replicate_table(self): sql = 'CREATE TABLE dy_sample LIKE dy_live_lives' self.cursor.execute(sql) def select_users(self): sql = 'SELECT room_id, sec_user_id, nickname, short_id, total_viewer, like_count, follower_count, signature, city FROM dy_live_lives WHERE total_viewer > 50000 AND follower_count > 500000' self.cursor.execute(sql) row = self.cursor.fetchone() while row: data = {} data['room_id'] = row[0] data['sec_user_id'] = row[1] data['nickname'] = row[2] data['short_id'] = row[3] data['total_viewer'] = row[4] data['like_count'] = row[5] data['follower_count'] = row[6] data['signature'] = row[7] data['city'] = row[8] self.a_list.append(data) row = self.cursor.fetchone() def get_cates(self, data): sec_user_id = data['sec_user_id'] try: cates_raw_data = self.get_raw_data.get_cates(sec_user_id) except Exception as e: logger.error('get_cates出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None cate_list = cates_raw_data.get('user_shop_categories') for each in cate_list: cate = each['name'] number = each['count'] if cate in ['零食', '食品', '花茶', '果茶'] and number >= 3: self.b_list.append(data) break def run_b(self): self.select_users() logger.info('a_list共有数据-' + str(len(self.a_list))) batch_size = 100 for batch_limit in range(0, len(self.a_list), batch_size): start = batch_limit stop = min(batch_limit+batch_size, len(self.a_list)) logger.info('当前爬取用户序号-' + str(start+1) + '-' + str(stop)) tasks = [gevent.spawn(self.get_cates, data) for data in self.a_list[start:stop]] gevent.joinall(tasks) logger.info('b_list共有数据-' + str(len(self.b_list))) for data in self.b_list: self.into_mysql(data, 'dy_sample') def run_c(self): self.select_users() for data in self.a_list: self.into_mysql(data, 'dy_sample') def select_rooms(self): room_list = [] sql = 'SELECT room_id FROM dy_sample' self.cursor.execute(sql) row = self.cursor.fetchone() while row: room_list.append(row[0]) row = self.cursor.fetchone() return room_list def get_txt(self): with open('lives_20200605.txt', 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: print(line) break def write_to_file(self, item_list): today = time.strftime('%Y-%m-%d', time.localtime()) today = today.replace('-', '') with open (FILE_DIRECTORY + '/' + 'item_lists_sample'+ '_' + today + '.txt', 'a', encoding='utf-8') as file: file.write(item_list + '\n') def run_d(self): url = 'https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId=606547898363&sellerId=2206709156233&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,activity,fqg,zjys,couponActivity,soldQuantity,page,originalPrice,tradeContract&callback=onSibRequestSuccess' headers = { 'Referer': 'https://item.taobao.com/item.htm?id=606547898363', 'Sec-Fetch-Mode': 'no-cors', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36' } response = requests.get(url, headers=headers, allow_redirects=False) print(response.text) def run_e(self): file = 'lives_20200609.txt' table = 'dy_live_lives' data_batch = [] batch_size = 200 loop = asyncio.get_event_loop() task = loop.create_task(self.mysql_client.connect_mysql(loop)) loop.run_until_complete(task) with open(file, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: line = json.loads(line) if self.is_qualified(line.get('nickname')): if line.get('status') == 4: data_batch.append(line) #最初数据没有入库然后程序很快就结束,是因为,4写成了’4‘。然后我以为其他地方有问题,就去打印,然后由于颜文字,打印出错(颜文字不会给入库造成错误),就在颜文字上卡了很久#最初数据没有入库然后程序很快就结束,是因为,4写成了’4‘。然后我以为其他地方有问题,就去打印,然后由于颜文字,打印出错(颜文字不会给入库造成错误),就在颜文字上卡了很久 line.pop('mobile') if len(data_batch) >= batch_size: tasks = [self.mysql_client.into_mysql(loop, i, table) for i in data_batch] loop.run_until_complete(asyncio.wait(tasks)) #tasks = [gevent.spawn(self.into_mysql, line) for line in data_batch] #gevent.joinall(tasks) data_batch.clear() else: self.redis_client.delete_users(line.get('sec_user_id')) self.redis_client.delete_live_users(line.get('sec_user_id')) print('删除user', line.get('nickname')) def run_f(self): file_a = 'lives_20200623.txt' file_b = '第四批_抖音主播_去重前.csv' with open(file_a, 'r', encoding='utf-8') as f: with open(file_b, 'a', encoding='utf-8-sig', newline='') as g: lines = f.readlines() first_line = json.loads(lines[0]) first_line.pop('mobile') keys = list(first_line.keys()) writer = csv.DictWriter(g, fieldnames = keys) for line in lines: line = json.loads(line) if self.is_qualified(line.get('nickname')): if line.get('status') == 4: line.pop('mobile') writer.writerow(line) def run_g(self): read_workbook = xlrd.open_workbook('C:/Users/百芯科技/scraping/douyin7/第四批_抖音主播_去重后.xlsx') write_workbook = copy(read_workbook) read_sheet = read_workbook.sheet_by_name('Sheet1') write_sheet = write_workbook.get_sheet(0) nrows = read_sheet.nrows ncolumns = read_sheet.ncols for row in range(1, nrows): text = read_sheet.row(row)[15].value if text: data = re.match('.*(1\d{10}).*', str(text), re.S) if data: mobile = data.group(1) write_sheet.write(row, ncolumns, mobile) write_workbook.save('C:/Users/百芯科技/scraping/douyin7/第四批_抖音主播_去重后_电话.xlsx')
class GetPromotions(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.aweme_id_list = [] def get_users(self): users = self.redis_client.get_users() return users def get_promotions(self, sec_user_id): try: raw_data = self.get_raw_data.get_promotions(sec_user_id) except Exception as e: logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: promotions = self.parse_promotions(raw_data, sec_user_id) except Exception as e: logger.error('parse_promotions错误-' + e.args + '-sec_user_id-' + sec_user_id) return None self.write_to_file(json.dumps(promotions, ensure_ascii=False)) def parse_promotions(self, raw_data, sec_user_id): promotions = [] data = raw_data.get('promotions') if data == []: logger.info('该用户不带货,将删除,sec_user_id-' + sec_user_id) self.redis_client.delete_users(sec_user_id) return None else: for each in data: promotion = {} #promotion['user_id'] = user_id promotion['sec_user_id'] = sec_user_id promotion['price'] = each.get('price') / 100 promotion['cover_url'] = each.get('images')[0].get( 'url_list')[0] promotion['title'] = each.get('title') promotion['product_id'] = each.get('product_id') promotion['product_url'] = each.get('detail_url') promotion['min_price'] = str(int(each.get('min_price')) / 100) promotion['douyin_sales'] = each.get('sales') promotion['product_source'] = each.get('goods_source') promotion['create_time'] = str(int(time.time())) if each.get('market_price'): promotion['market_price'] = each.get('market_price') / 100 if each.get('last_aweme_id'): promotion['promotion_type'] = 'video' promotion['aweme_id'] = int(each.get('last_aweme_id')) self.aweme_id_list.append(promotion['aweme_id']) else: promotion['promotion_type'] = 'picture' if each.get('taobao'): taobao = each.get('taobao') if taobao.get('coupon'): promotion['coupon_amount'] = taobao.get('coupon').get( 'coupon_amount') promotion['price_after_coupon'] = promotion[ 'price'] - float(promotion['coupon_amount']) promotion['coupon_url'] = taobao.get('coupon').get( 'coupon_web_url') promotions.append(promotion) return promotions def write_to_file(self, promotions): today = time.strftime('%Y-%m-%d', time.localtime()) today = today.replace('-', '') harry_potter = str(random.choice(range(100))) with open(FILE_DIRECTORY + '/' + 'promotions' + '_' + harry_potter + '.txt', 'a', encoding='utf-8') as file: file.write(promotions + '\n') def save_awemes(self): for each in self.aweme_id_list: self.redis_client.add_awemes(each) def run(self): users = self.get_users() logger.info('共有用户数量:' + str(len(users))) batch_size = 50 #尽管异步,还是很慢,200个就很慢很慢了,慢到跟同步一样,这可能是抖音某个神奇的特点吧 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('get_promotions爬取当前用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_promotions, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks) self.save_awemes() self.aweme_id_list.clear()
class GetUserProfile(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.stupid_key_words = STUPID_KEY_WORDS def get_users(self): users = self.redis_client.test_b() return users def get_user_profile(self, sec_user_id): try: raw_data = self.get_raw_data.get_user_profile(sec_user_id) except Exception as e: logger.error('get_user_profile出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: user_profile = self.parse_user_profile(raw_data) except Exception as e: logger.error('parse_user_profile出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None if user_profile: self.write_to_file(json.dumps(user_profile, ensure_ascii=False)) self.redis_client.test_c(sec_user_id) def is_qualified_user(self, user): nickname = user.get('nickname') for word in self.stupid_key_words: if word in nickname: return False if user.get('is_gov_media_vip'): return False if user.get('enterprise_verify_reason') != '': return False if user.get('custom_verify') != '': return False if not user.get('with_fusion_shop_entry'): return False if not user.get('live_commerce'): return False if not user.get('with_commerce_entry'): return False return True def parse_user_profile(self, raw_data): data = raw_data.get('user') user_profile = {} user_profile['sec_uid'] = data.get('sec_uid') if self.is_qualified_user(data): user_profile['follower_count'] = data.get('follower_count') user_profile['nickname'] = data.get('nickname') user_profile['gender'] = data.get('gender') user_profile['location'] = data.get('location') user_profile['birthday'] = data.get('birthday') user_profile['avatar_url'] = data.get('avatar_larger').get( 'url_list')[0] user_profile['school_name'] = data.get('school_name') user_profile['signature'] = data.get('signature') user_profile['uid'] = data.get('uid') user_profile['short_id'] = data.get('short_id') user_profile['unique_id'] = data.get('unique_id') user_profile['star_atlas'] = data.get('commerce_user_info').get( 'star_atlas') user_profile['aweme_count'] = data.get('aweme_count') user_profile['dongtai_count'] = data.get('dongtai_count') user_profile['following_count'] = data.get('following_count') user_profile['favoriting_count'] = data.get('favoriting_count') user_profile['total_favorited'] = data.get('total_favorited') user_profile['live_commerce'] = data.get('live_commerce') user_profile['create_time'] = str(int(time.time())) return user_profile else: self.redis_client.delete_users(user_profile['sec_uid']) #print(json.dumps(raw_data, ensure_ascii=False)) logger.info('删除user-sec_user_id-' + user_profile['sec_uid']) return None def write_to_file(self, user_profile): today = time.strftime('%Y-%m-%d', time.localtime()) today = today.replace('-', '') with open(FILE_DIRECTORY + '/' + 'user_profiles' + '_' + today + '.txt', 'a', encoding='utf-8') as file: file.write(user_profile + '\n') def run(self): users = self.get_users() logger.info('共有users-' + str(len(users))) batch_size = 1 #50个会获取不到数据 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('当前爬取用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_user_profile, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks)
class CheckRooms(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.lives_on_list = [] def get_rooms(self): return self.redis_client.get_rooms(0, 0) def change_room_status(self): for each in self.lives_on_list: self.redis_client.add_rooms(each, 1) def check_room(self, room_id): try: room_raw_data = self.get_raw_data.get_live(room_id) except Exception as e: logger.error('get_live出错-' + e.args[0] + '-room_id-' + room_id) return None try: owner = room_raw_data.get('data').get('owner') follower_count = owner.get('follow_info').get('follower_count') sec_user_id = owner.get('sec_uid') except Exception as e: logger.error('解析room_raw_data出错-' + e.args[0] + '-room_id-' + room_id) return None if follower_count < 10000: self.redis_client.delete_users(sec_user_id) self.redis_client.delete_rooms(room_id) else: status = room_raw_data.get('data').get('status') if status == 2: try: #判断该场直播是否带货 item_list_raw_data = self.get_raw_data.get_item_list( sec_user_id, room_id) except Exception as e: logger.error('get_item_list出错' + e.args[0] + '-sec_user_id和room_id-' + sec_user_id + '-' + room_id) return None if len(item_list_raw_data.get('promotions')) != 0: self.lives_on_list.append(room_id) def run(self): all_room_ids = self.get_rooms() logger.info('此前未在直播的直播间数量:' + str(len(all_room_ids))) batch_size = 200 for batch_limit in range(0, len(all_room_ids), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(all_room_ids)) logger.info('待查看的此前未在直播的直播间-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.check_room, room_id) for room_id in all_room_ids[start:stop] ] gevent.joinall(tasks) logger.info('新发现开始的直播数量-' + str(len(self.lives_on_list))) self.change_room_status() self.lives_on_list.clear()