if barrage_info.get("type") == "NORMAL": temp_time = int(label["data-ts"]) if temp_time >= update_time: update_time = temp_time barrage_info["user_name"] = label["data-uname"] barrage_info["user_id"] = label["data-uid"] barrage_info["content"] = label["data-danmaku"] barrage_list.append(barrage_info) mysql.insert_pure(table_name, barrage_list) total_num += 1 total_time += 1000 * (time.time() - start_time) wait_time = 0.5 if wait_time > (time.time() - start_time): time.sleep(0.5 - (time.time() - start_time)) data_id_max += len(barrage_list) print("本次时间范围内新增弹幕:", len(barrage_list), "条,", "(共计:", data_id_max, ")", "|", "运行时间:", round(total_time / total_num), "毫秒", "(", round(total_time), "/", total_num, ")") if __name__ == "__main__": crawler("20191110_LOL世界赛决赛(FPX vs G2)", "https://live.bilibili.com/blanc/6?liteVersion=true", tool.mysql_connect("Barrage"))
tweet_info["likes"] = tweet_likes tweet_list.append(tweet_info) # 向下滚动到最下面的一条推文 if last_label_tweet is not None: driver.execute_script("arguments[0].scrollIntoView();", last_label_tweet) # 滑动到推文标签 time.sleep(1) else: break return tweet_list if __name__ == "__main__": selenium = tool.open_chrome() # 打开Selenium控制的Chrome浏览器 mySQL = tool.mysql_connect("Huabang") # 构造MySQL数据库连接对象 if "Huabang" in env.DATA and "Media List" in env.DATA["Huabang"]: for media_item in env.DATA["Huabang"]["Media List"]: # if media_item[0] < 440: # continue print("开始抓取媒体:", media_item[1], "(", media_item[0], ")", "-", media_item[3], "(", media_item[2], ")") tweet_template = { "media_id": media_item[0], "media_name": media_item[1], "tweet_id": None, "is_retweet": 0, "time": None, "text": None, "replies": None, "retweets": None,
barrage_info["user_name"] = label.select_one( "li > div > span:nth-child(2)").text barrage_info["content"] = label.select_one( "li > div > div > span").text elif "msg-auditorSys" in category: # 处理msg-auditorSys类型提示(系统提示) barrage_info["type"] = "AS" barrage_info["other"] = label.text elif "msg-sys" in category: # 处理msg-sys类型提示(系统提示) barrage_info["type"] = "SY" barrage_info["other"] = label.text else: # 处理其他类型 barrage_info.update(type="OT", other="弹幕名称" + category) barrage_list.append(barrage_info) mysql.insert(table_name, barrage_list) total_num += 1 total_time += 1000 * (time.time() - start_time) wait_time = 0.5 if wait_time > (time.time() - start_time): time.sleep(0.5 - (time.time() - start_time)) print("本次时间范围内新增弹幕:", len(barrage_list), "条,", "(共计:", data_id_max, ")", "|", "运行时间:", round(total_time / total_num), "毫秒", "(", round(total_time), "/", total_num, ")") if __name__ == "__main__": crawler("神超", "https://www.huya.com/102411", tool.mysql_connect("Barrage"))
continue if "player_list" not in summoner_json["data"]: continue summoner_list = list() for summoner_item in summoner_json["data"]["player_list"]: if "tier_title" not in summoner_item: continue if "name" not in summoner_item: continue if "uuid" not in summoner_item: continue if "ranking" not in summoner_item: continue if "league_points" not in summoner_item: continue summoner_list.append({ "tier": summoner_item["tier_title"], "name": summoner_item["name"], "uuid": summoner_item["uuid"], "area": 1, "ranking": summoner_item["ranking"], "points": summoner_item["league_points"], "period": setting.PERIOD }) mysql.insert("summoner", summoner_list) time.sleep(3) if __name__ == "__main__": crawler(tool.mysql_connect("TFT"))
tool.console("报错", "未提取到热搜热度!") continue # 提取热搜标注 if label_icon := label_item.select_one("tr > td.td-03"): icon = label_icon.text if icon == "荐": # 跳过空热搜(广告热搜) empty_rank += 1 continue else: tool.console("报错", "未提取到热搜标注标签!") continue hot_list.append({ "ranking": ranking, "keyword": keyword, "heat": heat, "icon": icon }) # 将结果写入到数据库 if test: tool.console("测试", "准备写入数据:" + str(hot_list)) return True else: return mysql.insert(table_name=table_name, data=hot_list) if __name__ == "__main__": crawler(test=True, mysql=tool.mysql_connect("CxSpider"), table_name="weibo")
from apscheduler.schedulers.blocking import BlockingScheduler import toolkit as tool from weibo.hot_ranking import crawler as weibo_hot_ranking # 微博热搜榜实时爬虫 if __name__ == "__main__": # 启动MySQL数据库连接 mysql = tool.mysql_connect("CxSpider") # 定义任务框架 scheduler = BlockingScheduler() # 定义BlockingScheduler # 添加爬虫任务 scheduler.add_job(weibo_hot_ranking, "interval", seconds=5 * 60, kwargs={ "test": True, "mysql": mysql, "table_name": "weibo" }) # 微博热搜榜实时爬虫 # 启动任务框架 scheduler.start()
tool.console("报错", "未能在推文反馈Json数据中找到的推文ID:" + str(item.get("tweet_id"))) return None feedback_item = feedback_dict[item.get("tweet_id")] item["tweet_url"] = feedback_item["result"]["data"]["feedback"][ "url"] # 提取:推文Url item["reaction"] = feedback_item["result"]["data"]["feedback"][ "reaction_count"]["count"] # 提取:推文点赞总数 item["comment"] = feedback_item["result"]["data"]["feedback"][ "comment_count"]["total_count"] # 提取:推文评论总数 item["share"] = feedback_item["result"]["data"]["feedback"]["share_count"][ "count"] # 提取:推文分享总数 if __name__ == "__main__": mysql_catalogue = tool.mysql_connect("Huabang(old)") # 构造MySQL数据库连接对象 mysql_saving = tool.mysql_connect("Huabang") # 构造MySQL数据库连接对象 time_start = int( time.mktime(time.strptime("2020-01-29 00:00:00", "%Y-%m-%d %H:%M:%S"))) # 开始时间 time_end = int( time.mktime(time.strptime("2020-01-29 23:59:59", "%Y-%m-%d %H:%M:%S"))) # 结束时间 # 读取榜单媒体名录中的Facebook账号Url列表 media_list = mysql_catalogue.select("media_list", ["media_id", "media_name", "media_fb"]) for media in media_list: # 判断媒体是否有Facebook用户主页Url