示例#1
0
            if barrage_info.get("type") == "NORMAL":
                temp_time = int(label["data-ts"])
                if temp_time >= update_time:
                    update_time = temp_time
                    barrage_info["user_name"] = label["data-uname"]
                    barrage_info["user_id"] = label["data-uid"]
                    barrage_info["content"] = label["data-danmaku"]
                    barrage_list.append(barrage_info)

        mysql.insert_pure(table_name, barrage_list)

        total_num += 1
        total_time += 1000 * (time.time() - start_time)

        wait_time = 0.5
        if wait_time > (time.time() - start_time):
            time.sleep(0.5 - (time.time() - start_time))

        data_id_max += len(barrage_list)

        print("本次时间范围内新增弹幕:", len(barrage_list), "条,", "(共计:", data_id_max,
              ")", "|", "运行时间:", round(total_time / total_num), "毫秒", "(",
              round(total_time), "/", total_num, ")")


if __name__ == "__main__":
    crawler("20191110_LOL世界赛决赛(FPX vs G2)",
            "https://live.bilibili.com/blanc/6?liteVersion=true",
            tool.mysql_connect("Barrage"))
示例#2
0
                tweet_info["likes"] = tweet_likes
                tweet_list.append(tweet_info)

        # 向下滚动到最下面的一条推文
        if last_label_tweet is not None:
            driver.execute_script("arguments[0].scrollIntoView();", last_label_tweet)  # 滑动到推文标签
            time.sleep(1)
        else:
            break

    return tweet_list


if __name__ == "__main__":
    selenium = tool.open_chrome()  # 打开Selenium控制的Chrome浏览器
    mySQL = tool.mysql_connect("Huabang")  # 构造MySQL数据库连接对象

    if "Huabang" in env.DATA and "Media List" in env.DATA["Huabang"]:
        for media_item in env.DATA["Huabang"]["Media List"]:
            # if media_item[0] < 440:
            #     continue
            print("开始抓取媒体:", media_item[1], "(", media_item[0], ")", "-", media_item[3], "(", media_item[2], ")")
            tweet_template = {
                "media_id": media_item[0],
                "media_name": media_item[1],
                "tweet_id": None,
                "is_retweet": 0,
                "time": None,
                "text": None,
                "replies": None,
                "retweets": None,
示例#3
0
                barrage_info["user_name"] = label.select_one(
                    "li > div > span:nth-child(2)").text
                barrage_info["content"] = label.select_one(
                    "li > div > div > span").text
            elif "msg-auditorSys" in category:  # 处理msg-auditorSys类型提示(系统提示)
                barrage_info["type"] = "AS"
                barrage_info["other"] = label.text
            elif "msg-sys" in category:  # 处理msg-sys类型提示(系统提示)
                barrage_info["type"] = "SY"
                barrage_info["other"] = label.text
            else:  # 处理其他类型
                barrage_info.update(type="OT", other="弹幕名称" + category)
            barrage_list.append(barrage_info)

        mysql.insert(table_name, barrage_list)

        total_num += 1
        total_time += 1000 * (time.time() - start_time)

        wait_time = 0.5
        if wait_time > (time.time() - start_time):
            time.sleep(0.5 - (time.time() - start_time))

        print("本次时间范围内新增弹幕:", len(barrage_list), "条,", "(共计:", data_id_max,
              ")", "|", "运行时间:", round(total_time / total_num), "毫秒", "(",
              round(total_time), "/", total_num, ")")


if __name__ == "__main__":
    crawler("神超", "https://www.huya.com/102411", tool.mysql_connect("Barrage"))
示例#4
0
            continue
        if "player_list" not in summoner_json["data"]:
            continue
        summoner_list = list()
        for summoner_item in summoner_json["data"]["player_list"]:
            if "tier_title" not in summoner_item:
                continue
            if "name" not in summoner_item:
                continue
            if "uuid" not in summoner_item:
                continue
            if "ranking" not in summoner_item:
                continue
            if "league_points" not in summoner_item:
                continue
            summoner_list.append({
                "tier": summoner_item["tier_title"],
                "name": summoner_item["name"],
                "uuid": summoner_item["uuid"],
                "area": 1,
                "ranking": summoner_item["ranking"],
                "points": summoner_item["league_points"],
                "period": setting.PERIOD
            })
        mysql.insert("summoner", summoner_list)
        time.sleep(3)


if __name__ == "__main__":
    crawler(tool.mysql_connect("TFT"))
示例#5
0
            tool.console("报错", "未提取到热搜热度!")
            continue

        # 提取热搜标注
        if label_icon := label_item.select_one("tr > td.td-03"):
            icon = label_icon.text
            if icon == "荐":  # 跳过空热搜(广告热搜)
                empty_rank += 1
                continue
        else:
            tool.console("报错", "未提取到热搜标注标签!")
            continue

        hot_list.append({
            "ranking": ranking,
            "keyword": keyword,
            "heat": heat,
            "icon": icon
        })

    # 将结果写入到数据库
    if test:
        tool.console("测试", "准备写入数据:" + str(hot_list))
        return True
    else:
        return mysql.insert(table_name=table_name, data=hot_list)


if __name__ == "__main__":
    crawler(test=True, mysql=tool.mysql_connect("CxSpider"), table_name="weibo")
示例#6
0
from apscheduler.schedulers.blocking import BlockingScheduler

import toolkit as tool
from weibo.hot_ranking import crawler as weibo_hot_ranking  # 微博热搜榜实时爬虫

if __name__ == "__main__":
    # 启动MySQL数据库连接
    mysql = tool.mysql_connect("CxSpider")

    # 定义任务框架
    scheduler = BlockingScheduler()  # 定义BlockingScheduler

    # 添加爬虫任务
    scheduler.add_job(weibo_hot_ranking,
                      "interval",
                      seconds=5 * 60,
                      kwargs={
                          "test": True,
                          "mysql": mysql,
                          "table_name": "weibo"
                      })  # 微博热搜榜实时爬虫

    # 启动任务框架
    scheduler.start()
示例#7
0
        tool.console("报错",
                     "未能在推文反馈Json数据中找到的推文ID:" + str(item.get("tweet_id")))
        return None
    feedback_item = feedback_dict[item.get("tweet_id")]
    item["tweet_url"] = feedback_item["result"]["data"]["feedback"][
        "url"]  # 提取:推文Url
    item["reaction"] = feedback_item["result"]["data"]["feedback"][
        "reaction_count"]["count"]  # 提取:推文点赞总数
    item["comment"] = feedback_item["result"]["data"]["feedback"][
        "comment_count"]["total_count"]  # 提取:推文评论总数
    item["share"] = feedback_item["result"]["data"]["feedback"]["share_count"][
        "count"]  # 提取:推文分享总数


if __name__ == "__main__":
    mysql_catalogue = tool.mysql_connect("Huabang(old)")  # 构造MySQL数据库连接对象
    mysql_saving = tool.mysql_connect("Huabang")  # 构造MySQL数据库连接对象

    time_start = int(
        time.mktime(time.strptime("2020-01-29 00:00:00",
                                  "%Y-%m-%d %H:%M:%S")))  # 开始时间
    time_end = int(
        time.mktime(time.strptime("2020-01-29 23:59:59",
                                  "%Y-%m-%d %H:%M:%S")))  # 结束时间

    # 读取榜单媒体名录中的Facebook账号Url列表
    media_list = mysql_catalogue.select("media_list",
                                        ["media_id", "media_name", "media_fb"])

    for media in media_list:
        # 判断媒体是否有Facebook用户主页Url