def crawler_item(browser, user_name: str, media_id: int, media_name: str, mt,
                 xpath):
    """ 抓取单个账号用户信息

    :param user_name: <str> 账号名称
    :param media_id: <int> 媒体ID
    :param media_name: <str> 媒体名称
    :return: <None> 已将结果存入数据库
    """
    # 使用twitter-scraper包抓取账户信息(关注数+正在关注数可能错误)
    try:
        profile = Profile(user_name)
    except:
        print("账号不存在!")
        return
    writing_item = profile.to_dict()
    writing_item["media_id"] = media_id
    writing_item["media_name"] = media_name

    # 抓取账户粉丝数和正在关注数(Selenium爬虫)
    browser.get("https://twitter.com/" + user_name)
    time.sleep(tool.get_scope_random(12))
    following_count = None
    followers_count = None
    try:
        following_count = browser.find_element_by_xpath(
            xpath["following_count"][0]).get_attribute("title")
        followers_count = browser.find_element_by_xpath(
            xpath["followers_count"][0]).get_attribute("title")
    except:
        try:
            following_count = browser.find_element_by_xpath(
                xpath["following_count"][1]).get_attribute("title")
            followers_count = browser.find_element_by_xpath(
                xpath["followers_count"][1]).get_attribute("title")
        except:
            print("Selenium抓取关注数+正在关注失败!")

    # 依据Selenium爬虫结果修正抓取结果
    if following_count is not None:
        following_count = following_count.replace(",", "")
        print("修正正在关注数量:", writing_item["following_count"], "→",
              following_count)
        writing_item["following_count"] = following_count
    if followers_count is not None:
        followers_count = followers_count.replace(",", "")
        print("修正关注者数量:", writing_item["followers_count"], "→",
              followers_count)
        writing_item["followers_count"] = followers_count

    # 将数据写入到数据库
    writing_list = list()
    writing_list.append(writing_item)
    write_num = mysql.insert_pure(mt, writing_list)
    print("存储记录数:", write_num)
    print(writing_list)
示例#2
0
    def run_match_list(self):
        print("开始运行:场次信息爬虫......")
        # 统计需要抓取的场次ID列表
        need_match_id_list = dict()
        for race_id, match_id_list in self.data_race.items():
            for match_id in match_id_list:
                match_file_name = str(match_id) + ".json"
                if match_file_name not in self.data_list_match:
                    need_match_id_list[match_id] = race_id
        print("需要抓取的场次数量:", len(need_match_id_list))

        num = 1
        for match_id, race_id in need_match_id_list.items():
            print("正在抓取场次:", num, "/", len(need_match_id_list), "(", match_id, "-", race_id, ")")
            num += 1
            # 执行场次请求
            actual_url = self._match_list_url % match_id
            self._match_list_headers["referer"] = self._match_list_referer % race_id
            response = requests.get(actual_url, headers=self._match_list_headers)
            response_json = json.loads(response.content.decode())
            tool.write_json_to_file(os.path.join(self._path_match, str(match_id) + ".json"), response_json)  # 存储日期比赛表
            time.sleep(tool.get_scope_random(5))
示例#3
0
    def run_race_list(self):
        print("开始运行:场次列表爬虫......")
        # 统计需要抓取的比赛ID列表
        need_race_id_list = list()
        for date_name, date_race_list in self.data_date.items():
            for race_item in date_race_list:
                if race_item["race_id"] not in self.data_race:
                    need_race_id_list.append(race_item["race_id"])
        print("需要抓取的比赛数量:", len(need_race_id_list))

        # 抓取需要的比赛数据
        for i in range(len(need_race_id_list)):
            need_race_id = str(need_race_id_list[i])
            print("正在抓取比赛:", i + 1, "/", len(need_race_id_list), "(", need_race_id, ")")
            match_id_list = list()  # 场次ID列表
            response = requests.get(self._race_list_url % need_race_id, headers=self._race_list_headers)
            bs = BeautifulSoup(response.content.decode(), 'lxml')
            game_labels = bs.select("body > div > div.content > div.left > div:nth-child(1) > div > a")
            for game_label in game_labels:
                if game_label.has_attr("data-matchid"):
                    match_id_list.append(game_label["data-matchid"])
            self.data_race[need_race_id] = match_id_list
            tool.write_json_to_file(self._path_race, self.data_race)  # 存储日期比赛表
            time.sleep(tool.get_scope_random(5))
示例#4
0
class WanPlusLolDataSpider:
    def __init__(self, saving_path):
        # 抓取起止时间计算
        self._start_date = datetime.datetime.today() + datetime.timedelta(days=-365)  # 抓取开始日期
        self._end_date = (datetime.datetime.today() + datetime.timedelta(days=-1)).strftime("%Y%m%d")  # 抓取结束日期

        # 数据存储路径
        self._path_saving = saving_path  # 存储文件根目录
        self._path_date = os.path.join(saving_path, "date_list.json")  # 日期列表
        self._path_race = os.path.join(saving_path, "race_list.json")  # 比赛列表
        self._path_match = os.path.join(saving_path, "match")  # 场次信息表

        # 数据存储变量
        self.data_date = {}
        self.data_race = {}
        self.data_list_match = []

        # 请求信息
        self._date_list_url = "https://www.wanplus.com/ajax/schedule/list"  # 列表请求的url
        self._date_list_headers = {
            "accept": "application/json, text/javascript, */*; q=0.01",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "cache-control": "no-cache",
            "content-length": "43",
            "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
            # "cookie": "UM_distinctid=16fd7fecc8499a-0b235e23a42caa-6701b35-1fa400-16fd7fecc856f0; wp_pvid=5198158261; gameType=2; wanplus_token=4cad6a33964b6e7332bbbecf75de892e; wanplus_storage=lf4m67eka3o; wanplus_sid=0d3b16b188a4c93171bc0d023a461bb3; wanplus_csrf=_csrf_tk_278248459; wp_info=ssid=s1273702015; Hm_lvt_f69cb5ec253c6012b2aa449fb925c1c2=1583294862,1585185668,1585185712; Hm_lpvt_f69cb5ec253c6012b2aa449fb925c1c2=1585208145; CNZZDATA1275078652=1738928189-1579872727-null%7C1585208374",
            "origin": "https://www.wanplus.com",
            "pragma": "no-cache",
            "referer": "https://www.wanplus.com/lol/schedule",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
            "x-csrf-token": "345357323",
            "x-requested-with": "XMLHttpRequest"
        }  # 列表请求的headers
        self._date_list_data = {
            "_gtk": "345357323",
            "game": "2",
            "time": "1571500800",
            "eids": "",
        }  # 列表请求的表单数据
        self._race_list_url = "https://www.wanplus.com/schedule/%s.html"  # 比赛请求的url
        self._race_list_headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "cache-control": "no-cache",
            # "cookie": "_uab_collina=157987680590179694225715; UM_distinctid=16fd7fecc8499a-0b235e23a42caa-6701b35-1fa400-16fd7fecc856f0; wp_pvid=5198158261; wanplus_token=4cad6a33964b6e7332bbbecf75de892e; wanplus_storage=lf4m67eka3o; wanplus_sid=0d3b16b188a4c93171bc0d023a461bb3; gameType=2; wanplus_csrf=_csrf_tk_278248459; wp_info=ssid=s8280898516; Hm_lvt_f69cb5ec253c6012b2aa449fb925c1c2=1585185668,1585185712,1585278669,1585474186; CNZZDATA1275078652=1738928189-1579872727-null%7C1585477331; Hm_lpvt_f69cb5ec253c6012b2aa449fb925c1c2=1585478088",
            "pragma": "no-cache",
            "referer": "https://www.wanplus.com/lol/schedule",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "none",
            "sec-fetch-user": "******",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
        }  # 比赛请求的headers
        self._match_list_url = "https://www.wanplus.com/ajax/matchdetail/%s?_gtk=345357323"  # 场次请求的url
        self._match_list_referer = "https://www.wanplus.com/schedule/%s.html"  # 场次请求的headers中referer参数的值
        self._match_list_headers = {
            "accept": "application/json, text/javascript, */*; q=0.01",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "cache-control": "no-cache",
            # "cookie": "UM_distinctid=16fd7fecc8499a-0b235e23a42caa-6701b35-1fa400-16fd7fecc856f0; wp_pvid=5198158261; wanplus_token=4cad6a33964b6e7332bbbecf75de892e; wanplus_storage=lf4m67eka3o; wanplus_sid=0d3b16b188a4c93171bc0d023a461bb3; wanplus_csrf=_csrf_tk_278248459; gameType=2; wp_info=ssid=s1462349516; Hm_lvt_f69cb5ec253c6012b2aa449fb925c1c2=1585185712,1585278669,1585474186,1585693166; CNZZDATA1275078652=1738928189-1579872727-null%7C1585692760; Hm_lpvt_f69cb5ec253c6012b2aa449fb925c1c2=1585695009",
            "pragma": "no-cache",
            "referer": "",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
            "x-csrf-token": "345357323",
            "x-requested-with": "XMLHttpRequest"
        }  # 场次请求的headers

        # 载入已经抓取的数据
        self.load()

    def load(self):
        """载入已经抓取的数据"""
        if os.path.exists(self._path_date):
            self.data_date = tool.load_file_as_json(self._path_date)  # 载入日期比赛表

        if os.path.exists(self._path_race):
            self.data_race = tool.load_file_as_json(self._path_race)  # 载入日期比赛表

        self.data_list_match = os.listdir(self._path_match)  # 载入游戏信息文件列表

    def run_date_list(self):
        print("开始运行:日期列表爬虫......")
        # 统计需要抓取的日期列表
        all_date_list = list()  # 需要获取的日期列表
        curr_date = datetime.datetime.now() + datetime.timedelta(days=-1)
        while curr_date >= self._start_date:
            if (curr_date_str := curr_date.strftime("%Y%m%d")) not in self.data_date:
                all_date_list.append(curr_date_str)
            curr_date += datetime.timedelta(days=-1)
        print("需要抓取的日期总数:", len(all_date_list))

        if len(all_date_list) == 0:  # 若没有需要抓取的日期则结束运行
            return

        # 统计需要追溯的日期所在的周的时间戳
        need_date_list = list()  # 需要抓取的时间戳列表(所有的周日+最早的一天)
        for curr_date_str in all_date_list:
            curr_date = datetime.datetime.strptime(curr_date_str, "%Y%m%d")
            if curr_date.weekday() == 0:  # 若时间戳为周日
                need_date_list.append(curr_date_str)
        need_date_list.append(all_date_list[-1])  # 添加最早的一天
        print("需要抓取的时间戳总数:", len(need_date_list))

        # 依据时间戳抓取比赛数据
        for i in range(len(need_date_list)):
            curr_date_str = need_date_list[i]
            print("正在抓取时间戳:", i + 1, "/", len(need_date_list), "(", curr_date_str, ")")
            self._date_list_data["time"] = curr_date_str  # 列表请求的表单数据
            response = requests.post(self._date_list_url, headers=self._date_list_headers, data=self._date_list_data)
            if response.status_code == 200:
                response_json = json.loads(response.content.decode())
                for curr_date_str, date_infor in response_json["data"]["scheduleList"].items():
                    if curr_date_str not in self.data_date and int(curr_date_str) <= int(self._end_date):
                        self.data_date[curr_date_str] = list()
                        if date_infor["list"]:
                            for match in date_infor["list"]:
                                self.data_date[curr_date_str].append({
                                    "race_id": match["scheduleid"],
                                    "team_a_name": match["oneseedname"],
                                    "team_b_name": match["twoseedname"],
                                    "start_time": match["starttime"],
                                    "team_a_score": match["onewin"],
                                    "team_b_score": match["twowin"],
                                    "contest_name": match["ename"],
                                    "match_name": match["groupname"],
                                    "team_a_score_per": match["oneScore"],
                                    "team_b_score_per": match["twoScore"],
                                })
                tool.write_json_to_file(self._path_date, self.data_date)  # 存储日期比赛表
            time.sleep(tool.get_scope_random(5))
        writing_item["following_count"] = following_count
    if followers_count is not None:
        followers_count = followers_count.replace(",", "")
        print("修正关注者数量:", writing_item["followers_count"], "→",
              followers_count)
        writing_item["followers_count"] = followers_count

    # 将数据写入到数据库
    writing_list = list()
    writing_list.append(writing_item)
    write_num = mysql.insert_pure(mt, writing_list)
    print("存储记录数:", write_num)
    print(writing_list)


if __name__ == "__main__":
    SETTING_PATH = "E:\\【微云工作台】\\数据\\华榜爬虫数据\\media_list_setting.json"
    settings = tool.load_file_as_json(SETTING_PATH)
    mt = mysql.MysqlTable(settings["user_mysql_table_infor"])  # 获取数据表信息

    browser = tool.open_chrome(use_user_dir=False)  # 通过Selenium打开Chrome浏览器

    for media_item in settings["media_list"]:
        print("开始抓取媒体:", media_item[1], "(", media_item[0], ")", "-",
              media_item[3], "(", media_item[2], ")")
        crawler_item(browser, media_item[2], media_item[0], media_item[1], mt,
                     settings["user_xpath"])
        time.sleep(tool.get_scope_random(1))

    browser.close()