Exemplo n.º 1
0
def audio_crawler(path='songs'):
    """
    音频爬取函数
    :param path: 提供自定义下载路径修改
    :return: None
    """
    # 规定基础路径
    base_dir = dirname(__file__) + "/" + path + "/"
    # 如果路径不存在则创建路径
    if not exists(base_dir):
        mkdir(base_dir)
    # 从12032-20000遍历sid,生成专辑url
    for sid in range(12032, 20000):
        # 拼接专辑url
        url = "https://www.bilibili.com/audio/music-service-c/web/song/of-menu?sid={}&pn=1&ps=100".format(sid)
        res = url_get(url=url, mode="json")
        data = dict_get(res, "data")
        # 如果data为空,则跳过
        if data is None:
            continue
        items = dict_get(data, "data")
        # 获取专辑信息请求
        info_url = "https://www.bilibili.com/audio/music-service-c/web/menu/info?sid={}".format(sid)
        info_get = url_get(url=info_url, mode="json")
        album_title = dict_get(info_get, "title").replace("/", '').replace("<", '').replace(">", '').replace(
            "|", '').replace(":", '').replace("*", '').replace("?", '').replace("\\", '')
        # 如果路径不存在则创建路径
        if not exists(base_dir + album_title):
            mkdir(base_dir + album_title)
        # 遍历专辑下所有音乐
        for item in items:
            author = dict_get(item, "author")  # 歌手
            title = dict_get(item, "title")  # 音乐标题
            sid = dict_get(item, "id")  # 音乐id,用于拼接音乐下载url
            songs_url = "https://www.bilibili.com/audio/music-service-c/web/url?sid={}".format(sid)
            songs_get = url_get(url=songs_url, mode="json")
            file_size = round(dict_get(songs_get, "size") / 1024 / 1024, 2)  # 音频文件大小
            # 分析json中cdns数据,判断音频文件真实地址
            cdns = dict_get(songs_get, "cdns")
            if cdns[0] > cdns[1]:
                real_url = cdns[0]
            else:
                real_url = cdns[1]
            print("Downloading Audio")
            song_file_name = base_dir + album_title + "/" + title + " - " + author + '.m4a'
            # 如果文件已存在,则跳过
            if exists(song_file_name):
                continue
            # 下载音频文件
            song_file_get = url_get(url=real_url, mode="content")
            with open(song_file_name, "wb") as song:
                song.write(song_file_get)
                song.close()
            # 显示进程信息
            print("album_title: {}".format(album_title))
            print("author: {}".format(author))
            print("title: {}".format(title))
            print("file_size: {} MB".format(file_size))
            print("-" * 60)
Exemplo n.º 2
0
def game_crawler():
    database = Database("localhost", "root", "", "bilibili")
    table_name = "game_list"
    game_list_url = "https://game.bilibili.com/gamelist.json"
    game_list_json = url_get(game_list_url, "json")
    for game in game_list_json:
        game_info = {}
        game_info['name'] = dict_get(game, "title")
        game_info['summary'] = dict_get(game, "summary")
        game_info['website'] = dict_get(game, "website")

        if database.execute_sql(table_name=table_name,
                                key="name",
                                value=game_info['name']) != 0:
            print("{} 重复,跳过".format(game_info['name']))
            print("-" * 60)

        if database.execute_sql(table_name=table_name,
                                mode="insert",
                                keys=list(game_info.keys()),
                                values=list(game_info.values())):
            print("游戏名: {}".format(game_info['name']))
            print("游戏介绍: {}".format(game_info['summary']))
            print("游戏官网: {}".format(game_info['website']))
            print("-" * 60)
Exemplo n.º 3
0
def crawler(av):
    database = Database(host="localhost",
                        username="******",
                        password="",
                        db_name="bilibili")
    for av_num in range(av, 48544470):
        url = "https://api.bilibili.com/x/web-interface/view?aid={}".format(
            av_num)
        get_json = url_get(url=url, mode="json")
        if dict_get(get_json, "code") != 0:
            print('错误!没有此视频!av:{}'.format(av_num))
            print('-' * 60)
            continue
        data = {}
        data['video_av'] = str(av_num)
        data['video_up'] = dict_get(get_json, "name")
        data['video_title'] = dict_get(get_json, "title")
        data['video_classification'] = dict_get(get_json, "tname")
        data['video_view'] = dict_get(get_json, "view")
        data['video_share'] = dict_get(get_json, "share")
        data['video_like'] = dict_get(get_json, "like")
        data['video_favorite'] = dict_get(get_json, "favorite")
        data['video_coin'] = dict_get(get_json, "coin")
        data['video_update'] = strftime("%Y-%m-%d %H:%M:%S",
                                        gmtime(dict_get(get_json, "ctime")))
        data['video_reply'] = dict_get(get_json, "reply")
        data['video_danmaku'] = dict_get(get_json, "danmaku")
        video_reprint = dict_get(get_json, "no_reprint")
        if video_reprint == 0:
            data['video_reprint'] = "转载"
        else:
            data['video_reprint'] = "原创"

        db_select = database.execute_sql(table_name="video",
                                         mode="search",
                                         key="video_av",
                                         value=data['video_av'])
        if db_select != 0:
            print('错误!此视频已存在!av:{}'.format(av_num))
            print('-' * 60)
        else:
            if database.execute_sql(table_name="video",
                                    mode="insert",
                                    keys=list(data.keys()),
                                    values=list(data.values())):
                print("视频av号: {}".format(data['video_av']))
                print("作者: {}".format(data['video_up']))
                print("标题: {}".format(data['video_title']))
                print("视频分类: {}".format(data['video_classification']))
                print("观看数: {}".format(data['video_view']))
                print("分享数: {}".format(data['video_share']))
                print("点赞数: {}".format(data['video_like']))
                print("收藏数: {}".format(data['video_favorite']))
                print("投币数: {}".format(data['video_coin']))
                print("上传时间: {}".format(data['video_update']))
                print("评论数: {}".format(data['video_reply']))
                print("弹幕数: {}".format(data['video_danmaku']))
                print("性质: {}".format(data['video_reprint']))
                print("-" * 60)
Exemplo n.º 4
0
def crawler(type_, sort, path='save_picture', page_num=0):
    """
    :param type_: 分类 --> cos or sifu
    :param sort: 排序 --> hot or new
    :param path: 路径(当前目录下)
    :param page_num: 开始页,默认0页开始
    """
    if path != '' and not exists(path):
        mkdir(path)
    base_dir = dirname(__file__) + "/" + path + "/"
    url = "https://api.vc.bilibili.com/link_draw/v2/Photo/list?category={}&type={}&page_num={}&page_size=20".format(
        type_, sort, page_num)
    res = url_get(url=url, mode="json")
    items = dict_get(res, "items")
    if len(items) == 0:
        print("Current page have no any picture, Exit mission!")
        return
    for i in items:
        title = dict_get(i, "title")  # 相簿标题
        up = dict_get(i, "name")  # up主
        directory_name = title.replace("/", '').replace("<", '').replace(
            ">", '').replace("|", '').replace(":", '').replace(
                "*", '').replace("?", '').replace("\\", '') + "-" + up
        if not exists(path + "/" + directory_name):
            mkdir(path + "/" + directory_name)
        picture_list = []  # 存放图片地址
        for picture in dict_get(i, "pictures"):
            picture_list.append(picture['img_src'])
        print("Downloading Pictures")
        for pic in picture_list:
            pic_name = pic.split("/")[-1]
            full_pic_path = base_dir + directory_name + "/" + pic_name
            if not exists(full_pic_path):
                pic_get = url_get(url=pic, mode="content")
                with open(full_pic_path, "wb") as pic_file:
                    pic_file.write(pic_get)
            else:
                continue
        print("current page: {}".format(page_num + 1))
        print("title: {}".format(title))
        print("up: {}".format(up))
        print("picture: {}".format(len(picture_list)))
        print("-" * 60)
    crawler(type_=type_, sort=sort, path=path, page_num=page_num + 1)
Exemplo n.º 5
0
def column_crawler():
    database = Database("localhost", "root", "", "bilibili")
    table_name = "zhuanlan"
    cid_dict = {
        "动画": 2,
        "游戏": 1,
        "影视": 28,
        "生活": 3,
        "兴趣": 29,
        "轻小说": 16,
        "科技": 17,
    }
    for v in cid_dict.values():
        pn = 1
        while True:
            column_url = "https://api.bilibili.com/x/article/recommends?cid={}&pn={}&ps=100&sort=0".format(
                v, pn)
            column_get = url_get(column_url, mode="json")
            column_data = dict_get(column_get, "data")
            if len(column_data) == 0:
                print(pn)
                break
            for item in column_data:
                data = {}
                author_info = dict_get(item, "author")  # 作者信息
                data['author_mid'] = author_info['mid']  # 作者id
                data['author_name'] = author_info['name']  # 作者用户名
                data['category'] = dict_get(item, "category")['name']  # 所属分类
                data['update_time'] = strftime(
                    "%Y-%m-%d %H:%M:%S",
                    localtime(dict_get(item, 'update_time')))  # 上传时间
                data['art_id'] = dict_get(
                    item, "id"
                )  # 文章id,如果需要爬取文章内容可以拼接url: https://www.bilibili.com/read/cv[文章id]
                data['art_title'] = dict_get(item, "title")  # 文章标题
                data['art_words'] = dict_get(item, "words")  # 文章字数
                data['art_like'] = dict_get(item, "like")  # 文章点赞数
                data['art_reply'] = dict_get(item, "reply")  # 文章评论数
                data['art_view'] = dict_get(item, "view")  # 文章浏览数
                data['art_favorite'] = dict_get(item, "favorite")  # 文章收藏数
                data['art_coin'] = dict_get(item, "coin")  # 文章投币数
                data['art_share'] = dict_get(item, "share")  # 文章分享数
                data['art_summary'] = dict_get(item, "summary")  # 文章摘要
                data['crawl_time'] = strftime("%Y-%m-%d %H:%M:%S",
                                              localtime())  # 爬取时间

                if database.execute_sql(table_name=table_name,
                                        select="id",
                                        key="art_id",
                                        value=data['art_id']) != 0:
                    print("id:{} 重复,跳过".format(data['art_id']))
                    print("-" * 60)
                    # pn += 1
                    continue
                if database.execute_sql(table_name=table_name,
                                        mode="insert",
                                        keys=list(data.keys()),
                                        values=list(data.values())):
                    print("作者id: {}".format(data['author_mid']))
                    print("作者用户名: {}".format(data['author_name']))
                    print("所属分类: {}".format(data['category']))
                    print("上传时间: {}".format(data['update_time']))
                    print("文章id: {}".format(data['art_id']))
                    print("文章标题: {}".format(data['art_title']))
                    print("文章字数: {}".format(data['art_words']))
                    print("文章点赞数: {}".format(data['art_like']))
                    print("文章评论数: {}".format(data['art_reply']))
                    print("文章浏览数: {}".format(data['art_view']))
                    print("文章收藏数: {}".format(data['art_favorite']))
                    print("文章投币数: {}".format(data['art_coin']))
                    print("文章分享数: {}".format(data['art_share']))
                    print("文章摘要: {}".format(data['art_summary']))
                    print("爬取时间: {}".format(data['crawl_time']))
                    print("-" * 60)
                else:
                    print("id:{} 异常,跳过".format(data['art_id']))
                    print("-" * 60)
                    # pn += 1
                    continue
            pn += 1
Exemplo n.º 6
0
def member_crawler(mid):
    database = Database("localhost", "root", "", "bilibili")
    while True:
        follow_url = "https://api.bilibili.com/x/relation/stat?vmid={}".format(
            mid)
        view_url = "https://api.bilibili.com/x/space/upstat?mid={}".format(mid)
        info_url = "https://api.bilibili.com/x/space/acc/info?mid={}".format(
            mid)
        tag_url = "https://space.bilibili.com/ajax/member/getTags?mids={}".format(
            mid)
        charging_url = "https://elec.bilibili.com/api/query.rank.do?mid={}".format(
            mid)
        upload_data_url = "https://api.bilibili.com/x/space/navnum?mid={}".format(
            mid)
        try:
            member_info = url_get(info_url, mode='json')
            username = dict_get(member_info, "name")
            if username is None:
                print("该会员不存在, 跳过 {}".format(mid))
                print("-" * 60)
                mid += 1
                continue
            level = dict_get(member_info, "level")
            member_id = dict_get(member_info, "mid")
            sex = dict_get(member_info, "sex")
            coins = dict_get(member_info, "coins")
            official_data = dict_get(member_info, "official")
            follow_data = url_get(follow_url, mode="json")
            following = dict_get(follow_data, 'following')
            follower = dict_get(follow_data, 'follower')
            view = dict_get(url_get(view_url, mode="json"), "view")

            if official_data['role'] == 1:
                official = official_data['title']
            else:
                official = "暂无认证"
            birthday = dict_get(member_info, "birthday")
            sign = dict_get(member_info, "sign")
            vip = dict_get(member_info, "status")
            if vip == 1:
                vip_status = "是"
            else:
                vip_status = "否"
            tag = ''
            for x in dict_get(url_get(tag_url, mode="json"), "tags"):
                tag += x + ' '
            charging = dict_get(url_get(charging_url, mode="json"),
                                "total_count")
            video_upload = dict_get(url_get(upload_data_url, mode="json"),
                                    "video")

            if database.execute_sql(table_name="member",
                                    mode="search",
                                    key="member_id",
                                    value=member_id) != 0:
                print("该会员已存在, 跳过 {}".format(member_id))
                print("-" * 60)
                mid += 1
                continue

            insert_data = {
                "member_id": member_id,
                "username": username,
                "sex": sex,
                "birthday": birthday,
                "level": level,
                "coins": coins,
                "sign": sign,
                "charging": charging,
                "video_upload": video_upload,
                "tag": tag,
                "vip_status": vip_status,
                "official": official,
                "following": following,
                "follower": follower,
                "view": view,
            }

            if database.execute_sql(mode="insert",
                                    table_name="member",
                                    keys=list(insert_data.keys()),
                                    values=list(insert_data.values())):
                print("用户id: {}".format(member_id))
                print("用户名: {}".format(username))
                print("性别: {}".format(sex))
                print("生日: {}".format(birthday))
                print("等级: {}".format(level))
                print("B币: {}".format(coins))
                print("个人签名: {}".format(sign))
                print("充电人数: {}".format(charging))
                print("视频数量: {}".format(video_upload))
                print("标签: {}".format(tag))
                print("B站大会员: {}".format(vip_status))
                print("Bilibili认证: {}".format(official))
                print("关注数: {}".format(following))
                print("粉丝数: {}".format(follower))
                print("播放量: {}".format(view))
                print("-" * 60)
            mid += 1
        except Exception as e:
            print("错误, 跳过 mid={}".format(mid))
            print(e)
            print("-" * 60)
            mid += 1
            continue
Exemplo n.º 7
0
def rank_crawler():
    # 保存目录
    save_path = "rank"
    # 如果目录不存在则创建目录
    if not exists(save_path):
        mkdir(save_path)
    # rid字典
    rid_dict = {
        "全站": 0,
        "动画": 1,
        "国创相关": 168,
        "音乐": 3,
        "舞蹈": 129,
        "游戏": 4,
        "科技": 36,
        "数码": 188,
        "生活": 160,
        "鬼畜": 119,
        "时尚": 155,
        "娱乐": 5,
        "影视": 181,
    }
    # 排行时间字典
    day_dict = {
        "日排行": 1,
        "三日排行": 3,
        "周排行": 7,
        "月排行": 30,
    }
    # 遍历rid字典
    for k, v in rid_dict.items():
        rid = v
        # 遍历排行时间字典
        for k2, v2 in day_dict.items():
            day = v2
            # 拼接url
            url = "https://api.bilibili.com/x/web-interface/ranking?rid={}&day={}".format(rid, day)
            res = url_get(url=url, mode="json")
            rank_list = dict_get(res, "list")
            for i in range(len(rank_list)):
                aid = dict_get(rank_list[i], "aid")  # 视频id
                author = dict_get(rank_list[i], "author")  # up主
                coins = dict_get(rank_list[i], "coins")  # 投币数
                play = dict_get(rank_list[i], "play")  # 播放数
                pts = dict_get(rank_list[i], "pts")  # 综合得分
                title = dict_get(rank_list[i], "title")  # 视频标题
                video_review = dict_get(rank_list[i], "video_review")  # 视频弹幕数(?)
                no_reprint = dict_get(rank_list[i], "no_reprint")
                if no_reprint == 1:  # 判断是否原创
                    reprint = "原创"
                else:
                    reprint = "转载"

                # 将数据保存到txt文件中,也可以导入functions.database包将数据保存到数据库中
                with open("{}/Bilibili-{}-{}.txt".format(save_path, k, k2), "a+", encoding="utf-8") as data_file:
                    data_file.write("排名: {}\n".format(i + 1))
                    data_file.write("视频id: {}\n".format(aid))
                    data_file.write("up主: {}\n".format(author))
                    data_file.write("投币数: {}\n".format(coins))
                    data_file.write("播放数: {}\n".format(play))
                    data_file.write("综合得分: {}\n".format(pts))
                    data_file.write("视频标题: {}\n".format(title))
                    data_file.write("视频弹幕数: {}\n".format(video_review))
                    data_file.write("是否原创: {}\n".format(reprint))
                    data_file.write("-" * 60 + "\n")
                    data_file.close()

                # 打印进程显示
                print("排名: {}".format(i + 1))
                print("视频id: {}".format(aid))
                print("up主: {}".format(author))
                print("投币数: {}".format(coins))
                print("播放数: {}".format(play))
                print("综合得分: {}".format(pts))
                print("视频标题: {}".format(title))
                print("视频弹幕数: {}".format(video_review))
                print("是否原创: {}".format(reprint))
                print("-" * 60)
Exemplo n.º 8
0
def micro_video_crawler(order='', page_num=1):
    """
    :param order: 排序方式,new为按照视频上传时间排序,默认为系统推荐
    """
    database = Database("localhost", "root", "", "bilibili")
    table_name = "micro_video"
    classification = []
    # 获取所有分类
    classification_url = "https://api.vc.bilibili.com/clip/v1/video/zonelist?page=total"
    classification_json = url_get(classification_url, "json")
    classification_data = dict_get(classification_json, "data")
    for i in classification_data:
        if classification_data[i] == '':
            continue
        for j in classification_data[i]['tags']:
            classification.append(j)

    for tag in classification:
        ps = 50  # page_size最大50
        pn = page_num  # 开始页,调用时可自定义
        while True:
            next_offset = (pn - 1) * ps
            micro_video_url = "https://api.vc.bilibili.com/clip/v1/video/search?" \
                              "page_size={}&need_playurl=0&next_offset={}&order={}" \
                              "&tag={}".format(ps, next_offset, order, tag)
            micro_video_json = url_get(micro_video_url, "json")
            items = dict_get(micro_video_json, "items")
            if len(items) == 0:
                break
            for item in items:
                video_info = {"tag": tag}
                video_info['title'] = dict_get(item, "description").replace("\n", "")  # 视频标题
                video_info['video_id'] = dict_get(item, "id")  # 视频id
                video_info['reply'] = dict_get(item, "reply")  # 视频评论数
                video_info['upload_time'] = dict_get(item, "upload_time")  # 视频上传时间
                video_info['video_size'] = round(float(dict_get(item, "video_size")) / 1024**2, 2)  # 视频文件大小,单位mb(float)
                video_info['video_time'] = dict_get(item, "video_time")  # 视频时长,单位s
                video_info['video_playurl'] = dict_get(item, "video_playurl")  # 视频播放地址
                video_info['watched_num'] = dict_get(item, "watched_num")  # 视频播放数
                video_info['name'] = dict_get(item, "name")  # 上传者用户名
                video_info['uid'] = dict_get(item, "uid")  # 上传者uid

                # 如果需要下载视频,请把下面注释去掉
                # video_content = url_get(video_info['video_playurl'], "content")  # 获取视频内容
                # video_file_name = video_info['title'][:30].replace("/", '').replace("<", '').replace(">", '').replace(
                #     "|", '').replace(":", '').replace("*", '').replace("?", '').replace("\\", '') + ".mp4"  # 拼接视频文件名
                # # 保存视频
                # with open(video_file_name, "wb") as video_file:
                #     video_file.write(video_content)
                #     video_file.close()

                # 如果不需要插入数据库,请把下面部分注释掉
                if database.execute_sql(table_name=table_name, key="video_id", value=video_info['video_id']) != 0:
                    print("视频id:{} 重复,跳过".format(video_info['video_id']))
                    print("-" * 60)
                    continue
                if database.execute_sql(table_name=table_name, mode="insert",
                                        keys=list(video_info.keys()), values=list(video_info.values())):
                    print("视频标题: {}".format(video_info['title']))
                    print("视频id: {}".format(video_info['video_id']))
                    print("视频评论数: {}".format(video_info['reply']))
                    print("视频上传时间: {}".format(video_info['upload_time']))
                    print("视频大小(mb): {}".format(video_info['video_size']))
                    print("视频时长: {}".format(video_info['video_time']))
                    print("视频播放地址: {}".format(video_info['video_playurl']))
                    print("视频观看数: {}".format(video_info['watched_num']))
                    print("上传者用户名: {}".format(video_info['name']))
                    print("上传者id: {}".format(video_info['uid']))
                    print("-" * 60)
            pn += 1