示例#1
0
def create_artists_table():
    '''
    在database中创建表格artists,包含id, nid, name。
    '''
    read_path = "/Volumes/nmusic/NetEase2020/data/simple_proxied_tracks_details"
    artists_set = set()
    conn = MyConn()

    for root, dirs, files in os.walk(read_path):
        for file in files:
            if "DS" in file: continue
            filepath = os.path.join(root, file)
            with open(filepath) as f:
                content = json.load(f)
            try:
                for ar in content["songs"][0]["ar"]:
                    artists_set.add((ar["id"], ar["name"]))
            except KeyboardInterrupt:
                print("interrupted by keyboard.")
                sys.exit(0)
            except Exception as e:
                print(filepath, e)


    print(len(artists_set))
    for ar in artists_set:
        conn.insert(table="artists", settings={"nid":ar[0], "name":ar[1]})
示例#2
0
    def task(pid, task_args):
        conn = MyConn()
        w2v_model = Word2Vec.load("../models/w2v/c4.mod")
        while 1:
            task_args["lock"].acquire()
            res = conn.query(targets=["id", "text_path"],
                             conditions={"have_words": 0},
                             table="breakouts",
                             fetchall=False)
            if res is not None:
                id_, text_path = res
                conn.update(table="breakouts",
                            settings={"have_words": 1},
                            conditions={"id": id_})
                task_args["lock"].release()

                try:
                    feature_words = tags_extractor(open(text_path).read(),
                                                   topk=10,
                                                   w2v_model=w2v_model)
                    conn.insert(table="breakouts_feature_words_c3",
                                settings={
                                    "id": id_,
                                    "feature_words": " ".join(feature_words)
                                })

                    # print("[Process-{}] id: {}, feature_words: {}".format(pid, id_, feature_words))
                except:
                    conn.update(table="breakouts",
                                settings={"have_words": 0},
                                conditions={"id": id_})
                    print(id_)
                    print(traceback.format_exc())
                    break

            else:
                task_args["lock"].release()
                break
示例#3
0
def create_subtracks_table():
    '''
    创建sub_tracks表格。
    歌曲筛选条件:
        + 拥有mp3_path,lyrics_path,json_path(reviews)
        + 对于有爆发点的歌,要求爆发点不属于fake,capital_drive,release_drive,爆发点的reviews_num>=100,beta>=50
        + 拥有artist_vec
    '''
    # 读取artists的向量表示
    with open("../data/artists_vec_dict.pkl", "rb") as f:
        d_artist_vec = pickle.load(f)
    conn = MyConn()

    # 拥有mp3_path,lyrics_path,json_path(reviews)
    data = conn.query(
        sql=
        "SELECT track_id, bnum, mp3_path, lyrics_path, json_path FROM tracks WHERE\
                           bnum IS NOT NULL and mp3_path IS NOT NULL and lyrics_path IS NOT NULL and json_path IS NOT NULL"
    )

    # 要求爆发点不属于fake,capital_drive,release_drive,爆发点的reviews_num>=100,beta>=50
    # d_track_valid_bnum记录歌曲中valid_breakouts的数量
    targets = ("id", "track_id", "beta", "reviews_num", "release_drive",
               "capital_drive", "fake")
    breakouts = conn.query(table="breakouts", targets=targets)
    d_track_valid_bnum = {}
    for b in breakouts:
        d_tmp = dict(zip(targets, b))
        if d_tmp["beta"]>=50 and d_tmp["reviews_num"]>=100 and \
            d_tmp["release_drive"]+d_tmp["capital_drive"]+d_tmp["fake"]==0:
            tid = d_tmp["track_id"]
            if tid in d_track_valid_bnum:
                d_track_valid_bnum[tid] += 1
            else:
                d_track_valid_bnum[tid] = 1

    new_data = []
    for item in data:
        track_id, bnum = item[0], item[1]
        # 用valid_breakouts筛选
        valid_bnum = 0
        if bnum > 0:
            if track_id not in d_track_valid_bnum:
                continue
            valid_bnum = d_track_valid_bnum[track_id]

        # 用artist_vec筛选
        valid_artist = None
        artists = conn.query(table="details",
                             targets=["artists"],
                             conditions={"track_id": track_id},
                             fetchall=False)
        if artists:
            artists = artists[0].split(',')
            for ar in artists:
                if ar.lower().strip() in d_artist_vec:
                    valid_artist = ar.lower().strip()
                    break
        if not valid_artist: continue
        new_data.append(
            [track_id, valid_bnum, valid_artist, item[2], item[3], item[4]])

    # 提交至数据库
    columns = ("track_id", "valid_bnum", "artist", "mp3_path", "lyrics_path",
               "json_path")
    for item in new_data:
        conn.insert(table="sub_tracks", settings=dict(zip(columns, item)))