示例#1
0
def get_reviews_topk_words(track_id, is_breakout, key):
    conn = MyConn()
    if key == "w_fake":
        col = "feature_words"
    elif key == "wo_fake":
        col = "feature_words_wo_fake"
    elif key == "tfidf":
        col = "feature_words_tfidf"
    elif key == "candidates":
        col = "feature_words_candidates"

    if is_breakout == 1:
        bids = [
            r[0] for r in conn.query(
                sql=
                "SELECT id FROM breakouts WHERE is_valid=1 and simi_score>=0.5 and track_id={}"
                .format(track_id))
        ]
        for bid in bids:
            feature_words = conn.query(
                sql="SELECT {} FROM breakouts_feature_words WHERE id='{}'".
                format(col, bid))
            if feature_words and feature_words[0][0]:
                break
    else:
        feature_words = conn.query(
            sql="SELECT {} FROM no_breakouts_feature_words WHERE track_id={}".
            format(col, track_id))

    if len(feature_words) > 0:
        feature_words = feature_words[0][0].split()
    return feature_words
示例#2
0
def check_breakouts():
    conn = MyConn()
    w2v_model = Word2Vec.load("../models/w2v/c3.mod")
    tracks = conn.query(
        sql="SELECT track_id, json_path FROM sub_tracks WHERE bnum>0")

    for track_id, filepath in tracks[70:]:
        d_reviews_partitions = get_reviews_partitions(filepath,
                                                      w2v_model,
                                                      merge_num=2)
        # print(d_reviews_partitions)
        breakouts = conn.query(table="breakouts",
                               targets=["date", "reviews_num"],
                               conditions={
                                   "track_id": track_id,
                                   "release_drive": 0,
                                   "fake": 0,
                                   "capital_drive": 0
                               })
        if not breakouts: continue

        d_bcount = dict(
            zip(d_reviews_partitions.keys(), [0] * len(d_reviews_partitions)))
        for dt, reviews_num in breakouts:
            date = datetime.strftime(dt, '%Y-%m-%d')
            for k in d_reviews_partitions:
                if k[0] <= date and date <= k[1]:
                    d_bcount[k] += 1
                    break
        print(track_id)
        for k, v in d_bcount.items():
            if v > 0:
                print("{} - {}: {} [count: {}]".format(k[0], k[1],
                                                       d_reviews_partitions[k],
                                                       d_bcount[k]))
示例#3
0
def in_tags_analysis(breakouts_set, no_breakouts_set):
    '''
    对指定的歌曲集的内置tags情况进行分析。
    '''
    tags = open("../data/metadata/自带tags.txt").read().splitlines()
    breakouts_tags_d = {}
    no_breakouts_tags_d = {}
    for t in tags:
        breakouts_tags_d[t] = []
        no_breakouts_tags_d[t] = []

    conn = MyConn()
    for tid in breakouts_set:
        res = conn.query(targets=["tags"], conditions={"track_id":tid})[0]
        for t in res[0].split():
            breakouts_tags_d[t].append(tid)
    for tid in no_breakouts_set:
        res = conn.query(targets=["tags"], conditions={"track_id":tid})[0]
        for t in res[0].split():
            no_breakouts_tags_d[t].append(tid)

    tags_count = []
    for k in breakouts_tags_d:
        tags_count.append((k, (float(format(len(breakouts_tags_d[k])/1748*100,'.2f')), 
                                float(format(len(no_breakouts_tags_d[k])/10,'.2f')))))

    tags_count = sorted(tags_count, key=lambda x:x[1][0], reverse=False)
    draw_bar(dict(tags_count), "../data/main_tagged_tracks/tags_count.html")
示例#4
0
def identify_release_drive_breakouts():
    '''
    找出由于新歌发布导致爆发的样本点(位于最头部)
    '''
    conn = MyConn()
    breakouts = conn.query(targets=["id", "track_id", "date"],
                           table="breakouts")

    release_breakouts_count = 0
    release_breakouts_tracks_set = set()
    more_breakouts_tracks_set = set()
    for b in breakouts:
        track_first_review = conn.query(targets=["first_review"],
                                        conditions={"track_id": b[1]},
                                        fetchall=False)[0]
        if b[2] - track_first_review < datetime.timedelta(days=15):
            release_breakouts_count += 1
            release_breakouts_tracks_set.add(b[1])
            conn.update(table="breakouts",
                        settings={"release_drive": 1},
                        conditions={"id": b[0]})
        else:
            more_breakouts_tracks_set.add(b[1])

    print(release_breakouts_count)
    print(len(release_breakouts_tracks_set))
    print(len(more_breakouts_tracks_set))
示例#5
0
def refine_subtracks():
    '''
    进一步精炼sub_tracks表格。
    + 将只存在非法爆发点的歌曲除去(release_drive / capital drive / fake)
    + 要求存在 beta>=50&&reviews_num>=100 的爆发点
    '''
    conn = MyConn()
    targets = ("id", "track_id", "beta", "reviews_num", "release_drive",
               "capital_drive", "fake")
    breakouts = conn.query(table="breakouts", targets=targets)
    d_track_valid_bnum = {}

    for b in breakouts:
        d_tmp = dict(zip(targets, b))
        if d_tmp["beta"]>=50 and d_tmp["reviews_num"]>=100 and \
            d_tmp["release_drive"]+d_tmp["capital_drive"]+d_tmp["fake"]==0:
            tid = d_tmp["track_id"]
            if tid in d_track_valid_bnum:
                d_track_valid_bnum[tid] += 1
            else:
                d_track_valid_bnum[tid] = 1

    subtracks = [
        r[0]
        for r in conn.query(sql="SELECT track_id FROM sub_tracks WHERE bnum>0")
    ]
    count_valid = 0
    for tid in subtracks:
        if tid in d_track_valid_bnum:
            count_valid += 1
        else:
            # print(tid, end=", ")
            conn.delete(table="sub_tracks", conditions={"track_id": tid})
    print("\n", count_valid)
示例#6
0
def artist_vec_from_tags(min_tags_num=2):
    conn = MyConn()
    artists = conn.query(table="artists", targets=["name", "nid"])
    tracks_artists = conn.query(table="details", targets=["track_id", "artists"])
    d_artist_tracks = {} # 记录歌手对应的歌曲集
    for ar, nid in artists:
        if nid=="0": continue
        d_artist_tracks[ar.lower().strip()] = []

    tracks = set()
    for tid, t_artists in tracks_artists:
        tracks.add(tid)
        t_artists = t_artists.lower().strip().split(",")
        for ar in t_artists:
            if ar in d_artist_tracks:
                d_artist_tracks[ar].append(tid)


    tracks_tags = conn.query(sql="SELECT track_id, tags FROM tracks")
    tags = open("../data_related/自带tags.txt").read().splitlines()
    d_tag_index = dict([(t, i) for i, t in enumerate(tags)])
    d_track_tags_count = {} # 记录歌曲对应的标签集
    for tid, t_tags in tracks_tags:
        if tid not in tracks: continue
        t_vec = np.zeros((len(tags),))
        t_tags = t_tags.split()
        for t in t_tags:
            t_vec[d_tag_index[t]] += 1
        d_track_tags_count[tid] = t_vec

    d_artist_tags_count = {} # 记录歌手对应的标签集
    for ar, ar_tracks in d_artist_tracks.items():
        if len(ar_tracks)==0: continue
        ar_vec = np.sum(np.array([d_track_tags_count[tid] for tid in ar_tracks]), axis=0)
        if np.sum(ar_vec, axis=None)>=min_tags_num:
            d_artist_tags_count[ar] = ar_vec

    artists = list(d_artist_tags_count.keys())
    ar_vecs = list(d_artist_tags_count.values())
    ar_vecs = np.mat(ar_vecs).T

    # scaled_ar_vecs = StandardScaler().fit_transform(ar_vecs) # mean=0, std=1
    scaled_ar_vecs = MinMaxScaler().fit_transform(ar_vecs) # [0,1]
    scaled_ar_vecs = np.mat(scaled_ar_vecs).T

    # 统计
    tags_count = np.sum(np.array(ar_vecs), axis=0)
    # for i in range(len(tags)):
    #     print(tags[i], tags_count[i])
    print(len(artists))

    d_artist_vec = {}
    for i in range(len(artists)):
        d_artist_vec[artists[i]] = np.array(scaled_ar_vecs[i]).ravel()

    # d_artist_vec = dict(zip(artists, scaled_ar_vecs))
    with open("../data/r_minmax_artists_vec_dict.pkl", "wb") as f:
        pickle.dump(d_artist_vec, f)
示例#7
0
def breakouts_complements():
    '''
    对爆发信息进一步补充(用于对爆发分类)
    '''
    conn = MyConn()
    logspace = [(0, 100), (100, 180), (180, 326), (326, 589), (589, 1066),
                (1066, 3494), (3494, 30000)]
    blevel_num = len(logspace)
    logspace_count = dict(zip(logspace, blevel_num * [0]))
    breakout_tracks = [
        r[0] for r in conn.query(targets=["DISTINCT(track_id)"],
                                 table="breakouts",
                                 conditions={"release_drive": 0})
    ]

    for track_id in breakout_tracks:
        reviews_num, first_review, last_review = conn.query(
            targets=["reviews_num", "first_review", "last_review"],
            conditions={"track_id": track_id},
            fetchall=False)
        breakouts = conn.query(
            targets=["flag", "reviews_num", "beta", "release_drive"],
            table="breakouts",
            conditions={"track_id": track_id})
        days_num = (last_review - first_review).days
        # 除去爆发点的平均评论数
        avg_normal = float((reviews_num - np.sum([b[1] for b in breakouts])) /
                           (days_num - len(breakouts)))

        blevel_vec = blevel_num * [0]
        for b in breakouts:
            if b[3] == 1: continue  # 不考虑release_drive爆发
            for i in range(blevel_num):
                if b[2] >= logspace[i][0] and b[2] < logspace[i][1]:  # 考察beta区间
                    blevel_vec[i] += 1
                    logspace_count[logspace[i]] += 1
                    break

        breakouts_num = int(np.sum(blevel_vec))
        blevel = 0
        for i in range(len(blevel_vec)):
            blevel += i * blevel_vec[i]
        blevel = blevel * 1.0 / breakouts_num
        settings = {
            "track_id": track_id,
            "average_reviews_num": avg_normal,
            "blevel_vec": ' '.join(map(str, blevel_vec)),
            "breakouts_num": breakouts_num,
            "blevel": blevel
        }
        conn.insert_or_update(table="breakouts_complements", settings=settings)
        # print(settings)
        print(track_id)
示例#8
0
def get_X(track_id,
          use_mp3,
          use_lyrics,
          use_artist,
          lyrics_d2v_model,
          d_artist_vec,
          music_datatype="mfcc"):
    conn = MyConn()

    rawmusic_path, vggish_embed_path, lyrics_path, artist = conn.query(
        table="sub_tracks",
        conditions={"track_id": track_id},
        fetchall=False,
        targets=[
            "rawmusic_path", "vggish_embed_path", "lyrics_path", "artist"
        ])

    vecs = []
    if use_mp3:
        if music_datatype == "mfcc":
            music_vec = get_mfcc(rawmusic_path).ravel()
        elif music_datatype == "vggish":
            with open(vggish_embed_path, "rb") as f:
                music_vec = pickle.load(f).detach().numpy()
        vecs.append(music_vec)
    if use_lyrics:
        lyrics_vec = get_d2v_vector(lyrics_path, lyrics_d2v_model)
        vecs.append(lyrics_vec)
    if use_artist:
        artist_vec = d_artist_vec[artist.lower().strip()]
        vecs.append(artist_vec)

    features_vec = concatenate_features(vecs)

    return features_vec
示例#9
0
def update_subtracks_music_words():
    conn = MyConn()
    valid_tracks_db = [
        r[0] for r in conn.query(
            sql="SELECT track_id FROM sub_tracks WHERE is_valid=1")
    ]
    with open("../data/reviews_feature_words_with_freqs/breakouts_wo_simi.json"
              ) as f:
        data = json.load(f)
        valid_tracks_pos = list(
            set([bid.split('-')[0] for bid in data if data[bid]["len"] >= 5]))
    with open(
            "../data/reviews_feature_words_with_freqs/no_breakouts_wo_simi.json"
    ) as f:
        data = json.load(f)
        valid_tracks_neg = [str(tid) for tid in data if data[tid]["len"] >= 5]
    valid_tracks = valid_tracks_pos + valid_tracks_neg
    print(len(valid_tracks_db))
    print(len(valid_tracks), len(valid_tracks_pos), len(valid_tracks_neg))
    for tid in valid_tracks_db:
        if tid not in valid_tracks:
            conn.update(table="sub_tracks",
                        settings={"is_valid": 0},
                        conditions={"track_id": tid})
            print(tid)
示例#10
0
def check_feature_words():
    conn = MyConn()
    breakouts_feature_words = Counter()
    res = [
        r[0].split() for r in conn.query(targets=["feature_words"],
                                         table="breakouts_feature_words_1")
    ]
    for r in res:
        breakouts_feature_words.update(r)

    valid_breakouts_feature_words = [
        p[0] for p in filter(lambda x: x[1] >= 30,
                             breakouts_feature_words.most_common())
    ]

    # no_breakouts_feature_words = Counter()
    # res = [r[0].split() for r in conn.query(targets=["feature_words"], table="no_breakouts_feature_words_1")]
    # for r in res:
    #   no_breakouts_feature_words.update(r)

    # valid_no_breakouts_feature_words = [p[0] for p in filter(lambda x:x[1]>=30, no_breakouts_feature_words.most_common())]

    intersection = set(valid_breakouts_feature_words).intersection(
        set(valid_no_breakouts_feature_words))
    print("intersection:\n", intersection)
    print("breakouts_unique:\n",
          set(valid_breakouts_feature_words) - intersection)
    print("no_breakouts_unique:\n",
          set(valid_no_breakouts_feature_words) - intersection)
示例#11
0
def rubbish_tags():
    '''
    + 统计每个爆发点关键词中rubbish_tags的个数
    + 将垃圾标签占比大的样本点作为噪声筛去
    + 将关键词中的垃圾标签删除并上传至数据库
    '''

    rubbish = open("../resources/rubbish_tags.txt").read().splitlines()
    conn = MyConn()

    records = []
    for res in conn.query(targets=["id", "feature_words"],
                          table="breakouts_feature_words_c3"):
        # if conn.query(table="breakouts", targets=["release_drive"], fetchall=False, conditions={"id": res[0]})[0] == 1:
        #     continue
        feature_words = res[1].split()
        rubbish_count = 0
        for w in feature_words:
            if w in rubbish:
                rubbish_count += 1
        records.append([res[0], rubbish_count, feature_words])

    records.sort(key=lambda x: x[1], reverse=True)
    for r in records:
        print(r)
示例#12
0
def update_path(table, key_col, col, root_dir, offset, overwrite=False):
    '''
     在数据库中更新路径
    '''
    conn = MyConn()
    count_update = 0
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if "OS" in file: continue
            filepath = os.path.join(root, file)
            key = file.split('/')[-1][:-offset]
            res = conn.query(table=table,
                             targets=[col],
                             conditions={key_col: key},
                             fetchall=False)
            if overwrite:
                conn.update(table=table,
                            settings={col: filepath},
                            conditions={key_col: key})
                count_update += 1
            else:
                if res and res[0] is None:
                    conn.update(table=table,
                                settings={col: filepath},
                                conditions={key_col: key})
                    count_update += 1
    print(count_update)
示例#13
0
def build_dataset():
    conn = MyConn()
    dataset_size = 1500
    # conditional_sql = "rawmusic_path IS NOT NULL AND language in ('ch', 'en')"
    pos_tracks = conn.query(
        sql=
        "SELECT track_id FROM sub_tracks WHERE valid_bnum>0 AND is_valid=1 LIMIT {}"
        .format(dataset_size))
    neg_tracks = conn.query(
        sql=
        "SELECT track_id FROM sub_tracks WHERE valid_bnum=0 AND is_valid=1 LIMIT {}"
        .format(dataset_size))
    lyrics_d2v_model = Doc2Vec.load("../models/d2v/d2v_b1.mod")  # 歌词d2v模型
    with open("../data/artists_vec_dict_r_minmax.pkl", "rb") as f:
        d_artist_vec = pickle.load(f)

    X, y = [], []
    args = {
        "lyrics_d2v_model": lyrics_d2v_model,
        "d_artist_vec": d_artist_vec,
        "use_mp3": True,
        "use_lyrics": True,
        "use_artist": True,
        "music_datatype": "vggish"
    }

    def add_data(tracks, label):
        for t in tracks:
            try:
                X.append(get_X(track_id=t, **args))
                y.append(label)
            except KeyboardInterrupt:
                print("KeyboardInterrupt")
                break
            except:
                print(label, t)
                print(traceback.format_exc())

    add_data(pos_tracks, 1)
    add_data(neg_tracks, 0)

    dataset_index = "0317_vggish"
    dataset_name = "m"*args["use_mp3"] + "l"*args["use_lyrics"] + "a"*args["use_artist"]\
                    + str(len(pos_tracks)) +'_'+ str(dataset_index)
    with open("../data/dataset/{}.pkl".format(dataset_name), 'wb') as f:
        pickle.dump([X, y], f)
示例#14
0
def chorus_duration_distribution():
    conn = MyConn()
    sql = "SELECT chorus_start, chorus_end FROM tracks WHERE chorus_start IS NOT NULL A ND chorus_end IS NOT NULL"
    res = conn.query(sql=sql)
    res = list(filter(lambda x: x[0] != 0, res))
    print(len(res))
    durations = [p[1] - p[0] for p in res]

    sns.displot(durations)
    plt.show()
示例#15
0
 def get_feature_words_counter(table):
     conn = MyConn()
     counter = Counter()
     res = [
         r[0].split()
         for r in conn.query(targets=["feature_words"], table=table)
     ]
     for r in res:
         counter.update(r)
     return counter
示例#16
0
def update_subtracks_havesimis():
    conn = MyConn()
    valid_tracks = set([
        r[0] for r in conn.query(
            sql="SELECT track_id FROM breakouts WHERE simi_score>=0.5")
    ])
    for tid in valid_tracks:
        conn.update(table="sub_tracks",
                    settings={"have_simis": 1},
                    conditions={"track_id": tid})
示例#17
0
def view_reviews_num_curve_html(track_id, save_dir, min_reviews=200):
    '''
    绘制给定歌曲id的评论数量变化曲线,利用pyecharts实现:
    + 爆发点与时间的对应
    + 爆发点与feature_words的对应
    '''

    conn = MyConn()
    json_path = conn.query(targets=["reviews_path"],
                           conditions={"track_id": track_id})
    if len(json_path) > 0:
        json_path = "/Volumes/nmusic/NetEase2020/data" + json_path[0][0]
    else:
        return None
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    df = get_reviews_df(json_path)
    reviews_count, dates = get_reviews_count(df["date"].values)
    breakouts_group = get_breakouts(reviews_count, min_reviews=min_reviews)
    breakouts = [g[0] for g in breakouts_group]

    x, y = dates, reviews_count
    mark_points = []
    for flag, breakout in enumerate(breakouts):
        feature_words = conn.query(
            table="breakouts_feature_words_c3",
            targets=["filtered_feature_words"],
            conditions={"id": '-'.join([track_id, str(flag)])},
            fetchall=False)[0]
        px, beta = breakout
        mark_points.append(
            opts.MarkPointItem(name="{}{}".format(dates[px], feature_words),
                               coord=[dates[px], reviews_count[px]],
                               value=beta))
    c = (Line().add_xaxis(x).add_yaxis(
        "评论曲线",
        y,
        markpoint_opts=opts.MarkPointOpts(data=mark_points),
    ).set_global_opts(title_opts=opts.TitleOpts(
        title="{}".format(track_id))).render(
            os.path.join(save_dir, "{}.html".format(track_id))))
示例#18
0
def get_reviews_vec(track_id, breakout, w2v_model, key="wo_fake"):
    '''
    指定歌曲获取评论文本向量组
    '''
    conn = MyConn()
    if key == "w_fake":
        col = "feature_words"
    elif key == "wo_fake":
        col = "feature_words_wo_fake"
    elif key == "tfidf":
        col = "feature_words_tfidf"
    elif key == "candidates":
        col = "feature_words_candidates"

    if breakout == 1:
        bids = [
            r[0] for r in conn.query(
                sql=
                "SELECT id FROM breakouts WHERE is_valid=1 and simi_score>=0.5 and track_id={}"
                .format(track_id))
        ]
        for bid in bids:
            feature_words = conn.query(
                sql="SELECT {} FROM breakouts_feature_words WHERE id='{}'".
                format(col, bid))
            if feature_words and feature_words[0][0]:
                break
    else:
        feature_words = conn.query(
            sql="SELECT {} FROM no_breakouts_feature_words WHERE track_id={}".
            format(col, track_id))

    if len(feature_words) > 0:
        feature_words = feature_words[0][0].split()
    # print(breakout, feature_words)
    reviews_vec = []
    for w in feature_words:
        vec = get_w2v_vector(w, w2v_model)
        if vec is not None:
            reviews_vec.append(vec)
    return reviews_vec
示例#19
0
def get_specific_reviews(track_id, date):
    conn = MyConn()
    w2v_model = Word2Vec.load("../models/w2v/b1.mod")
    filepath = "/Volumes/nmusic/NetEase2020/data" + conn.query(
        targets=["reviews_path"],
        conditions={"track_id": track_id},
        fetchall=False)[0]
    df = get_reviews_df(filepath)
    reviews = df[df["date"] == date]["content"].values
    reviews = "\n".join(reviews)
    # print(reviews)
    top_words = tags_extractor(reviews, topk=30, w2v_model=w2v_model)
    print(top_words)
示例#20
0
def get_breakouts_num():
    conn = MyConn()
    breakouts = conn.query(targets=["id", "track_id"], table="breakouts")
    track_2_bnum = {}
    for id_, track_id in breakouts:
        if track_id in track_2_bnum:
            track_2_bnum[track_id] += 1
        else:
            track_2_bnum[track_id] = 1
    for k, v in track_2_bnum.items():
        conn.update(table="sub_tracks",
                    settings={"bnum": v},
                    conditions={"track_id": k})
示例#21
0
def get_tracks_set_db(sql, conditions):
    '''
    从数据库中获取符合条件的歌曲集
    params:
        + sql: 如 'SELECT track_id FROM tracks WHERE have_lyrics=%s'
        + conditions: 如 {"have_lyrics":1}
    return: tracks_set
    '''
    conn = MyConn()
    res = conn.query(sql=sql, conditions=conditions)
    res = set([str(r[0]) for r in res])

    return res
示例#22
0
    def divide_artists():
        '''
        提取出爆发点和非爆发点涉及的歌手。
        save: "breakouts_artists.txt": 爆发点涉及的歌手
        save: "no_breakouts_artists.txt": 非爆发点涉及的歌手
        '''
        conn = MyConn()
        conditions = {"release_drive":0, "capital_drive":0, "fake":0}
        b_tracks = [r[0] for r in conn.query(targets=["distinct(track_id)"], table="breakouts", conditions=conditions)]
        nb_tracks = [r[0] for r in conn.query(targets=["distinct(track_id)"], table="no_breakouts")]

        b_arts, nb_arts = set(), set()
        for t in b_tracks:
            arts = conn.query(targets=["artist"], table="details", conditions={"track_id": t}, fetchall=False)[0].split(",")
            b_arts.update(arts)
        for t in nb_tracks:
            arts = conn.query(targets=["artist"], table="details", conditions={"track_id": t}, fetchall=False)[0].split(",")
            nb_arts.update(arts)

        with open("../data_related/breakouts_artists.txt", 'w') as f:
            f.write("\n".join(b_arts))
        with open("../data_related/no_breakouts_artists.txt", 'w') as f:
            f.write("\n".join(nb_arts))
示例#23
0
def build_train_test_dataset():
    conn = MyConn()
    random.seed(21)
    train_size, test_size = 3000, 1000
    size = train_size + test_size

    breakouts = random.sample([
        r[0] for r in conn.query(targets=["id"],
                                 conditions={
                                     "have_words": 1,
                                     "have_rawmusic": 1
                                 },
                                 table="breakouts")
    ], size)
    breakouts_train, breakouts_test = breakouts[:train_size], breakouts[
        train_size:size + 1]
    no_breakouts = random.sample([
        r[0] for r in conn.query(targets=["id"],
                                 conditions={
                                     "have_words": 1,
                                     "have_rawmusic": 1
                                 },
                                 table="no_breakouts")
    ], size)
    no_breakouts_train, no_breakouts_test = no_breakouts[:
                                                         train_size], no_breakouts[
                                                             train_size:size +
                                                             1]

    with open("../data/dataset/breakouts_id_train_2.txt", 'w') as f:
        f.write('\n'.join(breakouts_train))
    with open("../data/dataset/breakouts_id_test_2.txt", 'w') as f:
        f.write('\n'.join(breakouts_test))
    with open("../data/dataset/no_breakouts_id_train_2.txt", 'w') as f:
        f.write('\n'.join(no_breakouts_train))
    with open("../data/dataset/no_breakouts_id_test_2.txt", 'w') as f:
        f.write('\n'.join(no_breakouts_test))
示例#24
0
def test_my_cluster():
    conn = MyConn()
    w2v_path = "../models/w2v/c4.mod"
    rubbish_tags = open(
        "../resources/rubbish_words_for_weather.txt").read().splitlines()
    w2v_model = Word2Vec.load(w2v_path)

    valid_breakouts = conn.query(
        sql=
        "SELECT id, date, reviews_num FROM breakouts WHERE release_drive=0 AND capital_drive=0 AND fake=0"
    )
    valid_breakouts_info_d = dict(
        zip([p[0] for p in valid_breakouts],
            [(p[1], p[2]) for p in valid_breakouts]))
    breakouts_id_tags_p = conn.query(table="breakouts_feature_words_c3",
                                     targets=["id", "clean_feature_words"])

    tags_pool = []
    for id_, tags in breakouts_id_tags_p:
        if id_ in valid_breakouts_info_d:
            b_date, b_size = valid_breakouts_info_d[id_]
            for t in tags.split():
                if t not in rubbish_tags and w2v_model.wv.__contains__(t):
                    tags_pool.append(Tag(t, b_date, b_size))

    print(len(tags_pool))  # 24796

    cluster_index = "weather"
    my_cluster = ClustersSet(w2v_path=w2v_path, affinity=0.55)
    my_cluster.grow(tags_pool)
    my_cluster.save(
        model_path="../models/my_cluster/my_cluster_{}.pkl".format(
            cluster_index),
        txt_path="../results/my_cluster_{}.txt".format(cluster_index),
        csv_path="../results/my_cluster_{}.csv".format(cluster_index),
        bsizes_csv_path="../results/my_clusters_{}_bsizes.csv".format(
            cluster_index))
示例#25
0
def add_no_breakouts_feature_words_to_db():
    '''
    向表格 no_breakouts_feature_words 中添加数据。
    '''
    conn = MyConn()
    w2v_model = Word2Vec.load("../models/w2v/c4.mod")
    rubbish_words_fake = open(
        "../resources/rubbish_words_fake.txt").read().splitlines()
    candidates = open("../resources/music_words_cbm.txt").read().splitlines()
    # tfidf_model = models.TfidfModel.load("../models/bow/corpora_tfidf.model")
    # dictionary = corpora.Dictionary.load("../models/bow/corpora_dict.dict")
    # stoi = dictionary.token2id
    # itos = dict(zip(stoi.values(), stoi.keys()))
    data = conn.query(sql="SELECT id, track_id, text_path FROM no_breakouts")
    d_data = {}
    for id_, track_id, text_path in data:
        if track_id in d_data:
            d_data[track_id].append((id_, text_path))
        else:
            d_data[track_id] = [(id_, text_path)]
    print(len(d_data))

    for track_id, v in d_data.items():
        try:
            text = ""
            for id_, text_path in v:
                text += open(text_path).read()
            feature_words_mode = "candidates"  # raw, stop, tfidf
            col = "feature_words_candidates"  # feature_words, feature_words_wo_fake, feature_words_tfidf
            feature_words = get_feature_words(text,
                                              topk=10,
                                              mode=feature_words_mode,
                                              w2v_model=w2v_model,
                                              candidates=candidates,
                                              return_freq=True)
            for p in feature_words:
                print("{}:{:.3f}".format(p[0], p[1] * 100), end=" ")
            print()
            if len(feature_words) < 5:
                print(track_id, "not enough words.")
                continue
            # feature_words = " ".join(feature_words)
            # conn.insert(table="no_breakouts_feature_words", settings={"id":id_, "track_id":track_id, col:feature_words})
            # conn.update(table="no_breakouts_feature_words", settings={col:feature_words}, conditions={"track_id":track_id})
        except KeyboardInterrupt:
            break
        except:
            print(track_id)
            print(traceback.format_exc())
示例#26
0
def mark_language():
	'''
	对歌词库中的所有歌曲进行语种标记。
	'''
	conn = MyConn()
	enchant_dict = enchant.Dict("en_US")
	for track_id, lyrics_path in conn.query(sql="SELECT track_id, lyrics_path FROM tracks WHERE lyrics_path is not null"):
		with open(lyrics_path) as f:
			content = json.load(f)
		lyrics = replace_noise(content["lrc"]["lyric"])
		lyrics = re.sub(r"( )*[作词|作曲|编曲|制作人|录音|混母带|监制].*\n", "", lyrics)
		if len(lyrics)<10: # 说明没有东西
			language = "empty"
		language = _mark_language(lyrics, enchant_dict)
		conn.update(table="tracks", settings={"language":language}, conditions={"track_id":track_id})
示例#27
0
def build_tfidf_model(tracks_set):
	'''
	数据来源:breakout_tracks_set, no_breakout_tracks_set
	处理办法:每首歌至多抽取1000条评论(随机),取topk=20构建doc
	'''

	conn = MyConn()
	w2v_model = models.Word2Vec.load("/Users/inkding/Desktop/partial_netease/models/word2vec/b1.mod")
	files = []
	for track_id in tracks_set:
		files.append(conn.query(targets=["text_path"], conditions={"track_id": track_id}, fetchall=False)[0])

	docs = []
	for i, file in enumerate(files):
		print(i)
		# content = open(file).read()[:1000]
		content = open(file).read().splitlines()
		content = random.sample(content, min(100, len(content)))
		content = "\n".join(content)
		docs.append(tags_extractor(content, topk=20, w2v_model=w2v_model))
		if i==50: 
			for d in docs:
				print(d)
			break


	dictionary = corpora.Dictionary(docs)
	bows = [dictionary.doc2bow(doc) for doc in docs]
	tfidf_model = models.TfidfModel(bows)

	dictionary.save('../models/bow/1/corpora_dict.dict') # 重载用corpora.Dictionary.load(path)
	tfidf_model.save('../models/bow/1/corpora_tfidf.model') # 重载用models.TfidfModel.load(path)

	# 获取字典
	stoi = dictionary.token2id
	print("words num:",len(stoi))
	itos = dict(zip(stoi.values(), stoi.keys()))

	# test
	for i in range(20):
		test_doc = docs[i]
		test_bow = dictionary.doc2bow(test_doc)
		# 得到tf-idf表示
		test_tfidf = sorted(tfidf_model[test_bow], key=lambda x:x[1], reverse=True)
		print(test_doc)
		for item in test_tfidf[:5]:
			print(itos[item[0]], item[1])
		print()
示例#28
0
def basic_analysis(tracks_set):
    '''
    对指定的歌曲集进行基本分析:评论数、时间跨度...
    '''
    conn = MyConn()
    # 数据准备
    data = []
    targets = ["track_id", "tags", "reviews_num", "first_review", "last_review"]
    for tid in tracks_set:
        res = conn.query(targets=targets, conditions={"track_id": int(tid)})
        data.append(res[0])
    
    df = pd.DataFrame(data, columns=targets)
    # df.to_csv("../results/main_tagged_tracks/basic_info.csv", encoding="utf_8_sig", index=False)

    draw_hist(df["reviews_num"].values, log_scale=True, color="tab:orange")
示例#29
0
def add_no_breakouts_feature_words_to_json():
    conn = MyConn()
    w2v_model = Word2Vec.load("../models/w2v/c4.mod")
    rubbish_words_fake = open(
        "../resources/rubbish_words_fake.txt").read().splitlines()
    candidates = open("../resources/music_words/music_words_cls_pos_pred.txt"
                      ).read().splitlines()
    # remove = open("../resources/music_words/music_words_similar.txt").read().splitlines()
    # candidates = [w for w in candidates if w not in remove]
    data = conn.query(sql="SELECT id, track_id, text_path FROM no_breakouts")
    d_data = {}
    for id_, track_id, text_path in data:
        if track_id in d_data:
            d_data[track_id].append((id_, text_path))
        else:
            d_data[track_id] = [(id_, text_path)]
    print(len(d_data))

    json_data = {}
    for track_id, v in list(d_data.items()):
        try:
            text = ""
            for id_, text_path in v:
                text += open(text_path).read()
            feature_words_mode = "candidates"  # raw, stop, tfidf
            feature_words = get_feature_words(text,
                                              topk=10,
                                              mode=feature_words_mode,
                                              w2v_model=w2v_model,
                                              candidates=candidates,
                                              return_freq=True)
            words, freqs = zip(*feature_words)
            json_data[track_id] = {
                "words": words,
                "freqs": freqs,
                "len": len(words)
            }
            if len(feature_words) < 5:
                print(track_id, "not enough words.")
        except KeyboardInterrupt:
            break
        except:
            print(track_id)
            print(traceback.format_exc())
    with open("../data/reviews_feature_words_with_freqs/no_breakouts_cls.json",
              'w') as f:
        json.dump(json_data, f, indent=2, ensure_ascii=False)
示例#30
0
def view_reviews_num_curve(track_id, min_reviews=200, save_path=None):
    '''
    绘制给定歌曲id的评论数量变化曲线(标注爆发点)
    '''
    conn = MyConn()

    json_path = conn.query(targets=["reviews_path"],
                           conditions={"track_id": track_id})
    if len(json_path) > 0:
        json_path = "/Volumes/nmusic/NetEase2020/data" + json_path[0][0]
    else:
        return None

    df = get_reviews_df(json_path)
    reviews_count, dates = get_reviews_count(df["date"].values)
    breakouts_group = get_breakouts(reviews_count, min_reviews=min_reviews)

    fig, ax = plt.subplots()
    x = list(range(len(reviews_count)))
    ax.plot(x, reviews_count)
    ax.xaxis.set_major_formatter(plt.NullFormatter())

    palette = plt.get_cmap('Paired')(np.linspace(0, 1, 10))
    y_head, beta_head = [], []
    for i in range(min(len(breakouts_group), 10)):
        x = list(zip(*breakouts_group[i]))[0]
        y = [reviews_count[i] for i in x]
        y_head.append(y[0])
        beta_head.append(breakouts_group[i][0][1])
        ax.scatter(x=x, y=y, color=palette[i])
        ax.xaxis.set_major_formatter(plt.NullFormatter())
    ax.set_xlabel("time")
    ax.set_ylabel("reviews_num")

    # text = '\n'.join(["count:{}, beta:{}".format(y_head[i], beta_head[i])
    #                      for i in range(len(y_head))])
    # ax.text(0, 1, text, verticalalignment="top", horizontalalignment="left", transform=ax.transAxes)

    if save_path:
        if not os.path.exists(os.path.dirname(save_path)):
            os.makedirs(os.path.dirname(save_path))
        plt.savefig(save_path)
    else:
        plt.show()

    plt.close()