def get_X(track_id, use_mp3, use_lyrics, use_artist, lyrics_d2v_model, d_artist_vec, music_datatype="mfcc"): conn = MyConn() rawmusic_path, vggish_embed_path, lyrics_path, artist = conn.query( table="sub_tracks", conditions={"track_id": track_id}, fetchall=False, targets=[ "rawmusic_path", "vggish_embed_path", "lyrics_path", "artist" ]) vecs = [] if use_mp3: if music_datatype == "mfcc": music_vec = get_mfcc(rawmusic_path).ravel() elif music_datatype == "vggish": with open(vggish_embed_path, "rb") as f: music_vec = pickle.load(f).detach().numpy() vecs.append(music_vec) if use_lyrics: lyrics_vec = get_d2v_vector(lyrics_path, lyrics_d2v_model) vecs.append(lyrics_vec) if use_artist: artist_vec = d_artist_vec[artist.lower().strip()] vecs.append(artist_vec) features_vec = concatenate_features(vecs) return features_vec
def get_reviews_topk_words(track_id, is_breakout, key): conn = MyConn() if key == "w_fake": col = "feature_words" elif key == "wo_fake": col = "feature_words_wo_fake" elif key == "tfidf": col = "feature_words_tfidf" elif key == "candidates": col = "feature_words_candidates" if is_breakout == 1: bids = [ r[0] for r in conn.query( sql= "SELECT id FROM breakouts WHERE is_valid=1 and simi_score>=0.5 and track_id={}" .format(track_id)) ] for bid in bids: feature_words = conn.query( sql="SELECT {} FROM breakouts_feature_words WHERE id='{}'". format(col, bid)) if feature_words and feature_words[0][0]: break else: feature_words = conn.query( sql="SELECT {} FROM no_breakouts_feature_words WHERE track_id={}". format(col, track_id)) if len(feature_words) > 0: feature_words = feature_words[0][0].split() return feature_words
def rubbish_tags(): ''' + 统计每个爆发点关键词中rubbish_tags的个数 + 将垃圾标签占比大的样本点作为噪声筛去 + 将关键词中的垃圾标签删除并上传至数据库 ''' rubbish = open("../resources/rubbish_tags.txt").read().splitlines() conn = MyConn() records = [] for res in conn.query(targets=["id", "feature_words"], table="breakouts_feature_words_c3"): # if conn.query(table="breakouts", targets=["release_drive"], fetchall=False, conditions={"id": res[0]})[0] == 1: # continue feature_words = res[1].split() rubbish_count = 0 for w in feature_words: if w in rubbish: rubbish_count += 1 records.append([res[0], rubbish_count, feature_words]) records.sort(key=lambda x: x[1], reverse=True) for r in records: print(r)
def create_artists_table(): ''' 在database中创建表格artists,包含id, nid, name。 ''' read_path = "/Volumes/nmusic/NetEase2020/data/simple_proxied_tracks_details" artists_set = set() conn = MyConn() for root, dirs, files in os.walk(read_path): for file in files: if "DS" in file: continue filepath = os.path.join(root, file) with open(filepath) as f: content = json.load(f) try: for ar in content["songs"][0]["ar"]: artists_set.add((ar["id"], ar["name"])) except KeyboardInterrupt: print("interrupted by keyboard.") sys.exit(0) except Exception as e: print(filepath, e) print(len(artists_set)) for ar in artists_set: conn.insert(table="artists", settings={"nid":ar[0], "name":ar[1]})
def check_breakouts(): conn = MyConn() w2v_model = Word2Vec.load("../models/w2v/c3.mod") tracks = conn.query( sql="SELECT track_id, json_path FROM sub_tracks WHERE bnum>0") for track_id, filepath in tracks[70:]: d_reviews_partitions = get_reviews_partitions(filepath, w2v_model, merge_num=2) # print(d_reviews_partitions) breakouts = conn.query(table="breakouts", targets=["date", "reviews_num"], conditions={ "track_id": track_id, "release_drive": 0, "fake": 0, "capital_drive": 0 }) if not breakouts: continue d_bcount = dict( zip(d_reviews_partitions.keys(), [0] * len(d_reviews_partitions))) for dt, reviews_num in breakouts: date = datetime.strftime(dt, '%Y-%m-%d') for k in d_reviews_partitions: if k[0] <= date and date <= k[1]: d_bcount[k] += 1 break print(track_id) for k, v in d_bcount.items(): if v > 0: print("{} - {}: {} [count: {}]".format(k[0], k[1], d_reviews_partitions[k], d_bcount[k]))
def check_feature_words(): conn = MyConn() breakouts_feature_words = Counter() res = [ r[0].split() for r in conn.query(targets=["feature_words"], table="breakouts_feature_words_1") ] for r in res: breakouts_feature_words.update(r) valid_breakouts_feature_words = [ p[0] for p in filter(lambda x: x[1] >= 30, breakouts_feature_words.most_common()) ] # no_breakouts_feature_words = Counter() # res = [r[0].split() for r in conn.query(targets=["feature_words"], table="no_breakouts_feature_words_1")] # for r in res: # no_breakouts_feature_words.update(r) # valid_no_breakouts_feature_words = [p[0] for p in filter(lambda x:x[1]>=30, no_breakouts_feature_words.most_common())] intersection = set(valid_breakouts_feature_words).intersection( set(valid_no_breakouts_feature_words)) print("intersection:\n", intersection) print("breakouts_unique:\n", set(valid_breakouts_feature_words) - intersection) print("no_breakouts_unique:\n", set(valid_no_breakouts_feature_words) - intersection)
def in_tags_analysis(breakouts_set, no_breakouts_set): ''' 对指定的歌曲集的内置tags情况进行分析。 ''' tags = open("../data/metadata/自带tags.txt").read().splitlines() breakouts_tags_d = {} no_breakouts_tags_d = {} for t in tags: breakouts_tags_d[t] = [] no_breakouts_tags_d[t] = [] conn = MyConn() for tid in breakouts_set: res = conn.query(targets=["tags"], conditions={"track_id":tid})[0] for t in res[0].split(): breakouts_tags_d[t].append(tid) for tid in no_breakouts_set: res = conn.query(targets=["tags"], conditions={"track_id":tid})[0] for t in res[0].split(): no_breakouts_tags_d[t].append(tid) tags_count = [] for k in breakouts_tags_d: tags_count.append((k, (float(format(len(breakouts_tags_d[k])/1748*100,'.2f')), float(format(len(no_breakouts_tags_d[k])/10,'.2f'))))) tags_count = sorted(tags_count, key=lambda x:x[1][0], reverse=False) draw_bar(dict(tags_count), "../data/main_tagged_tracks/tags_count.html")
def update_subtracks_music_words(): conn = MyConn() valid_tracks_db = [ r[0] for r in conn.query( sql="SELECT track_id FROM sub_tracks WHERE is_valid=1") ] with open("../data/reviews_feature_words_with_freqs/breakouts_wo_simi.json" ) as f: data = json.load(f) valid_tracks_pos = list( set([bid.split('-')[0] for bid in data if data[bid]["len"] >= 5])) with open( "../data/reviews_feature_words_with_freqs/no_breakouts_wo_simi.json" ) as f: data = json.load(f) valid_tracks_neg = [str(tid) for tid in data if data[tid]["len"] >= 5] valid_tracks = valid_tracks_pos + valid_tracks_neg print(len(valid_tracks_db)) print(len(valid_tracks), len(valid_tracks_pos), len(valid_tracks_neg)) for tid in valid_tracks_db: if tid not in valid_tracks: conn.update(table="sub_tracks", settings={"is_valid": 0}, conditions={"track_id": tid}) print(tid)
def artist_vec_from_tags(min_tags_num=2): conn = MyConn() artists = conn.query(table="artists", targets=["name", "nid"]) tracks_artists = conn.query(table="details", targets=["track_id", "artists"]) d_artist_tracks = {} # 记录歌手对应的歌曲集 for ar, nid in artists: if nid=="0": continue d_artist_tracks[ar.lower().strip()] = [] tracks = set() for tid, t_artists in tracks_artists: tracks.add(tid) t_artists = t_artists.lower().strip().split(",") for ar in t_artists: if ar in d_artist_tracks: d_artist_tracks[ar].append(tid) tracks_tags = conn.query(sql="SELECT track_id, tags FROM tracks") tags = open("../data_related/自带tags.txt").read().splitlines() d_tag_index = dict([(t, i) for i, t in enumerate(tags)]) d_track_tags_count = {} # 记录歌曲对应的标签集 for tid, t_tags in tracks_tags: if tid not in tracks: continue t_vec = np.zeros((len(tags),)) t_tags = t_tags.split() for t in t_tags: t_vec[d_tag_index[t]] += 1 d_track_tags_count[tid] = t_vec d_artist_tags_count = {} # 记录歌手对应的标签集 for ar, ar_tracks in d_artist_tracks.items(): if len(ar_tracks)==0: continue ar_vec = np.sum(np.array([d_track_tags_count[tid] for tid in ar_tracks]), axis=0) if np.sum(ar_vec, axis=None)>=min_tags_num: d_artist_tags_count[ar] = ar_vec artists = list(d_artist_tags_count.keys()) ar_vecs = list(d_artist_tags_count.values()) ar_vecs = np.mat(ar_vecs).T # scaled_ar_vecs = StandardScaler().fit_transform(ar_vecs) # mean=0, std=1 scaled_ar_vecs = MinMaxScaler().fit_transform(ar_vecs) # [0,1] scaled_ar_vecs = np.mat(scaled_ar_vecs).T # 统计 tags_count = np.sum(np.array(ar_vecs), axis=0) # for i in range(len(tags)): # print(tags[i], tags_count[i]) print(len(artists)) d_artist_vec = {} for i in range(len(artists)): d_artist_vec[artists[i]] = np.array(scaled_ar_vecs[i]).ravel() # d_artist_vec = dict(zip(artists, scaled_ar_vecs)) with open("../data/r_minmax_artists_vec_dict.pkl", "wb") as f: pickle.dump(d_artist_vec, f)
def update_subtracks_havesimis(): conn = MyConn() valid_tracks = set([ r[0] for r in conn.query( sql="SELECT track_id FROM breakouts WHERE simi_score>=0.5") ]) for tid in valid_tracks: conn.update(table="sub_tracks", settings={"have_simis": 1}, conditions={"track_id": tid})
def chorus_duration_distribution(): conn = MyConn() sql = "SELECT chorus_start, chorus_end FROM tracks WHERE chorus_start IS NOT NULL A ND chorus_end IS NOT NULL" res = conn.query(sql=sql) res = list(filter(lambda x: x[0] != 0, res)) print(len(res)) durations = [p[1] - p[0] for p in res] sns.displot(durations) plt.show()
def get_feature_words_counter(table): conn = MyConn() counter = Counter() res = [ r[0].split() for r in conn.query(targets=["feature_words"], table=table) ] for r in res: counter.update(r) return counter
def get_breakouts_num(): conn = MyConn() breakouts = conn.query(targets=["id", "track_id"], table="breakouts") track_2_bnum = {} for id_, track_id in breakouts: if track_id in track_2_bnum: track_2_bnum[track_id] += 1 else: track_2_bnum[track_id] = 1 for k, v in track_2_bnum.items(): conn.update(table="sub_tracks", settings={"bnum": v}, conditions={"track_id": k})
def get_tracks_set_db(sql, conditions): ''' 从数据库中获取符合条件的歌曲集 params: + sql: 如 'SELECT track_id FROM tracks WHERE have_lyrics=%s' + conditions: 如 {"have_lyrics":1} return: tracks_set ''' conn = MyConn() res = conn.query(sql=sql, conditions=conditions) res = set([str(r[0]) for r in res]) return res
def get_specific_reviews(track_id, date): conn = MyConn() w2v_model = Word2Vec.load("../models/w2v/b1.mod") filepath = "/Volumes/nmusic/NetEase2020/data" + conn.query( targets=["reviews_path"], conditions={"track_id": track_id}, fetchall=False)[0] df = get_reviews_df(filepath) reviews = df[df["date"] == date]["content"].values reviews = "\n".join(reviews) # print(reviews) top_words = tags_extractor(reviews, topk=30, w2v_model=w2v_model) print(top_words)
def add_no_breakouts_feature_words_to_db(): ''' 向表格 no_breakouts_feature_words 中添加数据。 ''' conn = MyConn() w2v_model = Word2Vec.load("../models/w2v/c4.mod") rubbish_words_fake = open( "../resources/rubbish_words_fake.txt").read().splitlines() candidates = open("../resources/music_words_cbm.txt").read().splitlines() # tfidf_model = models.TfidfModel.load("../models/bow/corpora_tfidf.model") # dictionary = corpora.Dictionary.load("../models/bow/corpora_dict.dict") # stoi = dictionary.token2id # itos = dict(zip(stoi.values(), stoi.keys())) data = conn.query(sql="SELECT id, track_id, text_path FROM no_breakouts") d_data = {} for id_, track_id, text_path in data: if track_id in d_data: d_data[track_id].append((id_, text_path)) else: d_data[track_id] = [(id_, text_path)] print(len(d_data)) for track_id, v in d_data.items(): try: text = "" for id_, text_path in v: text += open(text_path).read() feature_words_mode = "candidates" # raw, stop, tfidf col = "feature_words_candidates" # feature_words, feature_words_wo_fake, feature_words_tfidf feature_words = get_feature_words(text, topk=10, mode=feature_words_mode, w2v_model=w2v_model, candidates=candidates, return_freq=True) for p in feature_words: print("{}:{:.3f}".format(p[0], p[1] * 100), end=" ") print() if len(feature_words) < 5: print(track_id, "not enough words.") continue # feature_words = " ".join(feature_words) # conn.insert(table="no_breakouts_feature_words", settings={"id":id_, "track_id":track_id, col:feature_words}) # conn.update(table="no_breakouts_feature_words", settings={col:feature_words}, conditions={"track_id":track_id}) except KeyboardInterrupt: break except: print(track_id) print(traceback.format_exc())
def mark_language(): ''' 对歌词库中的所有歌曲进行语种标记。 ''' conn = MyConn() enchant_dict = enchant.Dict("en_US") for track_id, lyrics_path in conn.query(sql="SELECT track_id, lyrics_path FROM tracks WHERE lyrics_path is not null"): with open(lyrics_path) as f: content = json.load(f) lyrics = replace_noise(content["lrc"]["lyric"]) lyrics = re.sub(r"( )*[作词|作曲|编曲|制作人|录音|混母带|监制].*\n", "", lyrics) if len(lyrics)<10: # 说明没有东西 language = "empty" language = _mark_language(lyrics, enchant_dict) conn.update(table="tracks", settings={"language":language}, conditions={"track_id":track_id})
def build_tfidf_model(tracks_set): ''' 数据来源:breakout_tracks_set, no_breakout_tracks_set 处理办法:每首歌至多抽取1000条评论(随机),取topk=20构建doc ''' conn = MyConn() w2v_model = models.Word2Vec.load("/Users/inkding/Desktop/partial_netease/models/word2vec/b1.mod") files = [] for track_id in tracks_set: files.append(conn.query(targets=["text_path"], conditions={"track_id": track_id}, fetchall=False)[0]) docs = [] for i, file in enumerate(files): print(i) # content = open(file).read()[:1000] content = open(file).read().splitlines() content = random.sample(content, min(100, len(content))) content = "\n".join(content) docs.append(tags_extractor(content, topk=20, w2v_model=w2v_model)) if i==50: for d in docs: print(d) break dictionary = corpora.Dictionary(docs) bows = [dictionary.doc2bow(doc) for doc in docs] tfidf_model = models.TfidfModel(bows) dictionary.save('../models/bow/1/corpora_dict.dict') # 重载用corpora.Dictionary.load(path) tfidf_model.save('../models/bow/1/corpora_tfidf.model') # 重载用models.TfidfModel.load(path) # 获取字典 stoi = dictionary.token2id print("words num:",len(stoi)) itos = dict(zip(stoi.values(), stoi.keys())) # test for i in range(20): test_doc = docs[i] test_bow = dictionary.doc2bow(test_doc) # 得到tf-idf表示 test_tfidf = sorted(tfidf_model[test_bow], key=lambda x:x[1], reverse=True) print(test_doc) for item in test_tfidf[:5]: print(itos[item[0]], item[1]) print()
def basic_analysis(tracks_set): ''' 对指定的歌曲集进行基本分析:评论数、时间跨度... ''' conn = MyConn() # 数据准备 data = [] targets = ["track_id", "tags", "reviews_num", "first_review", "last_review"] for tid in tracks_set: res = conn.query(targets=targets, conditions={"track_id": int(tid)}) data.append(res[0]) df = pd.DataFrame(data, columns=targets) # df.to_csv("../results/main_tagged_tracks/basic_info.csv", encoding="utf_8_sig", index=False) draw_hist(df["reviews_num"].values, log_scale=True, color="tab:orange")
def get_reviews_vec_with_freq(track_id, breakout, w2v_model, d_breakouts, d_no_breakouts, d_pos_track_breakout, with_freq=True): conn = MyConn() if breakout: bid = d_pos_track_breakout[track_id] feature_words = d_breakouts[bid]["words"] freqs = d_breakouts[bid]["freqs"] else: feature_words = d_no_breakouts[track_id]["words"] freqs = d_no_breakouts[track_id]["freqs"] if len(feature_words) < 5: print(track_id, breakout) reviews_vec = [] for i, w in enumerate(feature_words): vec = get_w2v_vector(w, w2v_model) if with_freq: vec = np.concatenate((vec, np.array([freqs[i] * 100]))) reviews_vec.append(vec) return reviews_vec
def build_dataset_embed(w_path): # ts1 = open("../data/main_tagged_tracks/tracks.txt").read().splitlines()[:1000] ts1 = list( pd.read_json("../data/breakouts-u2.json")["track_id"].unique())[:1000] ts2 = open("../data/no_breakouts_tracks.txt").read().splitlines()[:1000] print(len(ts1), len(ts2)) # 是否爆发 tracks_set = [(tid, 1) for tid in ts1] tracks_set += [(tid, 0) for tid in ts2] # 加载模型 conn = MyConn() d2v_model = Doc2Vec.load("../models/d2v/d2v_a1.mod") config = Config() mf_path = "MyModel/models/3/mf_extractor-e3.pkl" if_path = "MyModel/models/3/if_embed-e3.pkl" music_feature_extractor = MusicFeatureExtractor(config) music_feature_extractor.load_state_dict(torch.load(mf_path)) intrinsic_feature_embed = IntrinsicFeatureEmbed(config) intrinsic_feature_embed.load_state_dict(torch.load(if_path)) music_feature_extractor.eval() intrinsic_feature_embed.eval() X, y = get_X_y_embed(dict(tracks_set), conn, d2v_model, music_feature_extractor, intrinsic_feature_embed) with open(w_path, 'wb') as f: pickle.dump([X, y], f)
def add_no_breakouts_feature_words_to_json(): conn = MyConn() w2v_model = Word2Vec.load("../models/w2v/c4.mod") rubbish_words_fake = open( "../resources/rubbish_words_fake.txt").read().splitlines() candidates = open("../resources/music_words/music_words_cls_pos_pred.txt" ).read().splitlines() # remove = open("../resources/music_words/music_words_similar.txt").read().splitlines() # candidates = [w for w in candidates if w not in remove] data = conn.query(sql="SELECT id, track_id, text_path FROM no_breakouts") d_data = {} for id_, track_id, text_path in data: if track_id in d_data: d_data[track_id].append((id_, text_path)) else: d_data[track_id] = [(id_, text_path)] print(len(d_data)) json_data = {} for track_id, v in list(d_data.items()): try: text = "" for id_, text_path in v: text += open(text_path).read() feature_words_mode = "candidates" # raw, stop, tfidf feature_words = get_feature_words(text, topk=10, mode=feature_words_mode, w2v_model=w2v_model, candidates=candidates, return_freq=True) words, freqs = zip(*feature_words) json_data[track_id] = { "words": words, "freqs": freqs, "len": len(words) } if len(feature_words) < 5: print(track_id, "not enough words.") except KeyboardInterrupt: break except: print(track_id) print(traceback.format_exc()) with open("../data/reviews_feature_words_with_freqs/no_breakouts_cls.json", 'w') as f: json.dump(json_data, f, indent=2, ensure_ascii=False)
def view_reviews_num_curve(track_id, min_reviews=200, save_path=None): ''' 绘制给定歌曲id的评论数量变化曲线(标注爆发点) ''' conn = MyConn() json_path = conn.query(targets=["reviews_path"], conditions={"track_id": track_id}) if len(json_path) > 0: json_path = "/Volumes/nmusic/NetEase2020/data" + json_path[0][0] else: return None df = get_reviews_df(json_path) reviews_count, dates = get_reviews_count(df["date"].values) breakouts_group = get_breakouts(reviews_count, min_reviews=min_reviews) fig, ax = plt.subplots() x = list(range(len(reviews_count))) ax.plot(x, reviews_count) ax.xaxis.set_major_formatter(plt.NullFormatter()) palette = plt.get_cmap('Paired')(np.linspace(0, 1, 10)) y_head, beta_head = [], [] for i in range(min(len(breakouts_group), 10)): x = list(zip(*breakouts_group[i]))[0] y = [reviews_count[i] for i in x] y_head.append(y[0]) beta_head.append(breakouts_group[i][0][1]) ax.scatter(x=x, y=y, color=palette[i]) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_xlabel("time") ax.set_ylabel("reviews_num") # text = '\n'.join(["count:{}, beta:{}".format(y_head[i], beta_head[i]) # for i in range(len(y_head))]) # ax.text(0, 1, text, verticalalignment="top", horizontalalignment="left", transform=ax.transAxes) if save_path: if not os.path.exists(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path)) plt.savefig(save_path) else: plt.show() plt.close()
def build_dataset(): conn = MyConn() dataset_size = 1500 # conditional_sql = "rawmusic_path IS NOT NULL AND language in ('ch', 'en')" pos_tracks = conn.query( sql= "SELECT track_id FROM sub_tracks WHERE valid_bnum>0 AND is_valid=1 LIMIT {}" .format(dataset_size)) neg_tracks = conn.query( sql= "SELECT track_id FROM sub_tracks WHERE valid_bnum=0 AND is_valid=1 LIMIT {}" .format(dataset_size)) lyrics_d2v_model = Doc2Vec.load("../models/d2v/d2v_b1.mod") # 歌词d2v模型 with open("../data/artists_vec_dict_r_minmax.pkl", "rb") as f: d_artist_vec = pickle.load(f) X, y = [], [] args = { "lyrics_d2v_model": lyrics_d2v_model, "d_artist_vec": d_artist_vec, "use_mp3": True, "use_lyrics": True, "use_artist": True, "music_datatype": "vggish" } def add_data(tracks, label): for t in tracks: try: X.append(get_X(track_id=t, **args)) y.append(label) except KeyboardInterrupt: print("KeyboardInterrupt") break except: print(label, t) print(traceback.format_exc()) add_data(pos_tracks, 1) add_data(neg_tracks, 0) dataset_index = "0317_vggish" dataset_name = "m"*args["use_mp3"] + "l"*args["use_lyrics"] + "a"*args["use_artist"]\ + str(len(pos_tracks)) +'_'+ str(dataset_index) with open("../data/dataset/{}.pkl".format(dataset_name), 'wb') as f: pickle.dump([X, y], f)
def update_path(table, key_col, col, root_dir, offset, overwrite=False): ''' 在数据库中更新路径 ''' conn = MyConn() count_update = 0 for root, dirs, files in os.walk(root_dir): for file in files: if "OS" in file: continue filepath = os.path.join(root, file) key = file.split('/')[-1][:-offset] res = conn.query(table=table, targets=[col], conditions={key_col: key}, fetchall=False) if overwrite: conn.update(table=table, settings={col: filepath}, conditions={key_col: key}) count_update += 1 else: if res and res[0] is None: conn.update(table=table, settings={col: filepath}, conditions={key_col: key}) count_update += 1 print(count_update)
def copy_columns(t1, t2, col, key_col="track_id"): ''' 在数据库中,将一张表某列的信息复制到另一张表 params: t1: 被拷贝的表 t2: 被粘贴的表 col: 列名称 key_col: 键值 ''' conn = MyConn() data = conn.query(table=t1, targets=[key_col, col]) for key_v, v in data: try: conn.update(table=t2, settings={col: v}, conditions={key_col: key_v}) except: print("ERROR {}: {}".format(key_col, key_v))
def breakouts_curve(): ''' 绘制爆发曲线 ''' conn = MyConn() for i in range(6): save_dir = "../data/breakouts_curve_clusters/{}".format(i) if not os.path.exists(save_dir): os.makedirs(save_dir) tracks = [ r[0] for r in conn.query(targets=["track_id"], table="breakouts_complements", conditions={"label6": i}) ] for tid in tracks[:100]: save_path = os.path.join(save_dir, "{}.png".format(tid)) view_reviews_num_curve(tid, save_path=save_path)
def get_description_by_api(): conn = MyConn() res = conn.query(targets=["name", "nid"], table="artists") name_2_id = dict([(r[0].lower().strip(), r[1]) for r in res]) artists = open("../data_related/query_artists.txt").read().splitlines() print(len(artists)) url_base = 'http://127.0.0.1:3000' # 代理服务器 proxyHost = "http-dyn.abuyun.com" proxyPort = "9020" # 代理隧道验证信息 proxyUser = "******" proxyPass = "******" proxy = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host" : proxyHost, "port" : proxyPort, "user" : proxyUser, "pass" : proxyPass, } data = [] for ar in artists: try: id_ = name_2_id[ar] if id_=="0": continue url = url_base + "/artist/desc?id={}&proxy={}".format(id_, proxy) res = requests.get(url, timeout=10).json() res["id"] = id_ res["artist"] = ar data.append(res) except KeyboardInterrupt: print("interrupted by keyboard.") sys.exit(0) except Exception as e: print(ar, e) with open("../data/sup_artists_desc.json", 'w') as f: json.dump(data, f, indent=2, ensure_ascii=False)
def upload_details(): ''' 将歌曲的基本信息上传至数据库(歌曲名称、歌手姓名、专辑名称...) ''' def extract_details(filename): with open(filename) as f: content = json.load(f) details = { "name": content["songs"][0]["name"], "artist": ",".join([item["name"] for item in content["songs"][0]["ar"]]), "pop": content["songs"][0]["pop"], "album": content["songs"][0]["al"]["name"] } return details read_path = "/Volumes/nmusic/NetEase2020/data/simple_proxied_tracks_details" conn = MyConn() for root, dirs, files in os.walk(read_path): for file in files: if "DS" in file: continue filepath = os.path.join(root, file) track_id = file[:-5] try: details = extract_details(filepath) except Exception as e: print(filepath) # print(traceback.format_exc()) print(e) # print(details) conn.insert_or_update(table="details", settings={ "track_id": track_id, "name": details["name"], "artist": details["artist"], "album": details["album"], "pop": details["pop"] })
def test_d2v_with_source(text, model, topn=5): conn = MyConn() source_tracks = open( "../data_related/lyrics_valid_tracks.txt").read().splitlines() text = replace_noise(text) text = re.sub(r"( )*[作词|作曲|编曲|制作人|录音|混母带|监制].*\n", "", text) words = cut(text, join_en=False) vec = model.infer_vector(words) s = model.docvecs.most_similar([vec], topn=10) print(s)