示例#1
0
def update_path(table, key_col, col, root_dir, offset, overwrite=False):
    '''
     在数据库中更新路径
    '''
    conn = MyConn()
    count_update = 0
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if "OS" in file: continue
            filepath = os.path.join(root, file)
            key = file.split('/')[-1][:-offset]
            res = conn.query(table=table,
                             targets=[col],
                             conditions={key_col: key},
                             fetchall=False)
            if overwrite:
                conn.update(table=table,
                            settings={col: filepath},
                            conditions={key_col: key})
                count_update += 1
            else:
                if res and res[0] is None:
                    conn.update(table=table,
                                settings={col: filepath},
                                conditions={key_col: key})
                    count_update += 1
    print(count_update)
示例#2
0
def identify_release_drive_breakouts():
    '''
    找出由于新歌发布导致爆发的样本点(位于最头部)
    '''
    conn = MyConn()
    breakouts = conn.query(targets=["id", "track_id", "date"],
                           table="breakouts")

    release_breakouts_count = 0
    release_breakouts_tracks_set = set()
    more_breakouts_tracks_set = set()
    for b in breakouts:
        track_first_review = conn.query(targets=["first_review"],
                                        conditions={"track_id": b[1]},
                                        fetchall=False)[0]
        if b[2] - track_first_review < datetime.timedelta(days=15):
            release_breakouts_count += 1
            release_breakouts_tracks_set.add(b[1])
            conn.update(table="breakouts",
                        settings={"release_drive": 1},
                        conditions={"id": b[0]})
        else:
            more_breakouts_tracks_set.add(b[1])

    print(release_breakouts_count)
    print(len(release_breakouts_tracks_set))
    print(len(more_breakouts_tracks_set))
示例#3
0
def update_subtracks_music_words():
    conn = MyConn()
    valid_tracks_db = [
        r[0] for r in conn.query(
            sql="SELECT track_id FROM sub_tracks WHERE is_valid=1")
    ]
    with open("../data/reviews_feature_words_with_freqs/breakouts_wo_simi.json"
              ) as f:
        data = json.load(f)
        valid_tracks_pos = list(
            set([bid.split('-')[0] for bid in data if data[bid]["len"] >= 5]))
    with open(
            "../data/reviews_feature_words_with_freqs/no_breakouts_wo_simi.json"
    ) as f:
        data = json.load(f)
        valid_tracks_neg = [str(tid) for tid in data if data[tid]["len"] >= 5]
    valid_tracks = valid_tracks_pos + valid_tracks_neg
    print(len(valid_tracks_db))
    print(len(valid_tracks), len(valid_tracks_pos), len(valid_tracks_neg))
    for tid in valid_tracks_db:
        if tid not in valid_tracks:
            conn.update(table="sub_tracks",
                        settings={"is_valid": 0},
                        conditions={"track_id": tid})
            print(tid)
示例#4
0
def update_subtracks_havesimis():
    conn = MyConn()
    valid_tracks = set([
        r[0] for r in conn.query(
            sql="SELECT track_id FROM breakouts WHERE simi_score>=0.5")
    ])
    for tid in valid_tracks:
        conn.update(table="sub_tracks",
                    settings={"have_simis": 1},
                    conditions={"track_id": tid})
示例#5
0
def get_breakouts_num():
    conn = MyConn()
    breakouts = conn.query(targets=["id", "track_id"], table="breakouts")
    track_2_bnum = {}
    for id_, track_id in breakouts:
        if track_id in track_2_bnum:
            track_2_bnum[track_id] += 1
        else:
            track_2_bnum[track_id] = 1
    for k, v in track_2_bnum.items():
        conn.update(table="sub_tracks",
                    settings={"bnum": v},
                    conditions={"track_id": k})
示例#6
0
def mark_language():
	'''
	对歌词库中的所有歌曲进行语种标记。
	'''
	conn = MyConn()
	enchant_dict = enchant.Dict("en_US")
	for track_id, lyrics_path in conn.query(sql="SELECT track_id, lyrics_path FROM tracks WHERE lyrics_path is not null"):
		with open(lyrics_path) as f:
			content = json.load(f)
		lyrics = replace_noise(content["lrc"]["lyric"])
		lyrics = re.sub(r"( )*[作词|作曲|编曲|制作人|录音|混母带|监制].*\n", "", lyrics)
		if len(lyrics)<10: # 说明没有东西
			language = "empty"
		language = _mark_language(lyrics, enchant_dict)
		conn.update(table="tracks", settings={"language":language}, conditions={"track_id":track_id})
示例#7
0
def copy_columns(t1, t2, col, key_col="track_id"):
    '''
    在数据库中,将一张表某列的信息复制到另一张表
    params:
        t1: 被拷贝的表
        t2: 被粘贴的表
        col: 列名称
        key_col: 键值
    '''
    conn = MyConn()
    data = conn.query(table=t1, targets=[key_col, col])
    for key_v, v in data:
        try:
            conn.update(table=t2,
                        settings={col: v},
                        conditions={key_col: key_v})
        except:
            print("ERROR {}: {}".format(key_col, key_v))
示例#8
0
def add_desc_source_2_database():
    '''
    将歌手的文本来源写入数据库表格。
    '''
    conn = MyConn()
    with open("../data/name_2_KB_ents.pkl", "rb") as f:
        name_2_KB_ents = pickle.load(f)
    ents = set(name_2_KB_ents.keys())

    with open("../data/sup_artists_desc_2.json") as f:
        content = json.load(f)
    sup_arts = set([item["artist"].lower() for item in content])
    # print("KB: {}, Netease: {}, intersection: {}".format(len(ents), len(sup_arts), len(sup_arts.intersection(ents))))

    for key in ents:
        conn.update(table="artists", settings={"source":1}, conditions={"lower_name":key})
    for name in sup_arts:
        conn.update(table="artists", settings={"source":2}, conditions={"lower_name":name})
示例#9
0
def update_special_tag1():
    '''
    从之前生成的special_words表格中更新sub_tracks中的special_tag
    '''
    conn = MyConn()
    sql = "SELECT id, special_words FROM breakouts_feature_words_c3 WHERE LENGTH(special_words)>0"
    data = conn.query(sql=sql)
    special_words = ["高考", "翻唱", "节日", "抖音"]
    for bid, text in data:
        words = text.split()
        for w in words:
            if w in special_words:
                tid = bid.split('-')[0]
                try:
                    conn.update(table="sub_tracks",
                                settings={"special_tag": w},
                                conditions={"track_id": tid})
                except:
                    print(tid)
                break
示例#10
0
    def task(thread_id, task_args):
        '''
        检测 chorus_start 并上传至数据库(tracks & sub_tracks)
        '''
        conn = MyConn()
        while 1:
            sql = "SELECT track_id, mp3_path from sub_tracks WHERE chorus_start is NULL AND valid_bnum=0"
            task_args["lock"].acquire()
            res = conn.query(sql=sql, fetchall=False)
            if len(res) == 0:
                return  # sub_tracks表格中不存在chorus_start is null的歌曲
            tid, mp3_path = res[0], res[1]
            conn.update(table="sub_tracks",
                        settings={"chorus_start": 0},
                        conditions={"track_id": tid})
            task_args["lock"].release()

            try:
                # 检测并截取歌曲的副歌部分,设置时长为20s
                duration = 20
                chorus = get_chorus(mp3_path, clip_length=duration)
                if chorus is None:
                    conn.update(table="sub_tracks",
                                settings={"chorus_start": -1},
                                conditions={"track_id": tid})
                else:
                    chorus_start, chorus_end = chorus
                    conn.update(table="sub_tracks",
                                settings={"chorus_start": chorus_start},
                                conditions={"track_id": tid})
                    conn.update(table="tracks",
                                settings={"chorus_start": chorus_start},
                                conditions={"track_id": tid})
            except KeyboardInterrupt:
                conn.update(
                    sql=
                    "UPDATE sub_tracks SET chorus_start=NULL WHERE track_id={}"
                    .format(tid))
                return
            except:
                print(tid, traceback.format_exc())
示例#11
0
    def task(pid, task_args):
        conn = MyConn()
        w2v_model = Word2Vec.load("../models/w2v/c4.mod")
        while 1:
            task_args["lock"].acquire()
            res = conn.query(targets=["id", "text_path"],
                             conditions={"have_words": 0},
                             table="breakouts",
                             fetchall=False)
            if res is not None:
                id_, text_path = res
                conn.update(table="breakouts",
                            settings={"have_words": 1},
                            conditions={"id": id_})
                task_args["lock"].release()

                try:
                    feature_words = tags_extractor(open(text_path).read(),
                                                   topk=10,
                                                   w2v_model=w2v_model)
                    conn.insert(table="breakouts_feature_words_c3",
                                settings={
                                    "id": id_,
                                    "feature_words": " ".join(feature_words)
                                })

                    # print("[Process-{}] id: {}, feature_words: {}".format(pid, id_, feature_words))
                except:
                    conn.update(table="breakouts",
                                settings={"have_words": 0},
                                conditions={"id": id_})
                    print(id_)
                    print(traceback.format_exc())
                    break

            else:
                task_args["lock"].release()
                break
示例#12
0
def filtered_feature_words():
    conn = MyConn()
    w2v_model = Word2Vec.load("../models/w2v/c3.mod")
    rubbish = open("../resources/rubbish_words.txt").read().splitlines()
    with open("../resources/feature_words_counter_d.pkl", "rb") as f:
        feature_words_counter_d = pickle.load(f)

    for id_, text in conn.query(targets=["id", "feature_words"],
                                table="breakouts_feature_words_c3"):
        # 过滤掉rubbish_words
        feature_words = text.split()
        filtered_feature_words = []
        for w in feature_words:
            if w not in rubbish:
                filtered_feature_words.append(w)

        # conn.update(table="breakouts_feature_words_c3",
        #     settings={"filtered_feature_words": " ".join(filtered_feature_words)},
        #     conditions={"id": id_})

        # 基于w2v模型合并意思相近的词
        simi_matrix = []
        size = len(filtered_feature_words)
        for i in range(size):
            tmp = []
            for j in range(size):
                w1, w2 = filtered_feature_words[i], filtered_feature_words[j]
                tmp.append(w2v_model.wv.similarity(w1, w2))
            simi_matrix.append(tmp)

        selected = [0] * size
        groups = []
        for i in range(size):
            if selected[i] == 1:
                continue
            selected[i] = 1
            group_wi = [i]
            for j in range(size):
                if selected[j] != 1:
                    for wi in group_wi:
                        if simi_matrix[wi][j] >= 0.55:
                            group_wi.append(j)
                            selected[j] = 1
                            break
            group = [filtered_feature_words[wi] for wi in group_wi]
            groups.append(group)

        # print(groups)
        clean_feature_words = []
        for g in groups:
            if len(g) > 1:
                clean_feature_words.append(g[np.argmax(
                    [feature_words_counter_d[w] for w in g])])
            else:
                clean_feature_words.append(g[0])
        # print(clean_feature_words)

        conn.update(
            table="breakouts_feature_words_c3",
            settings={"clean_feature_words": " ".join(clean_feature_words)},
            conditions={"id": id_})