示例#1
0
    def _generate_answers(self, song_meta_json, train, questions):
        song_meta = {int(song["id"]): song for song in song_meta_json}
        song_mp_counter, song_mp = most_popular(train, "songs", 200)
        tag_mp_counter, tag_mp = most_popular(train, "tags", 100)
        song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter)

        answers = []
        for q in tqdm(questions):
            genre_counter = Counter()

            for sid in q["songs"]:
                for genre in song_meta[sid]["song_gn_gnr_basket"]:
                    genre_counter.update({genre: 1})

            top_genre = genre_counter.most_common(1)

            if len(top_genre) != 0:
                cur_songs = song_mp_per_genre[top_genre[0][0]]
            else:
                cur_songs = song_mp

            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], cur_songs)[:100],
                "tags": remove_seen(q["tags"], tag_mp)[:10]
            })

        return answers
示例#2
0
    def _generate_answers(self, song_meta_json, train, questions):
        # key를 song_id value를 해당 song_id에 대한 정보로 dictionary 생성
        song_meta = {int(song["id"]): song for song in song_meta_json}
        # 상위 200개 곡
        song_mp_counter, song_mp = most_popular(train, "songs", 200)
        # 상위 100개 태그
        tag_mp_counter, tag_mp = most_popular(train, "tags", 100)
        song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter)

        answers = []
        for q in tqdm(questions):
            genre_counter = Counter()

            for sid in q["songs"]:
                for genre in song_meta[sid]["song_gn_gnr_basket"]:
                    genre_counter.update({genre: 1})

            top_genre = genre_counter.most_common(1)
            # 가장 인기있는 장르가 존재하면
            if len(top_genre) != 0:
                # 해당 장르에서 가장 많이 등장한 song 추천
                cur_songs = song_mp_per_genre[top_genre[0][0]]
            else:
                # 아니면 가장 많이 등장한 노래 추천
                cur_songs = song_mp

            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], cur_songs)[:100],
                "tags": remove_seen(q["tags"], tag_mp)[:10]
            })

        return answers
示例#3
0
def fill_X(train, val):
    # embedding load ( need to mk s2v model by ply_tag_embedding.py )
    embed = PlyEmbedding(train)
    embed.load_s2v(s2v_path)
    # p2v for tag_by_song, title-vector for t2r
    titles, vectors = embed.song_based(
        mode='s2v', by='mean',
        keyedvector=False)  ## title, vectors not in KeydVectos

    # T2R
    t2r = Title2Rec()
    # remove non alpha or hangul. tokenize, ( t : title, v : vectors, ID : plylst id )
    t, v, ID = t2r.preprocess_clustering(titles,
                                         vectors,
                                         ID=True,
                                         khaiii=True,
                                         verbose=True)

    # load cluster ( need to mk cluster pkl file )
    t2r.load_cluster(cluster_path)

    # sort by cluster & distance from center
    data = t2r.pre_fasttext(t, v)

    # fit fasttext & title2rec
    t2r.fit_fasttext(data)
    t2r.fit_title2rec(t, ID)

    # most popular
    _, pop_songs = most_popular(train, 'songs', 100)
    _, pop_tags = most_popular(train, 'tags', 10)

    for ply in tqdm(val):
        ply['song_dirty'] = 0
        ply['tag_dirty'] = 0

        if ply['songs'] != []:
            if ply['tags'] != []:
                pass
            else:
                ply['tags'] = embed.tag_by_songs(ply, 10, 3.9)
                if len(ply['tags']) < 10:
                    ply['tags'] = put_most_popular(ply['tags'], pop_tags)
                ply['tag_dirty'] = 1

        else:
            songs, tags, song_sign, tag_sign = t2r.title2rec(
                ply, 100, 10, song_const, tag_const)
            if (song_sign) and (len(songs) == 0):
                songs = put_most_popular(songs, pop_songs)
                #raise RuntimeError("song length < 100")
            if (tag_sign) and (len(tags) < 10):
                tags = put_most_popular(tags, pop_tags)
            ply['songs'] = songs
            ply['tags'] = tags
            ply['song_dirty'] = song_sign
            ply['tag_dirty'] = tag_sign

    return val
示例#4
0
    def _generate_answers(self, train, questions):
        _, song_mp = most_popular(train, "songs", 200)
        _, tag_mp = most_popular(train, "tags", 100)

        answers = []

        for q in tqdm(questions):
            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], song_mp)[:100],
                "tags": remove_seen(q["tags"], tag_mp)[:10],
            })

        return answers
示例#5
0
def generate_answers(train, questions):

    _, song_mp = most_popular(train, "songs", 200)
    _, tag_mp = most_popular(train, "tags", 100)

    answers = []

    for q in questions:
        if len(q["songs"]) != 0 and len(q["tags"]) != 0:
            answers.append({
                "id": q["id"],
                "songs": q["songs"],
                "tags": q["tags"]
            })
        else:
            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], song_mp)[:100],
                "tags": remove_seen(q["tags"], tag_mp)[:10]
            })

    return answers
示例#6
0
    def _generate_answers(self, song_meta_json, train,
                          questions):  #train : arena_data/orig/train.json
        song_meta = {int(song["id"]): song
                     for song in song_meta_json
                     }  #song id를 키값으로 저장하고 그 song의 특징(tags, name 등) 저장
        song_mp_counter, song_mp = most_popular(
            train, "songs", 200
        )  #song_mp_counter : 딕셔너리값({... , 18273 : 1, ...}),song_mp : train에서 'songs'에 가장 많이 있는 200개 곡
        tag_mp_counter, tag_mp = most_popular(
            train, "tags", 100)  #train에서 'tags'에 가장 많이 있는 100개 태그
        song_mp_per_genre = self._song_mp_per_genre(
            song_meta, song_mp_counter
        )  #song_mp_per_genre = res ex) res = { pop : ['hello' : 200 ... ], }
        art_dic = self._artist_songs(
            song_meta, song_mp_counter)  #200넘는곡을 가진 가수의 이름과 곡 딕셔너리
        tag_id = self._songs_most_tag(train)

        answers = []
        for q in tqdm(questions):

            genre_counter = Counter()
            art_c = Counter()
            tag_c = Counter()

            for sid in q["songs"]:
                for genre in song_meta[sid]["artist_name_basket"]:
                    art_c.update({genre: 1})

            artist_name = list(art_c.keys())

            for sid in q["songs"]:
                for genre in song_meta[sid]["song_gn_gnr_basket"]:
                    genre_counter.update({genre: 1})

            top_genre = genre_counter.most_common(1)

            if len(artist_name) == 1 and artist_name[0] in art_dic.keys():
                cur_songs = list(art_dic[artist_name[0]])
            elif len(top_genre) != 0:
                cur_songs = song_mp_per_genre[top_genre[0][0]]
            else:
                cur_songs = song_mp

            tag_list = []

            if (q['songs'] != []):
                for sid in q["songs"]:
                    if (sid in tag_id):
                        for a in tag_id[sid]:
                            tag_c.update({a: 1})
                    tag_list = [k for k, v in tag_c.most_common()]
                if len(tag_list) > 10:
                    cur_tags = tag_list[:10]
                else:
                    new_list = remove_seen(tag_list, tag_mp)[:10]
                    cur_tags = (tag_list + new_list)[:10]
            else:
                cur_tags = remove_seen(q["tags"], tag_mp)[:10]

            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], cur_songs)[:100],
                "tags": cur_tags
            })

        return answers
示例#7
0
    def _generate_answers(self, song_meta_json, train, questions):
        song_meta = {int(song["id"]): song for song in song_meta_json}
        train_meta = {int(plylst["id"]): plylst for plylst in train}

        song_mp_counter, song_mp = most_popular(train, "songs", 200)
        tag_mp_counter, tag_mp = most_popular(train, "tags", 100)
        song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter)

        tag_per_song = self._tag_per_song(train_meta)

        ## modified for song prediction
        ## pre-processing train set data
        _, song_pop = most_popular(train, "songs", 200000)
        #song_pop = set(song_pop)
        voca_dict, voca_dict_t = self._build_vocadict(song_pop)

        # filtering song list
        num_users = len(train)
        f_song_lst, f_usr_lst = self._const_filtered_lst(train,
                                                         voca_dict,
                                                         num_users,
                                                         to_idx=num_users,
                                                         val=False)
        num_items = len(set(f_song_lst))
        data_len = len(f_song_lst)

        # re-setting index of filtered songs
        item_ids = np.array([voca_dict[i] for i in f_song_lst])
        data = np.ones(data_len)
        rows, cols, data = zip(*set(zip(f_usr_lst, item_ids, data)))
        print('train preproc done', num_items)

        ## pre-processing valid/test set data
        v_num_users = len(questions)
        f_song_lst_v, f_usr_lst_v = self._const_filtered_lst(
            questions, voca_dict, num_users, to_idx=v_num_users, val=True)
        data_len_v = len(f_song_lst_v)

        v_item_ids = np.array([voca_dict[i] for i in f_song_lst_v])
        v_data = np.ones(data_len_v)
        v_rows, v_cols, v_data = zip(
            *set(zip(f_usr_lst_v, v_item_ids, v_data)))
        print('valid preproc done', num_items)

        n_rows = rows + v_rows
        n_cols = cols + v_cols
        n_data = data + v_data
        t_num_users = num_users + v_num_users

        usr_item_mat = sp.csr_matrix((n_data, (n_rows, n_cols)),
                                     shape=(t_num_users, num_items))
        item_usr_mat = usr_item_mat.T
        als_model = implicit.als.AlternatingLeastSquares(factors=100,
                                                         regularization=0.05,
                                                         iterations=50)
        #als_model = implicit.bpr.BayesianPersonalizedRanking(factors=50)  ### actually bpr
        #als_model = implicit.lmf.LogisticMatrixFactorization(factors=50) ### actually Logistic MF
        als_model.fit(item_usr_mat)
        print("als model fitting done")

        ### for cold-start users (plylst containing no song)
        title_to_tok, vocab = title_to_token(train)
        v_title_to_tok, v_vocab = title_to_token(questions)
        title_to_tok.extend(v_title_to_tok)
        vocab.extend(v_vocab)
        print("title to token converted", len(title_to_tok), len(vocab))

        fin_vocab = get_fin_vocab(vocab)
        print("final vocab size", len(fin_vocab))

        title_to_idx = []
        for plylst in title_to_tok:
            res_idx = tok_to_idx(plylst, fin_vocab)
            title_to_idx.append(res_idx)

        user_lst, vocab_lst = preproc_for_csr(title_to_idx, 0)
        cb_rows = np.array(user_lst)
        cb_cols = np.array(vocab_lst)
        cb_data = np.ones(len(user_lst))
        plylst_tt_mat = sp.csr_matrix(
            (cb_data, (cb_rows, cb_cols)),
            shape=(len(title_to_tok), len(fin_vocab)))
        print("csr matrix for tf-idf matrix made")

        tfidf_mat = build_tfidf_mat(plylst_tt_mat)

        ####

        answers = []
        for idx, q in tqdm(enumerate(questions)):
            genre_counter = Counter()

            for sid in q["songs"]:
                for genre in song_meta[sid]["song_gn_gnr_basket"]:
                    genre_counter.update({genre: 1})

            top_genre = genre_counter.most_common(1)

            if len(top_genre) != 0:
                cur_songs = song_mp_per_genre[top_genre[0][0]]
            else:
                cur_songs = song_mp

            ## modified for tag prediction
            tag_lst = self._tag_per_plylst(q, tag_per_song)
            tag_res = remove_seen(q["tags"], tag_lst)[:10]
            if len(tag_res) < 10:
                tag_res = remove_seen(q["tags"], tag_mp)[:10]

            ## modified for song prediction
            if len(q["songs"]) == 0:
                n_idx = idx + len(train)
                most_sim_lst = get_sim_plylst(tfidf_mat, given=n_idx, topn=30)
                cands = gather_cand(train, questions, most_sim_lst)
                song_res = remove_seen(q["songs"], cands)[:100]
                #print(n_idx, song_res)
            else:
                song_lst = self._cal_alsmodel(idx, num_users, usr_item_mat,
                                              als_model, voca_dict_t)
                song_res = remove_seen(q["songs"], song_lst)[:100]

            if len(song_res) < 100:
                print('checked here', idx)
                song_res = remove_seen(q["songs"], cur_songs)[:100]

            answers.append({"id": q["id"], "songs": song_res, "tags": tag_res})

        return answers
示例#8
0
def Recommender(train,
                questions,
                n_msp,
                n_mtp,
                mode,
                sim_measure,
                song_meta,
                freq_song,
                save=False):
    ## 최종 추천리스트
    rec_list = []

    ## 1단계: 전처리
    # 1) 추천 결과가 없거나 모자란 경우를 위해 most_popular 생성
    _, song_mp = most_popular(train, "songs", 200)
    _, tag_mp = most_popular(train, "tags", 20)

    # 2) 빠른 접근을 위한 Dictionary 생성
    song_plylst_dic, song_tag_dic, plylst_song_dic, plylst_tag_dic, tag_plylst_dic, tag_song_dic, song_issue_dic, song_artist_dic = DicGenerator(
        train, song_meta)

    # 3) 미리 계산한 플레이리스트 유사도 불러오기
    '''
    sim_scores: 입력으로 들어온 questions과 train간 유사도 (Autoencoder 기반)
    gnr_scores: 입력으로 들어온 questions과 train간 유사도 (genre 정보 추가)
    title_scores: 입력으로 들어온 questions과 train간 유사도 (Word2vec 기반)
    '''
    sim_scores = np.load(f'scores/{mode}_scores_bias_{sim_measure}.npy',
                         allow_pickle=True).item()
    gnr_scores = np.load(f'scores/{mode}_scores_bias_{sim_measure}_gnr.npy',
                         allow_pickle=True).item()
    title_scores = np.load(
        f'scores/{mode}_scores_title_{sim_measure}_24000.npy',
        allow_pickle=True).item()

    ## 2단계: 함수 정의
    # 1) Counter 객체에서 빈도수 기준 topk개 출력
    def most_similar(cnt, topk):
        cnt_topk = cnt.most_common(topk)
        return [k for k, v in cnt_topk]

    # 2) 미리 계산한 유사도 기준 topk개의 플레이리스트의 plylsts와 scores 출력
    def most_similar_emb(q_id, topk, title=False, genre=False):
        # title_scores 기준
        if title:
            plylsts = [t[0] for t in title_scores[q_id][:topk]]
            scores = [t[1] for t in title_scores[q_id][:topk]]
        # gnr_scores 기준
        elif genre:
            plylsts = [t[0] for t in gnr_scores[q_id][:topk]]
            scores = [t[1] for t in gnr_scores[q_id][:topk]]
        # sim_scores 기준
        else:
            plylsts = [t[0] for t in sim_scores[q_id][:topk]]
            scores = [t[1] for t in sim_scores[q_id][:topk]]
        return plylsts, scores

    # 3) new_song_plylst_dict
    def get_new_song_plylst_dict(plylst_ms):
        new_song_plylst_dict = defaultdict(set)
        for plylst in plylst_ms:
            for _song in plylst_song_dic[plylst]:
                new_song_plylst_dict[_song].add(plylst)
        return new_song_plylst_dict

    ## 3단계: 입력으로 들어온 questions 플레이리스트에 대해 추천
    for q in tqdm(questions):

        # 1) question 플레이리스트의 정보
        # 수록 song/tag
        q_songs = q['songs']
        q_tags = q['tags']

        # 수록 song/tag와 함께 등장한 song/tag/plylst 빈도 수
        song_plylst_C = Counter()
        song_tag_C = Counter()
        tag_plylst_C = Counter()
        tag_song_C = Counter()

        # 수록 song/tag가 둘 다 없거나 적을 때
        no_songs_tags, few_songs_tags = False, False
        if len(q_songs) == 0 and len(q_tags) == 0:
            no_songs_tags = True
        elif len(q_songs) <= 3:
            few_songs_tags = True

        # 2) 빈도수 기반 추천을 위해 카운트
        # 수록 song에 대해
        for q_s in q_songs:
            song_plylst_C.update(song_plylst_dic[q_s])
            song_tag_C.update(song_tag_dic[q_s])
        # 수록 tag에 대해
        for q_t in q_tags:
            tag_plylst_C.update(tag_plylst_dic[q_t])
            tag_song_C.update(tag_song_dic[q_t])
            # 수록곡 수로 나눠서 비율로 계산
        for i, j in list(song_plylst_C.items()):
            if len(plylst_song_dic[i]) > 0:
                song_plylst_C[i] = (j / len(plylst_song_dic[i]))

                # 3) 유사도 기반 추천을 위해 점수 계산
        plylst_song_scores = defaultdict(lambda: 0)
        plylst_tag_scores = defaultdict(lambda: 0)

        # Case 1: song과 tag가 둘 다 없는 경우
        if no_songs_tags:
            # plylst_ms / plylst_mt: title_scores 기준 유사한 플레이리스트 n_msp / n_mtp개
            plylst_ms, song_scores = most_similar_emb(q['id'],
                                                      n_msp,
                                                      title=True)
            plylst_mt, tag_scores = most_similar_emb(q['id'],
                                                     n_mtp,
                                                     title=True)
            plylst_add, add_scores = most_similar_emb(q['id'], n_mtp)

        # Case 2: song과 tag가 부족한 경우
        elif few_songs_tags:
            # plylst_ms / plylst_mt: sim_scores 기준 n_msp개 / title_scores 기준 n_mtp개
            plylst_ms, song_scores = most_similar_emb(q['id'], n_msp)
            plylst_mt, tag_scores = most_similar_emb(q['id'],
                                                     n_mtp,
                                                     title=True)
            plylst_add, add_scores = most_similar_emb(q['id'],
                                                      n_mtp,
                                                      genre=True)

        # Case 3: song과 tag가 충분한 경우
        else:
            # plylst_ms / plylst_mt: sim_scores 기준 유사한 플레이리스트 n_msp / n_mtp개
            plylst_ms, song_scores = most_similar_emb(q['id'], n_msp)
            plylst_mt, tag_scores = most_similar_emb(q['id'],
                                                     n_mtp,
                                                     genre=True)
            plylst_add, add_scores = most_similar_emb(q['id'],
                                                      n_mtp,
                                                      title=True)

        new_song_plylst_dict = get_new_song_plylst_dict(plylst_ms)

        # 3-1. plylst_song_scores 계산
        for idx, ms_p in enumerate(plylst_ms):
            for song in plylst_song_dic[ms_p]:
                song_score = 0
                for q_s in q_songs:
                    try:
                        song_score += len(new_song_plylst_dict[q_s]
                                          & new_song_plylst_dict[song]) / len(
                                              new_song_plylst_dict[q_s])
                    except:
                        pass
                if song in freq_song:
                    plylst_song_scores[song] += song_plylst_C[
                        ms_p] * song_score * song_scores[idx] * (n_msp -
                                                                 idx) * 4
                else:
                    plylst_song_scores[song] += song_plylst_C[
                        ms_p] * song_score * song_scores[idx] * (n_msp - idx)
            for tag in plylst_tag_dic[ms_p]:
                plylst_tag_scores[tag] += tag_scores[idx] * (n_msp - idx)

        # 3-2. plylst_tag_scores 계산
        for idx, mt_p in enumerate(plylst_mt):
            for tag in plylst_tag_dic[mt_p]:
                plylst_tag_scores[tag] += tag_scores[idx] * (n_mtp - idx)
            for song in plylst_song_dic[mt_p]:
                plylst_song_scores[song] += tag_scores[idx]

        # 3-3. plylst_{song/tag}_scores 보정
        for idx, mt_p in enumerate(plylst_add):
            for tag in plylst_tag_dic[mt_p]:
                plylst_tag_scores[tag] += add_scores[idx] * (n_mtp - idx)

        # 4) song과 tag 둘 다 없거나 적은 경우 예측해서 채워넣기
        if no_songs_tags:
            # q_songs 새롭게 채워넣기 (원래는 song가 없지만 title_scores 기준 유사한 플레이리스트로부터 song 예측)
            pre_songs = sorted(plylst_song_scores.items(),
                               key=lambda x: x[1],
                               reverse=True)
            pre_songs = [scores[0] for scores in pre_songs][:200]
            pre_songs = pre_songs + remove_seen(pre_songs, song_mp)
            q_songs = pre_songs[:100]

            # q_tags 새롭게 채워넣기 (원래는 tag가 없지만 title_scores 기준 유사한 플레이리스트로부터 tag 예측)
            pre_tags = sorted(plylst_tag_scores.items(),
                              key=lambda x: x[1],
                              reverse=True)
            pre_tags = [scores[0] for scores in pre_tags][:20]
            pre_tags = pre_tags + remove_seen(pre_tags, tag_mp)
            q_tags = pre_tags[:10]

            # 5) questions 플레이리스트에 대해 추천
        ## song 추천
        # song 있을 때
        lt_song_art = []
        if len(q_songs) > 0:
            plylst_song_scores = sorted(plylst_song_scores.items(),
                                        key=lambda x: x[1],
                                        reverse=True)

            lt_artist = []
            for w_song in q_songs:
                lt_artist.extend(song_artist_dic[w_song])
            counter_artist = Counter(lt_artist)
            counter_artist = sorted(counter_artist.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
            if few_songs_tags:
                artist = [art[0] for art in counter_artist]
            else:
                artist = [x[0] for x in counter_artist if x[1] > 1]
            cand_ms = [scores[0] for scores in plylst_song_scores
                       ][(100 - len(artist)):1000]
            for cand in cand_ms:
                if artist == []:
                    break
                if cand in q_songs:
                    break
                for art in song_artist_dic[cand]:
                    if art in artist:
                        lt_song_art.append(cand)
                        artist.remove(art)
                        break
            song_ms = [scores[0] for scores in plylst_song_scores][:200]

        # song 없고, tag 있을 때
        else:
            song_ms = most_similar(tag_song_C, 200)

        ## tag 추천
        # tag 있을 때
        if len(q_tags) > 0:
            plylst_tag_scores = sorted(plylst_tag_scores.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
            tag_ms = [scores[0] for scores in plylst_tag_scores][:20]

        # tag 없고, song 있을 때
        else:
            plylst_tag_scores = sorted(plylst_tag_scores.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
            tag_ms = [scores[0] for scores in plylst_tag_scores][:20]

        ## issue date 늦은 song 제거
        if q['updt_date']:
            q_updt_date = q['updt_date'][:4] + q['updt_date'][5:7] + q[
                'updt_date'][8:10]
            song_ms = [x for x in song_ms if song_issue_dic[x] < q_updt_date]

        ## 중복 제거 및 부족하면 most_popular로 채워넣기
        song_candidate = song_ms + remove_seen(song_ms, song_mp)
        tag_candidate = tag_ms + remove_seen(tag_ms, tag_mp)

        song_remove = q_songs
        tag_remove = q_tags

        song_candidate = song_candidate[:100] if no_songs_tags else remove_seen(
            song_remove, song_candidate)[:100]
        if len(lt_song_art) > 0:
            lt_song_art = [x for x in lt_song_art if x not in song_candidate]
            song_candidate[(100 - len(lt_song_art)):100] = lt_song_art

        rec_list.append({
            "id":
            q["id"],
            "songs":
            song_candidate,
            "tags":
            tag_candidate[:10] if no_songs_tags else remove_seen(
                tag_remove, tag_candidate)[:10]
        })

    # 6) results.json 파일 저장 여부
    if save == True:
        write_json(
            rec_list, 'results/results_' +
            dt.datetime.now().strftime("%y%m%d-%H%M%S") + '_' + mode + '.json')

    return rec_list