def fit_title2rec(self, titles, ID): keys = [i + " " + t for t, i in zip(titles, ID)] print('Fit title2rec...') vectors = list(map(self.fasttext.wv.get_vector, titles)) self.t2r = WordEmbeddingsKeyedVectors(vector_size=100) self.t2r.add(keys, vectors) print('done.')
def song_based(self, mode='s2v', by='mean', keyedvector=True): if mode == 's2v': if not self.s2v: print("Song2Vec not exist.\nRun make_s2v first.") return elif mode == 'd2v': if not self.d2v: print("Doc2Vec not exist.\nRun make_d2v first.") return else: print("mode gets 's2v' or 'd2v'") if not by in ['mean', 'sum']: raise RuntimeError("'by' gets 'mean' or 'sum'") ply_id = [] ply_vec = [] for p in tqdm(self.data): if by == 'mean': tmp = [] else: tmp = 0 for song in p['songs']: try: if by == 'mean': if mode == 's2v': tmp.append(self.s2v.wv.get_vector(str(song))) else: tmp.append(self.d2v.wv.get_vector(str(song))) else: if mode == 's2v': tmp += self.s2v.wv.get_vector(str(song)) else: tmp += self.d2v.wv.get_vector(str(song)) except KeyError: pass if by == 'mean': if tmp != []: ply_id.append('(' + str(p['id']) + ') ' + p['plylst_title']) ply_vec.append(np.mean(tmp, axis=0)) else: if type(tmp) != int: ply_id.append('(' + str(p['id']) + ') ' + p['plylst_title']) ply_vec.append(tmp) print("Original data length: ", len(self.data)) print("Embedded data length: ", len(ply_id)) if not keyedvector: return ply_id, ply_vec out = WordEmbeddingsKeyedVectors(vector_size=100) out.add(ply_id, ply_vec) return out
def __init__(self, FILE_PATH): self.FILE_PATH = FILE_PATH # word2vec의 요소들 # 최소 1번 이상 연관이 있어야 학습한다. self.min_count = 2 # 의미를 담을 벡터를 150차원으로 만든다. self.size = 150 # 중심단어 기준으로 앞뒤로 210개 범위까지 학습시킨다. self.window = 210 # sg = 1이면 skip-gram 아니면 CBOW self.sg = 1 # 키 + 벡터를 저장함 # KeyedVectors는 추가 교육을 지원하지 않는 대신 더 작고 RAM을 덜 차지한다. self.p2v_model = WordEmbeddingsKeyedVectors(self.size) # 유니코드 한글 시작: 44032, 끝:55199 self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG = 44032, 588, 28 # 초성 리스트0~18 self.CHOSUNG_LIST = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] # 중성 리스트 0~20 self.JUNGSUNG_LIST = [ 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ' ] # 종성 리스트 0~27 self.JONGSUNG_LIST = [ '', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] # 여기에 자모분리된 train의 플레이리스트 제목들이 담긴다. self.title_list_detach = [] # FILE_PATH가 가리키는 곳에 train, test와 most_popular_res.json, song_meta.json이 있어야 합니다. with open(FILE_PATH + '/train.json', encoding="utf-8") as f: self.train = json.load(f) self.train = random.sample(self.train, 30000) with open(FILE_PATH + '/val2.json', encoding="utf-8") as f: self.val = json.load(f) with open(FILE_PATH + '/most_popular_res.json', encoding="utf-8") as f: self.most_results = json.load(f) # song_meta 데이터를 가져온다. with open(FILE_PATH + '/song_meta.json', encoding="utf-8") as f: self.song_meta = json.load(f)
def train(args): # Output during training logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # use text8 corpus as training data, haikus dont provide sufficient context training_data = api.load('text8') # use the phrase model to recognize bigrams like "White House" or "Climate Change" bigram_model = Phrases(training_data) # Export the trained model = use less RAM, faster processing. Model updates no longer possible. bigrams = Phraser(bigram_model) # # create and train model model = Word2Vec(bigrams[training_data], size=args.embedding_dim) word_list = list(model.wv.vocab.keys()) vector_list = [model[word] for word in word_list] # the basic model doesnt seem to be supporting item assignment # but WordEmbeddingsKeyedVectors does kv = WordEmbeddingsKeyedVectors(args.embedding_dim) kv.add(word_list, vector_list) kv.add(["<eos>", "<n>", "<unk>"], np.random.rand(3, args.embedding_dim)) # just to be safe, clear the cache of normalized vectors # as i had a similar issue as https://github.com/RaRe-Technologies/gensim/issues/2532 del kv.vectors_norm # save the new models bigrams.save(f"{args.model_path}/bigram.model") kv.save(f"{args.model_path}/word2vec.model")
def to_keyed_vectors(self, embd_matrix, dim, delete_unknown=True): """ Transform to gensim's keyed vectors structure for further usage. https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/keyedvectors.py """ vectors = WordEmbeddingsKeyedVectors(vector_size=dim) tokens = self.corpus.vocab.tokens if delete_unknown: # delete last row (for <UNK> token) embd_matrix = np.delete(embd_matrix, (-1), axis=0) else: # the last token is the UNK token so append it tokens.append("<UNK>") vectors.add(tokens, embd_matrix) return vectors
def item2vec(dataset,min_count=3,size=300,sg=5): window = max(list(map(len,dataset))) p2v_model = WordEmbeddingsKeyedVectors(size) w2v_model = Word2Vec(dataset, min_count = min_count, size = size, window = window, sg = sg,seed=1025) w2v_model.save('item2vec.model') return w2v_model
def compute_similar_nn(early, later): M_early = WordEmbeddingsKeyedVectors(300) M_early.add(words, early.to_numpy()) M_later = WordEmbeddingsKeyedVectors(300) M_later.add(words, later.to_numpy()) scores = list() for word in words: early_similar = get_similar_set(word, M_early) later_similar = get_similar_set(word, M_later) count = len(early_similar.intersection(later_similar)) scores.append(count) return scores
def getRecos(liked, disliked, watched, threshold=1000): liked = [str(l) for l in liked] disliked = [str(l) for l in disliked] watched = [str(w) for w in watched if str( w) not in liked and str(w) not in disliked] df_restr = df_movies[~df_movies["movieId"].isin(watched)].sort_values( by="count", ascending=False) kv = WordEmbeddingsKeyedVectors(movie_embedding_size) kv.add( df_restr['movieId'].apply(str).values, w[df_restr.movieId] ) idlist = [int(x[0]) for x in kv.most_similar(positive=liked, negative=disliked, restrict_vocab=4000, topn=12)] return getAll(idlist=idlist)
def compute_scaled_average_sim(early, later): M_early = WordEmbeddingsKeyedVectors(300) M_early.add(words, early.to_numpy()) M_later = WordEmbeddingsKeyedVectors(300) M_later.add(words, later.to_numpy()) scores = list() for word in words: score = 0 early_values = M_early.most_similar(word, topn=20) later_values = M_later.most_similar(word, topn=20) early_dict = {v[0]:v[1] for v in early_values} later_dict = {v[0]:v[1] for v in later_values} overlap = set([w for w in early_dict.keys() if w in later_dict]) early_avg = 0 later_avg = 0 for entry in overlap: early_avg += early_dict[entry] later_avg += later_dict[entry] early_avg = early_avg / len(overlap) if len(overlap) else 0 later_avg = later_avg / len(overlap) if len(overlap) else 0 scores.append(len(overlap) + (1 - abs(later_avg - early_avg))) return scores
def make_prediction(favofite_movie): """ Input: feature_dict: a dictionary of the form {"feature_name": "value"} Function makes sure the features are fed to the model in the same order the model expects them. Output: Returns (x_inputs, probs) where x_inputs: a list of feature values in the order they appear in the model probs: a list of dictionaries with keys 'name', 'prob' """ movie = favofite_movie threshold = 100 mainstream_movies = movies_df[ movies_df.n_ratings >= threshold].reset_index(drop=True) movie_embedding_size = w.shape[1] kv = WordEmbeddingsKeyedVectors(movie_embedding_size) kv.add(mainstream_movies['key'].values, w[mainstream_movies.movieId]) results = kv.most_similar(movie) return [result[0] for result in results]
def createKVs(DFFile, COFIle, type): #createWordandVectorList() # to create word and vector list for wiki 50 # wordList - list of words # vectorList - list of the vector corresponding to the words wordListW2V, vectorListW2V = loadWordANdVectorsW2V() wordListPCA, vectorListPCA = loadWordANdVectorsPCA(DFFile, COFIle) w2v_len = 50 PCA_len = 10 kv1 = WordEmbeddingsKeyedVectors(w2v_len) kv2 = WordEmbeddingsKeyedVectors(PCA_len) kv1.add(wordListW2V, list(vectorListW2V)) kv2.add(wordListPCA, vectorListPCA) filename = 'KV' + type + '.obj' with open(filename, "wb") as f: pickle.dump(kv1, f) pickle.dump(kv2, f) print(kv1.most_similar('love')) # gives the list of words similar to word1 return filename
def get_p2v_model(train, val, w2v_model): p2v_model = WordEmbeddingsKeyedVectors(100) ID = [] vec = [] data = pd.concat([train, val], axis=0) for id_, songs, tags in zip(data['id'], data['songs'], data['tags']): tmp_vec = 0 for token in songs + tags: try: tmp_vec += w2v_model.wv.get_vector(str(token)) except KeyError: pass if type(tmp_vec) != int: ID.append(str(id_)) vec.append(tmp_vec) p2v_model.add(ID, vec) file_name = "./manual_emb/p2v_mdl_" + get_time() + ".model" p2v_model.save(file_name) return p2v_model
def main(): args = parse_args() if not args.input.is_file(): raise FileNotFoundError('%r is not a file' % args.input) if not args.outputdir.is_dir(): raise FileNotFoundError('%r is not a directory' % args.outputdir) if args.outputname: outfile = args.outputdir / args.outputname else: name = args.input.stem + '-small.pkl' outfile = args.outputdir / name kv = load_fasttext_embeddings(args.input) vector_size = kv.vector_size token_iter = chain.from_iterable(iterate_tokens(s) for s in args.splits) words = list(set(token_iter)) + ["__UNK__", "__PAD__"] embeddings = np.zeros((len(words), vector_size)) for row, word in tqdm.tqdm(enumerate(words)): embeddings[row, :] = kv.word_vec(word) new_kv = WordEmbeddingsKeyedVectors(vector_size) new_kv.add(words, embeddings) new_kv.save(str(outfile))
class PlaylistEmbedding: # java의 생성자 같은 존재 __init__ def __init__(self, FILE_PATH): self.FILE_PATH = FILE_PATH # word2vec의 요소들 # 최소 1번 이상 연관이 있어야 학습한다. self.min_count = 2 # 의미를 담을 벡터를 150차원으로 만든다. self.size = 150 # 중심단어 기준으로 앞뒤로 210개 범위까지 학습시킨다. self.window = 210 # sg = 1이면 skip-gram 아니면 CBOW self.sg = 1 # 키 + 벡터를 저장함 # KeyedVectors는 추가 교육을 지원하지 않는 대신 더 작고 RAM을 덜 차지한다. self.p2v_model = WordEmbeddingsKeyedVectors(self.size) # 유니코드 한글 시작: 44032, 끝:55199 self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG = 44032, 588, 28 # 초성 리스트0~18 self.CHOSUNG_LIST = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] # 중성 리스트 0~20 self.JUNGSUNG_LIST = [ 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ' ] # 종성 리스트 0~27 self.JONGSUNG_LIST = [ '', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] # 여기에 자모분리된 train의 플레이리스트 제목들이 담긴다. self.title_list_detach = [] # FILE_PATH가 가리키는 곳에 train, test와 most_popular_res.json, song_meta.json이 있어야 합니다. with open(FILE_PATH + '/train.json', encoding="utf-8") as f: self.train = json.load(f) self.train = random.sample(self.train, 30000) with open(FILE_PATH + '/val2.json', encoding="utf-8") as f: self.val = json.load(f) with open(FILE_PATH + '/most_popular_res.json', encoding="utf-8") as f: self.most_results = json.load(f) # song_meta 데이터를 가져온다. with open(FILE_PATH + '/song_meta.json', encoding="utf-8") as f: self.song_meta = json.load(f) def write_json(self, data, fname): def _conv(o): if isinstance(o, (np.int64, np.int32)): return int(o) raise TypeError parent = os.path.dirname(fname) distutils.dir_util.mkpath( "C:/Users/hwang in beom/Desktop/final/full/" + parent) with io.open("C:/Users/hwang in beom/Desktop/final/full/" + fname, "w", encoding="utf-8") as f: json_str = json.dumps(data, ensure_ascii=False, default=_conv) f.write(json_str) def remove_seen(self, seen, l): seen = set(seen) return [x for x in l if not (x in seen)] # train, val의 곡과 태그를 플레이리스트 id를 key값으로 가지는 딕셔너리에 저장 def get_dic(self, train, val): song_dic = {} tag_dic = {} data = train + val for q in tqdm(data): song_dic[str(q['id'])] = q['songs'] tag_dic[str(q['id'])] = q['tags'] print() self.song_dic = song_dic self.tag_dic = tag_dic # 여기서 토탈로 train의 곡과 태그만 보내기 때문에 모델에는 train만 학습됨 total = list( map(lambda x: list(map(str, x['songs'])) + list(x['tags']), data)) total = [x for x in total if len(x) > 1] self.total = total def get_w2v(self, total, min_count, size, window, sg): try: print("get_w2v 실행") if not (os.path.isfile( "C:/Users/hwang in beom/Desktop/final/full/w2v_model.model" )): print("get_w2v 모델 학습 시작") # window가 210인 이유는 태그 10개와 곡 200개 꽉차있는 플레이리스트도 존재하기 때문이다. . iter는 반복횟수 w2v_model = Word2Vec(total, min_count=min_count, size=size, window=window, sg=sg, iter=25) print("get_w2v 모델 학습 완료") self.w2v_model = w2v_model w2v_model.save( "C:/Users/hwang in beom/Desktop/final/full/w2v_model.model" ) print("w2v_model 모델 로드") self.w2v_model = Word2Vec.load( "C:/Users/hwang in beom/Desktop/final/full/w2v_model.model") except OSError as e: print("failed to create directory!") raise def update_p2v(self, train, val, w2v_model): ID = [] vec = [] # val에 있는 곡이나 태그들 중 train에는 없어서 예외처리되는 것을 확인하기 위한 카운트 # 이 부분은 나중에 제거해도 상관 없음 self.yes_songs_count = 0 self.yes_tags_count = 0 self.no_songs_count = 0 self.no_tags_count = 0 # 두개를 합치고 for q in tqdm(train + val): tmp_vec = 0 songs_vec = 0 tags_vec = 0 # 둘다 1 이상일 때 확인 if len(q['songs']) >= 1 or len(q['tags']) >= 1: # 노래를 가지고 for문을 돌리고 for x in q['songs']: # word2vec 을 통해 백터를 가지고 온다. 이때 song의 x를 하나씩 넣어서 추가해주고 이것에 대한 개수를 센다 try: songs_vec += w2v_model.wv.get_vector(str(x)) self.yes_songs_count += 1 except: self.no_songs_count += 1 # song에 했던 것과 똑같이 한다. for y in q['tags']: try: tags_vec += w2v_model.wv.get_vector(str(y)) self.yes_tags_count += 1 except: self.no_tags_count += 1 # 2개를 더한다. tmp_vec = songs_vec + tags_vec # 만약에 타입이 int가 아니면 ID와 vec를 append 한다 if type(tmp_vec) != int: ID.append(str(q['id'])) vec.append(tmp_vec) # train, val의 플레이리스트 id에 해당하는 vector값을 구함 self.p2v_model.add(ID, vec) # FastText def get_title(self, train): title_list = [] for q in train: title_list.append(q['plylst_title']) self.title_list = title_list def jamo_str(self, text, BASE_CODE, CHOSUNG, JUNGSUNG, CHOSUNG_LIST, JUNGSUNG_LIST, JONGSUNG_LIST): # 데이터 정제 def clean_str(text): pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)' # E-mail제거 text = re.sub(pattern=pattern, repl='', string=text) pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+' # URL제거 text = re.sub(pattern=pattern, repl='', string=text) pattern = '([ㄱ-ㅎㅏ-ㅣ]+)' # 한글 자음, 모음 제거 text = re.sub(pattern=pattern, repl=' ', string=text) pattern = '<[^>]*>' # HTML 태그 제거 text = re.sub(pattern=pattern, repl=' ', string=text) pattern = '[^\w\s]' # 특수기호제거 text = re.sub(pattern=pattern, repl=' ', string=text) return text string = text string = clean_str(string) # print(string) # 리스트로 형변환 sp_list = list(string) # print(sp_list) result = [] for keyword in sp_list: # 한글 여부 check 후 분리 (keyword가 none이 아니면) if re.match('.*[ㄱ-ㅎㅏ-ㅣ가-힣]+.*', keyword) is not None: # 만약 keyword 가 ' '면 그냥 result에 넣는다. if keyword == ' ': result.append(' ') # 키워드안에 초성리스트 , 중성리스트, 종성 리스트가 들어가면 '' 을 넣는다. if keyword in CHOSUNG_LIST or keyword in JUNGSUNG_LIST or keyword in JONGSUNG_LIST: result.append('') else: # 초성 ord->문자의 코드값을 구한다 # keyword의 아스키 코드값 - basecode를 뺀다. char_code = ord(keyword) - BASE_CODE # char_code - 초성 char1 = int(char_code / CHOSUNG) # 초성 리스트에서 char1의 인덱스에 해당하는 값을 넣는다. result.append(CHOSUNG_LIST[char1]) # 중성 char2 = int((char_code - (CHOSUNG * char1)) / JUNGSUNG) result.append(JUNGSUNG_LIST[char2]) # 종성 char3 = int( (char_code - (CHOSUNG * char1) - (JUNGSUNG * char2))) if char3 == 0: result.append('-') result.append(JONGSUNG_LIST[char3]) # 아니면 그냥 넣는다. else: result.append(keyword) results_all = [] # 리스트에서 문자열로 변환 results_all = ("".join(result)) # 저장 self.results_all = results_all def get_title_list(self, results_all): # print("".join(result)) #자모 분리 결과 출력? title_list_detach = [] title_list_detach.append(results_all) self.title_list_detach.append(title_list_detach) def make_title_model(self, title_list_detach): try: print("make_title_model 실행") if not (os.path.isfile( "C:/Users/hwang in beom/Desktop/final/full/FT_title_model.model" )): print("make_title_model 모델 학습 시작") FT_title_model = FT_gensim(title_list_detach, size=300, window=100, min_count=1, sg=1, iter=2000) print("make_title_model2 모델 학습 완료") self.FT_title_model = FT_title_model FT_title_model.save( "C:/Users/hwang in beom/Desktop/final/full/FT_title_model.model" ) self.FT_title_model = FT_gensim.load( "C:/Users/hwang in beom/Desktop/final/full/FT_title_model.model" ) print("make_title_model 모델 로드됨") except OSError as e: print("failed to create directory!") raise # Fasttext끝 def get_result(self, p2v_model, song_dic, tag_dic, most_results, val, train, FT_title_model, song_meta): title_sentence_train = [] # train에서 한글 정제 작업을 한다. 그때 plylst_title 부분을 한다. for x in train: self.jamo_str(x['plylst_title'], self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG, self.CHOSUNG_LIST, self.JUNGSUNG_LIST, self.JONGSUNG_LIST) # 위에꺼를 하면 results_all이 나오고 이 값을 title_sentence_train 넣는다. title_sentence_train.append(self.results_all) answers = [] # 제대로 진행되고 있는지 알기 위해 세는 카운트 # most_id는 제대로 뽑히고 있는가? self.most_id = [] # ply_embedding 추천이 제대로된 플레이리스트는 몇개인가 self.p2v_count = 0 # 예외처리된 플레이리스트는 몇개인가 self.except_count = 0 # 어디서 끊기는지 정확히 알고 싶으면 옮기면서 카운트해보는 변수 self.when_stop = 0 # 문제유형별로 몇개의 플레이리스트가 있는 세는 카운트 self.TNSN = 0 self.TYSN = 0 self.TNSY = 0 self.TYSY = 0 # 곡이나 태그가 100, 10개 안채워졌을 때 채우는 카운트 self.update_song_count = 0 self.update_tag_count = 0 # tqdm으로 진행도를 나타내고 val의 개수 만큼 돌아가는데 enumerate를통해 몇번째 돌고 있는지를 n에 넣어서 보여준다. for n, q in tqdm(enumerate(val), total=len(val)): # 제목, 곡, 태그 유무 파악 및 개수 세기 songs = q['songs'] tags = q['tags'] songs_count = len(songs) tags_count = len(tags) try: # 플레이리스트 임베딩하는 알고리즘(곡으로 곡추천할 때 씀) def ply_em(q): # test or val 값을 넣고 이제 id 값을 넣었을때 유사한 것들을 가져와 most_id에 넣는다. most_id = [ x[0] for x in p2v_model.most_similar(str(q['id']), topn=15) ] # most_vec = [x[1] for x in p2v_model.most_similar(str(q['id']), topn=15)] # 원본 get_song = [] get_tag = [] # most_id의 각각의 id 값을 song_dic 와 tag_dic에 넣어 노래와 태그를 얻는다. for ID in most_id: get_song += song_dic[ID] get_tag += tag_dic[ID] # 반복되는 노래에 대해 카운트를 추가하면서 카운트를 늘린다 count = {} for i in get_song: try: count[i] += 1 except: count[i] = 1 count = sorted(count.items(), key=lambda x: x[1], reverse=True) # 반복되는 태그에 대해 카운트를 추가하면서 카운트를 늘린다 count2 = {} for i in get_tag: try: count2[i] += 1 except: count2[i] = 1 count2 = sorted(count2.items(), key=lambda x: x[1], reverse=True) # 이거는 위에서 봤을때 몇번째 돌고 멈추는지 체크할 때 쓰는거 같았음 self.when_stop += 1 real_get_song = [] real_get_tag = [] for song in count: real_get_song.append(song[0]) for tag in count2: real_get_tag.append(tag[0]) # get_song = list(pd.value_counts(get_song)[:500].index) # get_tag = list(pd.value_counts(get_tag)[:20].index) def to_integer(dt_time): return 10000 * dt_time.year + 100 * dt_time.month + dt_time.day # 시간을 가지고 몰 하는거 같은데 utc_time = datetime.strptime(q['updt_date'][:26], '%Y-%m-%d %H:%M:%S.%f') updt = int(to_integer(utc_time)) true_get_song = [] # 위에서 얻은 real_get_song의 song_id를 하나씩 꺼내서 for song_id in real_get_song: # songmeta의 song_id로 값을 찾은 뒤 issue_data를 꺼낸다. issue = int(song_meta[song_id]['issue_date']) # 여기서 업데이트 했던 내역 - issue를 했을때 값이 0보다 크면 넣고 적으면 안넣는다. 아마도 이상치 처리하는 듯 if updt - issue >= 0: true_get_song.append(song_id) else: pass answers.append({ "id": q["id"], "songs": self.remove_seen(q["songs"], true_get_song)[:100], "tags": self.remove_seen(q["tags"], real_get_tag)[:10], }) # 여기까지 오면 카운트 추가 self.p2v_count += 1 # FastText 알고리즘 (여기서는 곡 정보가 없을때 나머지 것들 이용했다.) def fasttext_title(q): train_ids = [] get_song = [] get_tag = [] # 한글정제 (자음모음) self.jamo_str(q['plylst_title'], self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG, self.CHOSUNG_LIST, self.JUNGSUNG_LIST, self.JONGSUNG_LIST) # 나온 값을 title에 저장 title = self.results_all # FT_title_model이라는 것을 사용하는데 title을 넣고 가장 유사한 것들을 뽑아내는거 같음? F_list = FT_title_model.wv.most_similar(title, topn=60) # 여기서 나온 값들을 하나씩 뽑아내서 for x in F_list: # title_sentence_train의 인덱스에서 F_list에서 뽑아낸 x의 [0]번째 값을 넣고 number라는 곳에 저장하고 number = title_sentence_train.index(x[0]) # train에 해당 인덱스에 id값을 빼서 train_ids에 넣는다. train_ids.append(train[number]['id']) # train ids의 하나하나 id 의 값을 빼서 for ids in train_ids: # song_dic에서 해당 하는 id값을 찾아서 get_song을 만들고 get_song += song_dic[str(ids)] # tag_dix에서 해당 하는 id값을 찾아서 get_tag에 넣는다. get_tag += tag_dic[str(ids)] # 여러번 나오는 값들에 +1을 계속하고 아닌 것들에 대해서는 1만 넣는다. count = {} for i in get_song: try: count[i] += 1 except: count[i] = 1 # 이것을 sorted 해서 많이 나온 순서대로 정렬한다. count = sorted(count.items(), key=lambda x: x[1], reverse=True) # 태그또한 여러번 나오는 i에 대해 +1을 계속한다. count2 = {} for i in get_tag: try: count2[i] += 1 except: count2[i] = 1 count2 = sorted(count2.items(), key=lambda x: x[1], reverse=True) real_get_song = [] real_get_tag = [] # 노래에 대해 카운트한걸 하나씩 뽑아내서 그 값에 0번째 인덱스를 real_get_song에 추가한다. for song in count: real_get_song.append(song[0]) # 태그에 대해 카운트한걸 하나씩 뽑아내서 그 값에 0번째 인덱스를 real_get_tag 추가한다. for tag in count2: real_get_tag.append(tag[0]) # get_song = list(pd.value_counts(real_get_song)[:200].index) # get_tag = list(pd.value_counts(real_get_tag)[:20].index) # 예외처리하는 부분 현재 플레이 리스트를 만든 연도와 들어가는 노래의 연도를 비교하여 플레이 리스트가 더 앞에 있으면 해당 하는 노래를 뺀다. def to_integer(dt_time): return 10000 * dt_time.year + 100 * dt_time.month + dt_time.day utc_time = datetime.strptime(q['updt_date'][:26], '%Y-%m-%d %H:%M:%S.%f') updt = int(to_integer(utc_time)) true_get_song = [] for song_id in real_get_song: issue = int(song_meta[song_id]['issue_date']) if updt - issue >= 0: true_get_song.append(song_id) else: pass answers.append({ "id": q["id"], "songs": self.remove_seen(q["songs"], true_get_song)[:100], "tags": self.remove_seen(q["tags"], real_get_tag)[:10], }) # 4가지 경우의 수로 나눠 예측을 하였다.곡 자체가 없을때는 fasttext_title 로 곡정보가 있을때는 ply_em 으로 하였다. # 태그 X 곡 X 제목 O if tags_count == 0 and songs_count == 0: self.TNSN += 1 fasttext_title(q) # 태그 O 곡 X 제목 X elif tags_count > 0 and songs_count == 0: self.TYSN += 1 fasttext_title(q) # 태그 x 곡 O elif tags_count == 0 and songs_count > 0: self.TNSY += 1 ply_em(q) # 태그 O 곡 O elif tags_count > 0 and songs_count > 0: self.TYSY += 1 ply_em(q) except: # 예외처리되면 카운터 추가 self.except_count += 1 answers.append({ "id": q["id"], "songs": most_results[n]['songs'], "tags": most_results[n]["tags"], }) # check and update answer for n, q in enumerate(answers): if len(q['songs']) != 100: answers[n]['songs'] += self.remove_seen( q['songs'], self.most_results[n]['songs'])[:100 - len(q['songs'])] self.update_song_count += 1 if len(q['tags']) != 10: answers[n]['tags'] += self.remove_seen( q['tags'], self.most_results[n]['tags'])[:10 - len(q['tags'])] self.update_tag_count += 1 self.answers = answers def run(self): # Word2Vec ply_embedding - Word2Vec를 통해 플레이 리스트를 밀집으로 표현 # train, val의 곡과 태그를 플레이리스트 id를 key값으로 가지는 딕셔너리에 저장 self.get_dic(self.train, self.val) # word2vec의 요소들을 넣어서 w2v를 실행함 - 옵션에 맞춰서 word2vec을 실행한다. # total - train과 val 데이터를 합치고 그 합친것에서 곡과 태그를 빼내 하나의 리스트로 만들어준다. # , min_count - 1번이상 연관이 있어야 학습, size - 의미를 담을 벡터를 150차원으로 만든다.? # window - 중심단어 기준으로 앞뒤로 210개 범위까지 학습, sg - 1이면 skip-gram, 아니면 CBOW # CBOW - 주변 단어들을 통해 중간의 단어를 예측하는 모델 # Skip-Gram 은 중심 단어를 통해 주변단어를 예측하는 모델 # 이 값을 word2vec에 넣고 model을 학습하고 저장한다. 그다음 이걸 로드해서 self에 저장해놓는다. self.get_w2v(self.total, self.min_count, self.size, self.window, self.sg) # p2v_model 에 값을 추가하는 작업 self.update_p2v(self.train, self.val, self.w2v_model) # FastText ply_title - facebook에서 제공해주는 것으로 play title을 생성 # train의 playlist title을 title_list라는 것에 저장한다 만든다. self.get_title(self.train) # title list와 초성,중성,종성등의 값을 넣고 데이터를 정제한다. for string in self.title_list: self.jamo_str(string, self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG, self.CHOSUNG_LIST, self.JUNGSUNG_LIST, self.JONGSUNG_LIST) # 위에꺼를 하면 results_all이 나오고 이 값을 title_list_detach에 넣는다. self.get_title_list(self.results_all) self.make_title_model(self.title_list_detach) # 곡과 태그 채우는 함수 # WordEmbeddingsKeyedVectors / song_dic / tag_dic / most_popular_res 데이터 / val 데이터 / train 데이터 / FT_gensim 해서 나온 결과 / song_meta 넣기 self.get_result(self.p2v_model, self.song_dic, self.tag_dic, self.most_results, self.val, self.train, self.FT_title_model, self.song_meta) # self.write_json(self.answers, '/content/drive/MyDrive/Colab Notebooks/final/test10/results2.json') # self.write_json(self.answers, 'results50000.json') print("results 작성 완료") def train_model(self): # Word2Vec ply_embedding self.get_dic(self.train, self.val) self.get_w2v(self.total, self.min_count, self.size, self.window, self.sg) self.update_p2v(self.train, self.val, self.w2v_model) # FastText ply_title self.get_title(self.train) for string in self.title_list: self.jamo_str(string, self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG, self.CHOSUNG_LIST, self.JUNGSUNG_LIST, self.JONGSUNG_LIST) self.get_title_list(self.results_all) self.make_title_model(self.title_list_detach)
def run(song_meta_data, train_data, test_data): train_data['updt_year'] = train_data['updt_date'].str.slice(start=0, stop=4) test_data['updt_year'] = test_data['updt_date'].str.slice(start=0, stop=4) song_meta_data['issue_year'] = song_meta_data['issue_date'].str.slice( start=0, stop=4) song_meta_data['id'] = song_meta_data['id'].astype(str) print("Tokenize...") tokenize(train_data, test_data) train_data = train_data.sort_values(by='updt_date').reset_index(drop=True) test_data = test_data.sort_values(by='updt_date').reset_index(drop=True) print("Total Dict Loading") if os.path.exists( BASE_DIR + 'model/total_data_final.pickle') and os.path.exists( BASE_DIR + 'model/song_dict_final.pickle') and os.path.exists( BASE_DIR + 'model/tag_dict_final.pickle') and os.path.exists( BASE_DIR + 'model/title_dict_final.pickle'): with open(BASE_DIR + 'model/total_data_final.pickle', 'rb') as handle: total_data = pickle.load(handle) with open(BASE_DIR + 'model/song_dict_final.pickle', 'rb') as handle: song_dict = pickle.load(handle) with open(BASE_DIR + 'model/tag_dict_final.pickle', 'rb') as handle: tag_dict = pickle.load(handle) with open(BASE_DIR + 'model/title_dict_final.pickle', 'rb') as handle: title_dict = pickle.load(handle) else: print("Total Dict Not Existing... Calculating") total_data, song_dict, tag_dict, title_dict = getTotalDict( train_data, test_data) with open(BASE_DIR + 'model/total_data_final.pickle', 'wb') as handle: pickle.dump(total_data, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(BASE_DIR + 'model/song_dict_final.pickle', 'wb') as handle: pickle.dump(song_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(BASE_DIR + 'model/tag_dict_final.pickle', 'wb') as handle: pickle.dump(tag_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(BASE_DIR + 'model/title_dict_final.pickle', 'wb') as handle: pickle.dump(title_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) print("Frequency Loading...") if os.path.exists(BASE_DIR + 'model/tag_freq_by_song.pickle') and os.path.exists( BASE_DIR + 'model/song_freq_by_tag.pickle'): with open(BASE_DIR + 'model/tag_freq_by_song.pickle', 'rb') as handle: tag_freq_by_song = pickle.load(handle) with open(BASE_DIR + 'model/song_freq_by_tag.pickle', 'rb') as handle: song_freq_by_tag = pickle.load(handle) else: print("Frequency Not Existing... Calculating") tag_freq_by_song, song_freq_by_tag = getFreqDict(train_data) with open(BASE_DIR + 'model/tag_freq_by_song.pickle', 'wb') as handle: pickle.dump(tag_freq_by_song, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(BASE_DIR + 'model/song_freq_by_tag.pickle', 'wb') as handle: pickle.dump(song_freq_by_tag, handle, protocol=pickle.HIGHEST_PROTOCOL) print("Update Date Loading...") if os.path.exists(BASE_DIR + 'model/updt_dict.pickle'): with open(BASE_DIR + 'model/updt_dict.pickle', 'rb') as handle: updt_dict = pickle.load(handle) else: print("Update Date Not Existing... Calculating") updt_dict = getUpdtDict(song_meta_data) with open(BASE_DIR + 'model/updt_dict.pickle', 'wb') as handle: pickle.dump(updt_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) print("Song Popularity Loading...") if os.path.exists(BASE_DIR + 'model/popular_song_by_year.pickle') and os.path.exists( BASE_DIR + 'model/popular_tag_by_year.pickle'): with open(BASE_DIR + 'model/popular_tag_by_year.pickle', 'rb') as handle: popular_tag_by_year = pickle.load(handle) with open(BASE_DIR + 'model/popular_song_by_year.pickle', 'rb') as handle: popular_song_by_year = pickle.load(handle) else: print("Song Popularity Not Existing... Calculating") popular_song_by_year, popular_tag_by_year = getPopularDict(train_data) with open(BASE_DIR + 'model/popular_tag_by_year.pickle', 'wb') as handle: pickle.dump(popular_tag_by_year, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(BASE_DIR + 'model/popular_song_by_year.pickle', 'wb') as handle: pickle.dump(popular_song_by_year, handle, protocol=pickle.HIGHEST_PROTOCOL) print("Word2Vec Model Loading...") if os.path.exists(BASE_DIR + 'model/w2v_model_sg_title.model'): w2v_model = Word2Vec.load(BASE_DIR + 'model/w2v_model_sg_title.model') else: print("Word2Vec Model Not Found !") print("Training...") w2v_model = Word2Vec(total_data, min_count=3, size=100, window=210, sg=1) w2v_model.save(BASE_DIR + 'model/w2v_model_sg_title.model') print("Training...") p2v_model = WordEmbeddingsKeyedVectors(100) updateP2V(train_data, test_data, w2v_model, p2v_model, song_dict, tag_dict, title_dict) print("Word2Vec Second Model Loading...") if os.path.exists(BASE_DIR + 'model/w2v_tag_final.model') and os.path.exists( BASE_DIR + 'model/w2v_song_final.model'): tag_model = Word2Vec.load(BASE_DIR + 'model/w2v_tag_final.model') song_model = Word2Vec.load(BASE_DIR + 'model/w2v_song_final.model') mt = W2VModel(tag_model, "tags") ms = W2VModel(song_model, "songs") else: print("Word2Vec Second Model Not Found !") print("Tag Training...") mt = W2VModel(pd.concat([train_data, test_data]), "tags") mt.model.save(BASE_DIR + 'model/w2v_tag_final.model') print("Song Training...") ms = W2VModel(pd.concat([train_data, test_data]), "songs") ms.model.save(BASE_DIR + 'model/w2v_song_final.model') print("start") answer = [] for i, row in tqdm(test_data.iterrows()): year = str(row['updt_year']) id = str(row['id']) songs = [] tags = [] try: most_id_list = [x[0] for x in p2v_model.most_similar(id, topn=200)] fillAnswer(getItemById(most_id_list, song_dict, 200), songs, 100, song_dict, id, updt_dict, year) fillAnswer(getItemById(most_id_list, tag_dict, 20), tags, 10, tag_dict, id) except: pass if len(songs) < 100: fillAnswer(ms.recommand(test_data, int(row['id']), 200), songs, 100, song_dict, id, updt_dict, year) if len(tags) < 10: fillAnswer(mt.recommand(test_data, int(row['id']), 20), tags, 10, tag_dict, id) if len(songs) < 100: fillAnswer(getSongByTagFreq(song_freq_by_tag, row['tags'], 200), songs, 100, song_dict, id, updt_dict, year) if len(tags) < 10: fillAnswer(getTagBySongFreq(tag_freq_by_song, row['songs'], 20), tags, 10, tag_dict, id) if len(songs) < 100: fillAnswer(getSongByYear(popular_song_by_year, year, 200), songs, 100, song_dict, id, updt_dict, year) if len(tags) < 10: fillAnswer(getTagByYear(popular_tag_by_year, year, 20), tags, 10, tag_dict, id) if len(songs) < 100: try: fillAnswer( getSongByYear(popular_song_by_year, str(int(year) - 1), 20), songs, 100, song_dict, id, updt_dict, year) except: fillAnswer( getSongByYear(popular_song_by_year, str(int(year) + 1), 200), songs, 100, song_dict, id, updt_dict, year) if len(tags) < 10: try: fillAnswer( getTagByYear(popular_tag_by_year, str(int(year) - 1), 20), tags, 10, tag_dict, id) except: fillAnswer( getTagByYear(popular_tag_by_year, str(int(year) + 1), 200), tags, 10, tag_dict, id) if len(songs) < 100: print("song 의 개수가 적습니다. id : ", str(row['id']), str(year)) if len(tags) < 10: print("tag 의 개수가 적습니다. id : ", str(row['id']), str(year)) answer.append({"id": row["id"], "songs": songs, "tags": tags}) write_json(answer, "results.json")
weights = net.emb.weight.detach().cpu().numpy() # embedding = WordEmbeddingsKeyedVectors(vector_size=300) # for i, n in enumerate(word2index.keys()): # embedding.add(entities=n, weights=net.word_embeddings(n).cpu().detach()) # if not i % 100: # print(f'{i}, {n}') # # embedding.save(os.path.join(data_path, 'keyed_values.dir')) # ===================================================================================== def analogy(x1, x2, y1): result = embedding.most_similar(positive=[y1, x2], negative=[x1]) return result[0][0] embedding = WordEmbeddingsKeyedVectors.load(os.path.join(data_path, 'keyed_values.dir')) print(analogy('estimate', 'estimates', 'find')) accuracy, result = embedding.evaluate_word_analogies(os.path.join(data_path, 'intrinsic_test.txt')) print(accuracy) for r in result: correct_len = len(r['correct']) incorrect_len = len(r['incorrect']) print(f'{r["section"]}: {correct_len} / {(correct_len + incorrect_len)}') # ===================================================================================== from gensim.test.utils import datapath print( (embedding.n_similarity(["king"], ["duke"]),
class Title2Rec: def __init__(self): super().__init__() self.cluster_model = None self.fasttext = None self.t2r = None self.good_tags = ['NNG', 'NNP', 'NNB', 'NP', 'NR', 'VA', 'MAG', 'SN', 'SL'] self.khaiii = KhaiiiApi() ## fit clustering def fit_clustering(self, vectors, n_clusters, verbose=0, max_iter=50): self.cluster_model = KMeans(n_clusters=n_clusters, verbose=verbose, max_iter=max_iter) print("Data length: ", len(vectors)) print("Fit KMeans...") self.cluster_model.fit(vectors) print("done.") ## preprocess for clustering def preprocess_clustering(self, titles, vectors, ID=True, khaiii=True, verbose=False): ## t: title / v: vectors / i : plylst id if ID: id_list = list(map(lambda x: x.split(' ')[0][1:-1], titles)) titles = list(map(lambda x: ' '.join(x.split(' ')[1:]), titles)) else: id_list = list(range(len(titles))) t_v = list(zip(titles, vectors, id_list)) stable = [(t, v, i) for t, v, i in t_v if re.findall('[가-힣a-zA-Z&]+', t) != []] stable = [(' '.join(re.findall('[가-힣a-zA-Z&]+|90|80|70', t)), v, i) for t, v, i in stable] stable = [(t, v, i) for t, v, i in stable if t != ''] ## title morph analysis by Khaiii def tag_process(title, khaiii, good_tags): token = khaiii.analyze(title) ## join : space bar between list element return ' '.join([morph.lex for to in token for morph in to.morphs if morph.tag in good_tags]) if khaiii: if verbose: stable = [(tag_process(t, self.khaiii, self.good_tags), v, i) for t, v, i in tqdm(stable)] stable = [(t, v, i) for t, v, i in stable if t != ''] else: stable = [(tag_process(t, self.khaiii, self.good_tags), v, i) for t, v, i in stable] stable = [(t, v, i) for t, v, i in stable if t != ''] titles = [t for t, v, i in stable] vectors = [v for t, v, i in stable] id_list = [i for t, v, i in stable] if verbose: print("Original lenght: ", len(t_v)) print("Processed length: ", len(titles)) return titles, vectors, id_list ## cleansing text before Khaiii @staticmethod def text_process(titles, ID=True): if ID: titles = list(map(lambda x: ' '.join(x.split(' ')[1:]), titles)) stable = [x for x in titles if re.findall('[가-힣a-zA-Z&]+', x) != []] stable = [' '.join(re.findall('[가-힣a-zA-Z&]+|90|80|70', x)) for x in stable] stable = [x for x in stable if x != ''] print("Only hangul & alpha & and sign.") print("Original lenght: ", len(titles)) print("Processed length: ", len(stable)) return stable ## predict cluster with cluster model, return clusters sorted by distance def pre_fasttext(self, titles, vectors): if not self.cluster_model: raise RuntimeError("Please fit clustering model.") cluster_out = self.cluster_model.predict(vectors) transform = self.cluster_model.transform(vectors) dist = [distance[cluster] for cluster, distance in zip(cluster_out, transform)] data = pd.DataFrame({'title': titles, 'cluster': cluster_out, 'distance': dist}) return data.sort_values(['cluster', 'distance']) ## mk Fasttext model with cluster(500) def fit_fasttext(self, data): sentence = data.groupby('cluster')['title'].apply(list).tolist() print("Fit fasttext...") self.fasttext = FastText(sentence) print('done.') ## mk title2rec model def fit_title2rec(self, titles, ID): keys = [i + " " + t for t, i in zip(titles, ID)] print('Fit title2rec...') vectors = list(map(self.fasttext.wv.get_vector, titles)) self.t2r = WordEmbeddingsKeyedVectors(vector_size=100) self.t2r.add(keys, vectors) print('done.') ## get title vectors from fasttext model ( most similar 10 - default) def forward(self, titles, topn=10): ft = list(map(self.fasttext.wv.get_vector, titles)) out = [self.t2r.wv.similar_by_vector(t, topn=topn) for t in ft] return out ## load cluster model def load_cluster(self, fname): self.cluster_model = joblib.load(fname) print("load complete") ## load fasttext model def load_fasttext(self, path): self.fasttext = gensim.models.FastText.load(path) ## load title to songs model def load_t2r(self, path): self.t2r = gensim.models.KeyedVectors.load(path) def title2rec(self, ply, song_n, tag_n, song_const, tag_const, khaiii=True): title, _, _ = self.preprocess_clustering([ply['plylst_title']], [None], ID=False, khaiii=khaiii, verbose=False) if title == []: if ply['tags'] != []: return ply['songs'], ply['tags'], 1, 0 else: return ply['songs'], ply['tags'], 1, 1 title = title[0] similars = self.forward([title], topn=200)[0] ID = [int(sim[0].split(" ")[0]) for sim in similars] similar = [sim[1] for sim in similars] tmp_df = pd.DataFrame({'id':ID, 'similar':similar}) tmp_df = pd.merge(tmp_df, train_df[['id', 'songs', 'tags']], how='left', on='id') tmp_df['song_len'] = tmp_df['songs'].apply(len) tmp_df['song_len'] = tmp_df['song_len'].cumsum().shift(1).fillna(0) song_df = tmp_df[tmp_df['song_len'] < 2000] score_dict = {} for sim, songs in zip(song_df['similar'], song_df['songs']): for i, song in enumerate(songs): score = (-math.log(i+1, 2) + song_const) * sim try: score_dict[song] += score except KeyError: score_dict[song] = score pick = sorted(score_dict.items(), key=lambda x: x[1], reverse=True) pick = [p[0] for p in pick] song_res = pick[:song_n] # date = pd.to_datetime(ply['updt_date']) # pick = [p for p in pick if song_date[p] <= date] # song_res = pick[:song_n] if len(song_res) < song_n: song_df = tmp_df[tmp_df['song_len'] >= 2000] for sim, songs in zip(song_df['similar'], song_df['songs']): for i, song in enumerate(songs): score = (-math.log(i+1, 2) + song_const) * sim try: score_dict[song] += score except KeyError: score_dict[song] = score pick = sorted(score_dict.items(), key=lambda x: x[1], reverse=True) pick = [p[0] for p in pick] # pick = [p for p in pick if song_date[p] <= date] song_res = pick[:song_n] # assert len(song_res) == song_n # song_res = [p[0] for p in pick] if ply['tags'] != []: return song_res, ply['tags'], 1, 0 tmp_df['tag_len'] = tmp_df['tags'].apply(len) tmp_df['tag_len'] = tmp_df['tag_len'].cumsum().shift(1).fillna(0) tag_df = tmp_df[tmp_df['tag_len'] < 150] score_dict = {} for sim, tags in zip(tag_df['similar'], tag_df['tags']): for i, tag in enumerate(tags): score = (-math.log(i+1, 2) + tag_const) * sim try: score_dict[tag] += score except KeyError: score_dict[tag] = score pick = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)[:tag_n] tag_res = [p[0] for p in pick] return song_res, tag_res, 1, 1
if __name__ == "__main__": _, words, freq = extract_words(5000) _, _, finder = build_word_context_model(words) M1_plus = compute_ppmi(finder, words) M2_10 = apply_pca(M1_plus, 10, words) M2_100 = apply_pca(M1_plus, 100, words) M2_300 = apply_pca(M1_plus, 300, words) W2V = KeyedVectors.load_word2vec_format( './GoogleNews-vectors-negative300.bin', binary=True) SM2_300 = evaluate_cosine(M2_300) print("pca_300 correlation {}".format(calc_pearson(SM2_300)[0])) SW2V = evaluate_cosine(W2V, False) print("word2vec correlation {}".format(calc_pearson(SW2V)[0])) SM_keyed = WordEmbeddingsKeyedVectors(300) SM_keyed.add(words, M2_300.to_numpy()) M10_keyed = WordEmbeddingsKeyedVectors(10) M10_keyed.add(words, M2_10.to_numpy()) M100_keyed = WordEmbeddingsKeyedVectors(100) M100_keyed.add(words, M2_100.to_numpy()) tests = [(W2V, './word-test.v1.txt'), (W2V, './filtered-test.txt'), (M10_keyed, './word-test.v1.txt'), (M10_keyed, './filtered-test.txt'), (M100_keyed, './word-test.v1.txt'), (M100_keyed, './filtered-test.txt'), (SM_keyed, './word-test.v1.txt'), (SM_keyed, './filtered-test.txt')]
def infer(MODE="Test"): mode_opt = { "Valid": { "train_path": "arena_data/orig/train.json", "test_path": "arena_data/questions/val.json", "results_path": "cf2/val/results.json", "eval": True }, "Dev": { "train_path": "res/train.json", "test_path": "res/val.json", "results_path": "cf2/dev/results.json", "eval": False }, "Test": { "train_path": "res/train.json", "test_path": "res/test.json", "results_path": "cf2/test/results.json", "eval": False } } opt = mode_opt[MODE] train = pd.read_json(opt["train_path"]) test = pd.read_json(opt["test_path"]) if MODE != "Dev": dev = pd.read_json("res/val.json") if MODE != "Test": test_res = pd.read_json("res/test.json") print("Preprocessing dates") test_date = {} for i in tqdm(test.index): test_date[test.at[i, 'id']] = test.at[i, 'updt_date'] song_meta = pd.read_json("res/song_meta.json") song_date = {} for i in tqdm(song_meta.index): song_date[song_meta.at[i, "id"]] = str(song_meta.at[i, "issue_date"]) del song_meta song_update_date = [] for i in train.index: updt_date = train.loc[i, 'updt_date'][:4] + train.loc[ i, 'updt_date'][5:7] + train.loc[i, 'updt_date'][8:10] for t in train.loc[i, 'songs']: if song_date[t] > updt_date: song_date[t] = updt_date song_update_date.append(t) for i in test.index: updt_date = test.loc[i, 'updt_date'][:4] + test.loc[ i, 'updt_date'][5:7] + test.loc[i, 'updt_date'][8:10] for t in test.loc[i, 'songs']: if song_date[t] > updt_date: song_date[t] = updt_date song_update_date.append(t) if MODE != "Dev": for i in dev.index: updt_date = dev.loc[i, 'updt_date'][:4] + dev.loc[ i, 'updt_date'][5:7] + dev.loc[i, 'updt_date'][8:10] for t in dev.loc[i, 'songs']: if song_date[t] > updt_date: song_date[t] = updt_date song_update_date.append(t) if MODE != "Test": for i in test_res.index: updt_date = test_res.loc[i, 'updt_date'][:4] + test_res.loc[ i, 'updt_date'][5:7] + test_res.loc[i, 'updt_date'][8:10] for t in test_res.loc[i, 'songs']: if song_date[t] > updt_date: song_date[t] = updt_date song_update_date.append(t) print("The number of processed songs :", len(set(song_update_date))) # Loading tags extracted from tiltle pred_tag = load_json("arena_data/model/pred_tag.json") dic_pred_tag = {} for p_t in pred_tag: dic_pred_tag[p_t['id']] = p_t['predict_tag'] train['tags_org'] = train['tags'] for i in train.index: train.at[i, 'tags'] = train.at[i, 'tags'] + dic_pred_tag[train.at[i, 'id']] test['tags_org'] = test['tags'] for i in test.index: test.at[i, 'tags'] = test.at[i, 'tags'] + dic_pred_tag[test.at[i, 'id']] if MODE != "Dev": dev['tags_org'] = dev['tags'] for i in dev.index: dev.at[i, 'tags'] = dev.at[i, 'tags'] + dic_pred_tag[dev.at[i, 'id']] if MODE != "Test": test_res['tags_org'] = test_res['tags'] for i in test_res.index: test_res.at[i, 'tags'] = test_res.at[i, 'tags'] + dic_pred_tag[ test_res.at[i, 'id']] # Calculating IDF inv_doc_freq = {} for d in train['songs'] + train['tags']: for i in d: if i in inv_doc_freq: inv_doc_freq[i] += 1 else: inv_doc_freq[i] = 1 for d in test['songs'] + test['tags']: for i in d: if i in inv_doc_freq: inv_doc_freq[i] += 1 else: inv_doc_freq[i] = 1 if MODE != "Dev": for d in dev['songs'] + dev['tags']: for i in d: if i in inv_doc_freq: inv_doc_freq[i] += 1 else: inv_doc_freq[i] = 1 if MODE != "Test": for d in test_res['songs'] + test_res['tags']: for i in d: if i in inv_doc_freq: inv_doc_freq[i] += 1 else: inv_doc_freq[i] = 1 for k in inv_doc_freq: if MODE == "Valid": inv_doc_freq[k] = math.log10( (len(train) + len(test) + len(dev) + len(test_res)) / inv_doc_freq[k]) elif MODE == "Dev": inv_doc_freq[k] = math.log10( (len(train) + len(test) + len(test_res)) / inv_doc_freq[k]) else: inv_doc_freq[k] = math.log10( (len(train) + len(test) + len(dev)) / inv_doc_freq[k]) # Preprocessing data for CF matrix if MODE == "Valid": n_train = len(train) + len(dev) + len(test_res) elif MODE == "Dev": n_train = len(train) + len(test_res) else: n_train = len(train) + len(dev) n_test = len(test) # train + test if MODE == "Valid": plylst = pd.concat([train, dev, test_res, test], ignore_index=True) elif MODE == "Dev": plylst = pd.concat([train, test_res, test], ignore_index=True) else: plylst = pd.concat([train, dev, test], ignore_index=True) # playlist id plylst["nid"] = range(n_train + n_test) # nid -> id plylst_nid_id = dict(zip(plylst["nid"], plylst["id"])) plylst_tag = plylst['tags'] tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs]) tag_dict = {x: tag_counter[x] for x in tag_counter} id_type = dict() tag_id_tid = dict() tag_tid_id = dict() for i, t in enumerate(tag_dict): tag_id_tid[t] = i tag_tid_id[i] = t id_type[t] = 1 n_tags = len(tag_dict) plylst_song = plylst['songs'] song_counter = Counter([sg for sgs in plylst_song for sg in sgs]) song_dict = {x: song_counter[x] for x in song_counter} song_id_sid = dict() song_sid_id = dict() for i, t in enumerate(song_dict): song_id_sid[t] = i song_sid_id[i] = t id_type[t] = 1 n_songs = len(song_dict) plylst_st = plylst['songs'] + plylst['tags'] st_counter = Counter([st for sts in plylst_st for st in sts]) st_dict = {x: st_counter[x] for x in st_counter} st_id_tid = dict() st_tid_id = dict() for i, t in enumerate(st_dict): st_id_tid[t] = i st_tid_id[i] = t n_sts = len(st_dict) print("Tags : ", n_tags, ", Songs : ", n_songs, ", Total : ", n_sts) plylst['songs_id'] = plylst['songs'].map( lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None]) plylst['tags_id'] = plylst['tags_org'].map( lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None]) plylst['sts_id'] = (plylst['songs'] + plylst['tags']).map( lambda x: [st_id_tid.get(st) for st in x if st_id_tid.get(st) != None]) plylst_use = plylst[['nid', 'updt_date', 'songs_id', 'tags_id', 'sts_id']] plylst_use.loc[:, 'num_songs'] = plylst_use['songs_id'].map(len) plylst_use.loc[:, 'num_tags'] = plylst_use['tags_id'].map(len) plylst_use.loc[:, 'num_sts'] = plylst_use['sts_id'].map(len) plylst_use = plylst_use.set_index('nid') plylst_train = plylst_use.iloc[:, :] plylst_test = plylst_use.iloc[n_train:, :] n_train = len(plylst_train) np.random.seed(33) test_set = plylst_test print("The number of test samples : ", len(test_set)) # Building CF matrices avg_len_songs = 0 for songs in plylst_train['songs_id']: avg_len_songs += len(songs) avg_len_songs /= len(plylst_train['songs_id']) avg_len_tags = 0 for tags in plylst_train['tags_id']: avg_len_tags += len(tags) avg_len_tags /= len(plylst_train['tags_id']) avg_len_sts = 0 for sts in plylst_train['sts_id']: avg_len_sts += len(sts) avg_len_sts /= len(plylst_train['sts_id']) row = np.repeat(range(n_train), plylst_train['num_songs']) col = [song for songs in plylst_train['songs_id'] for song in songs] dat = [1 for songs in plylst_train['songs_id'] for song in songs] train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs)) row = np.repeat(range(n_train), plylst_train['num_tags']) col = [tag for tags in plylst_train['tags_id'] for tag in tags] dat = [1 for tags in plylst_train['tags_id'] for tag in tags] train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags)) row = np.repeat(range(n_train), plylst_train['num_sts']) col = [st for sts in plylst_train['sts_id'] for st in sts] dat = [ inv_doc_freq[st_tid_id[st]] / (len(sts) + 50) for sts in plylst_train['sts_id'] for st in sts ] train_sts_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_sts)) train_songs_A_T = train_songs_A.T.tocsr() train_tags_A_T = train_tags_A.T.tocsr() # Building map playlist id to songs or tags for playlist2vec if MODE == "Valid": p2v_targets = [train, test, dev, test_res] elif MODE == "Dev": p2v_targets = [train, test, test_res] else: p2v_targets = [train, test, dev] song_dic = {} tag_dic = {} for i, q in tqdm(pd.concat(p2v_targets).iterrows()): song_dic[str(q['id'])] = q['songs'] tag_dic[str(q['id'])] = q['tags_org'] # Loading playlist embedding vectors p2v_song = WordEmbeddingsKeyedVectors.load( "arena_data/model/p2v_song.model") p2v_tag = WordEmbeddingsKeyedVectors.load("arena_data/model/p2v_tag.model") print("Predicting") res = [] filtered_lot_song = [] filtered_lot_tag = [] for pid in tqdm(test_set.index): songs_already = test_set.loc[pid, "songs_id"] tags_already = test_set.loc[pid, "tags_id"] # Song prediction - 1. Query vector to predict songs p = np.zeros((n_sts, 1)) if len(test_set.loc[pid, 'sts_id']) > 0: for st in test_set.loc[pid, 'sts_id']: if st_tid_id[st] in inv_doc_freq: p[st] = inv_doc_freq[st_tid_id[st]] / ( len(test_set.loc[pid, 'sts_id']) + 50) # Song prediction - 2. K-nn playlists val = train_sts_A.dot(p).reshape(-1) val_idx = val.reshape(-1).argsort()[-250:][::-1] val_knn = np.zeros((n_train)) val_knn[val_idx] = val[val_idx] val = val_knn**2 # Song prediction - 3. Candidates cand_song = train_songs_A_T.dot(val) # Song prediction - 4. Rescoring using playlist2vec dic_song_score = {} if str(plylst_nid_id[pid]) in p2v_song.wv.vocab: most_id = [ x for x in p2v_song.most_similar(str(plylst_nid_id[pid]), topn=50) ] for ID in most_id: for s in song_dic[ID[0]]: if s in dic_song_score: dic_song_score[s] += ID[1] else: dic_song_score[s] = ID[1] for k in dic_song_score: cand_song[song_id_sid[k]] *= dic_song_score[k]**0.2 cand_song_idx = cand_song.reshape(-1).argsort()[-5000:][::-1] # Song prediction - 5. Filtering by score and date cand_song_idx_filtered = [] for cand in cand_song_idx: if cand_song[cand] > 0 and song_date[song_sid_id[ cand]] <= test_date[plylst_nid_id[pid]][:4] + test_date[ plylst_nid_id[pid]][5:7] + test_date[ plylst_nid_id[pid]][8:10]: cand_song_idx_filtered.append(cand) if len(cand_song_idx_filtered) < 400: filtered_lot_song.append(len(cand_song_idx_filtered)) cand_song_idx = np.array(cand_song_idx_filtered) # Song prediction - 6. Rescoring using heuristics dict_score = {} for idx in cand_song_idx: dict_score[idx] = cand_song[idx] mean_doc_freq = 0 std_doc_freq = 0 list_doc_freq = [] mean_song_date = 0 list_song_date = [] if len(test_set.loc[pid, "songs_id"]) > 0: for t in test_set.loc[pid, "songs_id"]: if song_sid_id[t] in inv_doc_freq: list_doc_freq.append(inv_doc_freq[song_sid_id[t]]) song_d = int(song_date[song_sid_id[t]]) if song_d > 19000000 and song_d < 20210000: list_song_date.append(song_d) if len(list_doc_freq) > 0: mean_doc_freq = np.mean(list_doc_freq) std_doc_freq = np.std(list_doc_freq) if len(list_song_date) > 0: mean_song_date = np.mean(list_song_date) # Song prediction - 6-1. Rescoring by IDF comparison if len(list_doc_freq) > 0: for c in dict_score: if song_sid_id[c] in inv_doc_freq: dict_score[c] = 1 / ( len(list_doc_freq)**0.5) * dict_score[c] + ( 1 - 1 / (len(list_doc_freq)**0.5)) * dict_score[c] * 2 / ( np.abs(inv_doc_freq[song_sid_id[c]] - mean_doc_freq) / (std_doc_freq + 1) + 2) else: dict_score[c] = 1 / (len(list_doc_freq)** 0.5) * dict_score[c] # Song prediction - 6-2. Rescoring by Date comparison if len(list_song_date) > 0: for c in dict_score: song_d = int(song_date[song_sid_id[c]]) if song_d > 19000000 and song_d < 20210000: dict_score[c] = 1 / ( len(list_song_date)**0.5) * dict_score[c] + ( 1 - 1 / (len(list_song_date)**0.5)) * dict_score[c] / ( np.abs(song_d - mean_song_date) / 500000 + 1) else: dict_score[c] = 1 / (len(list_song_date)** 0.5) * dict_score[c] score_sorted = sorted(dict_score.items(), key=lambda x: x[1], reverse=True) cand_song_idx = [] for t in score_sorted: cand_song_idx.append(t[0]) cand_song_idx = np.array(cand_song_idx) cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:300] rec_song_idx = [song_sid_id[i] for i in cand_song_idx] # Tag prediction - 1. Query vector to predict tags p = np.zeros((n_sts, 1)) p[test_set.loc[pid, 'sts_id']] = 1 # Tag prediction - 2. K-nn playlists val = train_sts_A.dot(p).reshape(-1) val_idx = val.reshape(-1).argsort()[-250:][::-1] val_knn = np.zeros((n_train)) val_knn[val_idx] = val[val_idx] val = val_knn**2 # Tag prediction - 3. Candidates cand_tag = train_tags_A_T.dot(val) # Tag prediction - 4. Rescoring using playlist2vec dic_tag_score = {} if str(plylst_nid_id[pid]) in p2v_tag.wv.vocab: most_id = [ x for x in p2v_tag.most_similar(str(plylst_nid_id[pid]), topn=50) ] for ID in most_id: for t in tag_dic[ID[0]]: if t in dic_tag_score: dic_tag_score[t] += ID[1] else: dic_tag_score[t] = ID[1] for k in dic_tag_score: cand_tag[tag_id_tid[k]] *= dic_tag_score[k]**0.5 cand_tag_idx = cand_tag.reshape(-1).argsort()[-35:][::-1] # Tag prediction - 5. Filtering by score cand_tag_idx_filtered = [] for cand in cand_tag_idx: if cand_tag[cand] > 0: cand_tag_idx_filtered.append(cand) if len(cand_tag_idx_filtered) != 35: filtered_lot_tag.append(len(cand_tag_idx_filtered)) cand_tag_idx = np.array(cand_tag_idx_filtered) cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:30] rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx] res.append({ "id": plylst_nid_id[pid], "songs": rec_song_idx, "tags": rec_tag_idx }) print(len(filtered_lot_song), filtered_lot_song) print(len(filtered_lot_tag), filtered_lot_tag) write_json(res, "results/" + opt["results_path"]) if opt["eval"]: evaluator = CustomEvaluator() evaluator.evaluate("arena_data/answers/val.json", "arena_data/results/" + opt["results_path"])
def __call__(self, docs, summaries): tfs = [] df = OrderedDict() weights = [] entities = [] for doc in docs: tf = OrderedDict() token_found = set() doc_token = [] for sent in doc[1]: sent = sent2tokens_wostop(sent, set(stopwords.words(LANGUAGE)), LANGUAGE) for token in sent: if token in tf: tf[token] += 1 else: tf[token] = 1 if token not in token_found: token_found.add(token) if token in df: df[token] += 1 else: df[token] = 1 embedding = np.zeros(300) try: embedding += self.word_embedding[token] except KeyError: pass embedding /= len(embedding) weights.append(embedding) entities.append(str(len(entities))) tfs.append(tf) id2word = {i:word for i, word in enumerate(df.keys())} word2id = {id2word[id]:id for id in id2word.keys()} corpora = [[(word2id[token], tf[token]) for token in tf.keys()] for tf in tfs] self.doc_entities = [] for i, tf in enumerate(tfs): divisor = sum([tf[token]/df[token] for token in tf.keys()]) embedding = [] for token in tf.keys(): try: embedding.append(self.word_embedding[token]*tf[token]/df[token]) except KeyError: pass embedding = np.sum(np.array(embedding), 0)/(len(embedding)*divisor) weights.append(embedding) entities.append('d'+str(i)) self.doc_entities.append('d'+str(i)) self.lda = LdaModel(corpus=corpora, num_topics=10, id2word=id2word, passes=10) self.topic_entities = [] for i in range(10): topic_words = self.lda.show_topic(i, topn=30) embedding = [] divisor = sum([w_p_pair[1]for w_p_pair in topic_words]) for w_p_pair in topic_words: try: embedding.append(self.word_embedding[w_p_pair[0]]*w_p_pair[1]/divisor) except KeyError: pass embedding = np.sum(np.array(embedding), 0)/len(embedding) weights.append(embedding) entities.append('t'+str(i)) self.topic_entities.append('t'+str(i)) self.sent_embedding = WordEmbeddingsKeyedVectors(300) self.sent_embedding.add(entities, np.array(weights), replace=True) return self.distributional_semantic_similarity(summaries), self.topic_relevance(summaries), self.coherence(summaries)
def run(total_concat, apply_data): total_concat['id'] = total_concat['id'].astype(str) c = Counter() for i in total_concat['tags']: c.update(i) tag_list = list( map(lambda y: y[0], (filter(lambda x: x[1] > 5, c.items())))) p = re.compile('|'.join(tag_list)) total_concat['tag_in_title'] = total_concat['plylst_title'].apply( lambda x: p.findall(x)) data = [] for i in total_concat.index: temp = total_concat.loc[i] data.append({ 'id': temp['id'], 'songs': temp['songs'], 'tags': temp['tags'], 'tag_in_title': temp['tag_in_title'] }) song_dic = {} tag_dic = {} for q in data: song_dic[q['id']] = q['songs'] tag_dic[q['id']] = q['tags'] total = list( map( lambda x: list(map(str, x['songs'])) + x['tags'] + x['tag_in_title' ], data)) total = [x for x in total if len(x) > 1] print("start training item2Vec") size = 300 if 'item2vec.model' in os.listdir(): w2v_model = Word2Vec.load('item2vec.model') else: w2v_model = train.item2vec(total, size=size) print("done. \n") p2v_model = WordEmbeddingsKeyedVectors(size) ID = [] vec = [] for q in data: tmp_vec = 0 for song in list(map(str, q['songs'])) + q['tags'] + q['tag_in_title']: try: tmp_vec += w2v_model.wv.get_vector(song) except KeyError: pass if type(tmp_vec) != int: ID.append(str(q['id'])) vec.append(tmp_vec) p2v_model.add(ID, vec) with open("./arena_data/pre_tag.json", encoding="utf-8") as f: our_best = json.load(f) not_in = 0 answers = [] for i, q in enumerate(apply_data.index): q = apply_data.loc[q] try: most_id = [ x[0] for x in p2v_model.most_similar(str(q['id']), topn=200) ] get_song = [] get_tag = [] for ID in most_id: get_song += song_dic[ID] get_tag += tag_dic[ID] get_song = list(pd.value_counts(get_song)[:300].index) get_tag = list(pd.value_counts(get_tag)[:30].index) output_song = remove_seen(q["songs"], get_song)[:100] output_tag = remove_seen(q["tags"], get_tag)[:10] answers.append({ "id": q["id"], "songs": output_song, "tags": output_tag, }) except KeyError: not_in += 1 answers.append({ "id": our_best[i]["id"], "songs": our_best[i]['songs'], "tags": our_best[i]["tags"], }) for n, q in enumerate(answers): if len(q['songs']) != 100: answers[n]['songs'] += remove_seen( q['songs'], our_best[n]['songs'])[:100 - len(q['songs'])] if len(q['tags']) != 10: answers[n]['tags'] += remove_seen( q['tags'], our_best[n]['tags'])[:10 - len(q['tags'])] write_json(answers, 'final_tags.json') return answers
class PlaylistEmbedding: # java의 생성자 같은 존재 __init__ def __init__(self, FILE_PATH): self.FILE_PATH = FILE_PATH # word2vec의 요소들 # 최소 1번 이상 연관이 있어야 학습한다. self.min_count = 2 # 의미를 담을 벡터를 150차원으로 만든다. self.size = 150 # 중심단어 기준으로 앞뒤로 210개 범위까지 학습시킨다. self.window = 210 # sg = 1이면 skip-gram 아니면 CBOW self.sg = 1 # 키 + 벡터를 저장함 # KeyedVectors는 추가 교육을 지원하지 않는 대신 더 작고 RAM을 덜 차지한다. self.p2v_model = WordEmbeddingsKeyedVectors(self.size) # 유니코드 한글 시작: 44032, 끝:55199 self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG = 44032, 588, 28 # 초성 리스트0~18 self.CHOSUNG_LIST = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] # 중성 리스트 0~20 self.JUNGSUNG_LIST = [ 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ' ] # 종성 리스트 0~27 self.JONGSUNG_LIST = [ '', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] # 여기에 자모분리된 train의 플레이리스트 제목들이 담긴다. self.title_list_detach = [] # FILE_PATH가 가리키는 곳에 train, test와 most_popular_res.json, song_meta.json이 있어야 합니다. with open(FILE_PATH + '/train.json', encoding="utf-8") as f: self.train = json.load(f) with open(FILE_PATH + '/test.json', encoding="utf-8") as f: self.val = json.load(f) with open(FILE_PATH + '/most_popular_res.json', encoding="utf-8") as f: self.most_results = json.load(f) # song_meta 데이터를 가져온다. with open(FILE_PATH + '/song_meta.json', encoding="utf-8") as f: self.song_meta = json.load(f) def write_json(self, data, fname): def _conv(o): if isinstance(o, (np.int64, np.int32)): return int(o) raise TypeError parent = os.path.dirname(fname) distutils.dir_util.mkpath("./arena_data/" + parent) with io.open("./arena_data/" + fname, "w", encoding="utf-8") as f: json_str = json.dumps(data, ensure_ascii=False, default=_conv) f.write(json_str) def remove_seen(self, seen, l): seen = set(seen) return [x for x in l if not (x in seen)] # train, val의 곡과 태그를 플레이리스트 id를 key값으로 가지는 딕셔너리에 저장 def get_dic(self, train, val): song_dic = {} tag_dic = {} data = train + val for q in tqdm(data): song_dic[str(q['id'])] = q['songs'] tag_dic[str(q['id'])] = q['tags'] self.song_dic = song_dic self.tag_dic = tag_dic # 여기서 토탈로 train의 곡과 태그만 보내기 때문에 모델에는 train만 학습됨 total = list( map(lambda x: list(map(str, x['songs'])) + list(x['tags']), data)) total = [x for x in total if len(x) > 1] self.total = total def get_w2v(self, total, min_count, size, window, sg): try: print("get_w2v 실행") if not (os.path.isfile("./w2v_model.model")): print("get_w2v 모델 학습 시작") # window가 210인 이유는 태그 10개와 곡 200개 꽉차있는 플레이리스트도 존재하기 때문이다. w2v_model = Word2Vec(total, min_count=min_count, size=size, window=window, sg=sg, iter=25) print("get_w2v 모델 학습 완료") self.w2v_model = w2v_model w2v_model.save("w2v_model.model") print("w2v_model 모델 로드") self.w2v_model = Word2Vec.load("./w2v_model.model") except OSError as e: print("failed to create directory!") raise def update_p2v(self, train, val, w2v_model): ID = [] vec = [] # val에 있는 곡이나 태그들 중 train에는 없어서 예외처리되는 것을 확인하기 위한 카운트 # 이 부분은 나중에 제거해도 상관 없음 self.yes_songs_count = 0 self.yes_tags_count = 0 self.no_songs_count = 0 self.no_tags_count = 0 for q in tqdm(train + val): tmp_vec = 0 songs_vec = 0 tags_vec = 0 if len(q['songs']) >= 1 or len(q['tags']) >= 1: for x in q['songs']: try: songs_vec += w2v_model.wv.get_vector(str(x)) self.yes_songs_count += 1 except: self.no_songs_count += 1 for y in q['tags']: try: tags_vec += w2v_model.wv.get_vector(str(y)) self.yes_tags_count += 1 except: self.no_tags_count += 1 tmp_vec = songs_vec + tags_vec if type(tmp_vec) != int: ID.append(str(q['id'])) vec.append(tmp_vec) # train, val의 플레이리스트 id에 해당하는 vector값을 구함 self.p2v_model.add(ID, vec) # FastText def get_title(self, train): title_list = [] for q in train: title_list.append(q['plylst_title']) self.title_list = title_list def jamo_str(self, text, BASE_CODE, CHOSUNG, JUNGSUNG, CHOSUNG_LIST, JUNGSUNG_LIST, JONGSUNG_LIST): def clean_str(text): pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)' # E-mail제거 text = re.sub(pattern=pattern, repl='', string=text) pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+' # URL제거 text = re.sub(pattern=pattern, repl='', string=text) pattern = '([ㄱ-ㅎㅏ-ㅣ]+)' # 한글 자음, 모음 제거 text = re.sub(pattern=pattern, repl=' ', string=text) pattern = '<[^>]*>' # HTML 태그 제거 text = re.sub(pattern=pattern, repl=' ', string=text) pattern = '[^\w\s]' # 특수기호제거 text = re.sub(pattern=pattern, repl=' ', string=text) return text string = text string = clean_str(string) # print(string) sp_list = list(string) # print(sp_list) result = [] for keyword in sp_list: # 한글 여부 check 후 분리 if re.match('.*[ㄱ-ㅎㅏ-ㅣ가-힣]+.*', keyword) is not None: if keyword == ' ': result.append(' ') if keyword in CHOSUNG_LIST or keyword in JUNGSUNG_LIST or keyword in JONGSUNG_LIST: result.append('') else: # 초성 ord->문자의 코드값을 구한다 char_code = ord(keyword) - BASE_CODE char1 = int(char_code / CHOSUNG) result.append(CHOSUNG_LIST[char1]) # 중성 char2 = int((char_code - (CHOSUNG * char1)) / JUNGSUNG) result.append(JUNGSUNG_LIST[char2]) # 종성 char3 = int( (char_code - (CHOSUNG * char1) - (JUNGSUNG * char2))) if char3 == 0: result.append('-') result.append(JONGSUNG_LIST[char3]) else: result.append(keyword) results_all = [] results_all = ("".join(result)) self.results_all = results_all def get_title_list(self, results_all): # print("".join(result)) #자모 분리 결과 출력 title_list_detach = [] title_list_detach.append(results_all) self.title_list_detach.append(title_list_detach) def make_title_model(self, title_list_detach): try: print("make_title_model 실행") if not (os.path.isfile("./FT_title_model.model")): print("make_title_model 모델 학습 시작") FT_title_model = FT_gensim(title_list_detach, size=300, window=100, min_count=1, sg=1, iter=2000) print("make_title_model2 모델 학습 완료") self.FT_title_model = FT_title_model FT_title_model.save("FT_title_model.model") self.FT_title_model = FT_gensim.load("./FT_title_model.model") print("make_title_model 모델 로드됨") except OSError as e: print("failed to create directory!") raise # Fasttext끝 def get_result(self, p2v_model, song_dic, tag_dic, most_results, val, train, FT_title_model, song_meta): title_sentence_train = [] for x in train: self.jamo_str(x['plylst_title'], self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG, self.CHOSUNG_LIST, self.JUNGSUNG_LIST, self.JONGSUNG_LIST) title_sentence_train.append(self.results_all) answers = [] # 제대로 진행되고 있는지 알기 위해 세는 카운트 # most_id는 제대로 뽑히고 있는가? self.most_id = [] # ply_embedding 추천이 제대로된 플레이리스트는 몇개인가 self.p2v_count = 0 # 예외처리된 플레이리스트는 몇개인가 self.except_count = 0 # 어디서 끊기는지 정확히 알고 싶으면 옮기면서 카운트해보는 변수 self.when_stop = 0 # 문제유형별로 몇개의 플레이리스트가 있는 세는 카운트 self.TNSN = 0 self.TYSN = 0 self.TNSY = 0 self.TYSY = 0 # 곡이나 태그가 100, 10개 안채워졌을 때 채우는 카운트 self.update_song_count = 0 self.update_tag_count = 0 for n, q in tqdm(enumerate(val), total=len(val)): # 제목, 곡, 태그 유무 파악 및 개수 세기 songs = q['songs'] tags = q['tags'] songs_count = len(songs) tags_count = len(tags) try: # 플레이리스트 임베딩하는 알고리즘(곡으로 곡추천할 때 씀) def ply_em(q): most_id = [ x[0] for x in p2v_model.most_similar(str(q['id']), topn=15) ] # most_vec = [x[1] for x in p2v_model.most_similar(str(q['id']), topn=15)] # 원본 get_song = [] get_tag = [] for ID in most_id: get_song += song_dic[ID] get_tag += tag_dic[ID] count = {} for i in get_song: try: count[i] += 1 except: count[i] = 1 count = sorted(count.items(), key=lambda x: x[1], reverse=True) count2 = {} for i in get_tag: try: count2[i] += 1 except: count2[i] = 1 count2 = sorted(count2.items(), key=lambda x: x[1], reverse=True) self.when_stop += 1 real_get_song = [] real_get_tag = [] for song in count: real_get_song.append(song[0]) for tag in count2: real_get_tag.append(tag[0]) # get_song = list(pd.value_counts(get_song)[:500].index) # get_tag = list(pd.value_counts(get_tag)[:20].index) def to_integer(dt_time): return 10000 * dt_time.year + 100 * dt_time.month + dt_time.day utc_time = datetime.strptime(q['updt_date'][:26], '%Y-%m-%d %H:%M:%S.%f') updt = int(to_integer(utc_time)) true_get_song = [] for song_id in real_get_song: issue = int(song_meta[song_id]['issue_date']) if updt - issue >= 0: true_get_song.append(song_id) else: pass answers.append({ "id": q["id"], "songs": self.remove_seen(q["songs"], true_get_song)[:100], "tags": self.remove_seen(q["tags"], real_get_tag)[:10], }) # 여기까지 오면 카운트 추가 self.p2v_count += 1 # FastText 알고리즘 def fasttext_title(q): train_ids = [] get_song = [] get_tag = [] self.jamo_str(q['plylst_title'], self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG, self.CHOSUNG_LIST, self.JUNGSUNG_LIST, self.JONGSUNG_LIST) title = self.results_all F_list = FT_title_model.wv.most_similar(title, topn=60) for x in F_list: number = title_sentence_train.index(x[0]) train_ids.append(train[number]['id']) for ids in train_ids: get_song += song_dic[str(ids)] get_tag += tag_dic[str(ids)] count = {} for i in get_song: try: count[i] += 1 except: count[i] = 1 count = sorted(count.items(), key=lambda x: x[1], reverse=True) count2 = {} for i in get_tag: try: count2[i] += 1 except: count2[i] = 1 count2 = sorted(count2.items(), key=lambda x: x[1], reverse=True) real_get_song = [] real_get_tag = [] for song in count: real_get_song.append(song[0]) for tag in count2: real_get_tag.append(tag[0]) # get_song = list(pd.value_counts(real_get_song)[:200].index) # get_tag = list(pd.value_counts(real_get_tag)[:20].index) def to_integer(dt_time): return 10000 * dt_time.year + 100 * dt_time.month + dt_time.day utc_time = datetime.strptime(q['updt_date'][:26], '%Y-%m-%d %H:%M:%S.%f') updt = int(to_integer(utc_time)) true_get_song = [] for song_id in real_get_song: issue = int(song_meta[song_id]['issue_date']) if updt - issue >= 0: true_get_song.append(song_id) else: pass answers.append({ "id": q["id"], "songs": self.remove_seen(q["songs"], true_get_song)[:100], "tags": self.remove_seen(q["tags"], real_get_tag)[:10], }) # 태그 X 곡 X 제목 O if tags_count == 0 and songs_count == 0: self.TNSN += 1 fasttext_title(q) # 태그 O 곡 X 제목 X elif tags_count > 0 and songs_count == 0: self.TYSN += 1 fasttext_title(q) # 태그 x 곡 O elif tags_count == 0 and songs_count > 0: self.TNSY += 1 ply_em(q) # 태그 O 곡 O elif tags_count > 0 and songs_count > 0: self.TYSY += 1 ply_em(q) except: # 예외처리되면 카운터 추가 self.except_count += 1 answers.append({ "id": q["id"], "songs": most_results[n]['songs'], "tags": most_results[n]["tags"], }) # check and update answer for n, q in enumerate(answers): if len(q['songs']) != 100: answers[n]['songs'] += self.remove_seen( q['songs'], self.most_results[n]['songs'])[:100 - len(q['songs'])] self.update_song_count += 1 if len(q['tags']) != 10: answers[n]['tags'] += self.remove_seen( q['tags'], self.most_results[n]['tags'])[:10 - len(q['tags'])] self.update_tag_count += 1 self.answers = answers def run(self): # Word2Vec ply_embedding self.get_dic(self.train, self.val) self.get_w2v(self.total, self.min_count, self.size, self.window, self.sg) self.update_p2v(self.train, self.val, self.w2v_model) # FastText ply_title self.get_title(self.train) for string in self.title_list: self.jamo_str(string, self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG, self.CHOSUNG_LIST, self.JUNGSUNG_LIST, self.JONGSUNG_LIST) self.get_title_list(self.results_all) self.make_title_model(self.title_list_detach) # 곡과 태그 채우는 함수 self.get_result(self.p2v_model, self.song_dic, self.tag_dic, self.most_results, self.val, self.train, self.FT_title_model, self.song_meta) self.write_json(self.answers, 'results.json') print("results 작성 완료") def train_model(self): # Word2Vec ply_embedding self.get_dic(self.train, self.val) self.get_w2v(self.total, self.min_count, self.size, self.window, self.sg) self.update_p2v(self.train, self.val, self.w2v_model) # FastText ply_title self.get_title(self.train) for string in self.title_list: self.jamo_str(string, self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG, self.CHOSUNG_LIST, self.JUNGSUNG_LIST, self.JONGSUNG_LIST) self.get_title_list(self.results_all) self.make_title_model(self.title_list_detach)
def train(): MODE = "Test" if MODE == "Valid": train = load_json("arena_data/orig/train.json") + load_json( "arena_data/questions/val.json") dev = load_json("res/val.json") test = load_json("res/test.json") else: train = load_json("res/train.json") dev = load_json("res/val.json") test = load_json("res/test.json") pred_tag = load_json("arena_data/model/pred_tag.json") dic_pred_tag = {} for p_t in pred_tag: dic_pred_tag[p_t['id']] = p_t['predict_tag'] for doc in train: doc['tags_org'] = doc['tags'].copy() doc['tags'] += dic_pred_tag[doc['id']] for doc in dev: doc['tags_org'] = doc['tags'].copy() doc['tags'] += dic_pred_tag[doc['id']] for doc in test: doc['tags_org'] = doc['tags'].copy() doc['tags'] += dic_pred_tag[doc['id']] item_list = [] len_item = [] for doc in train + dev + test: song_list = [] for i in doc['songs']: song_list.append(str(i)) item_list.append(song_list + doc['tags']) len_item.append(len(song_list + doc['tags'])) print("Max length of item list :", max(len_item), ", Min :", min(len_item)) item_list = [x for x in item_list if len(x) > 1] print("Train set :", len(item_list)) print("Training Item2Vec model") SIZE = 100 model = Word2Vec(sentences=item_list, size=SIZE, window=240, min_count=2, sg=1, workers=8, iter=10, negative=7, compute_loss=True, callbacks=[LossPrinter()]) model.save("arena_data/model/word2vec.model") print("Vocab : ", len(model.wv.vocab)) print("Building and saving playlist embeddings") song_dic = {} tag_dic = {} for q in tqdm(train + test + dev): song_dic[str(q['id'])] = q['songs'] tag_dic[str(q['id'])] = q['tags_org'] p2v_song = WordEmbeddingsKeyedVectors(SIZE) ID = [] vec = [] for q in tqdm(train + test + dev): tmp_vec = 0 cnt_vocab = 0 if len(q['songs']) >= 1: for item in q['songs']: try: tmp_vec += model.wv.get_vector(str(item)) * 2 cnt_vocab += 1 except KeyError: pass if len(q['tags']) >= 1: for item in q['tags']: try: tmp_vec += model.wv.get_vector(str(item)) cnt_vocab += 1 except KeyError: pass if type(tmp_vec) != int: ID.append(str(q['id'])) vec.append(tmp_vec) p2v_song.add(ID, vec) p2v_song.save("arena_data/model/p2v_song.model") p2v_tag = WordEmbeddingsKeyedVectors(SIZE) ID = [] vec = [] for q in tqdm(train + test + dev): tmp_vec = 0 cnt_vocab = 0 if len(q['songs']) >= 1: for item in q['songs']: try: tmp_vec += model.wv.get_vector(str(item)) cnt_vocab += 1 except KeyError: pass if len(q['tags']) >= 1: for item in q['tags']: try: tmp_vec += model.wv.get_vector(str(item)) * 2 cnt_vocab += 1 except KeyError: pass if type(tmp_vec) != int: ID.append(str(q['id'])) vec.append(tmp_vec) p2v_tag.add(ID, vec) p2v_tag.save("arena_data/model/p2v_tag.model") if MODE == "Valid": print("Testing") questions = load_json("arena_data/questions/val.json") cnt_wv_song = 0 cnt_wv_tag = 0 res = [] for q in tqdm(questions): dic_song_score = {} dic_tag_score = {} song_result = [] tag_result = [] if str(q['id']) in p2v_song.wv.vocab: most_id = [ x for x in p2v_song.most_similar(str(q['id']), topn=50) ] for ID in most_id: for s in song_dic[ID[0]]: if s in dic_song_score: dic_song_score[s] += ID[1] else: dic_song_score[s] = ID[1] if str(q['id']) in p2v_tag.wv.vocab: most_id = [ x for x in p2v_tag.most_similar(str(q['id']), topn=50) ] for t in tag_dic[ID[0]]: if t in dic_tag_score: dic_tag_score[t] += ID[1] else: dic_tag_score[t] = ID[1] if len(dic_song_score) > 0: sort_song_score = sorted(dic_song_score.items(), key=lambda x: x[1], reverse=True) for s in sort_song_score: song_result.append(s[0]) cnt_wv_song += 1 if len(dic_tag_score) > 0: sort_tag_score = sorted(dic_tag_score.items(), key=lambda x: x[1], reverse=True) for s in sort_tag_score: tag_result.append(s[0]) cnt_wv_tag += 1 res.append({ "id": q["id"], "songs": remove_seen(q["songs"], song_result)[:100], "tags": remove_seen(q["tags"], tag_result)[:10], }) print(len(questions), cnt_wv_song, cnt_wv_tag) ans = load_json("arena_data/answers/val.json") evaluator = CustomEvaluator() evaluator._evaluate(ans, res)
w2v_model = Word2Vec(tot_repr, min_count=min_count, size=size, window=window, sg=sg, workers=8, hashfxn=hash) #%% save the w2v model with open(os.path.join(MODEL_PATH, '0007ky.w2v'), 'wb') as f: pickle.dump(w2v_model, f) # In[ ]: # make p2v model p2v_model = WordEmbeddingsKeyedVectors(size) #%% tot = train + test + val song_dic = {} tag_dic = {} for q in tqdm(tot): song_dic[q['id']] = q['songs'] tag_dic[q['id']] = q['tags'] #%% ID = [] vec = [] for q in tqdm(tot, leave=True, position=0): tmp_vec = 0 if len(q['repr']) >= 1: for word in q['repr']: