예제 #1
0
    def make_dataset(self, shuffle=True):
        result = {"model_input": [], 'label': []}
        pad_idx = self.sentencepiece.piece_to_id(parameters.pad_token)

        if shuffle:
            random.shuffle(self.dataset)

        for each in tqdm(self.dataset, total=len(self.dataset)):
            plylst_title = util.remove_special_char(each['plylst_title'])
            if not plylst_title:
                continue

            songs = list(filter(lambda song: song in self.all_songs_set, each['songs']))
            tags = list(filter(lambda tag: tag in self.all_tags_set, each['tags']))

            label = songs + tags

            model_input = convert_model_input(plylst_title, self.sentencepiece, self.model_input_size)
            pad_model_input = model_input + [pad_idx] * (self.model_input_size - len(model_input))
            label = self.label_info.label_encoder.transform(label)

            result["model_input"].append(pad_model_input)
            result["label"].append(label)

        return result
예제 #2
0
    def make_ndcg_check_dataset(self, question):
        result = {'model_input': [], 'id_list': [], 'seen_songs_set': [], 'seen_tags_set': [],
                  'plylst_updt_date': [], 'gt': []}

        pad_idx = self.sentencepiece.piece_to_id(parameters.pad_token)

        for each in tqdm(question, total=len(question)):
            plylst_title = util.remove_special_char(each['plylst_title'])
            if not plylst_title:
                continue

            model_input = convert_model_input(plylst_title, self.sentencepiece, self.model_input_size)

            pad_model_input = model_input + [pad_idx] * (self.model_input_size - len(model_input))

            gt = self.answer_plylst_id_songs_tags_dict[each["id"]]
            gt['id'] = each["id"]

            songs = list(filter(lambda song: song in self.all_songs_set, each['songs']))
            tags = list(filter(lambda tag: tag in self.all_tags_set, each['tags']))

            result['gt'].append(gt)
            result['model_input'].append(pad_model_input)
            result['id_list'].append(each["id"])
            result['seen_songs_set'].append(set(songs))
            result['seen_tags_set'].append(set(tags))
            result['plylst_updt_date'].append(util.convert_updt_date(each["updt_date"]))

        return result
예제 #3
0
    def make_loss_check_dataset(self, question):
        dataset = {"model_input": [], 'label': []}
        pad_idx = self.sentencepiece.piece_to_id(parameters.pad_token)

        for each in tqdm(question, total=len(question)):
            plylst_title = util.remove_special_char(each['plylst_title'])
            if not plylst_title:
                continue

            songs = list(filter(lambda song: song in self.all_songs_set, each['songs']))
            tags = list(filter(lambda tag: tag in self.all_tags_set, each['tags']))

            answer_songs = list(filter(lambda song: song in self.all_songs_set,
                                       self.answer_plylst_id_songs_tags_dict[each['id']]['songs']))
            answer_tags = list(filter(lambda tag: tag in self.all_tags_set,
                                      self.answer_plylst_id_songs_tags_dict[each['id']]['tags']))

            label = songs + answer_songs + tags + answer_tags
            if not label:
                continue
            label = self.label_info.label_encoder.transform(label)

            model_input = convert_model_input(plylst_title, self.sentencepiece, self.model_input_size)
            pad_model_input = model_input + [pad_idx] * (self.model_input_size - len(model_input))

            dataset["model_input"].append(pad_model_input)
            dataset["label"].append(label)

        return dataset
예제 #4
0
def dump_plylst_title(dataset, fout):
    with open(fout, 'w', encoding='utf-8', errors='ignore') as o:
        for each in dataset:
            plylst_title = util.remove_special_char(each['plylst_title'])
            if not plylst_title:
                continue
            o.write(plylst_title + '\n')
예제 #5
0
    def make_pre_train_dataset(self, shuffle=True):
        result = {"model_input": [], 'mask_label': [], 'boolean_mask': []}
        pad_idx = self.sentencepiece.piece_to_id(parameters.pad_token)

        if shuffle:
            random.shuffle(self.dataset)

        for each in tqdm(self.dataset, total=len(self.dataset)):
            plylst_title = util.remove_special_char(each['plylst_title'])
            if not plylst_title:
                continue

            model_input = convert_model_input(plylst_title, self.sentencepiece, self.model_input_size)
            model_input, mask_label, boolean_mask = make_mask_dataset(model_input, self.sentencepiece)
            if not model_input:
                continue

            pad_model_input = model_input + [pad_idx] * (self.model_input_size - len(model_input))
            pad_boolean_mask = boolean_mask + [False] * (self.model_input_size - len(boolean_mask))

            result["model_input"].append(pad_model_input)
            result["mask_label"].append(mask_label)
            result["boolean_mask"].append(pad_boolean_mask)

        return result
    def do_reco(self, question_path, batch_size=128, title_importance=0.85, title_tag_weight=0.8):
        answers = []

        songs_tags_artists_data = {'model_input': [], 'plylst_id_list': []}
        plylst_title_data = {'model_input': [], 'plylst_id_list': []}
        coldstart_plylst_id_list = []

        plylst_id_songs_tags_num = {}

        question = util.load_json(question_path)
        for each in tqdm(question, total=len(question), desc='Preprocess'):
            songs = list(filter(lambda song: song in self.all_songs_set, each['songs']))
            tags = list(filter(lambda tag: tag in self.all_tags_set, each['tags']))
            artists = util.get_artists(songs, self.label_info.song_artist_dict)
            plylst_title = util.remove_special_char(each['plylst_title'])
            plylst_id = each['id']
            plylst_updt_date = each['updt_date']

            self.plylst_id_seen_songs_dict[plylst_id] = set(songs)
            self.plylst_id_seen_tags_dict[plylst_id] = set(tags)
            self.plylst_id_plylst_updt_date_dict[plylst_id] = util.convert_updt_date(plylst_updt_date)

            plylst_id_songs_tags_num[plylst_id] = len(songs + tags)
            if songs or tags:
                model_input = songs_tags_artists_util.convert_model_input(songs, tags, artists,
                                                                          self.label_info.label_encoder)
                model_input += [self.songs_tags_artists_model_pad_idx] * (
                        parameters.songs_tags_artists_model_max_sequence_length - len(model_input))
                songs_tags_artists_data['model_input'].append(model_input)
                songs_tags_artists_data['plylst_id_list'].append(plylst_id)

            if plylst_title:
                model_input = plylst_title_util.convert_model_input(plylst_title, self.sp,
                                                                    parameters.title_model_max_sequence_length)
                model_input += [self.plylst_title_model_pad_idx] * (
                        parameters.title_model_max_sequence_length - len(model_input))
                plylst_title_data['model_input'].append(model_input)
                plylst_title_data['plylst_id_list'].append(plylst_id)

            if not songs and not tags and not plylst_title:
                coldstart_plylst_id_list.append(plylst_id)

        total_plylst_id_reco_song_score_dict = {}
        total_plylst_id_reco_tag_score_dict = {}

        # do songs_tags_artists_model
        iter = int(np.ceil(len(songs_tags_artists_data['model_input']) / batch_size))
        for i in tqdm(range(iter), desc='songs_tags_artists_model'):
            plylst_id_reco_song_score_dict, plylst_id_reco_tag_score_dict = self.songs_tags_do_reco(
                self.songs_tags_artists_model,
                model_input=songs_tags_artists_data['model_input'][i * batch_size:(i + 1) * batch_size],
                plylst_id_list=songs_tags_artists_data['plylst_id_list'][i * batch_size:(i + 1) * batch_size])

            for plylst_id in plylst_id_reco_song_score_dict:
                if plylst_id not in total_plylst_id_reco_song_score_dict:
                    total_plylst_id_reco_song_score_dict[plylst_id] = {}
                for song, score in plylst_id_reco_song_score_dict[plylst_id].items():
                    if song not in total_plylst_id_reco_song_score_dict[plylst_id]:
                        total_plylst_id_reco_song_score_dict[plylst_id][song] = 0
                    total_plylst_id_reco_song_score_dict[plylst_id][song] += score * plylst_id_songs_tags_num[
                        plylst_id] / (title_importance + plylst_id_songs_tags_num[plylst_id])

            for plylst_id in plylst_id_reco_tag_score_dict:
                if plylst_id not in total_plylst_id_reco_tag_score_dict:
                    total_plylst_id_reco_tag_score_dict[plylst_id] = {}
                for tag, score in plylst_id_reco_tag_score_dict[plylst_id].items():
                    if tag not in total_plylst_id_reco_tag_score_dict[plylst_id]:
                        total_plylst_id_reco_tag_score_dict[plylst_id][tag] = 0
                    total_plylst_id_reco_tag_score_dict[plylst_id][tag] += score

        # do plylst_title_model
        iter = int(np.ceil(len(plylst_title_data['model_input']) / batch_size))
        for i in tqdm(range(iter), desc='plylst_title_model'):
            plylst_id_reco_song_score_dict, plylst_id_reco_tag_score_dict = self.songs_tags_do_reco(
                self.plylst_title_model,
                model_input=plylst_title_data['model_input'][i * batch_size:(i + 1) * batch_size],
                plylst_id_list=plylst_title_data['plylst_id_list'][i * batch_size:(i + 1) * batch_size])

            for plylst_id in plylst_id_reco_song_score_dict:
                if plylst_id not in total_plylst_id_reco_song_score_dict:
                    total_plylst_id_reco_song_score_dict[plylst_id] = {}
                for song, score in plylst_id_reco_song_score_dict[plylst_id].items():
                    if song not in total_plylst_id_reco_song_score_dict[plylst_id]:
                        total_plylst_id_reco_song_score_dict[plylst_id][song] = 0
                    total_plylst_id_reco_song_score_dict[plylst_id][
                        song] += score * title_importance / (title_importance + plylst_id_songs_tags_num[plylst_id])

            for plylst_id in plylst_id_reco_tag_score_dict:
                if plylst_id not in total_plylst_id_reco_tag_score_dict:
                    total_plylst_id_reco_tag_score_dict[plylst_id] = {}
                for tag, score in plylst_id_reco_tag_score_dict[plylst_id].items():
                    if tag not in total_plylst_id_reco_tag_score_dict[plylst_id]:
                        total_plylst_id_reco_tag_score_dict[plylst_id][tag] = 0
                    total_plylst_id_reco_tag_score_dict[plylst_id][
                        tag] += score * title_tag_weight

        # 두개 모델 종합해서 추천
        for plylst_id in total_plylst_id_reco_song_score_dict:
            reco_songs = list(map(lambda x: x[0], sorted(list(total_plylst_id_reco_song_score_dict[plylst_id].items()),
                                                         key=lambda x: x[1], reverse=True)[:100]))
            reco_tags = list(map(lambda x: x[0], sorted(list(total_plylst_id_reco_tag_score_dict[plylst_id].items()),
                                                        key=lambda x: x[1], reverse=True)[:10]))
            answers.append({
                "id": plylst_id,
                "songs": reco_songs,
                "tags": reco_tags,
            })

        # cold_start
        for plylst_id in tqdm(coldstart_plylst_id_list, total=len(coldstart_plylst_id_list), desc='coldstart_reco'):
            reco_songs, reco_tags = self.coldstart_do_reco(plylst_id)
            answers.append({
                "id": plylst_id,
                "songs": reco_songs,
                "tags": reco_tags,
            })

        return answers