示例#1
0
def load_bpe_model(path):
    from sentencepiece import SentencePieceProcessor
    spm = SentencePieceProcessor()
    spm.Load(path)
    if spm.Load(path):
        return spm
    else:
        raise Exception("Error loading model")
示例#2
0
文件: core.py 项目: fastai/fastai
class SentencePieceTokenizer():#TODO: pass the special tokens symbol to sp
    "SentencePiece tokenizer for `lang`"
    def __init__(self, lang='en', special_toks=None, sp_model=None, vocab_sz=None, max_vocab_sz=30000,
                 model_type='unigram', char_coverage=None, cache_dir='tmp'):
        try: from sentencepiece import SentencePieceTrainer,SentencePieceProcessor
        except ImportError:
            raise Exception('sentencepiece module is missing: run `pip install sentencepiece!=0.1.90,!=0.1.91`')
        self.sp_model,self.cache_dir = sp_model,Path(cache_dir)
        self.vocab_sz,self.max_vocab_sz,self.model_type = vocab_sz,max_vocab_sz,model_type
        self.char_coverage = ifnone(char_coverage, 0.99999 if lang in eu_langs else 0.9998)
        self.special_toks = ifnone(special_toks, defaults.text_spec_tok)
        if sp_model is None: self.tok = None
        else:
            self.tok = SentencePieceProcessor()
            self.tok.Load(str(sp_model))
        os.makedirs(self.cache_dir, exist_ok=True)

    def _get_vocab_sz(self, raw_text_path):
        cnt = Counter()
        with open(raw_text_path, 'r') as f:
            for line in f.readlines():
                cnt.update(line.split())
                if len(cnt)//4 > self.max_vocab_sz: return self.max_vocab_sz
        res = len(cnt)//4
        while res%8 != 0: res+=1
        return max(res,29)

    def train(self, raw_text_path):
        "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
        from sentencepiece import SentencePieceTrainer
        vocab_sz = self._get_vocab_sz(raw_text_path) if self.vocab_sz is None else self.vocab_sz
        spec_tokens = ['\u2581'+s for s in self.special_toks]
        SentencePieceTrainer.Train(" ".join([
            f"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}",
            f"--character_coverage={self.char_coverage} --model_type={self.model_type}",
            f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1 --minloglevel=2",
            f"--user_defined_symbols={','.join(spec_tokens)} --hard_vocab_limit=false"]))
        raw_text_path.unlink()
        return self.cache_dir/'spm.model'

    def setup(self, items, rules=None):
        from sentencepiece import SentencePieceProcessor
        if rules is None: rules = []
        if self.tok is not None: return {'sp_model': self.sp_model}
        raw_text_path = self.cache_dir/'texts.out'
        with open(raw_text_path, 'w') as f:
            for t in progress_bar(maps(*rules, items), total=len(items), leave=False):
                f.write(f'{t}\n')
        sp_model = self.train(raw_text_path)
        self.tok = SentencePieceProcessor()
        self.tok.Load(str(sp_model))
        return {'sp_model': sp_model}

    def __call__(self, items):
        if self.tok is None: self.setup(items)
        for t in items: yield self.tok.EncodeAsPieces(t)
示例#3
0
def main(train_path, val_path, test_path, config_path, subword_model_path,
         out_dir):
    params = Params.from_file(config_path)
    reader_params = params.pop("reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)
    processor = SentencePieceProcessor()
    processor.Load(subword_model_path)
    train_text_file = os.path.join(out_dir, "train.text.txt")
    train_summary_file = os.path.join(out_dir, "train.summary.txt")
    val_text_file = os.path.join(out_dir, "val.text.txt")
    val_summary_file = os.path.join(out_dir, "val.summary.txt")
    test_text_file = os.path.join(out_dir, "test.text.txt")
    test_summary_file = os.path.join(out_dir, "test.summary.txt")
    files = ((train_path, train_text_file,
              train_summary_file), (val_path, val_text_file, val_summary_file),
             (test_path, test_text_file, test_summary_file))
    for path, text_file_name, summary_file_name in files:
        with open(text_file_name,
                  "w") as text_file, open(summary_file_name,
                                          "w") as summary_file:
            for text, summary in reader.parse_set(path):
                text_subwords = processor.EncodeAsPieces(text)
                summary_subwords = processor.EncodeAsPieces(summary)
                text_subwords.insert(0, "<t>")
                text_subwords.append("</t>")
                summary_subwords.insert(0, "<t>")
                summary_subwords.append("</t>")
                text_file.write(" ".join(text_subwords) + "\n")
                summary_file.write((" ".join(summary_subwords)) + "\n")
class SentencePieceExtractor:
    """
    Extractor implementation for SentencePiece trained models.
    https://github.com/google/sentencepiece
    """
    def __init__(self, model: str):
        # Get SentencePiece
        self.sp = SentencePieceProcessor()
        self.sp.Load(model)

    def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
        sp = self.sp
        vocab = {
            sp.id_to_piece(index): index
            for index in trange(sp.GetPieceSize())
        }

        # Merges
        merges = []
        for piece_l in tqdm(vocab.keys(), total=sp.GetPieceSize()):
            for piece_r in vocab.keys():
                if piece_l != piece_r:
                    merge = sp.PieceToId(f"{piece_l}{piece_r}")
                    score = sp.GetScore(merge)

                    if score != 0.:
                        merges += [(piece_l, piece_r)]

        return vocab, merges
示例#5
0
class SentencePieceTokenizer(Tokenizer, CppProcessorMixin):
    """Sentence piece tokenizer."""

    class Config(ConfigBase):
        sp_model_path: str = ""

    def __init__(self, sp_model_path: str = ""):
        self.sp_model_path = sp_model_path
        self._load_processor()

    @classmethod
    def from_config(cls, config: Config):
        return cls(config.sp_model_path)

    def tokenize(self, input_str: str) -> List[Token]:
        pieces = self.processor.EncodeAsPieces(input_str)
        tokens = []
        # calculate start and end indices of each piece.
        end = 0
        for piece in pieces:
            original_piece = piece.lstrip("\u2581")
            start = input_str.find(original_piece, end)
            end = start + len(original_piece)
            tokens.append(Token(piece, start, end))
        return tokens

    def _load_processor(self):
        self.processor = SentencePieceProcessor()
        self.processor.Load(self.sp_model_path)
示例#6
0
class SentencePieceTokenizer():#TODO: pass the special tokens symbol to sp
    "Spacy tokenizer for `lang`"
    def __init__(self, lang='en', special_toks=None, sp_model=None, vocab_sz=None, max_vocab_sz=30000,
                 model_type='unigram', char_coverage=None, cache_dir='tmp'):
        try: from sentencepiece import SentencePieceTrainer,SentencePieceProcessor
        except ImportError:
            raise Exception('sentencepiece module is missing: run `pip install sentencepiece`')
        self.sp_model,self.cache_dir = sp_model,Path(cache_dir)
        self.vocab_sz,self.max_vocab_sz,self.model_type = vocab_sz,max_vocab_sz,model_type
        self.char_coverage = ifnone(char_coverage, 0.99999 if lang in eu_langs else 0.9998)
        self.special_toks = ifnone(special_toks, defaults.text_spec_tok)
        if sp_model is None: self.tok = None
        else:
            self.tok = SentencePieceProcessor()
            self.tok.Load(str(sp_model))
        os.makedirs(self.cache_dir, exist_ok=True)

    def _get_vocab_sz(self, raw_text_path):
        cnt = Counter()
        with open(raw_text_path, 'r') as f:
            for line in f.readlines():
                cnt.update(line.split())
                if len(cnt)//4 > self.max_vocab_sz: return self.max_vocab_sz
        res = len(cnt)//4
        while res%8 != 0: res+=1
        return res

    def train(self, raw_text_path):
        "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
class SentencePieceExtractor:
    """
    Extractor implementation for SentencePiece trained model_files. https://github.com/google/sentencepiece
    """
    def __init__(self, model: str):
        requires_sentencepiece(self)
        from sentencepiece import SentencePieceProcessor

        self.sp = SentencePieceProcessor()
        self.sp.Load(model)

    def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
        sp = self.sp
        vocab = {
            sp.id_to_piece(index): index
            for index in range(sp.GetPieceSize())
        }

        # Merges
        merges = []
        for piece_l in vocab.keys():
            for piece_r in vocab.keys():
                merge = f"{piece_l}{piece_r}"
                piece_id = vocab.get(merge, None)
                if piece_id:
                    merges += [(piece_l, piece_r, piece_id)]
        merges = sorted(merges, key=lambda val: val[2])
        merges = [(val[0], val[1]) for val in merges]

        return vocab, merges
示例#8
0
class SentencePieceTokenizer(Tokenizer, CppProcessorMixin):
    """Sentence piece tokenizer."""

    class Config(ConfigBase):
        sp_model_path: str = ""
        max_input_text_length: Optional[int] = None
        use_fb_sentencepiece: Optional[bool] = False

    def __init__(
        self,
        sp_model_path: str = "",
        max_input_text_length: Optional[int] = None,
        use_fb_sentencepiece: Optional[bool] = None,
    ):
        self.sp_model_path = sp_model_path
        self.max_input_text_length = max_input_text_length
        self.use_fb_sentencepiece = use_fb_sentencepiece
        self._load_processor()
        log_class_usage(__class__)

    @classmethod
    def from_config(cls, config: Config):
        return cls(
            config.sp_model_path,
            config.max_input_text_length,
            config.use_fb_sentencepiece,
        )

    def tokenize(self, input_str: str) -> List[Token]:
        if (
            hasattr(self, "max_input_text_length")
            and self.max_input_text_length is not None
        ):
            input_str = input_str[: self.max_input_text_length]
        pieces = self.processor.EncodeAsPieces(input_str)
        tokens = []
        # calculate start and end indices of each piece.
        end = 0
        for piece in pieces:
            original_piece = piece.lstrip("\u2581")
            start = input_str.find(original_piece, end)
            end = start + len(original_piece)
            tokens.append(Token(piece, start, end))
        return tokens

    def _load_processor(self):
        sp_model_path = PathManager.get_local_path(self.sp_model_path)
        if self.use_fb_sentencepiece:
            self.processor = torch.classes.fb.SentencePiece.fromFile(sp_model_path)
        else:
            from sentencepiece import SentencePieceProcessor

            self.processor = SentencePieceProcessor()
            self.processor.Load(sp_model_path)

    def torchscriptify(self):
        return ScriptDoNothingTokenizer()
示例#9
0
def main():

	parser = ArgumentParser()
	parser.add_argument("--model", required=True, help="sentencepiece model to use for encoding")
	parser.add_argument("--inputs", nargs="+", default=["-"], help="input files to filter/encode")
	parser.add_argument("--outputs", nargs="+", default=["-"], help="path to save encoded outputs")
	parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
	parser.add_argument("--min-len", type=int, metavar="N", help="filter sentence pairs with fewer than N tokens")
	parser.add_argument("--max-len", type=int, metavar="N", help="filter sentence pairs with more than N tokens")
	args = parser.parse_args()

	sp = SentencePieceProcessor()
	sp.Load(args.model)

	if args.output_format == "piece":

		def encode(l):
			return sp.EncodeAsPieces(l)

	elif args.output_format == "id":

		def encode(l):
			return list(map(str, sp.EncodeAsIds(l)))

	if args.min_len is not None or args.max_len is not None:

		def valid(line):
			return (args.min_len is None or len(line) >= args.min_len) and (args.max_len is None or len(line) <= args.max_len)

	else:

		def valid(lines):
			return True

	with ExitStack() as stack:
		inputs = [stack.enter_context(open(input, "r", encoding="utf-8")) if input != "-" else sys.stdin for input in args.inputs]
		outputs = [stack.enter_context(open(output, "w", encoding="utf-8")) if output != "-" else sys.stdout for output in args.outputs]

		def encode_line(line):
			line = line.strip()
			if len(line) > 0:
				line = encode(line)
				if valid(line):
					return line
			return None

		for i, lines in enumerate(zip(*inputs), start=1):
			enc_lines = list(map(encode_line, lines))
			if not any(enc_line is None for enc_line in enc_lines):
				for enc_line, output_h in zip(enc_lines, outputs):
					print(" ".join(enc_line), file=output_h)
			if i % 10000 == 0:
				print("processed {} lines".format(i), file=sys.stderr)
示例#10
0
def train_dataloader():
    config = ConveRTTrainConfig(train_batch_size=10, split_size=5)
    tokenizer = SentencePieceProcessor()
    tokenizer.Load(config.sp_model_path)

    instances = load_instances_from_reddit_dataset(
        "data/sample-dataset.json")[:100]
    dataset = ConveRTDataset(instances, tokenizer)
    data_loader = DataLoader(dataset,
                             batch_size=config.train_batch_size,
                             collate_fn=convert_collate_fn)
    return data_loader
示例#11
0
def make_title_tdm(df, path):
    if "{}.model".format(path) not in os.listdir():
        makeSentencepieceModel(df, path)
    sp = SentencePieceProcessor()
    sp.Load("{}.model".format(path))

    cv = CountVectorizer(max_features=3000, tokenizer=sp.encode_as_pieces)
    content = df['plylst_title']
    tdm = cv.fit_transform(content)

    title_tdm = tdm.toarray()
    return cv, title_tdm
def main(train_path,
         val_path,
         test_path,
         config_path,
         subword_model_path,
         out_dir,
         max_text_subwords,
         max_summary_subwords,
         source_suffix,
         target_suffix,
         insert_tags=False,
         lowercase=False):
    params = Params.from_file(config_path)
    reader_params = params.pop("dataset_reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)

    processor = SentencePieceProcessor()
    processor.Load(subword_model_path)

    train_text_file = os.path.join(out_dir, "train.{}".format(source_suffix))
    train_summary_file = os.path.join(out_dir,
                                      "train.{}".format(target_suffix))
    val_text_file = os.path.join(out_dir, "val.{}".format(source_suffix))
    val_summary_file = os.path.join(out_dir, "val.{}".format(target_suffix))
    test_text_file = os.path.join(out_dir, "test.{}".format(source_suffix))
    test_summary_file = os.path.join(out_dir, "test.{}".format(target_suffix))

    files = ((train_path, train_text_file,
              train_summary_file), (val_path, val_text_file, val_summary_file),
             (test_path, test_text_file, test_summary_file))
    for path, text_file_name, summary_file_name in files:
        with open(text_file_name,
                  "w") as text_file, open(summary_file_name,
                                          "w") as summary_file:
            for text, summary in reader.parse_set(path):
                if lowercase:
                    text = text.lower()
                    summary = summary.lower()
                text_subwords = processor.EncodeAsPieces(text)
                if max_text_subwords:
                    text_subwords = text_subwords[:max_text_subwords]
                summary_subwords = processor.EncodeAsPieces(summary)
                if max_summary_subwords:
                    summary_subwords = summary_subwords[:max_summary_subwords]
                if insert_tags:
                    text_subwords.insert(0, "<t>")
                    text_subwords.append("</t>")
                    summary_subwords.insert(0, "<t>")
                    summary_subwords.append("</t>")
                text_file.write(" ".join(text_subwords) + "\n")
                summary_file.write((" ".join(summary_subwords)) + "\n")
示例#13
0
def main(train_path,
         val_path,
         test_path,
         mode,
         subword_model_path,
         output_dir,
         max_source_subwords,
         max_target_subwords,
         source_suffix,
         target_suffix,
         lowercase=False):
    processor = SentencePieceProcessor()
    processor.Load(subword_model_path)

    os.makedirs(output_dir, exist_ok=True)
    train_source_file = os.path.join(output_dir,
                                     "train.{}".format(source_suffix))
    train_target_file = os.path.join(output_dir,
                                     "train.{}".format(target_suffix))
    val_source_file = os.path.join(output_dir, "val.{}".format(source_suffix))
    val_target_file = os.path.join(output_dir, "val.{}".format(target_suffix))
    test_source_file = os.path.join(output_dir,
                                    "test.{}".format(source_suffix))
    test_target_file = os.path.join(output_dir,
                                    "test.{}".format(target_suffix))

    parse = MODES.get(mode, None)
    assert parse is not None

    files = ((train_path, train_source_file,
              train_target_file), (val_path, val_source_file, val_target_file),
             (test_path, test_source_file, test_target_file))
    for path, source_file_name, target_file_name in files:
        with open(source_file_name,
                  "w") as source_file, open(target_file_name,
                                            "w") as target_file:
            for record in parse(path):
                source = record["source"]
                target = record["target"]
                if lowercase:
                    source = source.lower()
                    target = target.lower()
                source_subwords = processor.EncodeAsPieces(source)
                if max_source_subwords:
                    source_subwords = source_subwords[:max_source_subwords]
                target_subwords = processor.EncodeAsPieces(target)
                if max_target_subwords:
                    target_subwords = target_subwords[:max_target_subwords]
                source_file.write(" ".join(source_subwords) + "\n")
                target_file.write((" ".join(target_subwords)) + "\n")
示例#14
0
class SentencePieceTokenizer:
    def __init__(self, spm_file, do_lower_case=True):
        self.processor = SentencePieceProcessor()
        self.processor.Load(spm_file)
        self.do_lower_case = do_lower_case

    def tokenize(self, text):
        text = preprocess_text(text, lower=self.do_lower_case)
        pieces = encode_pieces(self.processor, text, sample=False)
        return pieces

    def convert_tokens_to_ids(self, tokens):
        return [self.processor.PieceToId(piece) for piece in tokens]

    def convert_ids_to_tokens(self, ids):
        pieces = [self.processor.IdToPiece(_id) for _id in ids]
        return pieces
示例#15
0
def main() -> int:
    train_config = get_train_config()
    model_config = ConveRTModelConfig()

    logger = logger_setup(train_config.log_dir)
    device = torch.device(train_config.device if torch.cuda.is_available() else "cpu")

    tokenizer = SentencePieceProcessor()
    tokenizer.Load(train_config.sp_model_path)

    instance_load_fn = load_instances_from_reddit_dataset if train_config.is_reddit else load_instances_from_tsv_dataset
    train_instances = instance_load_fn(train_config.train_dataset_path)
    test_instances = instance_load_fn(train_config.test_dataset_path)

    train_dataset = ConveRTDataset(train_instances, tokenizer)
    test_dataset = ConveRTDataset(test_instances, tokenizer)
    train_dataloader = DataLoader(
        train_dataset, train_config.train_batch_size, collate_fn=convert_collate_fn, drop_last=True
    )
    test_dataloader = DataLoader(
        test_dataset, train_config.test_batch_size, collate_fn=convert_collate_fn, drop_last=True
    )

    model = ConveRTDualEncoder(model_config)
    criterion = ConveRTCosineLoss(split_size=train_config.split_size)

    model.to(device)
    criterion.to(device)

    if train_config.use_data_paraller and torch.cuda.is_available():
        model = nn.DataParallel(model)
        criterion = nn.DataParallel(criterion)

    trainer = ConveRTTrainer(
        model=model,
        criterion=criterion,
        train_config=train_config,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        logger=logger,
        device=device,
    )
    trainer.train()
    torch.save(trainer.model, 'final_model.pkl')
    return 0
示例#16
0
def load_sentencepiece_tokenizer(
        tokenizer_path: str) -> SentencePieceProcessor:
    ''' Loads an already pretrained sentencepiece tokenizer.

    Args:
        tokenizer_path: path to the files of the pretrained sentencepiece tokenizer.
    Returns:
        tokenizer: pretrained sentencepiece tokenizer.
    '''
    if not os.path.isfile(tokenizer_path):
        print("SentencePiece tokenizer not found!")
        sys.exit()

    tokenizer = SentencePieceProcessor()
    tokenizer.Load(tokenizer_path)
    # enable inserting <s> and </s> tags automatically at start/end of a sentence.
    tokenizer.set_encode_extra_options('bos:eos')
    return tokenizer
示例#17
0
def main(**kwargs):
    set_seed(1)
    train_config = ConveRTTrainConfig()
    model_config = ConveRTModelConfig()
    tokenizer = SentencePieceProcessor()
    args = _parse_args()
    tokenizer.Load(train_config.sp_model_path)
    train_instances = load_instances_from_reddit_json(train_config.dataset_path)
    RD = RedditData(train_instances, tokenizer, 60)
    dm = DataModule()
    train_loader = dm.train_dataloader(RD)
    model = SingleContextConvert(model_config, train_config)
    lr_decay = LearningRateDecayCallback(train_config)
    model.register_subword_params()

    trainer = (
        pl.Trainer.from_argparse_args(args, callbacks = [lr_decay],**kwargs)
    )  # ,checkpoint_callback = checkpoint_callback)  # ,resume_from_checkpoint=)
    trainer.fit(model, train_dataloader = train_loader, val_dataloaders = train_loader)
示例#18
0
class SentencePieceTokenizer(Tokenizer, CppProcessorMixin):
    """Sentence piece tokenizer."""
    class Config(ConfigBase):
        sp_model_path: str = ""

    def __init__(self, sp_model_path: str = ""):
        self.sp_model_path = sp_model_path
        self._load_processor()

    @classmethod
    def from_config(cls, config: Config):
        return cls(config.sp_model_path)

    def tokenize(self, input_str: str) -> List[Token]:
        pieces = self.processor.EncodeAsPieces(input_str)
        return [Token(piece, -1, -1) for piece in pieces]

    def _load_processor(self):
        self.processor = SentencePieceProcessor()
        self.processor.Load(self.sp_model_path)
class SubwordTokenizer(Tokenizer):
    def __init__(self,
                 model_path: str = None,
                 nbest_size: int = None,
                 alpha: float = None):
        self._model_path = cached_path(model_path)
        self._processor = SentencePieceProcessor()
        self._processor.Load(self._model_path)
        self._nbest_size = nbest_size
        self._alpha = alpha

    def tokenize(self, text: str) -> List[Token]:
        if self._nbest_size and self._alpha:
            subwords = self._processor.SampleEncodeAsPieces(text, self._nbest_size, self._alpha)
        else:
            subwords = self._processor.EncodeAsPieces(text)
        tokens = [Token(s) for s in subwords]
        return tokens

    def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
        return [self.tokenize(text) for text in texts]
示例#20
0
def main():
	parser = ArgumentParser()
	parser.add_argument("--model", required=True, help="sentencepiece model to use for decoding")
	parser.add_argument("--input", default="-", help="input file to decode")
	parser.add_argument("--input_format", choices=["piece", "id"], default="piece")
	args = parser.parse_args()

	sp = SentencePieceProcessor()
	sp.Load(args.model)

	if args.input_format == "piece":

		def decode(l):
			return "".join(sp.DecodePieces(l))

	elif args.input_format == "id":

		def decode(l):
			return "".join(sp.DecodeIds(l))

	def tok2int(tok):
		# remap reference-side <unk> to 0
		return int(tok) if tok != "<unk>" else 0

	if args.input == "-":
		if args.input_format == "id":
			for line in sys.stdin:
				print(decode(list(map(tok2int, line.rstrip().split()))))
		elif args.input_format == "piece":
			for line in sys.stdin:
				print(decode(line.rstrip().split()))
	else:
		with open(args.input, "r", encoding="utf-8") as h:
			if args.input_format == "id":
				for line in h:
					print(decode(list(map(tok2int, line.rstrip().split()))))
			elif args.input_format == "piece":
				for line in h:
					print(decode(line.rstrip().split()))
示例#21
0
class SentencePieceTokenizer:
    def __init__(self, spm_file, do_lower_case=True):
        if not os.path.exists(spm_file):
            raise ValueError(
                "Can't find spm_file \"%s\". "
                "Please pass the correct path of sentence-piece model file, "
                "e.g.`spiece.model`." % spm_file
            )
        self.processor = SentencePieceProcessor()
        self.processor.Load(spm_file)
        self.do_lower_case = do_lower_case

    def tokenize(self, text):
        text = preprocess_text(text, lower=self.do_lower_case)
        pieces = encode_pieces(self.processor, text, sample=False)
        return pieces

    def convert_tokens_to_ids(self, tokens):
        return [self.processor.PieceToId(piece) for piece in tokens]

    def convert_ids_to_tokens(self, ids):
        pieces = [self.processor.IdToPiece(_id) for _id in ids]
        return pieces
示例#22
0
def main():
    options = parse_args()
    torch.manual_seed(options.seed)
    basename = os.path.splitext(os.path.basename(options.input))[0]
    out_dir = options.out_dir or "data/{}/".format(basename)
    spinner = Halo(spinner="dots", placement="right")

    with open(options.input, "r", encoding="utf8") as fd:
        reader = csv.reader(fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="")
        lines = [[line[0]] for line in reader]

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    output_full = os.path.join(out_dir, "{}.tsv".format(basename))
    with open(output_full, "w", encoding="utf8") as fd:
        writer = csv.writer(fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="")
        writer.writerows(lines)

    vocab_size = 32000
    spiece_out = os.path.join(out_dir, "spiece")
    spiece_args = (
        "--input={} "
        "--model_prefix={} "
        "--vocab_size={} "
        "--character_coverage=1.0"
    ).format(output_full, spiece_out, vocab_size)
    SentencePieceTrainer.Train(spiece_args)
    # Load the generated vocabulary
    with open("{}.vocab".format(spiece_out), "r", encoding="utf8") as fd:
        reader = csv.reader(
            fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
        )
        vocab = [line[0] for line in reader]
    # Remove the special tokens <unk>, <s>, </s>
    vocab = vocab[3:]

    # Convert to BERT style
    bert_vocab = [
        v[1:] if v.startswith("▁") else "##{}".format(v) for v in vocab if v != "▁"
    ]
    # Add BERT's special tokens to the beginning
    bert_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + bert_vocab
    # Fill up with unused tokens
    pad_size = vocab_size - len(bert_vocab)
    bert_vocab += ["unused{}".format(i) for i in range(pad_size)]
    with open(os.path.join(out_dir, "vocab.txt"), "w", encoding="utf8") as fd:
        writer = csv.writer(
            fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
        )
        writer.writerows([[b] for b in bert_vocab])

    # Convert to GPT-2 style
    # Unfortunately it's slow and tedious.
    spinner.start(text="Generating BPE vocabulary")
    gpt2_vocab = ["Ġ{}".format(v[1:]) if v.startswith("▁") else v for v in vocab]
    # Add the GPT-2 special token to the end
    gpt2_vocab.append("<|endoftext|>")
    with open(os.path.join(out_dir, "vocab.json"), "w", encoding="utf8") as fd:
        json.dump({v: i for i, v in enumerate(gpt2_vocab)}, fd, ensure_ascii=False)
    spiece_processor = SentencePieceProcessor()
    spiece_processor.Load("{}.model".format(spiece_out))
    # Encode the whole text
    encoded = [
        [" ".join(spiece_processor.EncodeAsPieces(line[0])).replace("▁", "Ġ")]
        for line in lines
    ]
    tmp_encoded_fd, tmp_encoded_path = tempfile.mkstemp()
    tmp_bpe_fd, tmp_bpe_path = tempfile.mkstemp()
    try:
        # Write the encoded text to a temporary file.
        with os.fdopen(tmp_encoded_fd, "w", encoding="utf8") as fd:
            writer = csv.writer(
                fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
            )
            writer.writerows(encoded)
        learn_bpe(
            open(tmp_encoded_path, "r", encoding="utf8"),
            open(tmp_bpe_path, "w", encoding="utf8"),
            num_symbols=vocab_size,
        )
        with open(tmp_bpe_path, "r", encoding="utf8") as fd:
            reader = csv.reader(
                fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
            )
            seen = set()
            merges = []
            for line in reader:
                # Get rid of the </w> tokens
                line = line[0].replace("</w>", "")
                # Remove duplicates (due to </w> tokens)
                if line not in seen:
                    seen.add(line)
                    merges.append([line])
        with open(os.path.join(out_dir, "merges.txt"), "w", encoding="utf8") as fd:
            writer = csv.writer(
                fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
            )
            writer.writerows(merges)
    finally:
        os.remove(tmp_encoded_path)
        os.remove(tmp_bpe_path)
    spinner.stop()
示例#23
0
def main(input_dir,
         subword_model_path,
         output_dir,
         max_source_subwords,
         max_target_subwords,
         source_suffix,
         target_suffix,
         lowercase=False):
    processor = SentencePieceProcessor()
    processor.Load(subword_model_path)

    os.makedirs(output_dir, exist_ok=True)
    train_source_file = os.path.join(output_dir,
                                     "train.{}".format(source_suffix))
    train_target_file = os.path.join(output_dir,
                                     "train.{}".format(target_suffix))
    val_source_file = os.path.join(output_dir, "val.{}".format(source_suffix))
    val_target_file = os.path.join(output_dir, "val.{}".format(target_suffix))
    test_source_file = os.path.join(output_dir,
                                    "test.{}".format(source_suffix))
    test_target_file = os.path.join(output_dir,
                                    "test.{}".format(target_suffix))

    dirs = list(os.listdir(input_dir))
    tasks = []
    for d in dirs:
        if d.startswith("_"):
            continue
        mode = d.lower()
        parse = MODES.get(mode, None)
        assert parse is not None
        tasks.append((os.path.join(input_dir, d), mode, parse))

    files = (("train.jsonl", train_source_file, train_target_file),
             ("val.jsonl", val_source_file, val_target_file),
             ("test.jsonl", test_source_file, test_target_file))
    for orig_file_name, source_file_name, target_file_name in files:
        records = []
        for d, mode, parse in tasks:
            if orig_file_name != "test.jsonl" and mode == "lidirus":
                continue
            elif orig_file_name == "test.jsonl" and mode == "lidirus":
                path = os.path.join(d, "LiDiRuS.jsonl")
            else:
                path = os.path.join(d, orig_file_name)
            for record in parse(path):
                source = mode + SEPARATOR + str(
                    record["idx"]) + SEPARATOR + record["source"]
                target = record["target"]
                if lowercase:
                    source = source.lower()
                    target = target.lower()
                source_subwords = processor.EncodeAsPieces(source)
                if max_source_subwords:
                    source_subwords = source_subwords[:max_source_subwords]
                target_subwords = processor.EncodeAsPieces(target)
                if max_target_subwords:
                    target_subwords = target_subwords[:max_target_subwords]
                source = " ".join(source_subwords)
                target = " ".join(target_subwords)
                records.append((source, target))
        random.shuffle(records)
        with open(source_file_name,
                  "w") as source_file, open(target_file_name,
                                            "w") as target_file:
            for source, target in records:
                source_file.write(source + "\n")
                target_file.write(target + "\n")
示例#24
0
def tokenizer() -> SentencePieceProcessor:
    config = ConveRTTrainConfig()
    tokenizer = SentencePieceProcessor()
    tokenizer.Load(config.sp_model_path)
    return tokenizer
class SentencepieceFasttextEmbed(EmbedderInterface):
    class Config(EmbedderInterface.Config):
        pass

    @classmethod
    def from_config(cls, config: Config):
        spm_model_file = os.path.join(config.preproc_dir, "spm.model")
        fasttext_model_file = os.path.join(config.preproc_dir,
                                           "fasttext-model.bin")
        return cls(spm_model_file, fasttext_model_file, config.max_pieces)

    def __init__(self,
                 spm_model_file: str,
                 fasttext_model_file: str = '',
                 max_pieces: int = -1):
        super().__init__(max_pieces=max_pieces)

        self.spm = SentencePieceProcessor()
        self.spm.Load(spm_model_file)
        self.pad_idx = self.spm.pad_id()
        self.pad_token = self.spm.IdToPiece(self.pad_idx)
        self.unk_idx = self.spm.unk_id()
        self.unk_token = self.spm.IdToPiece(self.unk_idx)
        self.bos_idx = self.spm.bos_id()
        self.bos_token = self.spm.IdToPiece(self.bos_idx)
        self.eos_idx = self.spm.eos_id()
        self.eos_token = self.spm.IdToPiece(self.eos_idx)

        if fasttext_model_file:
            self.fasttext = fasttext.load_model(fasttext_model_file)

    @property
    def embed_dim(self):
        return self.fasttext.dim

    @property
    def n_vocab(self):
        return self.spm.get_piece_size()

    def encode_text_as_ids(self, text: str) -> np.array:
        """
    Doesn't produce BOS, EOS ids.
    """
        return np.asarray(self.spm.EncodeAsIds(text)[self.pieces_slice],
                          dtype=np.int32)

    def encode_text_as_tokens(self, text: str) -> List[str]:
        """
    Doesn't produce BOS, EOS tokens.
    """
        return self.spm.EncodeAsPieces(text)[self.pieces_slice]

    def tokenize(self, text: str) -> List[str]:
        """
    Alias for `encode_text_as_tokens`.
    Doesn't produce BOS, EOS tokens.
    """
        return self.encode_text_as_tokens(text)[self.pieces_slice]

    def decode_ids_as_text(self, ids: List[int], strip_special=True) -> str:
        """
    Doesn't produce PAD, BOS, or EOS text.
    i.e. PAD, BOS, EOS ids are stripped out before decoding.
    UNK is decoded but unintelligible.
    """
        if strip_special:
            ids = [
                int(id) for id in ids
                if id not in (self.pad_idx, self.bos_idx, self.eos_idx)
            ]
        else:
            ids = [int(id) for id in ids]
        return self.spm.DecodeIds(ids)

    def decode_tokens_as_text(self, toks: List[str]) -> str:
        """
    Doesn't produce PAD, BOS, or EOS text.
    i.e. PAD, BOS, EOS tokens are stripped out before decoding.
    UNK is decoded but unintelligible.
    """
        return self.spm.DecodePieces(toks[self.pieces_slice])

    @functools.lru_cache(maxsize=1024)
    def decode_id_as_token(self, id: int) -> str:
        return self.spm.IdToPiece(id)

    def decode_ids_as_tokens(self,
                             ids: List[int],
                             strip_special: bool = True) -> List[str]:
        """
    By default, doesn't produce PAD, BOS, EOS tokens.

    Avoids problematic intermediate string representation that causes length mismatch.
    In other words, SentencePiece isn't isomorphic with respect to the string representation.
    """
        if strip_special:
            ids = [
                id for id in ids
                if id not in (self.pad_idx, self.bos_idx, self.eos_idx)
            ]
        return [self.decode_id_as_token(int(ix)) for ix in ids]

    @functools.lru_cache(maxsize=1024)
    def embed_tok(self, tok: str) -> np.array:
        """
    When given PAD, returns all zeros
    """
        if tok == self.pad_token:
            return np.zeros(self.fasttext.dim)
        return np.asarray(self.fasttext[tok])

    def embed_text(self, text: str) -> np.array:
        """
    Doesn't produce PAD, BOS, EOS embeddings.
    i.e. PAD, BOS, EOS are stripped out during tokenization before embedding.
    """
        return np.asarray([self.embed_tok(tok) for tok in self.tokenize(text)])

    def embed_ids(self,
                  ids: List[int],
                  strip_special: bool = True) -> List[np.array]:
        """
    By default, doesn't produce PAD, BOS, EOS tokens.

    Avoids problematic intermediate string representation that causes length mismatch.
    In other words, SentencePiece isn't isomorphic with respect to the string representation.
    """
        return [
            self.embed_tok(t)
            for t in self.decode_ids_as_tokens(ids,
                                               strip_special=strip_special)
        ]

    def embed_ids_batch(self, ids: np.array) -> torch.tensor:
        emb = [self.embed_ids(turn, strip_special=False) for turn in ids]
        emb = torch.tensor(emb)
        return emb
示例#26
0
def song_inference():
    sp_total_model_path = "sp_total"
    train = pd.read_json('./dataset/train.json', typ='frame', encoding='utf-8')
    song = pd.read_json('./dataset/song_meta.json',
                        typ='frame',
                        encoding='utf-8')
    plylst_tag = train['tags']
    tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
    tag_dict = {x: tag_counter[x] for x in tag_counter}

    tag_id_tid = dict()
    tag_tid_id = dict()
    for i, t in enumerate(tag_dict):
        tag_id_tid[t] = i
        tag_tid_id[i] = t
    n_tags = len(tag_dict)

    plylst_song = train['songs']
    song_dict = {x: x for x in song['id']}

    n_songs = len(song_dict)

    train['tags_id'] = train['tags'].map(
        lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])
    # song genre 내용 가져오기.
    song_cate = []

    for i in range(len(train)):
        gnr = []
        songs = train.iloc[i, 3]

        for j in songs:
            for k in song.loc[j, 'song_gn_dtl_gnr_basket']:
                gnr.append(k)
        song_cate.append(gnr)

    train['plylst_genre'] = song_cate

    plylst_genre = train['plylst_genre']
    genre_counter = Counter([gen for genre in plylst_genre for gen in genre])
    genre_dict = {x: genre_counter[x] for x in genre_counter}

    genre_id_tid = dict()
    genre_tid_id = dict()
    for i, t in enumerate(genre_dict):
        genre_id_tid[t] = i
        genre_tid_id[i] = t
    n_genre = len(genre_dict)
    train['plylst_genre_id'] = train['plylst_genre'].map(
        lambda x:
        [genre_id_tid.get(s) for s in x if genre_id_tid.get(s) != None])

    gnr_array = np.zeros((len(train), n_genre))
    for i, index in enumerate(train.index):
        if i % 10000 == 0:
            print(i)
        counter = Counter(train.loc[index]['plylst_genre_id'])
        for (k, c) in counter.items():
            gnr_array[i][k] = c
    gnr_array.shape

    song['issue_date'] = song['issue_date'].astype('str').map(lambda x: x[:6])

    plylst_use = train[['plylst_title', 'updt_date', 'tags_id', 'songs']]
    plylst_use.loc[:, 'num_songs'] = plylst_use['songs'].map(len)
    plylst_use.loc[:, 'num_tags'] = plylst_use['tags_id'].map(len)

    plylst_train = plylst_use

    n_train = len(plylst_train)
    row = np.repeat(range(n_train),
                    plylst_train['num_songs'])  # User Index 별 노래 개수만큼 만듦
    col = [song for songs in plylst_train['songs']
           for song in songs]  # Song dic number 추출
    dat = np.repeat(1, plylst_train['num_songs'].sum()
                    )  # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦
    train_user_songs_A = spr.csr_matrix(
        (dat, (row, col)), shape=(n_train, n_songs))  # csr_matrix 제작

    row = np.repeat(range(n_train), plylst_train['num_tags'])
    col = [tag for tags in plylst_train['tags_id'] for tag in tags]
    dat = np.repeat(1, plylst_train['num_tags'].sum())
    train_user_tags_A = spr.csr_matrix((dat, (row, col)),
                                       shape=(n_train, n_tags))

    train_user_songs_A_T = train_user_songs_A.T.tocsr()
    train_user_songs_A_T  # 행에는 노래 columns에는 User 정보 삽입

    train_user_tags_A_T = train_user_tags_A.T.tocsr()
    train_user_tags_A_T  # 행에는 Tangs columns에는 User 정보 삽입

    val = pd.read_json('./dataset/val.json', typ='frame', encoding='utf-8')

    song_cate = []

    for i in range(len(val)):
        gnr = []
        songs = val.iloc[i, 3]

        for j in songs:
            for k in song.loc[j, 'song_gn_dtl_gnr_basket']:
                gnr.append(k)
        song_cate.append(gnr)

    val['plylst_genre'] = song_cate

    val['tags_id'] = val['tags'].map(
        lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])
    val['plylst_genre_id'] = val['plylst_genre'].map(
        lambda x:
        [genre_id_tid.get(s) for s in x if genre_id_tid.get(s) != None])
    val.loc[:, 'num_songs'] = val['songs'].map(len)
    val.loc[:, 'num_tags'] = val['tags_id'].map(len)
    # val_title = cv.transform(val['plylst_title']).toarray()

    gnr_val = np.zeros((len(val), n_genre))
    for i, index in enumerate(val.index):
        if i % 10000 == 0:
            print(i)
        counter = Counter(val.loc[index]['plylst_genre_id'])
        for (k, c) in counter.items():
            gnr_val[i][k] = c
    gnr_val.shape

    n_val = len(val)
    row = np.repeat(range(n_val), val['num_songs'])  # User Index 별 노래 개수만큼 만듦
    col = [song for songs in val['songs']
           for song in songs]  # Song dic number 추출
    dat = np.repeat(
        1,
        val['num_songs'].sum())  # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦
    val_user_songs_A = spr.csr_matrix((dat, (row, col)),
                                      shape=(n_val, n_songs))  # csr_matrix 제작

    row = np.repeat(range(n_val), val['num_tags'])
    col = [tag for tags in val['tags_id'] for tag in tags]
    dat = np.repeat(1, val['num_tags'].sum())
    val_user_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_val, n_tags))

    val_user_songs_A_T = val_user_songs_A.T.tocsr()
    val_user_tags_A_T = val_user_tags_A.T.tocsr()

    test = pd.read_json('./dataset/test.json', typ='frame', encoding='utf-8')

    song_cate = []

    for i in range(len(test)):
        gnr = []
        songs = test.iloc[i, 3]

        for j in songs:
            for k in song.loc[j, 'song_gn_dtl_gnr_basket']:
                gnr.append(k)
        song_cate.append(gnr)

    test['plylst_genre'] = song_cate

    test['tags_id'] = test['tags'].map(
        lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])
    test['plylst_genre_id'] = test['plylst_genre'].map(
        lambda x:
        [genre_id_tid.get(s) for s in x if genre_id_tid.get(s) != None])
    test.loc[:, 'num_songs'] = test['songs'].map(len)
    test.loc[:, 'num_tags'] = test['tags_id'].map(len)
    # test_title = cv.transform(test['plylst_title']).toarray()

    gnr_test = np.zeros((len(test), n_genre))
    for i, index in enumerate(test.index):
        if i % 10000 == 0:
            print(i)
        counter = Counter(test.loc[index]['plylst_genre_id'])
        for (k, c) in counter.items():
            gnr_test[i][k] = c
    gnr_test.shape

    n_test = len(test)
    row = np.repeat(range(n_test),
                    test['num_songs'])  # User Index 별 노래 개수만큼 만듦
    col = [song for songs in test['songs']
           for song in songs]  # Song dic number 추출
    dat = np.repeat(
        1,
        test['num_songs'].sum())  # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦
    test_user_songs_A = spr.csr_matrix(
        (dat, (row, col)), shape=(n_test, n_songs))  # csr_matrix 제작

    row = np.repeat(range(n_test), test['num_tags'])
    col = [tag for tags in test['tags_id'] for tag in tags]
    dat = np.repeat(1, test['num_tags'].sum())
    test_user_tags_A = spr.csr_matrix((dat, (row, col)),
                                      shape=(n_test, n_tags))

    test_user_songs_A_T = test_user_songs_A.T.tocsr()
    test_user_tags_A_T = test_user_tags_A.T.tocsr()

    data_all = pd.concat([train, val, test])
    data_all.index = range(len(data_all))

    arts = song['artist_id_basket'].map(lambda x: x[0])

    arts = pd.DataFrame(arts)

    art_counts = arts['artist_id_basket'].value_counts().reset_index()
    art_counts.columns = ['artist_id_basket', 'counts']

    arts2 = pd.merge(arts, art_counts, how='left', on=['artist_id_basket'])

    song_art = song.iloc[arts2.query('counts >= 12')['artist_id_basket'].index]

    song_art = song_art[['artist_id_basket']]

    #아티스트 대분류
    ART_cate = []

    for i in tqdm_notebook(range(len(data_all))):
        ART = []
        songs = data_all.loc[i, 'songs']

        for j in songs:
            if j in song_art.index:
                for k in song_art.loc[j, 'artist_id_basket']:
                    ART.append(k)
        ART_cate.append(ART)

    data_all['plylst_ARTIST'] = ART_cate

    plylst_ARTIST = data_all['plylst_ARTIST']
    ARTIST_counter = Counter(
        [ART for ARTIST in plylst_ARTIST for ART in ARTIST])
    ARTIST_dict = {x: ARTIST_counter[x] for x in ARTIST_counter}

    ARTIST_id_tid = dict()
    ARTIST_tid_id = dict()
    for i, t in enumerate(ARTIST_dict):
        ARTIST_id_tid[t] = i
        ARTIST_tid_id[i] = t
    n_ARTIST = len(ARTIST_dict)
    data_all['plylst_ARTIST_id'] = data_all['plylst_ARTIST'].map(
        lambda x:
        [ARTIST_id_tid.get(s) for s in x if ARTIST_id_tid.get(s) != None])

    ART_data_all = np.zeros((len(data_all), n_ARTIST))
    for i, index in enumerate(data_all.index):
        if i % 10000 == 0:
            print(i)
        counter = Counter(data_all.loc[index]['plylst_ARTIST_id'])
        for (k, c) in counter.items():
            ART_data_all[i][k] = c
    ART_data_all.shape

    ART_array = ART_data_all[:len(train)]
    ART_val = ART_data_all[len(train):len(train) + len(val)]
    ART_test = ART_data_all[len(train) + len(val):len(train) + len(val) +
                            len(test)]

    # ART_data_all = sparse.csr_matrix(ART_data_all)
    del ART_data_all

    ART_array = sparse.csr_matrix(ART_array)
    ART_val = sparse.csr_matrix(ART_val)
    ART_test = sparse.csr_matrix(ART_test)

    # song tim 내용 가져오기.
    tim_cate = []

    for i in tqdm_notebook(range(len(data_all))):
        tim = []
        songs = data_all.loc[i, 'songs']

        for j in songs:
            tim.append(song.loc[j, 'issue_date'])
        tim_cate.append(tim)

    data_all['plylst_times'] = tim_cate

    plylst_times = data_all['plylst_times']
    times_counter = Counter([tim for times in plylst_times for tim in times])
    times_dict = {x: times_counter[x] for x in times_counter}

    times_id_tid = dict()
    times_tid_id = dict()
    for i, t in enumerate(times_dict):
        times_id_tid[t] = i
        times_tid_id[i] = t
    n_times = len(times_dict)
    data_all['plylst_times_id'] = data_all['plylst_times'].map(
        lambda x:
        [times_id_tid.get(s) for s in x if times_id_tid.get(s) != None])

    tim_data_all = np.zeros((len(data_all), n_times))
    for i, index in enumerate(data_all.index):
        if i % 10000 == 0:
            print(i)
        counter = Counter(data_all.loc[index]['plylst_times_id'])
        for (k, c) in counter.items():
            tim_data_all[i][k] = c

    tim_array = tim_data_all[:len(train)]
    tim_val = tim_data_all[len(train):len(train) + len(val)]
    tim_test = tim_data_all[len(train) + len(val):len(train) + len(val) +
                            len(test)]

    # tim_data_all = sparse.csr_matrix(tim_data_all)
    del tim_data_all

    tim_array = sparse.csr_matrix(tim_array)
    tim_val = sparse.csr_matrix(tim_val)
    tim_test = sparse.csr_matrix(tim_test)

    #장르 대분류
    GEN_cate = []

    for i in tqdm_notebook(range(len(data_all))):
        GEN = []
        songs = data_all.loc[i, 'songs']

        for j in songs:
            for k in song.loc[j, 'song_gn_gnr_basket']:
                GEN.append(k)
        GEN_cate.append(GEN)

    data_all['plylst_GENRE'] = GEN_cate

    plylst_GENRE = data_all['plylst_GENRE']
    GENRE_counter = Counter([GEN for GENRE in plylst_GENRE for GEN in GENRE])
    GENRE_dict = {x: GENRE_counter[x] for x in GENRE_counter}

    GENRE_id_tid = dict()
    GENRE_tid_id = dict()
    for i, t in enumerate(GENRE_dict):
        GENRE_id_tid[t] = i
        GENRE_tid_id[i] = t
    n_GENRE = len(GENRE_dict)
    data_all['plylst_GENRE_id'] = data_all['plylst_GENRE'].map(
        lambda x:
        [GENRE_id_tid.get(s) for s in x if GENRE_id_tid.get(s) != None])

    GEN_data_all = np.zeros((len(data_all), n_GENRE))
    for i, index in enumerate(data_all.index):
        if i % 10000 == 0:
            print(i)
        counter = Counter(data_all.loc[index]['plylst_GENRE_id'])
        for (k, c) in counter.items():
            GEN_data_all[i][k] = c

    GEN_array = GEN_data_all[:len(train)]
    GEN_val = GEN_data_all[len(train):len(train) + len(val)]
    GEN_test = GEN_data_all[len(train) + len(val):len(train) + len(val) +
                            len(test)]
    # GEN_data_all = sparse.csr_matrix(GEN_data_all)
    del GEN_data_all

    GEN_array = sparse.csr_matrix(GEN_array)
    GEN_val = sparse.csr_matrix(GEN_val)
    GEN_test = sparse.csr_matrix(GEN_test)

    content = data_all['plylst_title']
    if "{}.model".format(sp_total_model_path) not in os.listdir():
        makeSentencepieceModel(data_all, sp_total_model_path)
    sp = SentencePieceProcessor()
    sp.Load("{}.model".format(sp_total_model_path))

    cv = CountVectorizer(max_features=3000, tokenizer=sp.encode_as_pieces)
    content = data_all['plylst_title']
    tdm = cv.fit_transform(content)

    title_tdm = tdm.toarray()

    title_tr = title_tdm[:len(train)]
    title_va = title_tdm[len(train):len(train) + len(val)]
    title_ts = title_tdm[len(train) + len(val):len(train) + len(val) +
                         len(test)]

    title_gnr = np.concatenate((gnr_array, title_tr), axis=1)
    val_title_gnr = np.concatenate((gnr_val, title_va), axis=1)
    test_title_gnr = np.concatenate((gnr_test, title_ts), axis=1)

    title_sp = sparse.csr_matrix(title_tdm)

    title_gnr = sparse.csr_matrix(title_gnr)
    val_title_gnr = sparse.csr_matrix(val_title_gnr)
    test_title_gnr = sparse.csr_matrix(test_title_gnr)

    title_gnr = vstack([title_gnr, val_title_gnr, test_title_gnr])
    song_sp = vstack([train_user_songs_A, val_user_songs_A, test_user_songs_A])
    tag_sp = vstack([train_user_tags_A, val_user_tags_A, test_user_tags_A])
    times_sp = vstack([tim_array, tim_val, tim_test])
    GEN_sp = vstack([GEN_array, GEN_val, GEN_test])

    ART_sp = vstack([ART_array, ART_val, ART_test])

    # song_sp_T = song_sp.T.tocsr()
    # tag_sp_T = tag_sp.T.tocsr()

    model_knn_song25 = NearestNeighbors(metric='cosine',
                                        algorithm='brute',
                                        n_neighbors=25,
                                        n_jobs=-1)
    model_knn_tag25 = NearestNeighbors(metric='cosine',
                                       algorithm='brute',
                                       n_neighbors=25,
                                       n_jobs=-1)
    model_knn_title25 = NearestNeighbors(metric='cosine',
                                         algorithm='brute',
                                         n_neighbors=25,
                                         n_jobs=-1)
    model_knn_title_gnr25 = NearestNeighbors(metric='cosine',
                                             algorithm='brute',
                                             n_neighbors=25,
                                             n_jobs=-1)
    model_knn_times25 = NearestNeighbors(metric='cosine',
                                         algorithm='brute',
                                         n_neighbors=25,
                                         n_jobs=-1)
    model_knn_GEN25 = NearestNeighbors(metric='cosine',
                                       algorithm='brute',
                                       n_neighbors=25,
                                       n_jobs=-1)
    model_knn_ART25 = NearestNeighbors(metric='cosine',
                                       algorithm='brute',
                                       n_neighbors=25,
                                       n_jobs=-1)

    model_knn_song40 = NearestNeighbors(metric='cosine',
                                        algorithm='brute',
                                        n_neighbors=40,
                                        n_jobs=-1)
    model_knn_tag40 = NearestNeighbors(metric='cosine',
                                       algorithm='brute',
                                       n_neighbors=40,
                                       n_jobs=-1)
    model_knn_title40 = NearestNeighbors(metric='cosine',
                                         algorithm='brute',
                                         n_neighbors=40,
                                         n_jobs=-1)
    model_knn_title_gnr40 = NearestNeighbors(metric='cosine',
                                             algorithm='brute',
                                             n_neighbors=40,
                                             n_jobs=-1)
    model_knn_times40 = NearestNeighbors(metric='cosine',
                                         algorithm='brute',
                                         n_neighbors=40,
                                         n_jobs=-1)
    model_knn_GEN40 = NearestNeighbors(metric='cosine',
                                       algorithm='brute',
                                       n_neighbors=40,
                                       n_jobs=-1)
    model_knn_ART40 = NearestNeighbors(metric='cosine',
                                       algorithm='brute',
                                       n_neighbors=40,
                                       n_jobs=-1)

    model_knn_song25.fit(song_sp)
    model_knn_tag25.fit(tag_sp)
    model_knn_title25.fit(title_sp)
    model_knn_title_gnr25.fit(title_gnr)
    model_knn_times25.fit(times_sp)
    model_knn_GEN25.fit(GEN_sp)
    model_knn_ART25.fit(ART_sp)

    model_knn_song40.fit(song_sp)
    model_knn_tag40.fit(tag_sp)
    model_knn_title40.fit(title_sp)
    model_knn_title_gnr40.fit(title_gnr)
    model_knn_times40.fit(times_sp)
    model_knn_GEN40.fit(GEN_sp)
    model_knn_ART40.fit(ART_sp)

    train.loc[:, 'num_songs'] = train['songs'].map(len)
    train.loc[:, 'num_tags'] = train['tags_id'].map(len)

    data_all = pd.concat([train, val, test])

    data_all.index = range(len(data_all))

    res = []
    for i in tqdm_notebook(range(len(test))):
        data = test.iloc[i]
        pid = i

        if len(data['songs']) >= 2 and len(data['tags_id']) >= 2:
            p = np.zeros((707989, 1))
            p[data['songs']] = 1

            pp = np.zeros((n_tags, 1))
            pp[data['tags_id']] = 1

            tra_song = data_all.iloc[model_knn_song25.kneighbors(p.T)[1][0]]
            row = np.repeat(range(25),
                            tra_song['num_songs'])  # User Index 별 노래 개수만큼 만듦
            col = [song for songs in tra_song['songs']
                   for song in songs]  # Song dic number 추출
            dat = np.repeat(1, tra_song['num_songs'].sum()
                            )  # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦
            tra_song_sp = spr.csr_matrix((dat, (row, col)),
                                         shape=(25, n_songs))  # csr_matrix 제작
            tra_song_sp_T = tra_song_sp.T.tocsr()

            tra_tag = data_all.iloc[model_knn_tag25.kneighbors(pp.T)[1][0]]
            row = np.repeat(range(25), tra_tag['num_tags'])
            col = [tag for tags in tra_tag['tags_id'] for tag in tags]
            dat = np.repeat(1, tra_tag['num_tags'].sum())
            tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(25, n_tags))
            tra_tag_sp_T = tra_tag_sp.T.tocsr()

            tra_tim = times_sp[model_knn_times25.kneighbors(
                tim_test[i:(i + 1)])[1][0]]
            tra_GEN = GEN_sp[model_knn_GEN25.kneighbors(GEN_test[i:(i +
                                                                    1)])[1][0]]
            tra_ART = ART_sp[model_knn_ART25.kneighbors(ART_test[i:(i +
                                                                    1)])[1][0]]
            tra_title_gnr = title_gnr[model_knn_title_gnr25.kneighbors(
                test_title_gnr[i:(i + 1)])[1][0]]

            songs_already = data["songs"]
            tags_already = data["tags_id"]

            test_song = cosine_similarity(tra_song_sp, p.T)
            test_tag = cosine_similarity(tra_tag_sp, pp.T)

            test_tim = cosine_similarity(tra_tim, tim_test[i:(i + 1)])
            test_GEN = cosine_similarity(tra_GEN, GEN_test[i:(i + 1)])
            test_ART = cosine_similarity(tra_ART, ART_test[i:(i + 1)])
            test_title_genre = cosine_similarity(tra_title_gnr,
                                                 test_title_gnr[i:(i + 1)])

            testi = test_song * test_tag * test_title_genre * test_tim * test_GEN * test_ART

            cand_song = tra_song_sp_T.dot(
                testi)  # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴
            cand_song_idx = cand_song.reshape(
                -1).argsort()[-300:][::-1]  # 값이 높은 상위 120개 노래 추출

            cand_song_idx = cand_song_idx[np.isin(
                cand_song_idx, songs_already) == False]  # 중복제거
            cand1 = pd.DataFrame(cand_song).iloc[cand_song_idx].reset_index()
            ####### 40 ####################################################
            tra_song = data_all.iloc[model_knn_song40.kneighbors(p.T)[1][0]]
            row = np.repeat(range(40),
                            tra_song['num_songs'])  # User Index 별 노래 개수만큼 만듦
            col = [song for songs in tra_song['songs']
                   for song in songs]  # Song dic number 추출
            dat = np.repeat(1, tra_song['num_songs'].sum()
                            )  # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦
            tra_song_sp = spr.csr_matrix((dat, (row, col)),
                                         shape=(40, n_songs))  # csr_matrix 제작
            tra_song_sp_T = tra_song_sp.T.tocsr()

            tra_tag = data_all.iloc[model_knn_tag40.kneighbors(pp.T)[1][0]]
            row = np.repeat(range(40), tra_tag['num_tags'])
            col = [tag for tags in tra_tag['tags_id'] for tag in tags]
            dat = np.repeat(1, tra_tag['num_tags'].sum())
            tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(40, n_tags))
            tra_tag_sp_T = tra_tag_sp.T.tocsr()

            tra_tim = times_sp[model_knn_times40.kneighbors(
                tim_test[i:(i + 1)])[1][0]]
            tra_GEN = GEN_sp[model_knn_GEN40.kneighbors(GEN_test[i:(i +
                                                                    1)])[1][0]]
            tra_ART = ART_sp[model_knn_ART40.kneighbors(ART_test[i:(i +
                                                                    1)])[1][0]]
            tra_title_gnr = title_gnr[model_knn_title_gnr40.kneighbors(
                test_title_gnr[i:(i + 1)])[1][0]]

            test_song = cosine_similarity(tra_song_sp, p.T)
            test_tag = cosine_similarity(tra_tag_sp, pp.T)

            test_tim = cosine_similarity(tra_tim, tim_test[i:(i + 1)])
            test_GEN = cosine_similarity(tra_GEN, GEN_test[i:(i + 1)])
            test_ART = cosine_similarity(tra_ART, ART_test[i:(i + 1)])
            test_title_genre = cosine_similarity(tra_title_gnr,
                                                 test_title_gnr[i:(i + 1)])

            testi = test_song * test_tag * test_title_genre * test_tim * test_GEN * test_ART

            cand_song = tra_song_sp_T.dot(
                testi)  # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴
            cand_song_idx = cand_song.reshape(
                -1).argsort()[-300:][::-1]  # 값이 높은 상위 120개 노래 추출

            cand_song_idx = cand_song_idx[np.isin(
                cand_song_idx, songs_already) == False]  # 중복제거
            cand2 = pd.DataFrame(cand_song).iloc[cand_song_idx].reset_index()

            cand_all = pd.merge(cand1, cand2, how='outer', on='index')
            cand_all = cand_all.fillna(0)
            cand_all['pred'] = (cand_all['0_x'] + cand_all['0_y']) / 2
            cand_song_idx = list(
                cand_all.sort_values(by=['pred'],
                                     ascending=False)[:100]['index'])

            ######tag######
            cand_tag = tra_tag_sp_T.dot(testi)  # 똑같은 작업 실시
            cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1]

            cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) ==
                                        False][:10]
            rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

            res.append({
                "id": test.loc[pid, 'id'],
                "songs": cand_song_idx,
                "tags": rec_tag_idx
            })

        elif len(data['songs']) != 0:
            p = np.zeros((707989, 1))
            p[data['songs']] = 1

            tra_song = data_all.iloc[model_knn_song25.kneighbors(p.T)[1][0]]
            row = np.repeat(range(25),
                            tra_song['num_songs'])  # User Index 별 노래 개수만큼 만듦
            col = [song for songs in tra_song['songs']
                   for song in songs]  # Song dic number 추출
            dat = np.repeat(1, tra_song['num_songs'].sum()
                            )  # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦
            tra_song_sp = spr.csr_matrix((dat, (row, col)),
                                         shape=(25, n_songs))  # csr_matrix 제작
            tra_song_sp_T = tra_song_sp.T.tocsr()

            #         tra_tag = data_all.iloc[model_knn_tag25.kneighbors(pp.T)[1][0]]
            row = np.repeat(range(25), tra_song['num_tags'])
            col = [tag for tags in tra_song['tags_id'] for tag in tags]
            dat = np.repeat(1, tra_song['num_tags'].sum())
            tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(25, n_tags))
            tra_tag_sp_T = tra_tag_sp.T.tocsr()

            tra_tim = times_sp[model_knn_times25.kneighbors(
                tim_test[i:(i + 1)])[1][0]]
            tra_GEN = GEN_sp[model_knn_GEN25.kneighbors(GEN_test[i:(i +
                                                                    1)])[1][0]]
            tra_ART = ART_sp[model_knn_ART25.kneighbors(ART_test[i:(i +
                                                                    1)])[1][0]]
            tra_title_gnr = title_gnr[model_knn_title_gnr25.kneighbors(
                test_title_gnr[i:(i + 1)])[1][0]]

            songs_already = data["songs"]
            tags_already = data["tags_id"]

            test_song = cosine_similarity(tra_song_sp, p.T)
            test_tim = cosine_similarity(tra_tim, tim_test[i:(i + 1)])
            test_GEN = cosine_similarity(tra_GEN, GEN_test[i:(i + 1)])
            test_ART = cosine_similarity(tra_ART, ART_test[i:(i + 1)])
            test_title_genre = cosine_similarity(tra_title_gnr,
                                                 test_title_gnr[i:(i + 1)])

            testi = test_song * test_title_genre * test_tim * test_GEN * test_ART

            cand_song = tra_song_sp_T.dot(
                testi)  # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴
            cand_song_idx = cand_song.reshape(
                -1).argsort()[-300:][::-1]  # 값이 높은 상위 120개 노래 추출

            cand_song_idx = cand_song_idx[np.isin(
                cand_song_idx, songs_already) == False]  # 중복제거
            cand1 = pd.DataFrame(cand_song).iloc[cand_song_idx].reset_index()
            ####### 40 ####################################################
            tra_song = data_all.iloc[model_knn_song40.kneighbors(p.T)[1][0]]
            row = np.repeat(range(40),
                            tra_song['num_songs'])  # User Index 별 노래 개수만큼 만듦
            col = [song for songs in tra_song['songs']
                   for song in songs]  # Song dic number 추출
            dat = np.repeat(1, tra_song['num_songs'].sum()
                            )  # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦
            tra_song_sp = spr.csr_matrix((dat, (row, col)),
                                         shape=(40, n_songs))  # csr_matrix 제작
            tra_song_sp_T = tra_song_sp.T.tocsr()

            row = np.repeat(range(40), tra_song['num_tags'])
            col = [tag for tags in tra_song['tags_id'] for tag in tags]
            dat = np.repeat(1, tra_song['num_tags'].sum())
            tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(40, n_tags))
            tra_tag_sp_T = tra_tag_sp.T.tocsr()

            tra_tim = times_sp[model_knn_times40.kneighbors(
                tim_test[i:(i + 1)])[1][0]]
            tra_GEN = GEN_sp[model_knn_GEN40.kneighbors(GEN_test[i:(i +
                                                                    1)])[1][0]]
            tra_ART = ART_sp[model_knn_ART40.kneighbors(ART_test[i:(i +
                                                                    1)])[1][0]]
            tra_title_gnr = title_gnr[model_knn_title_gnr40.kneighbors(
                test_title_gnr[i:(i + 1)])[1][0]]

            test_song = cosine_similarity(tra_song_sp, p.T)
            test_tim = cosine_similarity(tra_tim, tim_test[i:(i + 1)])
            test_GEN = cosine_similarity(tra_GEN, GEN_test[i:(i + 1)])
            test_ART = cosine_similarity(tra_ART, ART_test[i:(i + 1)])
            test_title_genre = cosine_similarity(tra_title_gnr,
                                                 test_title_gnr[i:(i + 1)])

            testi = test_song * test_title_genre * test_tim * test_GEN * test_ART

            cand_song = tra_song_sp_T.dot(
                testi)  # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴
            cand_song_idx = cand_song.reshape(
                -1).argsort()[-300:][::-1]  # 값이 높은 상위 120개 노래 추출

            cand_song_idx = cand_song_idx[np.isin(
                cand_song_idx, songs_already) == False]  # 중복제거
            cand2 = pd.DataFrame(cand_song).iloc[cand_song_idx].reset_index()

            cand_all = pd.merge(cand1, cand2, how='outer', on='index')
            cand_all = cand_all.fillna(0)
            cand_all['pred'] = (cand_all['0_x'] + cand_all['0_y']) / 2
            cand_song_idx = list(
                cand_all.sort_values(by=['pred'],
                                     ascending=False)[:100]['index'])

            #######tag########
            cand_tag = tra_tag_sp_T.dot(testi)  # 똑같은 작업 실시
            cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1]

            cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) ==
                                        False][:10]
            rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

            res.append({
                "id": test.loc[pid, 'id'],
                "songs": cand_song_idx,
                "tags": rec_tag_idx
            })

        elif len(data['tags_id']) != 0:
            p = np.zeros((n_tags, 1))
            p[data['tags_id']] = 1

            tra_tag = data_all.iloc[model_knn_tag25.kneighbors(p.T)[1][0]]
            row = np.repeat(range(25),
                            tra_tag['num_songs'])  # User Index 별 노래 개수만큼 만듦
            col = [song for songs in tra_tag['songs']
                   for song in songs]  # Song dic number 추출
            dat = np.repeat(1, tra_tag['num_songs'].sum()
                            )  # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦
            tra_song_sp = spr.csr_matrix((dat, (row, col)),
                                         shape=(25, n_songs))  # csr_matrix 제작
            tra_song_sp_T = tra_song_sp.T.tocsr()

            row = np.repeat(range(25), tra_tag['num_tags'])
            col = [tag for tags in tra_tag['tags_id'] for tag in tags]
            dat = np.repeat(1, tra_tag['num_tags'].sum())
            tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(25, n_tags))
            tra_tag_sp_T = tra_tag_sp.T.tocsr()

            songs_already = data["songs"]
            tags_already = data["tags_id"]

            testi = cosine_similarity(tra_tag_sp, pp.T)

            if len(data['plylst_title']) != 0:
                tra_title_gnr = title_tdm[model_knn_title25.kneighbors(
                    title_ts[i:(i + 1)])[1][0]]
                testi_title = cosine_similarity(tra_title_gnr,
                                                title_ts[i:(i + 1)])
                testi = testi * testi_title

            cand_song = tra_song_sp_T.dot(
                testi)  # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴
            cand_song_idx = cand_song.reshape(
                -1).argsort()[-300:][::-1]  # 값이 높은 상위 120개 노래 추출

            cand_song_idx = cand_song_idx[
                np.isin(cand_song_idx, songs_already) ==
                False][:100]  # 중복되는 노래 있는지 확인하고 100개 추출

            cand_tag = tra_tag_sp_T.dot(testi)  # 똑같은 작업 실시
            cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1]

            cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) ==
                                        False][:10]
            rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

            res.append({
                "id": test.loc[pid, 'id'],
                "songs": list(cand_song_idx),
                "tags": rec_tag_idx
            })

        else:
            cand_song = []
            for li in data_all.iloc[model_knn_title25.kneighbors(
                    title_ts[i:(i + 1)])[1][0]].songs.to_list():
                for j in li:
                    cand_song.append(j)

            cand_tag = []
            for li in data_all.iloc[model_knn_title25.kneighbors(
                    title_ts[i:(i + 1)])[1][0]].tags.to_list():
                for j in li:
                    cand_tag.append(j)

            cand_song_idx = list(
                pd.DataFrame(cand_song)[0].value_counts()[:100].index)
            rec_tag_idx = list(
                pd.DataFrame(cand_tag)[0].value_counts()[:10].index)

            res.append({
                "id": test.loc[pid, 'id'],
                "songs": cand_song_idx,
                "tags": rec_tag_idx
            })

    for i in range(len(res)):
        if len(res[i]['songs']) != 100:
            print('song 에서 {}번째 오류 발생'.format(i))

        if len(res[i]['tags']) != 10:
            print('tag 에서 {}번째 오류 발생'.format(i))

    rec = []
    for i in range(len(res)):
        rec.append({
            "id": res[i]['id'],
            "songs": list(res[i]['songs']),
            "tags": res[i]['tags']
        })

    result1 = pd.DataFrame(rec)

    model_knn_song = NearestNeighbors(metric='cosine',
                                      algorithm='brute',
                                      n_neighbors=50,
                                      n_jobs=-1)
    model_knn_tag = NearestNeighbors(metric='cosine',
                                     algorithm='brute',
                                     n_neighbors=50,
                                     n_jobs=-1)
    model_knn_title = NearestNeighbors(metric='cosine',
                                       algorithm='brute',
                                       n_neighbors=50,
                                       n_jobs=-1)
    model_knn_title_gnr = NearestNeighbors(metric='cosine',
                                           algorithm='brute',
                                           n_neighbors=50,
                                           n_jobs=-1)
    model_knn_times = NearestNeighbors(metric='cosine',
                                       algorithm='brute',
                                       n_neighbors=50,
                                       n_jobs=-1)
    model_knn_GEN = NearestNeighbors(metric='cosine',
                                     algorithm='brute',
                                     n_neighbors=50,
                                     n_jobs=-1)
    model_knn_ART = NearestNeighbors(metric='cosine',
                                     algorithm='brute',
                                     n_neighbors=50,
                                     n_jobs=-1)

    model_knn_song.fit(song_sp)
    model_knn_tag.fit(tag_sp)
    model_knn_title.fit(title_sp)
    model_knn_title_gnr.fit(title_gnr)
    model_knn_times.fit(times_sp)
    model_knn_GEN.fit(GEN_sp)
    model_knn_ART.fit(ART_sp)

    res2 = []
    for i in tqdm_notebook([1960, 6361, 8705, 9310, 10498]):
        data = test.iloc[i]
        pid = i

        if len(data['songs']) != 0 and len(data['tags_id']) != 0:
            p = np.zeros((707989, 1))
            p[data['songs']] = 1

            pp = np.zeros((n_tags, 1))
            pp[data['tags_id']] = 1

            tra_song = data_all.iloc[model_knn_song.kneighbors(p.T)[1][0]]
            row = np.repeat(range(50),
                            tra_song['num_songs'])  # User Index 별 노래 개수만큼 만듦
            col = [song for songs in tra_song['songs']
                   for song in songs]  # Song dic number 추출
            dat = np.repeat(1, tra_song['num_songs'].sum()
                            )  # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦
            tra_song_sp = spr.csr_matrix((dat, (row, col)),
                                         shape=(50, n_songs))  # csr_matrix 제작
            tra_song_sp_T = tra_song_sp.T.tocsr()

            tra_tag = data_all.iloc[model_knn_tag.kneighbors(pp.T)[1][0]]
            row = np.repeat(range(50), tra_tag['num_tags'])
            col = [tag for tags in tra_tag['tags_id'] for tag in tags]
            dat = np.repeat(1, tra_tag['num_tags'].sum())
            tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(50, n_tags))
            tra_tag_sp_T = tra_tag_sp.T.tocsr()

            tra_tim = times_sp[model_knn_times.kneighbors(
                tim_test[i:(i + 1)])[1][0]]
            tra_GEN = GEN_sp[model_knn_GEN.kneighbors(GEN_test[i:(i +
                                                                  1)])[1][0]]
            tra_title_gnr = title_gnr[model_knn_title_gnr.kneighbors(
                test_title_gnr[i:(i + 1)])[1][0]]

            songs_already = data["songs"]
            tags_already = data["tags_id"]

            test_song = cosine_similarity(tra_song_sp, p.T)
            test_tag = cosine_similarity(tra_tag_sp, pp.T)

            test_tim = cosine_similarity(tra_tim, tim_test[i:(i + 1)])
            test_GEN = cosine_similarity(tra_GEN, GEN_test[i:(i + 1)])
            test_title_genre = cosine_similarity(tra_title_gnr,
                                                 test_title_gnr[i:(i + 1)])

            testi = test_song * test_tag * test_title_genre * test_GEN

            cand_song = tra_song_sp_T.dot(
                testi)  # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴
            cand_song_idx = cand_song.reshape(
                -1).argsort()[-300:][::-1]  # 값이 높은 상위 120개 노래 추출

            cand_song_idx = cand_song_idx[
                np.isin(cand_song_idx, songs_already) ==
                False][:100]  # 중복되는 노래 있는지 확인하고 100개 추출

            cand_tag = tra_tag_sp_T.dot(testi)  # 똑같은 작업 실시
            cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1]

            cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) ==
                                        False][:10]
            rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

            res2.append({
                "id": test.loc[pid, 'id'],
                "songs": cand_song_idx,
                "tags": rec_tag_idx
            })

        elif len(data['songs']) != 0:
            p = np.zeros((707989, 1))
            p[data['songs']] = 1

            tra_song = data_all.iloc[model_knn_song.kneighbors(p.T)[1][0]]
            row = np.repeat(range(50),
                            tra_song['num_songs'])  # User Index 별 노래 개수만큼 만듦
            col = [song for songs in tra_song['songs']
                   for song in songs]  # Song dic number 추출
            dat = np.repeat(1, tra_song['num_songs'].sum()
                            )  # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦
            tra_song_sp = spr.csr_matrix((dat, (row, col)),
                                         shape=(50, n_songs))  # csr_matrix 제작
            tra_song_sp_T = tra_song_sp.T.tocsr()

            row = np.repeat(range(50), tra_song['num_tags'])
            col = [tag for tags in tra_song['tags_id'] for tag in tags]
            dat = np.repeat(1, tra_song['num_tags'].sum())
            tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(50, n_tags))
            tra_tag_sp_T = tra_tag_sp.T.tocsr()

            songs_already = data["songs"]
            tags_already = data["tags_id"]

            tra_tim = times_sp[model_knn_times.kneighbors(
                tim_test[i:(i + 1)])[1][0]]
            tra_GEN = GEN_sp[model_knn_GEN.kneighbors(GEN_test[i:(i +
                                                                  1)])[1][0]]
            tra_title_gnr = title_gnr[model_knn_title_gnr.kneighbors(
                test_title_gnr[i:(i + 1)])[1][0]]

            test_song = cosine_similarity(tra_song_sp, p.T)

            test_tim = cosine_similarity(tra_tim, tim_test[i:(i + 1)])
            test_GEN = cosine_similarity(tra_GEN, GEN_test[i:(i + 1)])
            test_title_genre = cosine_similarity(tra_title_gnr,
                                                 test_title_gnr[i:(i + 1)])
            testi = test_song * test_title_genre * test_tim * test_GEN

            cand_song = tra_song_sp_T.dot(
                testi)  # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴
            cand_song_idx = cand_song.reshape(
                -1).argsort()[-200:][::-1]  # 값이 높은 상위 120개 노래 추출

            cand_song_idx = cand_song_idx[
                np.isin(cand_song_idx, songs_already) ==
                False][:100]  # 중복되는 노래 있는지 확인하고 100개 추출

            cand_tag = tra_tag_sp_T.dot(testi)  # 똑같은 작업 실시
            cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1]

            cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) ==
                                        False][:10]
            rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

            res2.append({
                "id": test.loc[pid, 'id'],
                "songs": cand_song_idx,
                "tags": rec_tag_idx
            })

        elif len(data['tags_id']) != 0:
            p = np.zeros((n_tags, 1))
            p[data['tags_id']] = 1

            tra_tag = data_all.iloc[model_knn_tag.kneighbors(p.T)[1][0]]
            row = np.repeat(range(50),
                            tra_tag['num_songs'])  # User Index 별 노래 개수만큼 만듦
            col = [song for songs in tra_tag['songs']
                   for song in songs]  # Song dic number 추출
            dat = np.repeat(1, tra_tag['num_songs'].sum()
                            )  # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦
            tra_song_sp = spr.csr_matrix((dat, (row, col)),
                                         shape=(50, n_songs))  # csr_matrix 제작
            tra_song_sp_T = tra_song_sp.T.tocsr()

            row = np.repeat(range(50), tra_tag['num_tags'])
            col = [tag for tags in tra_tag['tags_id'] for tag in tags]
            dat = np.repeat(1, tra_tag['num_tags'].sum())
            tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(50, n_tags))
            tra_tag_sp_T = tra_tag_sp.T.tocsr()

            songs_already = data["songs"]
            tags_already = data["tags_id"]

            testi = cosine_similarity(tra_tag_sp, pp.T)

            if len(data['plylst_title']) != 0:
                tra_title_gnr = title_tdm[model_knn_title.kneighbors(
                    title_ts[i:(i + 1)])[1][0]]
                testi_title = cosine_similarity(tra_title_gnr,
                                                title_ts[i:(i + 1)])
                testi = testi * testi_title

            cand_song = tra_song_sp_T.dot(
                testi)  # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴
            cand_song_idx = cand_song.reshape(
                -1).argsort()[-300:][::-1]  # 값이 높은 상위 120개 노래 추출

            cand_song_idx = cand_song_idx[
                np.isin(cand_song_idx, songs_already) ==
                False][:100]  # 중복되는 노래 있는지 확인하고 100개 추출

            cand_tag = tra_tag_sp_T.dot(testi)  # 똑같은 작업 실시
            cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1]

            cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) ==
                                        False][:10]
            rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

            res2.append({
                "id": test.loc[pid, 'id'],
                "songs": cand_song_idx,
                "tags": rec_tag_idx
            })

        else:
            cand_song = []
            for li in data_all.iloc[model_knn_title.kneighbors(
                    title_ts[i:(i + 1)])[1][0]].songs.to_list():
                for j in li:
                    cand_song.append(j)

            cand_tag = []
            for li in data_all.iloc[model_knn_title.kneighbors(
                    title_ts[i:(i + 1)])[1][0]].tags.to_list():
                for j in li:
                    cand_tag.append(j)

            cand_song_idx = list(
                pd.DataFrame(cand_song)[0].value_counts()[:100].index)
            rec_tag_idx = list(
                pd.DataFrame(cand_tag)[0].value_counts()[:10].index)

            res2.append({
                "id": test.loc[pid, 'id'],
                "songs": cand_song_idx,
                "tags": rec_tag_idx
            })

    pd.DataFrame(res2)

    rec2 = []
    for i in range(len(res2)):
        rec2.append({
            "id": res2[i]['id'],
            "songs": list(res2[i]['songs']),
            "tags": res2[i]['tags']
        })

    result2 = pd.DataFrame(rec2)['songs']

    n_index = [10498, 6361, 1960, 8705, 9310]

    result2.index = n_index

    result1.loc[n_index, 'songs'] = result2

    result1['songs'].apply(len).sort_values()
    #그럼에도 채워지지 않은 6361에 대해서 상위 100곡 추천
    s = []
    for song in train.songs.tolist():
        s += song
    r1 = dict(Counter(s))

    r_song = sorted(r1.items(), key=lambda x: -x[1])
    r_song_top = r_song[:100]  # 몇 곡 할지도 정해야 함

    list_song = list(dict(r_song_top).keys())
    len(list_song)

    sub = []
    for j in range(len(result1)):
        sub.append(result1.loc[j].to_dict())

    sub[6361]['songs'] = list_song

    pd.DataFrame(sub)['songs'].apply(len).sort_values()
    write_json(sub, 'final_songs.json')
    return sub
示例#27
0
class SentencePieceTokenizer:
    def __init__(self, model_path: str = None):
        self.unk = '<unk>'
        self.pad = '<pad>'
        self.sos = '<s>'
        self.eos = '</s>'

        if model_path:
            self.load(model_path)
        else:
            self.tokenizer = None

    def tokenize(self, sent: str):
        return self.tokenizer.encode_as_pieces(sent)

    def text_to_id(self, sent: str):
        return self.tokenizer.encode_as_ids(sent)

    def id_to_text(self, idxs: list):
        return self.tokenizer.decode_ids(idxs)

    def token_to_id(self, token: str):
        return self.tokenizer.piece_to_id(token)

    def train(self,
              sent_path: str,
              model_prefix: str,
              character_coverage=0.9995,
              vocab_size=None,
              model_type: str = "bpe",
              control_symbols: list = ['<pad>']):

        if character_coverage is None and vocab_size is None:
            print("at least character_coverage or vocab_size should be given!")
            assert character_coverage or vocab_size

        coverage_conditions = ""
        if character_coverage is not None:
            coverage_condition = f" --character_coverage={str(character_coverage)} "
        else:
            coverage_condition = f" --vocab_size={str(vocab_size)} "

        symbol_list = ""
        for i in control_symbols:
            symbol_list += i + ","

        args = ("--input={} "
                "--model_prefix={} "
                "--model_type={} "
                "--control_symbols={} ".format(sent_path, model_prefix,
                                               model_type, symbol_list))

        args += coverage_condition

        SentencePieceTrainer.Train(args)

    def load(self, model_path: str):
        self.tokenizer = SentencePieceProcessor()
        self.tokenizer.Load(model_path)

    def __repr__(self):
        unk = '"{}"'.format(self.unk) if self.unk else "None"
        return "Vocab(size={}, unk={}, pad={})".format(len(self.tokenizer),
                                                       unk, self.pad)

    def __len__(self):
        return len(self.tokenizer)
示例#28
0
 def _encode_batch(self, texts):
     from sentencepiece import SentencePieceProcessor
     tok = SentencePieceProcessor()
     tok.Load(str(self.sp_model))
     return [np.array(tok.EncodeAsIds(t)) for t in texts]
示例#29
0
def sentencepiece_load(file):
    """Load a SentencePiece model"""
    from sentencepiece import SentencePieceProcessor
    spm = SentencePieceProcessor()
    spm.Load(str(file))
    return spm
示例#30
0
def load_sentencepiece(model_path):
  from sentencepiece import SentencePieceProcessor
  sp = SentencePieceProcessor()
  sp.Load(model_path)
  return sp