def load_bpe_model(path): from sentencepiece import SentencePieceProcessor spm = SentencePieceProcessor() spm.Load(path) if spm.Load(path): return spm else: raise Exception("Error loading model")
class SentencePieceTokenizer():#TODO: pass the special tokens symbol to sp "SentencePiece tokenizer for `lang`" def __init__(self, lang='en', special_toks=None, sp_model=None, vocab_sz=None, max_vocab_sz=30000, model_type='unigram', char_coverage=None, cache_dir='tmp'): try: from sentencepiece import SentencePieceTrainer,SentencePieceProcessor except ImportError: raise Exception('sentencepiece module is missing: run `pip install sentencepiece!=0.1.90,!=0.1.91`') self.sp_model,self.cache_dir = sp_model,Path(cache_dir) self.vocab_sz,self.max_vocab_sz,self.model_type = vocab_sz,max_vocab_sz,model_type self.char_coverage = ifnone(char_coverage, 0.99999 if lang in eu_langs else 0.9998) self.special_toks = ifnone(special_toks, defaults.text_spec_tok) if sp_model is None: self.tok = None else: self.tok = SentencePieceProcessor() self.tok.Load(str(sp_model)) os.makedirs(self.cache_dir, exist_ok=True) def _get_vocab_sz(self, raw_text_path): cnt = Counter() with open(raw_text_path, 'r') as f: for line in f.readlines(): cnt.update(line.split()) if len(cnt)//4 > self.max_vocab_sz: return self.max_vocab_sz res = len(cnt)//4 while res%8 != 0: res+=1 return max(res,29) def train(self, raw_text_path): "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`" from sentencepiece import SentencePieceTrainer vocab_sz = self._get_vocab_sz(raw_text_path) if self.vocab_sz is None else self.vocab_sz spec_tokens = ['\u2581'+s for s in self.special_toks] SentencePieceTrainer.Train(" ".join([ f"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}", f"--character_coverage={self.char_coverage} --model_type={self.model_type}", f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1 --minloglevel=2", f"--user_defined_symbols={','.join(spec_tokens)} --hard_vocab_limit=false"])) raw_text_path.unlink() return self.cache_dir/'spm.model' def setup(self, items, rules=None): from sentencepiece import SentencePieceProcessor if rules is None: rules = [] if self.tok is not None: return {'sp_model': self.sp_model} raw_text_path = self.cache_dir/'texts.out' with open(raw_text_path, 'w') as f: for t in progress_bar(maps(*rules, items), total=len(items), leave=False): f.write(f'{t}\n') sp_model = self.train(raw_text_path) self.tok = SentencePieceProcessor() self.tok.Load(str(sp_model)) return {'sp_model': sp_model} def __call__(self, items): if self.tok is None: self.setup(items) for t in items: yield self.tok.EncodeAsPieces(t)
def main(train_path, val_path, test_path, config_path, subword_model_path, out_dir): params = Params.from_file(config_path) reader_params = params.pop("reader", default=Params({})) reader = DatasetReader.from_params(reader_params) processor = SentencePieceProcessor() processor.Load(subword_model_path) train_text_file = os.path.join(out_dir, "train.text.txt") train_summary_file = os.path.join(out_dir, "train.summary.txt") val_text_file = os.path.join(out_dir, "val.text.txt") val_summary_file = os.path.join(out_dir, "val.summary.txt") test_text_file = os.path.join(out_dir, "test.text.txt") test_summary_file = os.path.join(out_dir, "test.summary.txt") files = ((train_path, train_text_file, train_summary_file), (val_path, val_text_file, val_summary_file), (test_path, test_text_file, test_summary_file)) for path, text_file_name, summary_file_name in files: with open(text_file_name, "w") as text_file, open(summary_file_name, "w") as summary_file: for text, summary in reader.parse_set(path): text_subwords = processor.EncodeAsPieces(text) summary_subwords = processor.EncodeAsPieces(summary) text_subwords.insert(0, "<t>") text_subwords.append("</t>") summary_subwords.insert(0, "<t>") summary_subwords.append("</t>") text_file.write(" ".join(text_subwords) + "\n") summary_file.write((" ".join(summary_subwords)) + "\n")
class SentencePieceExtractor: """ Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece """ def __init__(self, model: str): # Get SentencePiece self.sp = SentencePieceProcessor() self.sp.Load(model) def extract(self) -> Tuple[Dict[str, int], List[Tuple]]: sp = self.sp vocab = { sp.id_to_piece(index): index for index in trange(sp.GetPieceSize()) } # Merges merges = [] for piece_l in tqdm(vocab.keys(), total=sp.GetPieceSize()): for piece_r in vocab.keys(): if piece_l != piece_r: merge = sp.PieceToId(f"{piece_l}{piece_r}") score = sp.GetScore(merge) if score != 0.: merges += [(piece_l, piece_r)] return vocab, merges
class SentencePieceTokenizer(Tokenizer, CppProcessorMixin): """Sentence piece tokenizer.""" class Config(ConfigBase): sp_model_path: str = "" def __init__(self, sp_model_path: str = ""): self.sp_model_path = sp_model_path self._load_processor() @classmethod def from_config(cls, config: Config): return cls(config.sp_model_path) def tokenize(self, input_str: str) -> List[Token]: pieces = self.processor.EncodeAsPieces(input_str) tokens = [] # calculate start and end indices of each piece. end = 0 for piece in pieces: original_piece = piece.lstrip("\u2581") start = input_str.find(original_piece, end) end = start + len(original_piece) tokens.append(Token(piece, start, end)) return tokens def _load_processor(self): self.processor = SentencePieceProcessor() self.processor.Load(self.sp_model_path)
class SentencePieceTokenizer():#TODO: pass the special tokens symbol to sp "Spacy tokenizer for `lang`" def __init__(self, lang='en', special_toks=None, sp_model=None, vocab_sz=None, max_vocab_sz=30000, model_type='unigram', char_coverage=None, cache_dir='tmp'): try: from sentencepiece import SentencePieceTrainer,SentencePieceProcessor except ImportError: raise Exception('sentencepiece module is missing: run `pip install sentencepiece`') self.sp_model,self.cache_dir = sp_model,Path(cache_dir) self.vocab_sz,self.max_vocab_sz,self.model_type = vocab_sz,max_vocab_sz,model_type self.char_coverage = ifnone(char_coverage, 0.99999 if lang in eu_langs else 0.9998) self.special_toks = ifnone(special_toks, defaults.text_spec_tok) if sp_model is None: self.tok = None else: self.tok = SentencePieceProcessor() self.tok.Load(str(sp_model)) os.makedirs(self.cache_dir, exist_ok=True) def _get_vocab_sz(self, raw_text_path): cnt = Counter() with open(raw_text_path, 'r') as f: for line in f.readlines(): cnt.update(line.split()) if len(cnt)//4 > self.max_vocab_sz: return self.max_vocab_sz res = len(cnt)//4 while res%8 != 0: res+=1 return res def train(self, raw_text_path): "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
class SentencePieceExtractor: """ Extractor implementation for SentencePiece trained model_files. https://github.com/google/sentencepiece """ def __init__(self, model: str): requires_sentencepiece(self) from sentencepiece import SentencePieceProcessor self.sp = SentencePieceProcessor() self.sp.Load(model) def extract(self) -> Tuple[Dict[str, int], List[Tuple]]: sp = self.sp vocab = { sp.id_to_piece(index): index for index in range(sp.GetPieceSize()) } # Merges merges = [] for piece_l in vocab.keys(): for piece_r in vocab.keys(): merge = f"{piece_l}{piece_r}" piece_id = vocab.get(merge, None) if piece_id: merges += [(piece_l, piece_r, piece_id)] merges = sorted(merges, key=lambda val: val[2]) merges = [(val[0], val[1]) for val in merges] return vocab, merges
class SentencePieceTokenizer(Tokenizer, CppProcessorMixin): """Sentence piece tokenizer.""" class Config(ConfigBase): sp_model_path: str = "" max_input_text_length: Optional[int] = None use_fb_sentencepiece: Optional[bool] = False def __init__( self, sp_model_path: str = "", max_input_text_length: Optional[int] = None, use_fb_sentencepiece: Optional[bool] = None, ): self.sp_model_path = sp_model_path self.max_input_text_length = max_input_text_length self.use_fb_sentencepiece = use_fb_sentencepiece self._load_processor() log_class_usage(__class__) @classmethod def from_config(cls, config: Config): return cls( config.sp_model_path, config.max_input_text_length, config.use_fb_sentencepiece, ) def tokenize(self, input_str: str) -> List[Token]: if ( hasattr(self, "max_input_text_length") and self.max_input_text_length is not None ): input_str = input_str[: self.max_input_text_length] pieces = self.processor.EncodeAsPieces(input_str) tokens = [] # calculate start and end indices of each piece. end = 0 for piece in pieces: original_piece = piece.lstrip("\u2581") start = input_str.find(original_piece, end) end = start + len(original_piece) tokens.append(Token(piece, start, end)) return tokens def _load_processor(self): sp_model_path = PathManager.get_local_path(self.sp_model_path) if self.use_fb_sentencepiece: self.processor = torch.classes.fb.SentencePiece.fromFile(sp_model_path) else: from sentencepiece import SentencePieceProcessor self.processor = SentencePieceProcessor() self.processor.Load(sp_model_path) def torchscriptify(self): return ScriptDoNothingTokenizer()
def main(): parser = ArgumentParser() parser.add_argument("--model", required=True, help="sentencepiece model to use for encoding") parser.add_argument("--inputs", nargs="+", default=["-"], help="input files to filter/encode") parser.add_argument("--outputs", nargs="+", default=["-"], help="path to save encoded outputs") parser.add_argument("--output_format", choices=["piece", "id"], default="piece") parser.add_argument("--min-len", type=int, metavar="N", help="filter sentence pairs with fewer than N tokens") parser.add_argument("--max-len", type=int, metavar="N", help="filter sentence pairs with more than N tokens") args = parser.parse_args() sp = SentencePieceProcessor() sp.Load(args.model) if args.output_format == "piece": def encode(l): return sp.EncodeAsPieces(l) elif args.output_format == "id": def encode(l): return list(map(str, sp.EncodeAsIds(l))) if args.min_len is not None or args.max_len is not None: def valid(line): return (args.min_len is None or len(line) >= args.min_len) and (args.max_len is None or len(line) <= args.max_len) else: def valid(lines): return True with ExitStack() as stack: inputs = [stack.enter_context(open(input, "r", encoding="utf-8")) if input != "-" else sys.stdin for input in args.inputs] outputs = [stack.enter_context(open(output, "w", encoding="utf-8")) if output != "-" else sys.stdout for output in args.outputs] def encode_line(line): line = line.strip() if len(line) > 0: line = encode(line) if valid(line): return line return None for i, lines in enumerate(zip(*inputs), start=1): enc_lines = list(map(encode_line, lines)) if not any(enc_line is None for enc_line in enc_lines): for enc_line, output_h in zip(enc_lines, outputs): print(" ".join(enc_line), file=output_h) if i % 10000 == 0: print("processed {} lines".format(i), file=sys.stderr)
def train_dataloader(): config = ConveRTTrainConfig(train_batch_size=10, split_size=5) tokenizer = SentencePieceProcessor() tokenizer.Load(config.sp_model_path) instances = load_instances_from_reddit_dataset( "data/sample-dataset.json")[:100] dataset = ConveRTDataset(instances, tokenizer) data_loader = DataLoader(dataset, batch_size=config.train_batch_size, collate_fn=convert_collate_fn) return data_loader
def make_title_tdm(df, path): if "{}.model".format(path) not in os.listdir(): makeSentencepieceModel(df, path) sp = SentencePieceProcessor() sp.Load("{}.model".format(path)) cv = CountVectorizer(max_features=3000, tokenizer=sp.encode_as_pieces) content = df['plylst_title'] tdm = cv.fit_transform(content) title_tdm = tdm.toarray() return cv, title_tdm
def main(train_path, val_path, test_path, config_path, subword_model_path, out_dir, max_text_subwords, max_summary_subwords, source_suffix, target_suffix, insert_tags=False, lowercase=False): params = Params.from_file(config_path) reader_params = params.pop("dataset_reader", default=Params({})) reader = DatasetReader.from_params(reader_params) processor = SentencePieceProcessor() processor.Load(subword_model_path) train_text_file = os.path.join(out_dir, "train.{}".format(source_suffix)) train_summary_file = os.path.join(out_dir, "train.{}".format(target_suffix)) val_text_file = os.path.join(out_dir, "val.{}".format(source_suffix)) val_summary_file = os.path.join(out_dir, "val.{}".format(target_suffix)) test_text_file = os.path.join(out_dir, "test.{}".format(source_suffix)) test_summary_file = os.path.join(out_dir, "test.{}".format(target_suffix)) files = ((train_path, train_text_file, train_summary_file), (val_path, val_text_file, val_summary_file), (test_path, test_text_file, test_summary_file)) for path, text_file_name, summary_file_name in files: with open(text_file_name, "w") as text_file, open(summary_file_name, "w") as summary_file: for text, summary in reader.parse_set(path): if lowercase: text = text.lower() summary = summary.lower() text_subwords = processor.EncodeAsPieces(text) if max_text_subwords: text_subwords = text_subwords[:max_text_subwords] summary_subwords = processor.EncodeAsPieces(summary) if max_summary_subwords: summary_subwords = summary_subwords[:max_summary_subwords] if insert_tags: text_subwords.insert(0, "<t>") text_subwords.append("</t>") summary_subwords.insert(0, "<t>") summary_subwords.append("</t>") text_file.write(" ".join(text_subwords) + "\n") summary_file.write((" ".join(summary_subwords)) + "\n")
def main(train_path, val_path, test_path, mode, subword_model_path, output_dir, max_source_subwords, max_target_subwords, source_suffix, target_suffix, lowercase=False): processor = SentencePieceProcessor() processor.Load(subword_model_path) os.makedirs(output_dir, exist_ok=True) train_source_file = os.path.join(output_dir, "train.{}".format(source_suffix)) train_target_file = os.path.join(output_dir, "train.{}".format(target_suffix)) val_source_file = os.path.join(output_dir, "val.{}".format(source_suffix)) val_target_file = os.path.join(output_dir, "val.{}".format(target_suffix)) test_source_file = os.path.join(output_dir, "test.{}".format(source_suffix)) test_target_file = os.path.join(output_dir, "test.{}".format(target_suffix)) parse = MODES.get(mode, None) assert parse is not None files = ((train_path, train_source_file, train_target_file), (val_path, val_source_file, val_target_file), (test_path, test_source_file, test_target_file)) for path, source_file_name, target_file_name in files: with open(source_file_name, "w") as source_file, open(target_file_name, "w") as target_file: for record in parse(path): source = record["source"] target = record["target"] if lowercase: source = source.lower() target = target.lower() source_subwords = processor.EncodeAsPieces(source) if max_source_subwords: source_subwords = source_subwords[:max_source_subwords] target_subwords = processor.EncodeAsPieces(target) if max_target_subwords: target_subwords = target_subwords[:max_target_subwords] source_file.write(" ".join(source_subwords) + "\n") target_file.write((" ".join(target_subwords)) + "\n")
class SentencePieceTokenizer: def __init__(self, spm_file, do_lower_case=True): self.processor = SentencePieceProcessor() self.processor.Load(spm_file) self.do_lower_case = do_lower_case def tokenize(self, text): text = preprocess_text(text, lower=self.do_lower_case) pieces = encode_pieces(self.processor, text, sample=False) return pieces def convert_tokens_to_ids(self, tokens): return [self.processor.PieceToId(piece) for piece in tokens] def convert_ids_to_tokens(self, ids): pieces = [self.processor.IdToPiece(_id) for _id in ids] return pieces
def main() -> int: train_config = get_train_config() model_config = ConveRTModelConfig() logger = logger_setup(train_config.log_dir) device = torch.device(train_config.device if torch.cuda.is_available() else "cpu") tokenizer = SentencePieceProcessor() tokenizer.Load(train_config.sp_model_path) instance_load_fn = load_instances_from_reddit_dataset if train_config.is_reddit else load_instances_from_tsv_dataset train_instances = instance_load_fn(train_config.train_dataset_path) test_instances = instance_load_fn(train_config.test_dataset_path) train_dataset = ConveRTDataset(train_instances, tokenizer) test_dataset = ConveRTDataset(test_instances, tokenizer) train_dataloader = DataLoader( train_dataset, train_config.train_batch_size, collate_fn=convert_collate_fn, drop_last=True ) test_dataloader = DataLoader( test_dataset, train_config.test_batch_size, collate_fn=convert_collate_fn, drop_last=True ) model = ConveRTDualEncoder(model_config) criterion = ConveRTCosineLoss(split_size=train_config.split_size) model.to(device) criterion.to(device) if train_config.use_data_paraller and torch.cuda.is_available(): model = nn.DataParallel(model) criterion = nn.DataParallel(criterion) trainer = ConveRTTrainer( model=model, criterion=criterion, train_config=train_config, train_dataloader=train_dataloader, test_dataloader=test_dataloader, logger=logger, device=device, ) trainer.train() torch.save(trainer.model, 'final_model.pkl') return 0
def load_sentencepiece_tokenizer( tokenizer_path: str) -> SentencePieceProcessor: ''' Loads an already pretrained sentencepiece tokenizer. Args: tokenizer_path: path to the files of the pretrained sentencepiece tokenizer. Returns: tokenizer: pretrained sentencepiece tokenizer. ''' if not os.path.isfile(tokenizer_path): print("SentencePiece tokenizer not found!") sys.exit() tokenizer = SentencePieceProcessor() tokenizer.Load(tokenizer_path) # enable inserting <s> and </s> tags automatically at start/end of a sentence. tokenizer.set_encode_extra_options('bos:eos') return tokenizer
def main(**kwargs): set_seed(1) train_config = ConveRTTrainConfig() model_config = ConveRTModelConfig() tokenizer = SentencePieceProcessor() args = _parse_args() tokenizer.Load(train_config.sp_model_path) train_instances = load_instances_from_reddit_json(train_config.dataset_path) RD = RedditData(train_instances, tokenizer, 60) dm = DataModule() train_loader = dm.train_dataloader(RD) model = SingleContextConvert(model_config, train_config) lr_decay = LearningRateDecayCallback(train_config) model.register_subword_params() trainer = ( pl.Trainer.from_argparse_args(args, callbacks = [lr_decay],**kwargs) ) # ,checkpoint_callback = checkpoint_callback) # ,resume_from_checkpoint=) trainer.fit(model, train_dataloader = train_loader, val_dataloaders = train_loader)
class SentencePieceTokenizer(Tokenizer, CppProcessorMixin): """Sentence piece tokenizer.""" class Config(ConfigBase): sp_model_path: str = "" def __init__(self, sp_model_path: str = ""): self.sp_model_path = sp_model_path self._load_processor() @classmethod def from_config(cls, config: Config): return cls(config.sp_model_path) def tokenize(self, input_str: str) -> List[Token]: pieces = self.processor.EncodeAsPieces(input_str) return [Token(piece, -1, -1) for piece in pieces] def _load_processor(self): self.processor = SentencePieceProcessor() self.processor.Load(self.sp_model_path)
class SubwordTokenizer(Tokenizer): def __init__(self, model_path: str = None, nbest_size: int = None, alpha: float = None): self._model_path = cached_path(model_path) self._processor = SentencePieceProcessor() self._processor.Load(self._model_path) self._nbest_size = nbest_size self._alpha = alpha def tokenize(self, text: str) -> List[Token]: if self._nbest_size and self._alpha: subwords = self._processor.SampleEncodeAsPieces(text, self._nbest_size, self._alpha) else: subwords = self._processor.EncodeAsPieces(text) tokens = [Token(s) for s in subwords] return tokens def batch_tokenize(self, texts: List[str]) -> List[List[Token]]: return [self.tokenize(text) for text in texts]
def main(): parser = ArgumentParser() parser.add_argument("--model", required=True, help="sentencepiece model to use for decoding") parser.add_argument("--input", default="-", help="input file to decode") parser.add_argument("--input_format", choices=["piece", "id"], default="piece") args = parser.parse_args() sp = SentencePieceProcessor() sp.Load(args.model) if args.input_format == "piece": def decode(l): return "".join(sp.DecodePieces(l)) elif args.input_format == "id": def decode(l): return "".join(sp.DecodeIds(l)) def tok2int(tok): # remap reference-side <unk> to 0 return int(tok) if tok != "<unk>" else 0 if args.input == "-": if args.input_format == "id": for line in sys.stdin: print(decode(list(map(tok2int, line.rstrip().split())))) elif args.input_format == "piece": for line in sys.stdin: print(decode(line.rstrip().split())) else: with open(args.input, "r", encoding="utf-8") as h: if args.input_format == "id": for line in h: print(decode(list(map(tok2int, line.rstrip().split())))) elif args.input_format == "piece": for line in h: print(decode(line.rstrip().split()))
class SentencePieceTokenizer: def __init__(self, spm_file, do_lower_case=True): if not os.path.exists(spm_file): raise ValueError( "Can't find spm_file \"%s\". " "Please pass the correct path of sentence-piece model file, " "e.g.`spiece.model`." % spm_file ) self.processor = SentencePieceProcessor() self.processor.Load(spm_file) self.do_lower_case = do_lower_case def tokenize(self, text): text = preprocess_text(text, lower=self.do_lower_case) pieces = encode_pieces(self.processor, text, sample=False) return pieces def convert_tokens_to_ids(self, tokens): return [self.processor.PieceToId(piece) for piece in tokens] def convert_ids_to_tokens(self, ids): pieces = [self.processor.IdToPiece(_id) for _id in ids] return pieces
def main(): options = parse_args() torch.manual_seed(options.seed) basename = os.path.splitext(os.path.basename(options.input))[0] out_dir = options.out_dir or "data/{}/".format(basename) spinner = Halo(spinner="dots", placement="right") with open(options.input, "r", encoding="utf8") as fd: reader = csv.reader(fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="") lines = [[line[0]] for line in reader] if not os.path.exists(out_dir): os.makedirs(out_dir) output_full = os.path.join(out_dir, "{}.tsv".format(basename)) with open(output_full, "w", encoding="utf8") as fd: writer = csv.writer(fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="") writer.writerows(lines) vocab_size = 32000 spiece_out = os.path.join(out_dir, "spiece") spiece_args = ( "--input={} " "--model_prefix={} " "--vocab_size={} " "--character_coverage=1.0" ).format(output_full, spiece_out, vocab_size) SentencePieceTrainer.Train(spiece_args) # Load the generated vocabulary with open("{}.vocab".format(spiece_out), "r", encoding="utf8") as fd: reader = csv.reader( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) vocab = [line[0] for line in reader] # Remove the special tokens <unk>, <s>, </s> vocab = vocab[3:] # Convert to BERT style bert_vocab = [ v[1:] if v.startswith("▁") else "##{}".format(v) for v in vocab if v != "▁" ] # Add BERT's special tokens to the beginning bert_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + bert_vocab # Fill up with unused tokens pad_size = vocab_size - len(bert_vocab) bert_vocab += ["unused{}".format(i) for i in range(pad_size)] with open(os.path.join(out_dir, "vocab.txt"), "w", encoding="utf8") as fd: writer = csv.writer( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) writer.writerows([[b] for b in bert_vocab]) # Convert to GPT-2 style # Unfortunately it's slow and tedious. spinner.start(text="Generating BPE vocabulary") gpt2_vocab = ["Ġ{}".format(v[1:]) if v.startswith("▁") else v for v in vocab] # Add the GPT-2 special token to the end gpt2_vocab.append("<|endoftext|>") with open(os.path.join(out_dir, "vocab.json"), "w", encoding="utf8") as fd: json.dump({v: i for i, v in enumerate(gpt2_vocab)}, fd, ensure_ascii=False) spiece_processor = SentencePieceProcessor() spiece_processor.Load("{}.model".format(spiece_out)) # Encode the whole text encoded = [ [" ".join(spiece_processor.EncodeAsPieces(line[0])).replace("▁", "Ġ")] for line in lines ] tmp_encoded_fd, tmp_encoded_path = tempfile.mkstemp() tmp_bpe_fd, tmp_bpe_path = tempfile.mkstemp() try: # Write the encoded text to a temporary file. with os.fdopen(tmp_encoded_fd, "w", encoding="utf8") as fd: writer = csv.writer( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) writer.writerows(encoded) learn_bpe( open(tmp_encoded_path, "r", encoding="utf8"), open(tmp_bpe_path, "w", encoding="utf8"), num_symbols=vocab_size, ) with open(tmp_bpe_path, "r", encoding="utf8") as fd: reader = csv.reader( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) seen = set() merges = [] for line in reader: # Get rid of the </w> tokens line = line[0].replace("</w>", "") # Remove duplicates (due to </w> tokens) if line not in seen: seen.add(line) merges.append([line]) with open(os.path.join(out_dir, "merges.txt"), "w", encoding="utf8") as fd: writer = csv.writer( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) writer.writerows(merges) finally: os.remove(tmp_encoded_path) os.remove(tmp_bpe_path) spinner.stop()
def main(input_dir, subword_model_path, output_dir, max_source_subwords, max_target_subwords, source_suffix, target_suffix, lowercase=False): processor = SentencePieceProcessor() processor.Load(subword_model_path) os.makedirs(output_dir, exist_ok=True) train_source_file = os.path.join(output_dir, "train.{}".format(source_suffix)) train_target_file = os.path.join(output_dir, "train.{}".format(target_suffix)) val_source_file = os.path.join(output_dir, "val.{}".format(source_suffix)) val_target_file = os.path.join(output_dir, "val.{}".format(target_suffix)) test_source_file = os.path.join(output_dir, "test.{}".format(source_suffix)) test_target_file = os.path.join(output_dir, "test.{}".format(target_suffix)) dirs = list(os.listdir(input_dir)) tasks = [] for d in dirs: if d.startswith("_"): continue mode = d.lower() parse = MODES.get(mode, None) assert parse is not None tasks.append((os.path.join(input_dir, d), mode, parse)) files = (("train.jsonl", train_source_file, train_target_file), ("val.jsonl", val_source_file, val_target_file), ("test.jsonl", test_source_file, test_target_file)) for orig_file_name, source_file_name, target_file_name in files: records = [] for d, mode, parse in tasks: if orig_file_name != "test.jsonl" and mode == "lidirus": continue elif orig_file_name == "test.jsonl" and mode == "lidirus": path = os.path.join(d, "LiDiRuS.jsonl") else: path = os.path.join(d, orig_file_name) for record in parse(path): source = mode + SEPARATOR + str( record["idx"]) + SEPARATOR + record["source"] target = record["target"] if lowercase: source = source.lower() target = target.lower() source_subwords = processor.EncodeAsPieces(source) if max_source_subwords: source_subwords = source_subwords[:max_source_subwords] target_subwords = processor.EncodeAsPieces(target) if max_target_subwords: target_subwords = target_subwords[:max_target_subwords] source = " ".join(source_subwords) target = " ".join(target_subwords) records.append((source, target)) random.shuffle(records) with open(source_file_name, "w") as source_file, open(target_file_name, "w") as target_file: for source, target in records: source_file.write(source + "\n") target_file.write(target + "\n")
def tokenizer() -> SentencePieceProcessor: config = ConveRTTrainConfig() tokenizer = SentencePieceProcessor() tokenizer.Load(config.sp_model_path) return tokenizer
class SentencepieceFasttextEmbed(EmbedderInterface): class Config(EmbedderInterface.Config): pass @classmethod def from_config(cls, config: Config): spm_model_file = os.path.join(config.preproc_dir, "spm.model") fasttext_model_file = os.path.join(config.preproc_dir, "fasttext-model.bin") return cls(spm_model_file, fasttext_model_file, config.max_pieces) def __init__(self, spm_model_file: str, fasttext_model_file: str = '', max_pieces: int = -1): super().__init__(max_pieces=max_pieces) self.spm = SentencePieceProcessor() self.spm.Load(spm_model_file) self.pad_idx = self.spm.pad_id() self.pad_token = self.spm.IdToPiece(self.pad_idx) self.unk_idx = self.spm.unk_id() self.unk_token = self.spm.IdToPiece(self.unk_idx) self.bos_idx = self.spm.bos_id() self.bos_token = self.spm.IdToPiece(self.bos_idx) self.eos_idx = self.spm.eos_id() self.eos_token = self.spm.IdToPiece(self.eos_idx) if fasttext_model_file: self.fasttext = fasttext.load_model(fasttext_model_file) @property def embed_dim(self): return self.fasttext.dim @property def n_vocab(self): return self.spm.get_piece_size() def encode_text_as_ids(self, text: str) -> np.array: """ Doesn't produce BOS, EOS ids. """ return np.asarray(self.spm.EncodeAsIds(text)[self.pieces_slice], dtype=np.int32) def encode_text_as_tokens(self, text: str) -> List[str]: """ Doesn't produce BOS, EOS tokens. """ return self.spm.EncodeAsPieces(text)[self.pieces_slice] def tokenize(self, text: str) -> List[str]: """ Alias for `encode_text_as_tokens`. Doesn't produce BOS, EOS tokens. """ return self.encode_text_as_tokens(text)[self.pieces_slice] def decode_ids_as_text(self, ids: List[int], strip_special=True) -> str: """ Doesn't produce PAD, BOS, or EOS text. i.e. PAD, BOS, EOS ids are stripped out before decoding. UNK is decoded but unintelligible. """ if strip_special: ids = [ int(id) for id in ids if id not in (self.pad_idx, self.bos_idx, self.eos_idx) ] else: ids = [int(id) for id in ids] return self.spm.DecodeIds(ids) def decode_tokens_as_text(self, toks: List[str]) -> str: """ Doesn't produce PAD, BOS, or EOS text. i.e. PAD, BOS, EOS tokens are stripped out before decoding. UNK is decoded but unintelligible. """ return self.spm.DecodePieces(toks[self.pieces_slice]) @functools.lru_cache(maxsize=1024) def decode_id_as_token(self, id: int) -> str: return self.spm.IdToPiece(id) def decode_ids_as_tokens(self, ids: List[int], strip_special: bool = True) -> List[str]: """ By default, doesn't produce PAD, BOS, EOS tokens. Avoids problematic intermediate string representation that causes length mismatch. In other words, SentencePiece isn't isomorphic with respect to the string representation. """ if strip_special: ids = [ id for id in ids if id not in (self.pad_idx, self.bos_idx, self.eos_idx) ] return [self.decode_id_as_token(int(ix)) for ix in ids] @functools.lru_cache(maxsize=1024) def embed_tok(self, tok: str) -> np.array: """ When given PAD, returns all zeros """ if tok == self.pad_token: return np.zeros(self.fasttext.dim) return np.asarray(self.fasttext[tok]) def embed_text(self, text: str) -> np.array: """ Doesn't produce PAD, BOS, EOS embeddings. i.e. PAD, BOS, EOS are stripped out during tokenization before embedding. """ return np.asarray([self.embed_tok(tok) for tok in self.tokenize(text)]) def embed_ids(self, ids: List[int], strip_special: bool = True) -> List[np.array]: """ By default, doesn't produce PAD, BOS, EOS tokens. Avoids problematic intermediate string representation that causes length mismatch. In other words, SentencePiece isn't isomorphic with respect to the string representation. """ return [ self.embed_tok(t) for t in self.decode_ids_as_tokens(ids, strip_special=strip_special) ] def embed_ids_batch(self, ids: np.array) -> torch.tensor: emb = [self.embed_ids(turn, strip_special=False) for turn in ids] emb = torch.tensor(emb) return emb
def song_inference(): sp_total_model_path = "sp_total" train = pd.read_json('./dataset/train.json', typ='frame', encoding='utf-8') song = pd.read_json('./dataset/song_meta.json', typ='frame', encoding='utf-8') plylst_tag = train['tags'] tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs]) tag_dict = {x: tag_counter[x] for x in tag_counter} tag_id_tid = dict() tag_tid_id = dict() for i, t in enumerate(tag_dict): tag_id_tid[t] = i tag_tid_id[i] = t n_tags = len(tag_dict) plylst_song = train['songs'] song_dict = {x: x for x in song['id']} n_songs = len(song_dict) train['tags_id'] = train['tags'].map( lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None]) # song genre 내용 가져오기. song_cate = [] for i in range(len(train)): gnr = [] songs = train.iloc[i, 3] for j in songs: for k in song.loc[j, 'song_gn_dtl_gnr_basket']: gnr.append(k) song_cate.append(gnr) train['plylst_genre'] = song_cate plylst_genre = train['plylst_genre'] genre_counter = Counter([gen for genre in plylst_genre for gen in genre]) genre_dict = {x: genre_counter[x] for x in genre_counter} genre_id_tid = dict() genre_tid_id = dict() for i, t in enumerate(genre_dict): genre_id_tid[t] = i genre_tid_id[i] = t n_genre = len(genre_dict) train['plylst_genre_id'] = train['plylst_genre'].map( lambda x: [genre_id_tid.get(s) for s in x if genre_id_tid.get(s) != None]) gnr_array = np.zeros((len(train), n_genre)) for i, index in enumerate(train.index): if i % 10000 == 0: print(i) counter = Counter(train.loc[index]['plylst_genre_id']) for (k, c) in counter.items(): gnr_array[i][k] = c gnr_array.shape song['issue_date'] = song['issue_date'].astype('str').map(lambda x: x[:6]) plylst_use = train[['plylst_title', 'updt_date', 'tags_id', 'songs']] plylst_use.loc[:, 'num_songs'] = plylst_use['songs'].map(len) plylst_use.loc[:, 'num_tags'] = plylst_use['tags_id'].map(len) plylst_train = plylst_use n_train = len(plylst_train) row = np.repeat(range(n_train), plylst_train['num_songs']) # User Index 별 노래 개수만큼 만듦 col = [song for songs in plylst_train['songs'] for song in songs] # Song dic number 추출 dat = np.repeat(1, plylst_train['num_songs'].sum() ) # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦 train_user_songs_A = spr.csr_matrix( (dat, (row, col)), shape=(n_train, n_songs)) # csr_matrix 제작 row = np.repeat(range(n_train), plylst_train['num_tags']) col = [tag for tags in plylst_train['tags_id'] for tag in tags] dat = np.repeat(1, plylst_train['num_tags'].sum()) train_user_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags)) train_user_songs_A_T = train_user_songs_A.T.tocsr() train_user_songs_A_T # 행에는 노래 columns에는 User 정보 삽입 train_user_tags_A_T = train_user_tags_A.T.tocsr() train_user_tags_A_T # 행에는 Tangs columns에는 User 정보 삽입 val = pd.read_json('./dataset/val.json', typ='frame', encoding='utf-8') song_cate = [] for i in range(len(val)): gnr = [] songs = val.iloc[i, 3] for j in songs: for k in song.loc[j, 'song_gn_dtl_gnr_basket']: gnr.append(k) song_cate.append(gnr) val['plylst_genre'] = song_cate val['tags_id'] = val['tags'].map( lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None]) val['plylst_genre_id'] = val['plylst_genre'].map( lambda x: [genre_id_tid.get(s) for s in x if genre_id_tid.get(s) != None]) val.loc[:, 'num_songs'] = val['songs'].map(len) val.loc[:, 'num_tags'] = val['tags_id'].map(len) # val_title = cv.transform(val['plylst_title']).toarray() gnr_val = np.zeros((len(val), n_genre)) for i, index in enumerate(val.index): if i % 10000 == 0: print(i) counter = Counter(val.loc[index]['plylst_genre_id']) for (k, c) in counter.items(): gnr_val[i][k] = c gnr_val.shape n_val = len(val) row = np.repeat(range(n_val), val['num_songs']) # User Index 별 노래 개수만큼 만듦 col = [song for songs in val['songs'] for song in songs] # Song dic number 추출 dat = np.repeat( 1, val['num_songs'].sum()) # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦 val_user_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_val, n_songs)) # csr_matrix 제작 row = np.repeat(range(n_val), val['num_tags']) col = [tag for tags in val['tags_id'] for tag in tags] dat = np.repeat(1, val['num_tags'].sum()) val_user_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_val, n_tags)) val_user_songs_A_T = val_user_songs_A.T.tocsr() val_user_tags_A_T = val_user_tags_A.T.tocsr() test = pd.read_json('./dataset/test.json', typ='frame', encoding='utf-8') song_cate = [] for i in range(len(test)): gnr = [] songs = test.iloc[i, 3] for j in songs: for k in song.loc[j, 'song_gn_dtl_gnr_basket']: gnr.append(k) song_cate.append(gnr) test['plylst_genre'] = song_cate test['tags_id'] = test['tags'].map( lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None]) test['plylst_genre_id'] = test['plylst_genre'].map( lambda x: [genre_id_tid.get(s) for s in x if genre_id_tid.get(s) != None]) test.loc[:, 'num_songs'] = test['songs'].map(len) test.loc[:, 'num_tags'] = test['tags_id'].map(len) # test_title = cv.transform(test['plylst_title']).toarray() gnr_test = np.zeros((len(test), n_genre)) for i, index in enumerate(test.index): if i % 10000 == 0: print(i) counter = Counter(test.loc[index]['plylst_genre_id']) for (k, c) in counter.items(): gnr_test[i][k] = c gnr_test.shape n_test = len(test) row = np.repeat(range(n_test), test['num_songs']) # User Index 별 노래 개수만큼 만듦 col = [song for songs in test['songs'] for song in songs] # Song dic number 추출 dat = np.repeat( 1, test['num_songs'].sum()) # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦 test_user_songs_A = spr.csr_matrix( (dat, (row, col)), shape=(n_test, n_songs)) # csr_matrix 제작 row = np.repeat(range(n_test), test['num_tags']) col = [tag for tags in test['tags_id'] for tag in tags] dat = np.repeat(1, test['num_tags'].sum()) test_user_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_test, n_tags)) test_user_songs_A_T = test_user_songs_A.T.tocsr() test_user_tags_A_T = test_user_tags_A.T.tocsr() data_all = pd.concat([train, val, test]) data_all.index = range(len(data_all)) arts = song['artist_id_basket'].map(lambda x: x[0]) arts = pd.DataFrame(arts) art_counts = arts['artist_id_basket'].value_counts().reset_index() art_counts.columns = ['artist_id_basket', 'counts'] arts2 = pd.merge(arts, art_counts, how='left', on=['artist_id_basket']) song_art = song.iloc[arts2.query('counts >= 12')['artist_id_basket'].index] song_art = song_art[['artist_id_basket']] #아티스트 대분류 ART_cate = [] for i in tqdm_notebook(range(len(data_all))): ART = [] songs = data_all.loc[i, 'songs'] for j in songs: if j in song_art.index: for k in song_art.loc[j, 'artist_id_basket']: ART.append(k) ART_cate.append(ART) data_all['plylst_ARTIST'] = ART_cate plylst_ARTIST = data_all['plylst_ARTIST'] ARTIST_counter = Counter( [ART for ARTIST in plylst_ARTIST for ART in ARTIST]) ARTIST_dict = {x: ARTIST_counter[x] for x in ARTIST_counter} ARTIST_id_tid = dict() ARTIST_tid_id = dict() for i, t in enumerate(ARTIST_dict): ARTIST_id_tid[t] = i ARTIST_tid_id[i] = t n_ARTIST = len(ARTIST_dict) data_all['plylst_ARTIST_id'] = data_all['plylst_ARTIST'].map( lambda x: [ARTIST_id_tid.get(s) for s in x if ARTIST_id_tid.get(s) != None]) ART_data_all = np.zeros((len(data_all), n_ARTIST)) for i, index in enumerate(data_all.index): if i % 10000 == 0: print(i) counter = Counter(data_all.loc[index]['plylst_ARTIST_id']) for (k, c) in counter.items(): ART_data_all[i][k] = c ART_data_all.shape ART_array = ART_data_all[:len(train)] ART_val = ART_data_all[len(train):len(train) + len(val)] ART_test = ART_data_all[len(train) + len(val):len(train) + len(val) + len(test)] # ART_data_all = sparse.csr_matrix(ART_data_all) del ART_data_all ART_array = sparse.csr_matrix(ART_array) ART_val = sparse.csr_matrix(ART_val) ART_test = sparse.csr_matrix(ART_test) # song tim 내용 가져오기. tim_cate = [] for i in tqdm_notebook(range(len(data_all))): tim = [] songs = data_all.loc[i, 'songs'] for j in songs: tim.append(song.loc[j, 'issue_date']) tim_cate.append(tim) data_all['plylst_times'] = tim_cate plylst_times = data_all['plylst_times'] times_counter = Counter([tim for times in plylst_times for tim in times]) times_dict = {x: times_counter[x] for x in times_counter} times_id_tid = dict() times_tid_id = dict() for i, t in enumerate(times_dict): times_id_tid[t] = i times_tid_id[i] = t n_times = len(times_dict) data_all['plylst_times_id'] = data_all['plylst_times'].map( lambda x: [times_id_tid.get(s) for s in x if times_id_tid.get(s) != None]) tim_data_all = np.zeros((len(data_all), n_times)) for i, index in enumerate(data_all.index): if i % 10000 == 0: print(i) counter = Counter(data_all.loc[index]['plylst_times_id']) for (k, c) in counter.items(): tim_data_all[i][k] = c tim_array = tim_data_all[:len(train)] tim_val = tim_data_all[len(train):len(train) + len(val)] tim_test = tim_data_all[len(train) + len(val):len(train) + len(val) + len(test)] # tim_data_all = sparse.csr_matrix(tim_data_all) del tim_data_all tim_array = sparse.csr_matrix(tim_array) tim_val = sparse.csr_matrix(tim_val) tim_test = sparse.csr_matrix(tim_test) #장르 대분류 GEN_cate = [] for i in tqdm_notebook(range(len(data_all))): GEN = [] songs = data_all.loc[i, 'songs'] for j in songs: for k in song.loc[j, 'song_gn_gnr_basket']: GEN.append(k) GEN_cate.append(GEN) data_all['plylst_GENRE'] = GEN_cate plylst_GENRE = data_all['plylst_GENRE'] GENRE_counter = Counter([GEN for GENRE in plylst_GENRE for GEN in GENRE]) GENRE_dict = {x: GENRE_counter[x] for x in GENRE_counter} GENRE_id_tid = dict() GENRE_tid_id = dict() for i, t in enumerate(GENRE_dict): GENRE_id_tid[t] = i GENRE_tid_id[i] = t n_GENRE = len(GENRE_dict) data_all['plylst_GENRE_id'] = data_all['plylst_GENRE'].map( lambda x: [GENRE_id_tid.get(s) for s in x if GENRE_id_tid.get(s) != None]) GEN_data_all = np.zeros((len(data_all), n_GENRE)) for i, index in enumerate(data_all.index): if i % 10000 == 0: print(i) counter = Counter(data_all.loc[index]['plylst_GENRE_id']) for (k, c) in counter.items(): GEN_data_all[i][k] = c GEN_array = GEN_data_all[:len(train)] GEN_val = GEN_data_all[len(train):len(train) + len(val)] GEN_test = GEN_data_all[len(train) + len(val):len(train) + len(val) + len(test)] # GEN_data_all = sparse.csr_matrix(GEN_data_all) del GEN_data_all GEN_array = sparse.csr_matrix(GEN_array) GEN_val = sparse.csr_matrix(GEN_val) GEN_test = sparse.csr_matrix(GEN_test) content = data_all['plylst_title'] if "{}.model".format(sp_total_model_path) not in os.listdir(): makeSentencepieceModel(data_all, sp_total_model_path) sp = SentencePieceProcessor() sp.Load("{}.model".format(sp_total_model_path)) cv = CountVectorizer(max_features=3000, tokenizer=sp.encode_as_pieces) content = data_all['plylst_title'] tdm = cv.fit_transform(content) title_tdm = tdm.toarray() title_tr = title_tdm[:len(train)] title_va = title_tdm[len(train):len(train) + len(val)] title_ts = title_tdm[len(train) + len(val):len(train) + len(val) + len(test)] title_gnr = np.concatenate((gnr_array, title_tr), axis=1) val_title_gnr = np.concatenate((gnr_val, title_va), axis=1) test_title_gnr = np.concatenate((gnr_test, title_ts), axis=1) title_sp = sparse.csr_matrix(title_tdm) title_gnr = sparse.csr_matrix(title_gnr) val_title_gnr = sparse.csr_matrix(val_title_gnr) test_title_gnr = sparse.csr_matrix(test_title_gnr) title_gnr = vstack([title_gnr, val_title_gnr, test_title_gnr]) song_sp = vstack([train_user_songs_A, val_user_songs_A, test_user_songs_A]) tag_sp = vstack([train_user_tags_A, val_user_tags_A, test_user_tags_A]) times_sp = vstack([tim_array, tim_val, tim_test]) GEN_sp = vstack([GEN_array, GEN_val, GEN_test]) ART_sp = vstack([ART_array, ART_val, ART_test]) # song_sp_T = song_sp.T.tocsr() # tag_sp_T = tag_sp.T.tocsr() model_knn_song25 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=25, n_jobs=-1) model_knn_tag25 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=25, n_jobs=-1) model_knn_title25 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=25, n_jobs=-1) model_knn_title_gnr25 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=25, n_jobs=-1) model_knn_times25 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=25, n_jobs=-1) model_knn_GEN25 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=25, n_jobs=-1) model_knn_ART25 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=25, n_jobs=-1) model_knn_song40 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=40, n_jobs=-1) model_knn_tag40 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=40, n_jobs=-1) model_knn_title40 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=40, n_jobs=-1) model_knn_title_gnr40 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=40, n_jobs=-1) model_knn_times40 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=40, n_jobs=-1) model_knn_GEN40 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=40, n_jobs=-1) model_knn_ART40 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=40, n_jobs=-1) model_knn_song25.fit(song_sp) model_knn_tag25.fit(tag_sp) model_knn_title25.fit(title_sp) model_knn_title_gnr25.fit(title_gnr) model_knn_times25.fit(times_sp) model_knn_GEN25.fit(GEN_sp) model_knn_ART25.fit(ART_sp) model_knn_song40.fit(song_sp) model_knn_tag40.fit(tag_sp) model_knn_title40.fit(title_sp) model_knn_title_gnr40.fit(title_gnr) model_knn_times40.fit(times_sp) model_knn_GEN40.fit(GEN_sp) model_knn_ART40.fit(ART_sp) train.loc[:, 'num_songs'] = train['songs'].map(len) train.loc[:, 'num_tags'] = train['tags_id'].map(len) data_all = pd.concat([train, val, test]) data_all.index = range(len(data_all)) res = [] for i in tqdm_notebook(range(len(test))): data = test.iloc[i] pid = i if len(data['songs']) >= 2 and len(data['tags_id']) >= 2: p = np.zeros((707989, 1)) p[data['songs']] = 1 pp = np.zeros((n_tags, 1)) pp[data['tags_id']] = 1 tra_song = data_all.iloc[model_knn_song25.kneighbors(p.T)[1][0]] row = np.repeat(range(25), tra_song['num_songs']) # User Index 별 노래 개수만큼 만듦 col = [song for songs in tra_song['songs'] for song in songs] # Song dic number 추출 dat = np.repeat(1, tra_song['num_songs'].sum() ) # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦 tra_song_sp = spr.csr_matrix((dat, (row, col)), shape=(25, n_songs)) # csr_matrix 제작 tra_song_sp_T = tra_song_sp.T.tocsr() tra_tag = data_all.iloc[model_knn_tag25.kneighbors(pp.T)[1][0]] row = np.repeat(range(25), tra_tag['num_tags']) col = [tag for tags in tra_tag['tags_id'] for tag in tags] dat = np.repeat(1, tra_tag['num_tags'].sum()) tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(25, n_tags)) tra_tag_sp_T = tra_tag_sp.T.tocsr() tra_tim = times_sp[model_knn_times25.kneighbors( tim_test[i:(i + 1)])[1][0]] tra_GEN = GEN_sp[model_knn_GEN25.kneighbors(GEN_test[i:(i + 1)])[1][0]] tra_ART = ART_sp[model_knn_ART25.kneighbors(ART_test[i:(i + 1)])[1][0]] tra_title_gnr = title_gnr[model_knn_title_gnr25.kneighbors( test_title_gnr[i:(i + 1)])[1][0]] songs_already = data["songs"] tags_already = data["tags_id"] test_song = cosine_similarity(tra_song_sp, p.T) test_tag = cosine_similarity(tra_tag_sp, pp.T) test_tim = cosine_similarity(tra_tim, tim_test[i:(i + 1)]) test_GEN = cosine_similarity(tra_GEN, GEN_test[i:(i + 1)]) test_ART = cosine_similarity(tra_ART, ART_test[i:(i + 1)]) test_title_genre = cosine_similarity(tra_title_gnr, test_title_gnr[i:(i + 1)]) testi = test_song * test_tag * test_title_genre * test_tim * test_GEN * test_ART cand_song = tra_song_sp_T.dot( testi) # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴 cand_song_idx = cand_song.reshape( -1).argsort()[-300:][::-1] # 값이 높은 상위 120개 노래 추출 cand_song_idx = cand_song_idx[np.isin( cand_song_idx, songs_already) == False] # 중복제거 cand1 = pd.DataFrame(cand_song).iloc[cand_song_idx].reset_index() ####### 40 #################################################### tra_song = data_all.iloc[model_knn_song40.kneighbors(p.T)[1][0]] row = np.repeat(range(40), tra_song['num_songs']) # User Index 별 노래 개수만큼 만듦 col = [song for songs in tra_song['songs'] for song in songs] # Song dic number 추출 dat = np.repeat(1, tra_song['num_songs'].sum() ) # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦 tra_song_sp = spr.csr_matrix((dat, (row, col)), shape=(40, n_songs)) # csr_matrix 제작 tra_song_sp_T = tra_song_sp.T.tocsr() tra_tag = data_all.iloc[model_knn_tag40.kneighbors(pp.T)[1][0]] row = np.repeat(range(40), tra_tag['num_tags']) col = [tag for tags in tra_tag['tags_id'] for tag in tags] dat = np.repeat(1, tra_tag['num_tags'].sum()) tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(40, n_tags)) tra_tag_sp_T = tra_tag_sp.T.tocsr() tra_tim = times_sp[model_knn_times40.kneighbors( tim_test[i:(i + 1)])[1][0]] tra_GEN = GEN_sp[model_knn_GEN40.kneighbors(GEN_test[i:(i + 1)])[1][0]] tra_ART = ART_sp[model_knn_ART40.kneighbors(ART_test[i:(i + 1)])[1][0]] tra_title_gnr = title_gnr[model_knn_title_gnr40.kneighbors( test_title_gnr[i:(i + 1)])[1][0]] test_song = cosine_similarity(tra_song_sp, p.T) test_tag = cosine_similarity(tra_tag_sp, pp.T) test_tim = cosine_similarity(tra_tim, tim_test[i:(i + 1)]) test_GEN = cosine_similarity(tra_GEN, GEN_test[i:(i + 1)]) test_ART = cosine_similarity(tra_ART, ART_test[i:(i + 1)]) test_title_genre = cosine_similarity(tra_title_gnr, test_title_gnr[i:(i + 1)]) testi = test_song * test_tag * test_title_genre * test_tim * test_GEN * test_ART cand_song = tra_song_sp_T.dot( testi) # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴 cand_song_idx = cand_song.reshape( -1).argsort()[-300:][::-1] # 값이 높은 상위 120개 노래 추출 cand_song_idx = cand_song_idx[np.isin( cand_song_idx, songs_already) == False] # 중복제거 cand2 = pd.DataFrame(cand_song).iloc[cand_song_idx].reset_index() cand_all = pd.merge(cand1, cand2, how='outer', on='index') cand_all = cand_all.fillna(0) cand_all['pred'] = (cand_all['0_x'] + cand_all['0_y']) / 2 cand_song_idx = list( cand_all.sort_values(by=['pred'], ascending=False)[:100]['index']) ######tag###### cand_tag = tra_tag_sp_T.dot(testi) # 똑같은 작업 실시 cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1] cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10] rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx] res.append({ "id": test.loc[pid, 'id'], "songs": cand_song_idx, "tags": rec_tag_idx }) elif len(data['songs']) != 0: p = np.zeros((707989, 1)) p[data['songs']] = 1 tra_song = data_all.iloc[model_knn_song25.kneighbors(p.T)[1][0]] row = np.repeat(range(25), tra_song['num_songs']) # User Index 별 노래 개수만큼 만듦 col = [song for songs in tra_song['songs'] for song in songs] # Song dic number 추출 dat = np.repeat(1, tra_song['num_songs'].sum() ) # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦 tra_song_sp = spr.csr_matrix((dat, (row, col)), shape=(25, n_songs)) # csr_matrix 제작 tra_song_sp_T = tra_song_sp.T.tocsr() # tra_tag = data_all.iloc[model_knn_tag25.kneighbors(pp.T)[1][0]] row = np.repeat(range(25), tra_song['num_tags']) col = [tag for tags in tra_song['tags_id'] for tag in tags] dat = np.repeat(1, tra_song['num_tags'].sum()) tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(25, n_tags)) tra_tag_sp_T = tra_tag_sp.T.tocsr() tra_tim = times_sp[model_knn_times25.kneighbors( tim_test[i:(i + 1)])[1][0]] tra_GEN = GEN_sp[model_knn_GEN25.kneighbors(GEN_test[i:(i + 1)])[1][0]] tra_ART = ART_sp[model_knn_ART25.kneighbors(ART_test[i:(i + 1)])[1][0]] tra_title_gnr = title_gnr[model_knn_title_gnr25.kneighbors( test_title_gnr[i:(i + 1)])[1][0]] songs_already = data["songs"] tags_already = data["tags_id"] test_song = cosine_similarity(tra_song_sp, p.T) test_tim = cosine_similarity(tra_tim, tim_test[i:(i + 1)]) test_GEN = cosine_similarity(tra_GEN, GEN_test[i:(i + 1)]) test_ART = cosine_similarity(tra_ART, ART_test[i:(i + 1)]) test_title_genre = cosine_similarity(tra_title_gnr, test_title_gnr[i:(i + 1)]) testi = test_song * test_title_genre * test_tim * test_GEN * test_ART cand_song = tra_song_sp_T.dot( testi) # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴 cand_song_idx = cand_song.reshape( -1).argsort()[-300:][::-1] # 값이 높은 상위 120개 노래 추출 cand_song_idx = cand_song_idx[np.isin( cand_song_idx, songs_already) == False] # 중복제거 cand1 = pd.DataFrame(cand_song).iloc[cand_song_idx].reset_index() ####### 40 #################################################### tra_song = data_all.iloc[model_knn_song40.kneighbors(p.T)[1][0]] row = np.repeat(range(40), tra_song['num_songs']) # User Index 별 노래 개수만큼 만듦 col = [song for songs in tra_song['songs'] for song in songs] # Song dic number 추출 dat = np.repeat(1, tra_song['num_songs'].sum() ) # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦 tra_song_sp = spr.csr_matrix((dat, (row, col)), shape=(40, n_songs)) # csr_matrix 제작 tra_song_sp_T = tra_song_sp.T.tocsr() row = np.repeat(range(40), tra_song['num_tags']) col = [tag for tags in tra_song['tags_id'] for tag in tags] dat = np.repeat(1, tra_song['num_tags'].sum()) tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(40, n_tags)) tra_tag_sp_T = tra_tag_sp.T.tocsr() tra_tim = times_sp[model_knn_times40.kneighbors( tim_test[i:(i + 1)])[1][0]] tra_GEN = GEN_sp[model_knn_GEN40.kneighbors(GEN_test[i:(i + 1)])[1][0]] tra_ART = ART_sp[model_knn_ART40.kneighbors(ART_test[i:(i + 1)])[1][0]] tra_title_gnr = title_gnr[model_knn_title_gnr40.kneighbors( test_title_gnr[i:(i + 1)])[1][0]] test_song = cosine_similarity(tra_song_sp, p.T) test_tim = cosine_similarity(tra_tim, tim_test[i:(i + 1)]) test_GEN = cosine_similarity(tra_GEN, GEN_test[i:(i + 1)]) test_ART = cosine_similarity(tra_ART, ART_test[i:(i + 1)]) test_title_genre = cosine_similarity(tra_title_gnr, test_title_gnr[i:(i + 1)]) testi = test_song * test_title_genre * test_tim * test_GEN * test_ART cand_song = tra_song_sp_T.dot( testi) # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴 cand_song_idx = cand_song.reshape( -1).argsort()[-300:][::-1] # 값이 높은 상위 120개 노래 추출 cand_song_idx = cand_song_idx[np.isin( cand_song_idx, songs_already) == False] # 중복제거 cand2 = pd.DataFrame(cand_song).iloc[cand_song_idx].reset_index() cand_all = pd.merge(cand1, cand2, how='outer', on='index') cand_all = cand_all.fillna(0) cand_all['pred'] = (cand_all['0_x'] + cand_all['0_y']) / 2 cand_song_idx = list( cand_all.sort_values(by=['pred'], ascending=False)[:100]['index']) #######tag######## cand_tag = tra_tag_sp_T.dot(testi) # 똑같은 작업 실시 cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1] cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10] rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx] res.append({ "id": test.loc[pid, 'id'], "songs": cand_song_idx, "tags": rec_tag_idx }) elif len(data['tags_id']) != 0: p = np.zeros((n_tags, 1)) p[data['tags_id']] = 1 tra_tag = data_all.iloc[model_knn_tag25.kneighbors(p.T)[1][0]] row = np.repeat(range(25), tra_tag['num_songs']) # User Index 별 노래 개수만큼 만듦 col = [song for songs in tra_tag['songs'] for song in songs] # Song dic number 추출 dat = np.repeat(1, tra_tag['num_songs'].sum() ) # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦 tra_song_sp = spr.csr_matrix((dat, (row, col)), shape=(25, n_songs)) # csr_matrix 제작 tra_song_sp_T = tra_song_sp.T.tocsr() row = np.repeat(range(25), tra_tag['num_tags']) col = [tag for tags in tra_tag['tags_id'] for tag in tags] dat = np.repeat(1, tra_tag['num_tags'].sum()) tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(25, n_tags)) tra_tag_sp_T = tra_tag_sp.T.tocsr() songs_already = data["songs"] tags_already = data["tags_id"] testi = cosine_similarity(tra_tag_sp, pp.T) if len(data['plylst_title']) != 0: tra_title_gnr = title_tdm[model_knn_title25.kneighbors( title_ts[i:(i + 1)])[1][0]] testi_title = cosine_similarity(tra_title_gnr, title_ts[i:(i + 1)]) testi = testi * testi_title cand_song = tra_song_sp_T.dot( testi) # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴 cand_song_idx = cand_song.reshape( -1).argsort()[-300:][::-1] # 값이 높은 상위 120개 노래 추출 cand_song_idx = cand_song_idx[ np.isin(cand_song_idx, songs_already) == False][:100] # 중복되는 노래 있는지 확인하고 100개 추출 cand_tag = tra_tag_sp_T.dot(testi) # 똑같은 작업 실시 cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1] cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10] rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx] res.append({ "id": test.loc[pid, 'id'], "songs": list(cand_song_idx), "tags": rec_tag_idx }) else: cand_song = [] for li in data_all.iloc[model_knn_title25.kneighbors( title_ts[i:(i + 1)])[1][0]].songs.to_list(): for j in li: cand_song.append(j) cand_tag = [] for li in data_all.iloc[model_knn_title25.kneighbors( title_ts[i:(i + 1)])[1][0]].tags.to_list(): for j in li: cand_tag.append(j) cand_song_idx = list( pd.DataFrame(cand_song)[0].value_counts()[:100].index) rec_tag_idx = list( pd.DataFrame(cand_tag)[0].value_counts()[:10].index) res.append({ "id": test.loc[pid, 'id'], "songs": cand_song_idx, "tags": rec_tag_idx }) for i in range(len(res)): if len(res[i]['songs']) != 100: print('song 에서 {}번째 오류 발생'.format(i)) if len(res[i]['tags']) != 10: print('tag 에서 {}번째 오류 발생'.format(i)) rec = [] for i in range(len(res)): rec.append({ "id": res[i]['id'], "songs": list(res[i]['songs']), "tags": res[i]['tags'] }) result1 = pd.DataFrame(rec) model_knn_song = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=50, n_jobs=-1) model_knn_tag = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=50, n_jobs=-1) model_knn_title = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=50, n_jobs=-1) model_knn_title_gnr = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=50, n_jobs=-1) model_knn_times = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=50, n_jobs=-1) model_knn_GEN = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=50, n_jobs=-1) model_knn_ART = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=50, n_jobs=-1) model_knn_song.fit(song_sp) model_knn_tag.fit(tag_sp) model_knn_title.fit(title_sp) model_knn_title_gnr.fit(title_gnr) model_knn_times.fit(times_sp) model_knn_GEN.fit(GEN_sp) model_knn_ART.fit(ART_sp) res2 = [] for i in tqdm_notebook([1960, 6361, 8705, 9310, 10498]): data = test.iloc[i] pid = i if len(data['songs']) != 0 and len(data['tags_id']) != 0: p = np.zeros((707989, 1)) p[data['songs']] = 1 pp = np.zeros((n_tags, 1)) pp[data['tags_id']] = 1 tra_song = data_all.iloc[model_knn_song.kneighbors(p.T)[1][0]] row = np.repeat(range(50), tra_song['num_songs']) # User Index 별 노래 개수만큼 만듦 col = [song for songs in tra_song['songs'] for song in songs] # Song dic number 추출 dat = np.repeat(1, tra_song['num_songs'].sum() ) # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦 tra_song_sp = spr.csr_matrix((dat, (row, col)), shape=(50, n_songs)) # csr_matrix 제작 tra_song_sp_T = tra_song_sp.T.tocsr() tra_tag = data_all.iloc[model_knn_tag.kneighbors(pp.T)[1][0]] row = np.repeat(range(50), tra_tag['num_tags']) col = [tag for tags in tra_tag['tags_id'] for tag in tags] dat = np.repeat(1, tra_tag['num_tags'].sum()) tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(50, n_tags)) tra_tag_sp_T = tra_tag_sp.T.tocsr() tra_tim = times_sp[model_knn_times.kneighbors( tim_test[i:(i + 1)])[1][0]] tra_GEN = GEN_sp[model_knn_GEN.kneighbors(GEN_test[i:(i + 1)])[1][0]] tra_title_gnr = title_gnr[model_knn_title_gnr.kneighbors( test_title_gnr[i:(i + 1)])[1][0]] songs_already = data["songs"] tags_already = data["tags_id"] test_song = cosine_similarity(tra_song_sp, p.T) test_tag = cosine_similarity(tra_tag_sp, pp.T) test_tim = cosine_similarity(tra_tim, tim_test[i:(i + 1)]) test_GEN = cosine_similarity(tra_GEN, GEN_test[i:(i + 1)]) test_title_genre = cosine_similarity(tra_title_gnr, test_title_gnr[i:(i + 1)]) testi = test_song * test_tag * test_title_genre * test_GEN cand_song = tra_song_sp_T.dot( testi) # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴 cand_song_idx = cand_song.reshape( -1).argsort()[-300:][::-1] # 값이 높은 상위 120개 노래 추출 cand_song_idx = cand_song_idx[ np.isin(cand_song_idx, songs_already) == False][:100] # 중복되는 노래 있는지 확인하고 100개 추출 cand_tag = tra_tag_sp_T.dot(testi) # 똑같은 작업 실시 cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1] cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10] rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx] res2.append({ "id": test.loc[pid, 'id'], "songs": cand_song_idx, "tags": rec_tag_idx }) elif len(data['songs']) != 0: p = np.zeros((707989, 1)) p[data['songs']] = 1 tra_song = data_all.iloc[model_knn_song.kneighbors(p.T)[1][0]] row = np.repeat(range(50), tra_song['num_songs']) # User Index 별 노래 개수만큼 만듦 col = [song for songs in tra_song['songs'] for song in songs] # Song dic number 추출 dat = np.repeat(1, tra_song['num_songs'].sum() ) # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦 tra_song_sp = spr.csr_matrix((dat, (row, col)), shape=(50, n_songs)) # csr_matrix 제작 tra_song_sp_T = tra_song_sp.T.tocsr() row = np.repeat(range(50), tra_song['num_tags']) col = [tag for tags in tra_song['tags_id'] for tag in tags] dat = np.repeat(1, tra_song['num_tags'].sum()) tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(50, n_tags)) tra_tag_sp_T = tra_tag_sp.T.tocsr() songs_already = data["songs"] tags_already = data["tags_id"] tra_tim = times_sp[model_knn_times.kneighbors( tim_test[i:(i + 1)])[1][0]] tra_GEN = GEN_sp[model_knn_GEN.kneighbors(GEN_test[i:(i + 1)])[1][0]] tra_title_gnr = title_gnr[model_knn_title_gnr.kneighbors( test_title_gnr[i:(i + 1)])[1][0]] test_song = cosine_similarity(tra_song_sp, p.T) test_tim = cosine_similarity(tra_tim, tim_test[i:(i + 1)]) test_GEN = cosine_similarity(tra_GEN, GEN_test[i:(i + 1)]) test_title_genre = cosine_similarity(tra_title_gnr, test_title_gnr[i:(i + 1)]) testi = test_song * test_title_genre * test_tim * test_GEN cand_song = tra_song_sp_T.dot( testi) # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴 cand_song_idx = cand_song.reshape( -1).argsort()[-200:][::-1] # 값이 높은 상위 120개 노래 추출 cand_song_idx = cand_song_idx[ np.isin(cand_song_idx, songs_already) == False][:100] # 중복되는 노래 있는지 확인하고 100개 추출 cand_tag = tra_tag_sp_T.dot(testi) # 똑같은 작업 실시 cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1] cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10] rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx] res2.append({ "id": test.loc[pid, 'id'], "songs": cand_song_idx, "tags": rec_tag_idx }) elif len(data['tags_id']) != 0: p = np.zeros((n_tags, 1)) p[data['tags_id']] = 1 tra_tag = data_all.iloc[model_knn_tag.kneighbors(p.T)[1][0]] row = np.repeat(range(50), tra_tag['num_songs']) # User Index 별 노래 개수만큼 만듦 col = [song for songs in tra_tag['songs'] for song in songs] # Song dic number 추출 dat = np.repeat(1, tra_tag['num_songs'].sum() ) # User별 Song이 있는 부분에 1을 넣기위해 1과 전체 노래 개수만큼 만듦 tra_song_sp = spr.csr_matrix((dat, (row, col)), shape=(50, n_songs)) # csr_matrix 제작 tra_song_sp_T = tra_song_sp.T.tocsr() row = np.repeat(range(50), tra_tag['num_tags']) col = [tag for tags in tra_tag['tags_id'] for tag in tags] dat = np.repeat(1, tra_tag['num_tags'].sum()) tra_tag_sp = spr.csr_matrix((dat, (row, col)), shape=(50, n_tags)) tra_tag_sp_T = tra_tag_sp.T.tocsr() songs_already = data["songs"] tags_already = data["tags_id"] testi = cosine_similarity(tra_tag_sp, pp.T) if len(data['plylst_title']) != 0: tra_title_gnr = title_tdm[model_knn_title.kneighbors( title_ts[i:(i + 1)])[1][0]] testi_title = cosine_similarity(tra_title_gnr, title_ts[i:(i + 1)]) testi = testi * testi_title cand_song = tra_song_sp_T.dot( testi) # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴 cand_song_idx = cand_song.reshape( -1).argsort()[-300:][::-1] # 값이 높은 상위 120개 노래 추출 cand_song_idx = cand_song_idx[ np.isin(cand_song_idx, songs_already) == False][:100] # 중복되는 노래 있는지 확인하고 100개 추출 cand_tag = tra_tag_sp_T.dot(testi) # 똑같은 작업 실시 cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1] cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10] rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx] res2.append({ "id": test.loc[pid, 'id'], "songs": cand_song_idx, "tags": rec_tag_idx }) else: cand_song = [] for li in data_all.iloc[model_knn_title.kneighbors( title_ts[i:(i + 1)])[1][0]].songs.to_list(): for j in li: cand_song.append(j) cand_tag = [] for li in data_all.iloc[model_knn_title.kneighbors( title_ts[i:(i + 1)])[1][0]].tags.to_list(): for j in li: cand_tag.append(j) cand_song_idx = list( pd.DataFrame(cand_song)[0].value_counts()[:100].index) rec_tag_idx = list( pd.DataFrame(cand_tag)[0].value_counts()[:10].index) res2.append({ "id": test.loc[pid, 'id'], "songs": cand_song_idx, "tags": rec_tag_idx }) pd.DataFrame(res2) rec2 = [] for i in range(len(res2)): rec2.append({ "id": res2[i]['id'], "songs": list(res2[i]['songs']), "tags": res2[i]['tags'] }) result2 = pd.DataFrame(rec2)['songs'] n_index = [10498, 6361, 1960, 8705, 9310] result2.index = n_index result1.loc[n_index, 'songs'] = result2 result1['songs'].apply(len).sort_values() #그럼에도 채워지지 않은 6361에 대해서 상위 100곡 추천 s = [] for song in train.songs.tolist(): s += song r1 = dict(Counter(s)) r_song = sorted(r1.items(), key=lambda x: -x[1]) r_song_top = r_song[:100] # 몇 곡 할지도 정해야 함 list_song = list(dict(r_song_top).keys()) len(list_song) sub = [] for j in range(len(result1)): sub.append(result1.loc[j].to_dict()) sub[6361]['songs'] = list_song pd.DataFrame(sub)['songs'].apply(len).sort_values() write_json(sub, 'final_songs.json') return sub
class SentencePieceTokenizer: def __init__(self, model_path: str = None): self.unk = '<unk>' self.pad = '<pad>' self.sos = '<s>' self.eos = '</s>' if model_path: self.load(model_path) else: self.tokenizer = None def tokenize(self, sent: str): return self.tokenizer.encode_as_pieces(sent) def text_to_id(self, sent: str): return self.tokenizer.encode_as_ids(sent) def id_to_text(self, idxs: list): return self.tokenizer.decode_ids(idxs) def token_to_id(self, token: str): return self.tokenizer.piece_to_id(token) def train(self, sent_path: str, model_prefix: str, character_coverage=0.9995, vocab_size=None, model_type: str = "bpe", control_symbols: list = ['<pad>']): if character_coverage is None and vocab_size is None: print("at least character_coverage or vocab_size should be given!") assert character_coverage or vocab_size coverage_conditions = "" if character_coverage is not None: coverage_condition = f" --character_coverage={str(character_coverage)} " else: coverage_condition = f" --vocab_size={str(vocab_size)} " symbol_list = "" for i in control_symbols: symbol_list += i + "," args = ("--input={} " "--model_prefix={} " "--model_type={} " "--control_symbols={} ".format(sent_path, model_prefix, model_type, symbol_list)) args += coverage_condition SentencePieceTrainer.Train(args) def load(self, model_path: str): self.tokenizer = SentencePieceProcessor() self.tokenizer.Load(model_path) def __repr__(self): unk = '"{}"'.format(self.unk) if self.unk else "None" return "Vocab(size={}, unk={}, pad={})".format(len(self.tokenizer), unk, self.pad) def __len__(self): return len(self.tokenizer)
def _encode_batch(self, texts): from sentencepiece import SentencePieceProcessor tok = SentencePieceProcessor() tok.Load(str(self.sp_model)) return [np.array(tok.EncodeAsIds(t)) for t in texts]
def sentencepiece_load(file): """Load a SentencePiece model""" from sentencepiece import SentencePieceProcessor spm = SentencePieceProcessor() spm.Load(str(file)) return spm
def load_sentencepiece(model_path): from sentencepiece import SentencePieceProcessor sp = SentencePieceProcessor() sp.Load(model_path) return sp