def _predict_chars( model: tf.keras.Sequential, sp: spm.SentencePieceProcessor, start_string: str, store: _BaseConfig, ) -> str: """ Evaluation step (generating text using the learned model). Args: model: tf.keras.Sequential model sp: SentencePiece tokenizer start_string: string to bootstrap model store: our config object Returns: Yields line of text per iteration """ # Converting our start string to numbers (vectorizing) input_eval = sp.EncodeAsIds(start_string) input_eval = tf.expand_dims(input_eval, 0) # Empty string to store each line sentence_ids = [] # Here batch size == 1 model.reset_states() while True: predictions = model(input_eval) # remove the batch dimension predictions = tf.squeeze(predictions, 0) # using a categorical distribution to # predict the word returned by the model predictions = predictions / store.gen_temp predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy() # We pass the predicted word as the next input to the model # along with the previous hidden state input_eval = tf.expand_dims([predicted_id], 0) sentence_ids.append(int(predicted_id)) decoded = sp.DecodeIds(sentence_ids) if store.field_delimiter is not None: decoded = decoded.replace(store.field_delimiter_token, store.field_delimiter) if "<n>" in decoded: return _pred_string(decoded.replace("<n>", "")) elif 0 < store.gen_chars <= len(decoded): return _pred_string(decoded)
class SentencepieceFasttextEmbed(EmbedderInterface): class Config(EmbedderInterface.Config): pass @classmethod def from_config(cls, config: Config): spm_model_file = os.path.join(config.preproc_dir, "spm.model") fasttext_model_file = os.path.join(config.preproc_dir, "fasttext-model.bin") return cls(spm_model_file, fasttext_model_file, config.max_pieces) def __init__(self, spm_model_file: str, fasttext_model_file: str = '', max_pieces: int = -1): super().__init__(max_pieces=max_pieces) self.spm = SentencePieceProcessor() self.spm.Load(spm_model_file) self.pad_idx = self.spm.pad_id() self.pad_token = self.spm.IdToPiece(self.pad_idx) self.unk_idx = self.spm.unk_id() self.unk_token = self.spm.IdToPiece(self.unk_idx) self.bos_idx = self.spm.bos_id() self.bos_token = self.spm.IdToPiece(self.bos_idx) self.eos_idx = self.spm.eos_id() self.eos_token = self.spm.IdToPiece(self.eos_idx) if fasttext_model_file: self.fasttext = fasttext.load_model(fasttext_model_file) @property def embed_dim(self): return self.fasttext.dim @property def n_vocab(self): return self.spm.get_piece_size() def encode_text_as_ids(self, text: str) -> np.array: """ Doesn't produce BOS, EOS ids. """ return np.asarray(self.spm.EncodeAsIds(text)[self.pieces_slice], dtype=np.int32) def encode_text_as_tokens(self, text: str) -> List[str]: """ Doesn't produce BOS, EOS tokens. """ return self.spm.EncodeAsPieces(text)[self.pieces_slice] def tokenize(self, text: str) -> List[str]: """ Alias for `encode_text_as_tokens`. Doesn't produce BOS, EOS tokens. """ return self.encode_text_as_tokens(text)[self.pieces_slice] def decode_ids_as_text(self, ids: List[int], strip_special=True) -> str: """ Doesn't produce PAD, BOS, or EOS text. i.e. PAD, BOS, EOS ids are stripped out before decoding. UNK is decoded but unintelligible. """ if strip_special: ids = [ int(id) for id in ids if id not in (self.pad_idx, self.bos_idx, self.eos_idx) ] else: ids = [int(id) for id in ids] return self.spm.DecodeIds(ids) def decode_tokens_as_text(self, toks: List[str]) -> str: """ Doesn't produce PAD, BOS, or EOS text. i.e. PAD, BOS, EOS tokens are stripped out before decoding. UNK is decoded but unintelligible. """ return self.spm.DecodePieces(toks[self.pieces_slice]) @functools.lru_cache(maxsize=1024) def decode_id_as_token(self, id: int) -> str: return self.spm.IdToPiece(id) def decode_ids_as_tokens(self, ids: List[int], strip_special: bool = True) -> List[str]: """ By default, doesn't produce PAD, BOS, EOS tokens. Avoids problematic intermediate string representation that causes length mismatch. In other words, SentencePiece isn't isomorphic with respect to the string representation. """ if strip_special: ids = [ id for id in ids if id not in (self.pad_idx, self.bos_idx, self.eos_idx) ] return [self.decode_id_as_token(int(ix)) for ix in ids] @functools.lru_cache(maxsize=1024) def embed_tok(self, tok: str) -> np.array: """ When given PAD, returns all zeros """ if tok == self.pad_token: return np.zeros(self.fasttext.dim) return np.asarray(self.fasttext[tok]) def embed_text(self, text: str) -> np.array: """ Doesn't produce PAD, BOS, EOS embeddings. i.e. PAD, BOS, EOS are stripped out during tokenization before embedding. """ return np.asarray([self.embed_tok(tok) for tok in self.tokenize(text)]) def embed_ids(self, ids: List[int], strip_special: bool = True) -> List[np.array]: """ By default, doesn't produce PAD, BOS, EOS tokens. Avoids problematic intermediate string representation that causes length mismatch. In other words, SentencePiece isn't isomorphic with respect to the string representation. """ return [ self.embed_tok(t) for t in self.decode_ids_as_tokens(ids, strip_special=strip_special) ] def embed_ids_batch(self, ids: np.array) -> torch.tensor: emb = [self.embed_ids(turn, strip_special=False) for turn in ids] emb = torch.tensor(emb) return emb
# Token id 0 is the equivalent of a "start" token model_infer.state = initial_state # stateの初期化 cur_inputs = np.zeros((1, 1), dtype=np.int32) # 初期値=0の挿入 all_samples = [] if prompt is not None: prompt = np.asarray(TOKENIZER.EncodeAsIds(prompt)) for iteration in range(length): logits = model_infer(cur_inputs) if prompt is not None and iteration < prompt.shape[0]: cur_samples = onp.array(prompt[iteration], dtype=int) else: logits = onp.array(logits)[0, 0, :] probs = onp.exp(logits) cur_samples = onp.random.choice(probs.shape[-1], p=probs[:]) cur_samples = onp.array(cur_samples, dtype=int) all_samples.append(cur_samples) cur_inputs = np.array(cur_samples[None, None]) all_samples = onp.stack(all_samples, -1) return all_samples prefix = [5, 3, 5, 2, 1, 6] pred = prediction(10, "家康は") print(TOKENIZER.DecodeIds(pred.tolist()))
class TVAEDataset(Dataset): def __init__(self, root, prefix, part, max_sequence_length=150, **kwargs): self.root = root self.prefix = prefix self.preprocess_args = kwargs data_file_name = os.path.join(root, prefix, part + ".npy") spm_file_name = os.path.join(root, prefix, "spm.model") if not TVAEDataset.exist(root, prefix, part): logging.info("Start preprocessing %s/%s/%s dataset", root, prefix, part) self.preprocess(root, prefix, part, **kwargs) if 'spm_model' in self.preprocess_args: logging.info("Use existed sentencepiece model") self.spm_model = self.preprocess_args['spm_model'] else: logging.info("Load sentencepiece model from disk") self.spm_model = SentencePieceProcessor() self.spm_model.load(spm_file_name) self._data = np.load(data_file_name) self.pad_symbol = self.spm_model.pad_id() self.eos_symbol = self.spm_model.eos_id() self._len = self._data.shape[0] self.limit = max_sequence_length sequence_lens = [ len(seq) for seq in self._data ] self.max_sequence_length = min(self.limit, max(sequence_lens)) def __getitem__(self, index): return self._data[index] def __len__(self): return self._len def preprocess(self, directory: str, prefix: str, part: str, spm_model: SentencePieceProcessor = None, pretrain_emb=True, vocab_size=3000, embedding_size=600, max_sentence_length=16384, workers=3, skip_gramm=False): # Check data files existing workdir = os.path.join(directory, prefix) os.makedirs(workdir, exist_ok=True) data_part_file = os.path.join(directory, part + ".tsv") if not os.path.exists(data_part_file): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), data_part_file) if part not in ['train', 'develop']: assert spm_model is not None, "For non train part, `spm_model` must be specified." else: # Train sentecepiece: logging.info("Start training sentecepiece") spm_directory = os.path.join(workdir, "spm") spm_params = ( "--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 " "--input={} --model_prefix={} --vocab_size={} --max_sentence_length={}".format( data_part_file, spm_directory, vocab_size, max_sentence_length ) ) SentencePieceTrainer.Train(spm_params) spm_model = SentencePieceProcessor() spm_model.load(spm_directory + ".model") if pretrain_emb: # Train word2vec logging.info("Start training word2vec") train_senteces = SentenceIterator(data_part_file, spm_model) logging.info("Loaded train sentences") w2v_model = Word2Vec(train_senteces, min_count=0, workers=workers, size=embedding_size, sg=int(skip_gramm)) w2v_model_filename = os.path.join(workdir, "word2vec.model") w2v_model.save(w2v_model_filename) # Export embeddings logging.info("Export embeddings") embeddings_filename = os.path.join(workdir, "embedding.npy") export_embeddings(embeddings_filename, spm_model, w2v_model) logging.info("Embeddings have been saved into {}".format(embeddings_filename)) logging.info("Start exporting data file") source_file_name = os.path.join(directory, part + ".tsv") exported_file_name = os.path.join(workdir, part + ".npy") sentence_iterator = SentenceIterator(source_file_name, spm_model) sentence_iterator.export(exported_file_name) logging.info("{} exported".format(exported_file_name)) logging.info("Data preprocessing completed") @staticmethod def exist(root: str, prefix: str, parts: TypeVar("P", str, List[str])) -> bool: if isinstance(parts, str): parts = [parts] parts_file_name = [os.path.join(root, prefix, part + ".npy") for part in parts] smp_file_name = os.path.join(root, prefix, "spm.model") necessary_files = parts_file_name + [smp_file_name] existing = [os.path.exists(filename) for filename in necessary_files] return reduce(and_, existing) @staticmethod def _pad_sequence(sequences, pad_symbol=0): sequence_lengths = [len(sequence) for sequence in sequences] max_len = max(sequence_lengths) for i, length in enumerate(sequence_lengths): to_add = max_len - length sequences[i] += [pad_symbol] * to_add return sequences, sequence_lengths def collate_function(self, batch): src_list, src_length_list = TVAEDataset._pad_sequence( [example[:self.limit] for example in batch], self.pad_symbol) batch = { "src": torch.LongTensor(src_list) } return batch def get_embeddings(self): """Load pretrain embeddings. Returns: np.array: Array with word2vec embeddings if this one exists, otherwise `None`. """ embedinds_path = os.path.join(self.root, self.prefix, "embedding.npy") if not os.path.exists(embedinds_path): logging.info("Embedding file does not founded") return None else: logging.info("Loading embedding dump file") return np.load(embedinds_path) def decode(self, sequences): sequences = [list(takewhile(lambda x: x != self.eos_symbol, sequence)) for sequence in sequences] return [self.spm_model.DecodeIds([token.item() for token in sentence]) for sentence in sequences]
class BPEDataset(SummarizationDataset): """Summarization dataset with Byte-Pair encoding. Args: directory (str): Dataset directory. prefix (str): Dataset preprocessing prefix. part (str): Dataset part name. :attr:`directory` must contain :attr:`part`.tsv file. Use `None` for sampling. max_sequence_length (int, optional): Defaults to 150. Maximum sequence length. Note: Use **kwargs to set up preprocessing arguments. """ def __init__(self, directory: str, prefix: str, part: str, max_sequence_length=150, **kwargs): self.data_workdir = os.path.join(directory, prefix) self.spm_file = os.path.join(self.data_workdir, "spm.model") if part is None: self._sample_init(self.spm_file, max_sequence_length) return self.source_part_file = os.path.join(directory, part + ".tsv") self.part_file = os.path.join(self.data_workdir, part + ".npy") if not self.exist(directory, prefix, part): logger.info("Dataset part {}/{} not founded".format(self.data_workdir, part)) self.preprocess(directory, prefix, part, **kwargs) self.data = np.load(self.part_file, allow_pickle=True) if "spm" in kwargs: logger.info("Use existing spm model") self.spm = kwargs["spm"] else: logger.info("Load spm model") self.spm = SentencePieceProcessor() self.spm.load(self.spm_file) self.pad_symbol = self.spm.pad_id() self.eos_symbol = self.spm.eos_id() self._len = self.data.shape[0] sequence_lens = [ len(seq) for example in self.data for seq in example ] self.max_sequence_length = min(max_sequence_length, max(sequence_lens)) def _sample_init(self, spm_file_name, max_sequence_length): if not os.path.exists(spm_file_name): raise RuntimeError("Firstly preprocess dataset") self.spm = SentencePieceProcessor() self.spm.load(spm_file_name) self.pad_symbol = self.spm.pad_id() self.eos_symbol = self.spm.eos_id() self._len = 0 self.data = [] self.max_sequence_length = max_sequence_length def __getitem__(self, index): return self.data[index] def __len__(self): return self._len @staticmethod def exist(directory: str, prefix: str, part: str) -> bool: """Check dataset existence, Args: directory (str): Dataset directory. prefix (str): Dataset prefix. part (str): Dataset part. spm_filename (str, optional): Defaults to "spm.model". Name of sentencepiece serialized model. Returns: bool: Existence status. """ data_workdir = os.path.join(directory, prefix) part_filename = os.path.join(data_workdir, part + ".npy") spm_filename = os.path.join(data_workdir, "spm.model") necessary_files = [part_filename, spm_filename] existing = [os.path.exists(filename) for filename in necessary_files] return all(existing) @staticmethod def preprocess(directory: str, prefix: str, part: str, spm: SentencePieceProcessor = None, pretrain_emb=True, vocab_size=30000, embedding_size=300, max_sentence_length=16384, workers=3, skip_gramm=False): """Preprocess dataset. Args: directory (str): Dataset directory. prefix (str): Dataset preprocessing prefix. part (str): Dataset part. :attr:`directory` must contain :attr:`part`.tsv file with data. spm (SentencePieceProcessor, optional): Defaults to None. Sentecepiece model. pretrain_emb (bool, optional): Defaults to True. Whether to pretrain embeddings. vocab_size (int, optional): Defaults to 30000. Vocabulary size. embedding_size (int, optional): Defaults to 300. Pretrained embedding size. max_sentence_length (int, optional): Defaults to 16384. Maximum sentence length for sentencepiece. workers (int, optional): Defaults to 3. Number of workers. skip_gramm (bool, optional): Defaults to False. Whether to use skip-gram type of Word2Vec training. Raises: FileNotFoundError: Raises if source data file doesn't exist. """ data_workdir = os.path.join(directory, prefix) part_source_filename = os.path.join(directory, part + ".tsv") part_exported_filename = os.path.join(data_workdir, part + ".npy") spm_filename = os.path.join(data_workdir, "spm.model") spm_directory = os.path.join(data_workdir, "spm") w2v_model_filename = os.path.join(data_workdir, "word2vec.model") embeddings_filename = os.path.join(data_workdir, "embedding.npy") logger.info("Preprocess {}/{} dataset.".format(data_workdir, part)) os.makedirs(data_workdir, exist_ok=True) if not os.path.exists(part_source_filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), part_source_filename) if part not in ["train", "dev"]: assert spm is not None, "For non train part, `spm` must be specified." else: logger.info("Start training sentencepiece") spm_params = ( "--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 " "--input={} --model_prefix={} --vocab_size={} --max_sentence_length={}".format( part_source_filename, spm_directory, vocab_size, max_sentence_length ) ) SentencePieceTrainer.Train(spm_params) spm = SentencePieceProcessor() spm.load(spm_filename) if pretrain_emb: logger.info("Start training Word2Vec embeddings") train_senteces = SentenceIterator(part_source_filename, spm) logger.info("Loaded train sentences") w2v_model = Word2Vec(train_senteces, min_count=0, workers=workers, size=embedding_size, sg=int(skip_gramm)) w2v_model.save(w2v_model_filename) # Export embeddings logger.info("Export embeddings") export_embeddings(embeddings_filename, spm, w2v_model) logger.info("Embeddings have been saved into {}".format(embeddings_filename)) logger.info("Start exporting data file") sentence_iterator = SentenceIterator(part_source_filename, spm) sentence_iterator.export(part_exported_filename) logger.info("{} exported".format(part_exported_filename)) def get_embeddings(self) -> np.array: """Load pretrain embeddings. Returns: np.array: Array with word2vec embeddings if this one exists, otherwise `None`. """ embedinds_path = os.path.join(self.data_workdir, "embedding.npy") if not os.path.exists(embedinds_path): logging.info("Embedding file does not founded") return None else: logging.info("Loading embedding dump file") return np.load(embedinds_path) def get_spm(self) -> SentencePieceProcessor: return self.spm def encode(self, sequences): sequences = [self.spm.EncodeAsIds(s)[:self.max_sequence_length] for s in sequences] return torch.LongTensor(sequences) def decode(self, sequences): sequences = [list(takewhile(lambda x: x != self.eos_symbol, sequence)) for sequence in sequences] return [self.spm.DecodeIds([token.item() for token in sentence]) for sentence in sequences] def collate_function(self, batch): src_list, src_length_list = self._pad_sequence( [example[0][:self.max_sequence_length] for example in batch], self.pad_symbol) trg_list, trg_length_list = self._pad_sequence( [example[1][:self.max_sequence_length] for example in batch], self.pad_symbol) batch = { "src": torch.LongTensor(src_list), "trg": torch.LongTensor(trg_list), "src_length": src_length_list, "trg_length": trg_length_list, } return batch @staticmethod def _pad_sequence(sequences, pad_symbol=0): sequence_lengths = [len(sequence) for sequence in sequences] max_len = max(sequence_lengths) for i, length in enumerate(sequence_lengths): to_add = max_len - length sequences[i] += [pad_symbol] * to_add return sequences, sequence_lengths