def test(): bpemb_en = BPEmb(lang="en", dim=100) s = "Stratford" res1 = bpemb_en.encode(s) res2 = bpemb_en.encode_ids(s) print(res1) print(res2) bpemb_en_100k = BPEmb(lang="en", vs=100000, dim=100) # 40 M;词表越大切分越少 s = "hello world !" bpemb_en_100k.encode_ids(s) res1 = bpemb_en_100k.encode(s) res2 = bpemb_en_100k.encode_ids(s) print(res1) print(res2)
class DataLoader(): def __init__(self, vocab_size, embedding_dim, max_sequence_len): self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.max_sequence_len = max_sequence_len self.bpemb_en_100k = BPEmb(lang="en", vs=self.vocab_size, dim=self.embedding_dim) # 40 M;词表越大切分越少 def get_x_data(self, sentences): sentences_ids = self.bpemb_en_100k.encode_ids(sentences) # 使用bpe x = pad_sequences(sentences_ids, maxlen=self.max_sequence_len) return x def get_train_data(self): nrows = 100 train_df = pd.read_csv("../data/train_preprocessed.csv", nrows=nrows) X_train = self.get_x_data(train_df["comment_text"]) Y_train = train_df['target'].values x, y = np.asarray(X_train), np.asarray(Y_train) print(x.shape, y.shape) return x, y def get_test_data(self): nrows = 100 test_df = pd.read_csv("../data/test_preprocessed.csv", nrows=nrows) X_test = self.get_x_data(test_df["comment_text"]) return np.asarray(X_test), test_df
class BPEmbEmbedding(AbstractEmbedding): def __init__(self, lang: str, dim: int = 300, vs: int = 100000, add_pad_emb: bool = True): super().__init__() try: from bpemb import BPEmb self.embedder = BPEmb(lang=lang, dim=dim, vs=vs, add_pad_emb=add_pad_emb, cache_dir=Path(aikido.cache_root) / "embeddings") self.embeddings_ = nn.Embedding.from_pretrained(tensor(self.embedder.vectors, dtype=torch.float), padding_idx=vs) self.dim_ = dim self.vs_ = vs except ImportError: logging.error("-" * 100) logging.error("no bpemb installation found. see https://github.com/bheinzerling/bpemb") logging.error("-" * 100) pass @property def embedding_length(self) -> int: return self.dim_ @property def vocabulary_length(self) -> int: return self.vs_ def encode_ids(self, word): return self.embedder.encode_ids(word) def embed(self, x): return self.embeddings_(x) def raw_embedding(self) -> nn.Embedding: return self.embeddings_
class SubWordVocab(object): def __init__(self, size): self.encoder = BPEmb(lang='en', vs=size) assert self.sos_id == 1 assert self.eos_id == 2 def __len__(self): return self.encoder.vs @property def sos_id(self): return 1 @property def eos_id(self): return self.encoder.EOS def encode(self, syms): return self.encoder.encode_ids(syms) def decode(self, ids): syms = self.encoder.decode_ids(ids) if isinstance(syms, list): return '' return syms
class TopicCEncSimpleBPemb(_TopicCBase): def __init__(self, embed_size, output_size, enc_hidden_size): super(TopicCEncSimpleBPemb, self).__init__() print("init: TopicCEncSimpleBPemb model") self.embedding_model = BPEmb(dim=embed_size, lang="en", vs=100000) self.encoder = nn.GRU(input_size=embed_size, hidden_size=enc_hidden_size, num_layers=1, bidirectional=True) self.seq_to_output_map = nn.Linear(2 * enc_hidden_size, output_size, bias=False) def embed_sequence(self, sequence: str) -> torch.Tensor: v_ids = self.embedding_model.encode_ids(sequence) return torch.tensor(self.embedding_model.vectors[v_ids]).to( self._device) def create_seq_vecs( self, sequences: List[str] ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # returns the padded seq vector, lengths and original order index # sequences are sorted by length, and can be reverted to their original # order with the unsorting index vector # start by embedding the sequence vectors seq_vecs = [self.embed_sequence(s) for s in sequences] # sort lengths and set the device lengths = torch.tensor([seq_v.shape[0] for seq_v in seq_vecs]) lengths, sort_i = lengths.sort(descending=True) _, orig_i = sort_i.sort() # pad the seq vecs and sort by length (dim 2 is the batch dimension) seq_vec_pad = rnn.pad_sequence(seq_vecs).to(self._device) seq_vec_pad = seq_vec_pad[:, sort_i, :] return seq_vec_pad, lengths, orig_i def forward(self, sequences: List[str]) -> torch.Tensor: # Make the word embeddings for each sequence pad_seq_vecs, lengths, orig_i = self.create_seq_vecs(sequences) # pack the sequence for the GRU # packed_seq_vecs.shape = max_seq_len, batch_size, embedding_dim packed_seq_vecs = rnn.pack_padded_sequence(pad_seq_vecs, lengths) # run through the GRU _, h_n = self.encoder(packed_seq_vecs) seq_output = torch.cat((h_n[0], h_n[1]), dim=1) output = self.seq_to_output_map(seq_output) # sort the output to match the original order output = output[orig_i, :] return nn.functional.log_softmax(output, dim=1)
class TextEncoder(): def __init__(self, vocab_path): with open(vocab_path, 'rb') as file: self.vocab = pickle.load(file) self.bpemb_en = BPEmb(lang="en", dim=300, vs=1000) def encode(self, text): token_ids = self.bpemb_en.encode_ids(text) ids = np.array([self.vocab[t] for t in token_ids]) return ids
def test_encoding(): text = ["This is Stratford", "<pad>"] bpemb_en = BPEmb(lang="en", add_pad_emb=True) # We can auto-add and encode start/end tokens. However, encoder can't handle <pad> directly. # We should pad outside with the corresponding index (index of the last word when add_pad_emb True). print(bpemb_en.encode(text)) print(bpemb_en.encode_with_eos(text)) print(bpemb_en.encode_with_bos_eos(text)) print(bpemb_en.encode_ids(text)) print(bpemb_en.encode_ids_with_eos(text)) print(bpemb_en.encode_ids_with_bos_eos(text))
def build_index(self, meta_parquet_path, vocab_size=50000, trees=100): ''' Build an annoy index of metadata titles. Uses BPEEmb, so small vocab size is fine index_path: path with filename, where filename ends with '.ann' trees: Number of trees to use for the Annoy index. More is better but slower to build. ''' from bpemb import BPEmb from compare_tools.hathimeta import clean_title metadf = pd.read_parquet(meta_parquet_path, columns=['htid', 'title']) bpemb_en = BPEmb(lang="en", dim=self.dims, vs=vocab_size) # Insert vectors for documents into Annoy index, using the integer from # the metadf index as the id for i, row in metadf.reset_index().fillna('').astype(str).iterrows(): bpe_ids = bpemb_en.encode_ids(row.title) # Sum of full title. Imperfect, would work better if BPEs for each word were averaged first. vec = bpemb_en.vectors[bpe_ids].sum(0) trimmed_bpe_ids = bpemb_en.encode_ids(clean_title(row.title)) trimmed_vec = bpemb_en.vectors[trimmed_bpe_ids].sum(0) # Average, with more weight on the cleaned title. weighted = np.average([vec, trimmed_vec], axis=0, weights=[.3, .7]) self.u.add_item(i, weighted) if i % 100000 == 0: print(i, end=',') print() # will take about 30m for 100 dims and 8mi titles self.u.build(trees) self.u.save(index_path) metadf.reset_index()['htid'].to_csv(self.index_reference_path, compression='gzip')
class TweetTokenizer(): def __init__(self, dim=50, vocab_size=10000, mode='get_id'): self.dim = dim self.vocab_size = vocab_size self.bpemb_en = BPEmb(lang="en", dim=dim, vs=vocab_size) self.embedding_weight = self.bpemb_en.vectors self.mode = mode def __call__(self, tweet, mode='get_id'): if mode == 'get_id': return torch.tensor(self.bpemb_en.encode_ids(tweet), dtype=torch.long) elif mode == 'raw': return self.bpemb_en.encode(tweet) else: raise ValueError('Invalid mode')
class BPembTokenizer(Tokenizer): def __init__(self, vocab_size=50000, emb_dim=300, lang='en'): super(BPembTokenizer, self).__init__() from bpemb import BPEmb self.bpemb_en = BPEmb(lang=lang, vs=vocab_size, dim=emb_dim) def get_embeddings(self): return self.bpemb_en.vectors def encode_ids(self, text): return self.bpemb_en.encode_ids(text) def decode_ids(self, ids): return self.bpemb_en.decode_ids(ids) def tokenize(self, text): return self.bpemb_en.encode(text)
class LoadedBPEEmbeddingTextVectorizer(LoadedTextVectorizer): def __init__(self, predictor_config): predictor_config = predictor_config['vectorizer'] self.bpemb = BPEmb(lang='en', dim=predictor_config['embedding_dim'], vs=predictor_config['max_vocab_size'], add_pad_emb=True) self.max_seq_len = predictor_config['max_seq_len'] def get_cutoff_ratios(self, texts: List[str]) -> List[float]: sequences = self.bpemb.encode_ids(texts) return [len(sequence) / self.max_seq_len for sequence in sequences] def vectorize(self, texts: List[str]): vectorized = _vectorize_padded(bpemb=self.bpemb, max_seq_len=self.max_seq_len, texts=texts) cut_off_ratios = self.get_cutoff_ratios(texts) return vectorized, cut_off_ratios
class StatusDataset(Dataset): def __init__(self, path=config.path_to_data, mode='train'): self.path_to_data = path self.mode = mode print(f"Loading {self.mode} data...") self.data = self.read_data() self.preprocess_data() self.bpemb_ru = BPEmb(lang="ru", dim=300, vs=50000) self.placeholder = torch.zeros(config.max_seq_length, dtype=torch.long) def read_data(self): if self.mode == 'train': data = pd.read_csv(self.path_to_data) data = data[data.isTest == 0][['text_orig', 'label']] elif self.mode == 'val': data = pd.read_csv(self.path_to_data) data = data[data.isTest == 1][['text_orig', 'label']] elif self.mode == 'test': data = pd.read_parquet(self.path_to_data) return data def preprocess_data(self): self.data['text_orig'] = self.data['text_orig'].map(self.remove_urls) self.data = shuffle(self.data) self.data.reset_index(drop=True, inplace=True) def remove_urls(self, v_text): v_text = re.sub(r"(/[\w\-?=$&:;#@/]+)", '', v_text, flags=re.MULTILINE) v_text = re.sub(r'(https?:[/.]*)', '', v_text, flags=re.MULTILINE) return v_text def __len__(self): return self.data.shape[0] def __getitem__(self, idx): text = self.data['text_orig'][idx] label = self.data['label'][idx] ids_tokens = self.bpemb_ru.encode_ids(text) placeholder = self.placeholder.clone() placeholder[:len(ids_tokens)] = torch.tensor( ids_tokens)[:config.max_seq_length] return placeholder, torch.tensor(label)
def dump(self, dump_path, bpe: BPEmb, max_sents=1, shuffle=0): with ExitStack() as stack: article_content_file = stack.enter_context(open(dump_path + '.content', "w", encoding='utf-8')) article_label_file = stack.enter_context(open(dump_path + '.labels', "w", encoding='utf-8')) if shuffle: indices = (ind for ind in np.random.RandomState(seed=shuffle).permutation(len(self.articles))) print("Write dataset to file (shuffled)") else: indices = range(len(self.articles)) print("Write dataset to file (no shuffling)") for ind in indices: article = self.articles[ind] sent_cnt = min(max_sents, len(article.sentences)) flat_sents = list(itertools.chain.from_iterable(article.sentences[:sent_cnt])) sent_bpe_ids = [str(bpe_id) for bpe_id in bpe.encode_ids(" ".join(flat_sents))] label_ids = [str(self.label_dict.null_index)] * len(sent_bpe_ids) article_content_file.write(" ".join(sent_bpe_ids) + '\n') article_label_file.write(" ".join(label_ids) + '\n')
def dump(self, dump_path, bpe: BPEmb, shuffle=0): with ExitStack() as stack: boxes_content_file = stack.enter_context(open(dump_path + '.content', "w", encoding='utf-8')) boxes_label_file = stack.enter_context(open(dump_path + '.labels', "w", encoding='utf-8')) boxes_positions_file = stack.enter_context(open(dump_path + '.pos', "w", encoding='utf-8')) if shuffle: indices = (ind for ind in np.random.RandomState(seed=shuffle).permutation(len(self.infoboxes))) print("Write dataset to file (shuffled)") else: indices = range(len(self.infoboxes)) print("Write dataset to file (no shuffling)") for ind in indices: infobox = self.infoboxes[ind] box_content = [] box_labels = [] box_positions = [] for record in infobox.records: rec_content = " ".join(record.content) rec_bpe_ids = [str(bpe_id) for bpe_id in bpe.encode_ids(rec_content)] num_tokens = len(rec_bpe_ids) if record.field_label in self.label_dict.word2id: rec_label_id = self.label_dict.word2id[record.field_label] else: print("Unknown field label %s" % record.field_label) rec_label_id = self.label_dict.unk_index label_ids = [str(rec_label_id)] * num_tokens positions = [str(num + 1) for num in range(num_tokens)] box_content.extend(rec_bpe_ids) box_labels.extend(label_ids) box_positions.extend(positions) boxes_content_file.write(" ".join(box_content) + '\n') boxes_label_file.write(" ".join(box_labels) + '\n') boxes_positions_file.write(" ".join(box_positions) + '\n')
class DatasetBase(ABC): is_multilingual = False def __init__(self, conf, lang, bert=None): self.conf = conf self.lang = lang self.bert = bert self.device = torch.device(f"cuda:{conf.gpu_id}") self.name = conf.dataset self.tag = conf.tag self.batch_size = conf.batch_size self.eval_batch_size = conf.eval_batch_size self.examples_to_print = conf.n_examples if self.conf.tag_scheme: self.convert_tags = iob_to[self.tag_scheme] self.load_data_raw() self.NO_TAG = "NO_TAG" tags = self.get_tags() print(Counter(tags).most_common()) shapes = self.get_shapes() char_enc = None if conf.char_enc_file: assert Path(conf.char_enc_file).exists() char_enc = joblib.load(conf.char_enc_file) if self.name.endswith("multi_finetune"): assert char_enc if char_enc: self.char_enc = char_enc else: chars = self.get_chars() self.char_enc = LabelEncoder( to_torch=True, device=self.device).fit(chars) tag_enc = None if conf.tag_enc_file: assert Path(conf.tag_enc_file).exists() tag_enc = joblib.load(conf.tag_enc_file) if tag_enc: self.tag_enc = tag_enc else: self.tag_enc = LabelEncoder( to_torch=True, device=self.device).fit(tags) self.shape_enc = LabelEncoder( to_torch=True, device=self.device).fit(shapes) self.bpemb = BPEmb( lang=conf.bpemb_lang, vs=conf.vocab_size, dim=conf.bpemb_dim, add_pad_emb=True) if conf.use_fasttext: f = conf.fasttext_emb_file.format(dataset=self.name, lang=lang) self.fasttext_emb = load_word2vec_file(f, add_unk=True) self.pad_idx = self.bpemb.emb.key_to_index["<pad>"] if not conf.no_dataset_tensorize: self.tensorize() @abstractmethod def load_data_raw(self): pass @abstractmethod def get_chars(self): pass @abstractmethod def get_tags(self): pass @abstractmethod def get_shapes(self): pass @abstractmethod def tensorize(self): pass def tensorize_sent(self, sent): tags_str = [token[self.tag] or self.NO_TAG for token in sent] tags = self.tag_enc.transform(tags_str) tokens = [token["form"] for token in sent] token_shape = self.shape_enc.transform(token_shapes(tokens)) bpe_ids = [ self.bpemb.encode_ids([token["form"]])[0] for token in sent] bpe_token_start_mask = self.start_mask(bpe_ids) bpe_token_end_mask = self.end_mask(bpe_ids) bpe_ids = tensor(list(flatten(bpe_ids))).to(device=self.device) assert bpe_token_start_mask.shape == bpe_ids.shape assert bpe_token_start_mask.sum().item() == len(tags) assert bpe_token_end_mask.shape == bpe_ids.shape assert bpe_token_end_mask.sum().item() == len(tags) try: chars = self.char_enc.transform([ [char for char in token["form"]] for token in sent]) except ValueError as e: print(e) return None char_token_start_mask = self.start_mask(chars) char_token_end_mask = self.end_mask(chars) chars = tensor(list(flatten(chars))).to(device=self.device) char_token, char_token_len = self.sub_token_and_len( chars, char_token_start_mask) bpe_token, bpe_token_len = self.sub_token_and_len( bpe_ids, bpe_token_start_mask) tensorized = { "token": tokens, "tag": tags, "token_shape": token_shape, "bpe": bpe_ids, "bpe_token": bpe_token, "bpe_token_len": bpe_token_len, "bpe_token_start_mask": bpe_token_start_mask, "bpe_token_end_mask": bpe_token_end_mask, "char": chars, "char_token_start_mask": char_token_start_mask, "char_token_end_mask": char_token_end_mask, "char_token": char_token, "char_token_len": char_token_len, } if hasattr(self, "fasttext_emb"): tensorized["fasttext"] = tensor( to_word_indexes( [token["form"].lower() for token in sent], self.fasttext_emb, unk="<unk>")).to(device=self.device) if self.bert is not None: try: tensorized["bert_ids"], \ tensorized["bert_mask"], \ tensorized["bert_token_starts"] = \ self.bert.subword_tokenize_to_ids(tokens) assert len(tensorized["bert_ids"]) <= self.conf.bert_max_seq_len if self.examples_to_print > 0: print(tokens) print( self.bert.model_name, self.bert.subword_tokenize(tokens)) self.examples_to_print -= 1 except AssertionError as e: print(e) return None # TODO: ta (Tamil) WikiAnn has weird whitespace characters that # are treated differently by the BERT tokenizer, leading to # mismatches in tag and token counts if len(tags) != tensorized["bert_token_starts"].sum(): print("Skipping instance with inconsistent tokenization:") print(" ## ".join(tags_str)) print(" ## ".join(tokens)) return None return tensorized @staticmethod def start_mask(subsegments): mask = list(flatten( [[1] + [0] * (len(ids) - 1) for ids in subsegments])) return tensor(mask).cuda().byte() @staticmethod def end_mask(subsegments): mask = list(flatten( [([0] * (len(ids) - 1)) + [1] for ids in subsegments])) return tensor(mask).cuda().byte() @staticmethod def sub_token_and_len(sub, sub_token_mask): char_token_start = sub_token_mask.nonzero().squeeze(1) char_token_end = cat([ char_token_start[1:], tensor([sub_token_mask.size(0)]).to(char_token_start)]) char_token = [ sub[s:e] for s, e in zip(char_token_start, char_token_end)] char_token_len = char_token_end - char_token_start return char_token, char_token_len def token_texts(self, split_name): split = getattr(self, split_name) return [instance["token"] for instance in split] def assert_batch_size(self): if not hasattr(self, "batch_size"): raise ValueError( "Need to set batch_size before calling train_loader") def assert_eval_batch_size(self): if not hasattr(self, "eval_batch_size"): raise ValueError( "Need to set eval_batch_size before calling" "dev_loader or test_loader") def loader(self, dataset, **kwargs): return DataLoader(dataset, collate_fn=collate_fn, **kwargs)
class Predictor(PredictorBase): def __init__(self, config): super(Predictor, self).__init__(config) self.config = config self.model = None self.sess = None # self.builder = tf.saved_model.builder.SavedModelBuilder("savedModel") if self.config["use_bpe"]: self.bpe_zh = BPEmb(lang="zh", vs=config["vocab_size"]) else: # 加载词汇表 self.word_to_idx = self.load_vocab() self.idx_to_label = {value: key for key, value in self.word_to_idx.items()} # 初始化模型 self.create_model() print("load model finished") # 加载计算图 self.load_graph() print("load graph finished") def load_vocab(self): # 将词汇-索引映射表加载出来 with open(os.path.join(self.output_path, "word_to_index.pkl"), "rb") as f: word_to_index = pickle.load(f) return word_to_index def sentence_to_encode(self, sentence): """ 创建数据对象 :return: """ if not sentence: return None if len(sentence) > 20: return None if self.config["use_bpe"]: word_idx = self.bpe_zh.encode_ids(sentence) word_idx = list(map(lambda x: x + 1, word_idx)) else: word_idx = [self.word_to_idx.get(token, self.word_to_idx["UNK"]) for token in sentence] new_word_idx = self.process_data(word_idx) return new_word_idx @staticmethod def process_data(sentence): """ 对数据做预处理 :param sentence: :return: """ encoder_inputs = [sentence] return dict(encoder_inputs=encoder_inputs) def response(self, tokens_list): sents = [] for i in range(self.config["beam_size"]): sent_token = tokens_list[:, i] if self.config["use_bpe"]: sent = self.bpe_zh.decode_ids(list(map(lambda x: x - 1, sent_token))) else: sent = "".join([self.idx_to_label[token] for token in sent_token]) sents.append(sent) return sents def create_model(self): """ 根据config文件选择对应的模型,并初始化 :return: """ if self.config["model_name"] == "seq2seq_lstm": self.model = Seq2SeqTransformer(config=self.config, vocab_size=len(self.word_to_idx), word_vectors=None) if self.config["model_name"] == "seq2seq_bilstm": self.model = Seq2SeqBiLstmModel(config=self.config, vocab_size=len(self.word_to_idx), word_vectors=None) def load_graph(self): """ 加载计算图 :return: """ self.sess = tf.Session() ckpt = tf.train.get_checkpoint_state(os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), self.config["ckpt_model_path"])) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('Reloading model parameters..') self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"])) # inputs = {"inputs": tf.saved_model.utils.build_tensor_info(self.model.encoder_inputs), # "inputs_length": tf.saved_model.utils.build_tensor_info(self.model.encoder_inputs_length), # "keep_prob": tf.saved_model.utils.build_tensor_info(self.model.keep_prob)} # # outputs = {"predictions": tf.saved_model.utils.build_tensor_info(self.model.predictions)} # # prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(inputs=inputs, # outputs=outputs, # method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) # legacy_init_op = tf.group(tf.tables_initializer(), name="legacy_init_op") # self.builder.add_meta_graph_and_variables(self.sess, [tf.saved_model.tag_constants.SERVING], # signature_def_map={"dialogue": prediction_signature}, # legacy_init_op=legacy_init_op) # self.builder.save() def predict(self, sentence): """ 给定一条句子,预测结果 :return: """ sentence_ids = self.sentence_to_encode(sentence) prediction_ = self.model.infer(sentence_ids["encoder_inputs"]) prediction = self.sess.run(prediction_) print(prediction.shape) response = self.response(prediction) return response
class seq2seqDataset(data.Dataset): def __init__(self, root_path, seq_length, embed_dim, embed_vec_space): """Initialize the dataset""" self.root_path = root_path self.seq_length = seq_length self.tokenized_data = self.tokenize(root_path) self.embed = BPEmb(lang="en", vs=embed_vec_space, add_pad_emb=True) self.pad = 1002 self.sos = 1001 self.eos = 1000 self.augmentator = nac.KeyboardAug(aug_char_min=0, aug_char_p=0.4, aug_word_p=0.5, aug_word_min=0, aug_word_max=self.seq_length // 5, special_char=False) def tokenize(self, root_path): with open(root_path, 'r') as f: text = f.read() lt = len(text) print(lt) splitted = [] start = 0 while True: flag = (1 if self.seq_length >= lt - start else 0) if lt <= start: break cur_text = text[start:start + min(self.seq_length, lt - start)] last_chunk_len = None try: last_chunk_len = (0 if cur_text[-1].isspace() else len( cur_text.split()[-1])) except: print(cur_text) start += self.seq_length continue start += self.seq_length - last_chunk_len if last_chunk_len == len(cur_text.strip()): start += self.seq_length continue st = cur_text[:-last_chunk_len].strip() if st == st.swapcase(): start += self.seq_length continue splitted.append(st) if flag: break return splitted # def one_hot_encode(self, arr, n_labels): # one_hot = np.zeros((arr.size, n_labels), dtype=np.float32) # # Fill the appropriate elements with ones # one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1. # # Finally reshape it to get back to the original array # one_hot = one_hot.reshape((*arr.shape, n_labels)) # return one_hot def augment(self, seq): augmented_data = self.augmentator.augment(seq) return seq def get_encodes(self, seq, use_aug=False): if use_aug: seq = self.augment(seq) def padded_encode(x): res = np.full((self.seq_length, self.embed.dim), self.embed['<pad>']) res1hot = np.full((self.seq_length), self.pad, dtype=np.int32) enc = self.embed.encode_ids(x) res[:len(enc)] = self.embed.vectors[enc] res1hot[:len(enc)] = np.array(enc) length = len(enc) res = np.insert(res, 0, np.full((self.embed.dim), self.sos), 0) length += 1 if use_aug: res = np.insert(res, length, np.full((self.embed.dim), self.eos), 0) length += 1 return length, res res1hot = np.insert(res1hot, length, self.eos, 0) return length, res, res1hot return padded_encode(seq) def get_params(self): return None def __len__(self): return len(self.tokenized_data) def __getitem__(self, index): sequence = self.tokenized_data[index] lengths_x, X = self.get_encodes(sequence, use_aug=True) lengths_y, Y, y1hot = self.get_encodes(sequence) return lengths_x, torch.from_numpy(X), lengths_y, torch.from_numpy( Y), torch.from_numpy(y1hot)
class BytePairFeaturizer(DenseFeaturizer, GraphComponent): @classmethod def required_components(cls) -> List[Type]: """Components that should be included in the pipeline before this component.""" return [Tokenizer] @staticmethod def required_packages() -> List[Text]: """Any extra python dependencies required for this component to run.""" return ["bpemb"] @staticmethod def get_default_config() -> Dict[Text, Any]: """Returns the component's default config.""" return { **DenseFeaturizer.get_default_config(), # specifies the language of the subword segmentation model "lang": None, # specifies the dimension of the subword embeddings "dim": None, # specifies the vocabulary size of the segmentation model "vs": None, # if set to True and the given vocabulary size can't be loaded for the given # model, the closest size is chosen "vs_fallback": True, } def __init__( self, config: Dict[Text, Any], name: Text, ) -> None: """Constructs a new byte pair vectorizer.""" super().__init__(name, config) # The configuration dictionary is saved in `self._config` for reference. self.model = BPEmb( lang=self._config["lang"], dim=self._config["dim"], vs=self._config["vs"], vs_fallback=self._config["vs_fallback"], ) @classmethod def create( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, ) -> GraphComponent: """Creates a new component (see parent class for full docstring).""" return cls(config, execution_context.node_name) def process(self, messages: List[Message]) -> List[Message]: """Processes incoming messages and computes and sets features.""" for message in messages: for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: self._set_features(message, attribute) return messages def process_training_data(self, training_data: TrainingData) -> TrainingData: """Processes the training examples in the given training data in-place.""" self.process(training_data.training_examples) return training_data def _create_word_vector(self, document: Text) -> np.ndarray: """Creates a word vector from a text. Utility method.""" encoded_ids = self.model.encode_ids(document) if encoded_ids: return self.model.vectors[encoded_ids[0]] return np.zeros((self.component_config["dim"],), dtype=np.float32) def _set_features(self, message: Message, attribute: Text = TEXT) -> None: """Sets the features on a single message. Utility method.""" tokens = message.get(TEXT_TOKENS) # If the message doesn't have tokens, we can't create features. if not tokens: return None # We need to reshape here such that the shape is equivalent to that of sparsely # generated features. Without it, it'd be a 1D tensor. We need 2D (n_utterance, n_dim). text_vector = self._create_word_vector(document=message.get(TEXT)).reshape( 1, -1 ) word_vectors = np.array( [self._create_word_vector(document=t.text) for t in tokens] ) final_sequence_features = Features( word_vectors, FEATURE_TYPE_SEQUENCE, attribute, self._config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( text_vector, FEATURE_TYPE_SENTENCE, attribute, self._config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features) @classmethod def validate_config(cls, config: Dict[Text, Any]) -> None: """Validates that the component is configured properly.""" if not config["lang"]: raise ValueError("BytePairFeaturizer needs language setting via `lang`.") if not config["dim"]: raise ValueError( "BytePairFeaturizer needs dimensionality setting via `dim`." ) if not config["vs"]: raise ValueError("BytePairFeaturizer needs a vector size setting via `vs`.")
class BytePairFeaturizer(DenseFeaturizer): """This component adds BPEmb features.""" @classmethod def required_components(cls) -> List[Type[Component]]: return [Tokenizer] @classmethod def required_packages(cls) -> List[Text]: return ["bpemb"] defaults = { # specifies the language of the subword segmentation model "lang": None, # specifies the dimension of the subword embeddings "dim": None, # specifies the vocabulary size of the segmentation model "vs": None, # if set to True and the given vocabulary size can't be loaded for the given # model, the closest size is chosen "vs_fallback": True, # specifies the folder in which downloaded BPEmb files will be cached "cache_dir": str(Path.home() / Path(".cache/bpemb")), # specifies the path to a custom SentencePiece model file "model_file": None, # specifies the path to a custom embedding file "emb_file": None, } language_list = [ "mt", "sd", "cr", "ba", "ht", "scn", "bi", "stq", "sm", "diq", "no", "yi", "vec", "bug", "am", "tl", "mn", "atj", "ko", "mai", "lij", "tcy", "sl", "bn", "dv", "rm", "ng", "ml", "kg", "koi", "war", "et", "mhr", "als", "bar", "ii", "sco", "got", "pnb", "ss", "bpy", "tum", "ru", "qu", "hy", "tw", "bm", "vep", "dty", "udm", "gd", "lbe", "rmy", "azb", "kw", "ja", "wuu", "pag", "ro", "tet", "ee", "min", "su", "ha", "glk", "pcd", "tk", "nrm", "ku", "gn", "ty", "bh", "pap", "fr", "ia", "cs", "ky", "ff", "kab", "rn", "csb", "tt", "cy", "ilo", "kaa", "hif", "ak", "pa", "crh", "ti", "myv", "ur", "se", "uz", "cdo", "lez", "srn", "kk", "pih", "de", "an", "tyv", "ext", "gan", "wo", "si", "lmo", "hak", "az", "ka", "ik", "frr", "hsb", "ho", "af", "nds", "pam", "el", "fur", "cu", "hr", "my", "nl", "da", "ch", "vls", "es", "as", "lt", "ny", "so", "oc", "lad", "pnt", "ms", "bcl", "os", "co", "ks", "or", "ay", "wa", "nah", "fa", "pl", "mzn", "za", "th", "fj", "kbp", "be", "zh", "ce", "sh", "sr", "id", "chy", "ps", "lo", "tr", "st", "he", "ang", "sah", "io", "gom", "ki", "sn", "kbd", "jam", "bo", "pms", "sk", "kv", "ckb", "nv", "dsb", "zea", "xmf", "fi", "ltg", "ksh", "ve", "new", "na", "jv", "tn", "sw", "rw", "ln", "bs", "gag", "ab", "olo", "is", "bjn", "ceb", "om", "vi", "ast", "uk", "mg", "mwl", "arz", "li", "mrj", "yo", "frp", "gl", "la", "km", "sv", "nap", "jbo", "bxr", "gv", "br", "fo", "ug", "pi", "bg", "ie", "din", "sa", "pdc", "cho", "lb", "ig", "aa", "sc", "fy", "kj", "eo", "eu", "kl", "sq", "to", "mi", "tpi", "kr", "hi", "arc", "ga", "nov", "mdf", "vo", "pfl", "rue", "haw", "kn", "mh", "mr", "te", "ca", "ace", "cv", "zu", "it", "iu", "av", "sg", "hz", "lv", "ts", "lrc", "ar", "hu", "nn", "nso", "krc", "mk", "tg", "ne", "dz", "ta", "mus", "ady", "en", "lg", "xal", "gu", "pt", "xh", "szl", "chr", ] def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: super().__init__(component_config) model_file, emb_file = (self.component_config[k] for k in ["model_file", "emb_file"]) if model_file: if not os.path.exists(model_file): raise FileNotFoundError( f"BytePair model {model_file} not found. Please check config." ) if emb_file: if not os.path.exists(emb_file): raise FileNotFoundError( f"BytePair embedding file {emb_file} not found. Please check config." ) if not self.component_config["lang"]: raise ValueError( "You must specify the `lang` parameter for BytePairEmbedding in `config.yml`." ) if not self.component_config["vs"]: raise ValueError( "You must specify the `vs` parameter for BytePairEmbedding in `config.yml`." ) if not self.component_config["dim"]: raise ValueError( "You must specify the `dim` parameter for BytePairEmbedding in `config.yml`." ) self.model = BPEmb( lang=self.component_config["lang"], dim=self.component_config["dim"], vs=self.component_config["vs"], vs_fallback=self.component_config["vs_fallback"], cache_dir=self.component_config["cache_dir"], model_file=self.component_config["model_file"], emb_file=self.component_config["emb_file"], ) def train( self, training_data: TrainingData, config: Optional[RasaNLUModelConfig] = None, **kwargs: Any, ) -> None: for example in training_data.intent_examples: for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: self.set_bpemb_features(example, attribute) def create_word_vector(self, document: Text) -> np.ndarray: encoded_ids = self.model.encode_ids(document) if encoded_ids: return self.model.vectors[encoded_ids[0]] return np.zeros((self.component_config["dim"], ), dtype=np.float32) def set_bpemb_features(self, message: Message, attribute: Text = TEXT) -> None: tokens = message.get(TOKENS_NAMES[attribute]) if not tokens: return None # We need to reshape here such that the shape is equivalent to that of sparsely # generated features. Without it, it'd be a 1D tensor. We need 2D (n_utterance, n_dim). text_vector = self.create_word_vector( document=message.get(TEXT)).reshape(1, -1) word_vectors = np.array( [self.create_word_vector(document=t.text) for t in tokens]) final_sequence_features = Features( word_vectors, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( text_vector, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features) def process(self, message: Message, **kwargs: Any) -> None: self.set_bpemb_features(message) def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]: pass @classmethod def load( cls, meta: Dict[Text, Any], model_dir: Optional[Text] = None, model_metadata: Optional["Metadata"] = None, cached_component: Optional["Component"] = None, **kwargs: Any, ) -> "Component": if cached_component: return cached_component return cls(meta)
class BpeTrainData(object): def __init__(self, config): self._train_data_path = os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), config["train_data"]) self._output_path = os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), config["output_path"]) if not os.path.exists(self._output_path): os.makedirs(self._output_path) self._embedding_size = config["embedding_size"] # 词向量的长度 self.bpe_zh = BPEmb(lang="zh", vs=config["vocab_size"]) self.vocab_size = None self.word_vectors = None self.pad_token = 0 self.go_token = 2 self.eos_token = 3 def read_data(self): """ 读取数据 :return: 返回分词后的对话对,questions, responses = [[]] """ with open(self._train_data_path, "r", encoding="utf8") as f: requests = [] responses = [] for line in f.readlines(): request, response = line.strip().split("<SEP>") requests.append(request.strip()) responses.append(response.strip()) return requests, responses def get_word_vectors(self): """ 直接从预训练好的bpe模型中加载词向量,并获得相应的词向量矩阵 :return: """ vocab = self.bpe_zh.words # 因为bpe模型中是不包含<pad>的,因此在0位置添加一个<pad>字符 vocab.append("<pad>") vectors = self.bpe_zh.vectors print(vectors.shape) pad_vector = np.random.randn(self._embedding_size) word_vectors = np.vstack((pad_vector, vectors)) return vocab, word_vectors def trans_to_index(self, data): """ 将输入转化为索引表示 :param data: 输入的是questions 和 responses :return: """ data_ids = [] for sentence in data: token_ids = self.bpe_zh.encode_ids(sentence) # 因为在bpe的vocab中的0位置添加了<pad>字符,bpe vocab对应的索引是从1开始,因此对这里的ids都加1 token_ids = list(map(lambda x: x + 1, token_ids)) data_ids.append(token_ids) return data_ids def padding(self, batch): """ 对每个batch数据按数据集中最大长度的句子进行补全 :param batch: :return: """ question_length = [len(sample[0]) for sample in batch] max_question_length = max(question_length) questions = [sample[0] + [self.pad_token] * (max_question_length - len(sample[0])) for sample in batch] # 在这里先对response加上一个终止符<eos> responses = [sample[1] + [self.eos_token] for sample in batch] response_length = [len(response) for response in responses] max_response_length = max(response_length) # 对response按最大长度补齐 pad_responses = [response + [self.pad_token] * (max_response_length - len(response)) for response in responses] return dict(questions=questions, responses=pad_responses, question_length=question_length, response_length=response_length) def gen_data(self): """ 生成可导入到模型中的数据 :return: """ # 如果不是第一次数据预处理,则直接读取 if os.path.exists(os.path.join(self._output_path, "train_data.pkl")): print("load existed train data") with open(os.path.join(self._output_path, "train_data.pkl"), "rb") as f: train_data = pickle.load(f) return train_data # 1,读取原始数据 questions, responses = self.read_data() # 2,生成vocab_size 和 词向量 vocab, word_vectors = self.get_word_vectors() self.vocab_size = len(vocab) self.word_vectors = word_vectors # 4,输入转索引 questions_idx = self.trans_to_index(questions) responses_idx = self.trans_to_index(responses) # 生成数据并保存下来 train_data = [[questions_idx[i], responses_idx[i]] for i in range(len(questions_idx))] with open(os.path.join(self._output_path, "train_data.pkl"), "wb") as fw: pickle.dump(train_data, fw) return train_data def next_batch(self, data, batch_size): """ 生成batch数据集 :param data: 输入 :param batch_size: 批量的大小 :return: """ random.shuffle(data) batch_num = len(data) // batch_size for i in range(batch_num): batch_data = data[batch_size * i: batch_size * (i + 1)] new_batch = self.padding(batch_data) yield new_batch
class Model(): ''' класс модели производит загрузку модели и расстановку пунктуации в предложении ''' def __init__(self, export_dir, vocab_size=5000, emb_dim=200, dict_punct=None): self.vocab_size = vocab_size self.emb_dim = emb_dim self.bpemb_ru = BPEmb(lang='ru', vs=vocab_size, dim=emb_dim) self.export_dir = export_dir self.predict_fn = predictor.from_saved_model(export_dir) if dict_punct is None: self.d = { 1: 4922, 2: 4921, 3: 4978, 4: 4985, 5: 4947, 6: 4963, 7: 4936 } else: self.d = dict_punct def parse_fn(self, line): ''' функция кодировки строки: line- строка ''' feature = np.array([self.bpemb_ru.encode_ids(line)]).astype(np.int32) return feature, np.array([len(feature[0])]) def to_capital_latter(self, sentence): '''фукция, переводящая прописные буквы в заглавные после точки''' tmp = '' flag = True for c in sentence: if flag and c != ' ': tmp += c.upper() flag = False else: tmp += c if c in '.?!': flag = True return tmp def predict(self, line): x, x_len = self.parse_fn(line) predict = self.predict_fn({'x': x, 'len': x_len}) a = [] for i in range(predict['lengths'][0]): a.append(predict['sequences'][0][i]) if predict['prediction'][0][i] != 0: a.append(self.d[predict['prediction'][0][i]]) return self.to_capital_latter(self.bpemb_ru.decode_ids(np.array(a)))
class BpeEvalData(object): def __init__(self, config): self._eval_data_path = os.path.join( os.path.abspath(os.path.dirname(os.getcwd())), config["eval_data"]) self._output_path = os.path.join( os.path.abspath(os.path.dirname(os.getcwd())), config["output_path"]) self.bpe_zh = BPEmb(lang="zh", vs=config["vocab_size"]) self.pad_token = 0 self.eos_token = 3 def read_data(self): """ 读取数据 :return: 返回分词后的文本内容和标签,questions, responses = [[]] """ with open(self._eval_data_path, "r", encoding="utf8") as f: requests = [] responses = [] for line in f.readlines(): request, response = line.strip().split("<SEP>") requests.append(request.strip()) responses.append(response.strip()) return requests, responses def trans_to_index(self, data): """ 将输入转化为索引表示 :param data: 输入的是questions 和 responses :return: """ data_ids = [] for sentence in data: token_ids = self.bpe_zh.encode_ids(sentence) # 因为在bpe的vocab中的0位置添加了<pad>字符,bpe vocab对应的索引是从1开始,因此对这里的ids都加1 token_ids = list(map(lambda x: x + 1, token_ids)) data_ids.append(token_ids) return data_ids def padding(self, batch): """ 对每个batch数据按数据集中最大长度的句子进行补全 :param batch: :return: """ question_length = [len(sample[0]) for sample in batch] max_question_length = max(question_length) questions = [ sample[0] + [self.pad_token] * (max_question_length - len(sample[0])) for sample in batch ] # 在这里先对response加上一个终止符<eos> responses = [sample[1] + [self.eos_token] for sample in batch] response_length = [len(response) for response in responses] max_response_length = max(response_length) # 对response按最大长度补齐 pad_responses = [ response + [self.pad_token] * (max_response_length - len(response)) for response in responses ] return dict(questions=questions, responses=pad_responses, question_length=question_length, response_length=response_length) def gen_data(self): """ 生成可导入到模型中的数据 :return: """ # 如果不是第一次数据预处理,则直接读取 if os.path.exists(os.path.join(self._output_path, "eval_data.pkl")): print("load existed eval data") with open(os.path.join(self._output_path, "eval_data.pkl"), "rb") as f: eval_data = pickle.load(f) return eval_data # 1,读取原始数据 questions, responses = self.read_data() # 3,输入转索引 questions_idx = self.trans_to_index(questions) responses_idx = self.trans_to_index(responses) # 生成数据并保存下来 eval_data = [[questions_idx[i], responses_idx[i]] for i in range(len(questions_idx))] with open(os.path.join(self._output_path, "eval_data.pkl"), "wb") as fw: pickle.dump(eval_data, fw) return eval_data def next_batch(self, data, batch_size): """ 生成batch数据集 :param data: 输入 :param batch_size: 批量的大小 :return: """ random.shuffle(data) batch_num = len(data) // batch_size for i in range(batch_num): batch_data = data[batch_size * i:batch_size * (i + 1)] new_batch = self.padding(batch_data) yield new_batch
class TopicKeyEncBPemb(_TopicKeyBase): def __init__(self, embed_size, enc_hidden_size, dense_size): super(TopicKeyEncBPemb, self).__init__() print("init: TopicKeyEncBPemb model") self.embedding_model = BPEmb(dim=embed_size, lang="en", vs=100000) self.encoder = nn.LSTM(input_size=embed_size, hidden_size=enc_hidden_size, num_layers=2, bidirectional=True) self._lstm_layers = 2 self._lstm_directions = 2 self.enc_to_dense_map = nn.Linear(2 * enc_hidden_size, dense_size) self.dense_to_output_map = nn.Linear(dense_size, 1) def embed_sequence(self, sequence: List[str]) -> torch.Tensor: # sequence is a list of words (strings) # average the per-word encoding vector def _enc(word): v_ids = self.embedding_model.encode_ids(word) return self.embedding_model.vectors[v_ids] return torch.tensor([_enc(word).mean(axis=0) for word in sequence]).to(self._device) def create_seq_vecs( self, sequences: List[List[str]] ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # returns the padded seq vector, lengths and original order index # sequences are sorted by length, and can be reverted to their original # order with the unsorting index vector # start by embedding the sequence vectors seq_vecs = [self.embed_sequence(s) for s in sequences] # sort lengths and set the device lengths = torch.tensor([seq_v.shape[0] for seq_v in seq_vecs]) lengths, sort_i = lengths.sort(descending=True) _, orig_i = sort_i.sort() # pad the seq vecs and sort by length (dim 1 is the batch dimension) seq_vec_pad = rnn.pad_sequence(seq_vecs).to(self._device) seq_vec_pad = seq_vec_pad[:, sort_i, :] return seq_vec_pad, lengths, orig_i def forward( self, sequences: List[List[str]]) -> Tuple[torch.Tensor, torch.Tensor]: # Make the word embeddings for each sequence pad_seq_vecs, lengths, orig_i = self.create_seq_vecs(sequences) # masks to indicate which parts of the sequences are padding # lengths are sorted largest to smallest, so lengths[0] is max_len pad_mask = torch.zeros(len(sequences), lengths[0], device=self._device, dtype=torch.bool) for i, length in enumerate(lengths): pad_mask[i, length:] = True # pack the sequence for the LSTM # packed_seq_vecs.shape = max_seq_len, batch_size, embedding_dim packed_seq_vecs = rnn.pack_padded_sequence(pad_seq_vecs, lengths) # run through the LSTM enc_outputs, _ = self.encoder(packed_seq_vecs) # enc_outputs.shape: max_seq_len, batch_size, 2*enc_hidden_size enc_outputs, _ = nn.utils.rnn.pad_packed_sequence(enc_outputs) # re-order so batch dim is first # enc_outputs.shape: batch_size, max_seq_len, 2*enc_hidden_size enc_outputs = enc_outputs.permute(1, 0, 2) # dense.shape: batch_size, max_seq_len, 2*enc_hidden_size dense = self.enc_to_dense_map(enc_outputs) dense = torch.tanh(dense) # final output layer, and remove the last dimension # output.shape = batch_size, max_seq_len output = self.dense_to_output_map(dense).squeeze(dim=2) return output[orig_i, :], pad_mask[orig_i, :]
from torch import nn, tensor # In[69]: emb_layer = nn.Embedding.from_pretrained(tensor(bpemb_fr.vectors)) # In[70]: emb_layer # In[71]: ids = bpemb_fr.encode_ids("Ceci est une phrase française") # In[72]: ids # In[73]: bpemb_fr.vectors[ids].shape # In[74]: emb_layer(tensor(ids)).shape
class TopicCEncBPemb(_TopicCBase): def __init__(self, embed_size, output_size, enc_hidden_size, attention_size, dense_size): super(TopicCEncBPemb, self).__init__() print("init: TopicCEncBPemb model") self.embedding_model = BPEmb(dim=embed_size, lang="en", vs=100000) self.encoder = nn.LSTM(input_size=embed_size, hidden_size=enc_hidden_size, num_layers=2, bidirectional=True) self._lstm_layers = 2 self._lstm_directions = 2 self.enc_to_att_map = nn.Linear(self._lstm_directions * enc_hidden_size, attention_size, bias=False) # The attention vector is used like the decoder states in the attention # component in a seq-to-seq model, however it's a single, learnable # vector in this case self.att_vec = nn.Linear(attention_size, 1, bias=False) self.seq_to_dense_map = nn.Linear(4 * enc_hidden_size, dense_size) self.dense_to_output_map = nn.Linear(dense_size, output_size) def embed_sequence(self, sequence: str) -> torch.Tensor: v_ids = self.embedding_model.encode_ids(sequence) return torch.tensor(self.embedding_model.vectors[v_ids]).to( self._device) def create_seq_vecs( self, sequences: List[str] ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # returns the padded seq vector, lengths and original order index # sequences are sorted by length, and can be reverted to their original # order with the unsorting index vector # start by embedding the sequence vectors seq_vecs = [self.embed_sequence(s) for s in sequences] # sort lengths and set the device lengths = torch.tensor([seq_v.shape[0] for seq_v in seq_vecs]) lengths, sort_i = lengths.sort(descending=True) _, orig_i = sort_i.sort() # pad the seq vecs and sort by length (dim 1 is the batch dimension) seq_vec_pad = rnn.pad_sequence(seq_vecs).to(self._device) seq_vec_pad = seq_vec_pad[:, sort_i, :] return seq_vec_pad, lengths, orig_i def forward(self, sequences: List[str]) -> torch.Tensor: # Make the word embeddings for each sequence pad_seq_vecs, lengths, orig_i = self.create_seq_vecs(sequences) # pack the sequence for the LSTM # packed_seq_vecs.shape = max_seq_len, batch_size, embedding_dim packed_seq_vecs = rnn.pack_padded_sequence(pad_seq_vecs, lengths) # run through the LSTM # h_n.shape = num_layers*num_directions, batch, hidden_size # Note from docs: # the layers can be separated using h_n.view(num_layers, num_directions, batch, hidden_size) enc_outputs, (h_n, _) = self.encoder(packed_seq_vecs) h_n = h_n.view(self._lstm_layers, self._lstm_directions, h_n.shape[1], h_n.shape[2]) # unpack the sequence # enc_outputs.shape = max_seq_len, batch_size, 2*enc_hidden_size enc_outputs, _ = nn.utils.rnn.pad_packed_sequence(enc_outputs) # encoder masks to indicate which parts of the sequence should be considered enc_masks = torch.zeros(enc_outputs.shape[0], enc_outputs.shape[1], 1, dtype=torch.bool, device=self._device) for i, length in enumerate(lengths): enc_masks[length:, i, 0] = True # encoder outputs projected to the dimension of the attention vector # att_proj.shape = max_seq_len, batch_size, attention_size att_proj = self.enc_to_att_map(enc_outputs) # weights to the where attention is given to each index of the sequence # att_w.shape = max_seq_len, batch_size, 1 att_w = self.att_vec(att_proj) # mask out sections which are not part of the sequence att_w = att_w.masked_fill(enc_masks, -float('inf')) # att_w.shape = max_seq_len, batch_size, 1 # turn into a normalised probability with softmax att_w = nn.functional.softmax(att_w, dim=0) # permute so the batch dimension is first instead of second # pointer_w.shape = batch_size, max_seq_len, 1 att_w = att_w.permute(1, 0, 2) # permute so the batch dimension is first, and seq_len dim is summed # enc_outputs.shape = batch_size, 2*enc_hidden_size, max_seq_len enc_outputs = enc_outputs.permute(1, 2, 0) # weighted sum of states # att_output.shape = batch_size, 2*enc_hidden_size, 1 att_output = torch.bmm(enc_outputs, att_w) # remove the last dimension # att_output.shape = batch_size, 2*enc_hidden_size att_output = att_output.squeeze(dim=2) # combine with the hidden states # get the last layer index0 = -1 for both directions index1 = (0, 1) seq_output = torch.cat((att_output, h_n[-1][0], h_n[-1][1]), dim=1) # have a dense non-linear layer # dense.shape = batch_size, dense_size dense = self.seq_to_dense_map(seq_output) dense = torch.tanh(dense) # final output layer # output.shape = batch_size, n_categories output = self.dense_to_output_map(dense) # sort the output to match the original order output = output[orig_i, :] return nn.functional.log_softmax(output, dim=1)
class BytePairFeaturizer(DenseFeaturizer): """This component adds BPEmb features.""" @classmethod def required_components(cls) -> List[Type[Component]]: return [Tokenizer] @classmethod def required_packages(cls) -> List[Text]: return ["bpemb"] defaults = { # specifies the language of the subword segmentation model "lang": "en", # specifies the dimension of the subword embeddings "dim": 25, # specifies the vocabulary size of the segmentation model "vs": 1000, # if set to True and the given vocabulary size can't be loaded for the given # model, the closest size is chosen "vs_fallback": True, # specifies the folder in which downloaded BPEmb files will be cached "cache_dir": Path.home() / Path(".cache/bpemb"), # specifies the path to a custom SentencePiece model file "model_file": None, # specifies the path to a custom embedding file. Supported formats are Word2Vec # plain text and GenSim binary. "emb_file": None, } language_list = [ "mt", "sd", "cr", "ba", "ht", "scn", "bi", "stq", "sm", "diq", "no", "yi", "vec", "bug", "am", "tl", "mn", "atj", "ko", "mai", "lij", "tcy", "sl", "bn", "dv", "rm", "ng", "ml", "kg", "koi", "war", "et", "mhr", "als", "bar", "ii", "sco", "got", "pnb", "ss", "bpy", "tum", "ru", "qu", "hy", "tw", "bm", "vep", "dty", "udm", "gd", "lbe", "rmy", "azb", "kw", "ja", "wuu", "pag", "ro", "tet", "ee", "min", "su", "ha", "glk", "pcd", "tk", "nrm", "ku", "gn", "ty", "bh", "pap", "fr", "ia", "cs", "ky", "ff", "kab", "rn", "csb", "tt", "cy", "ilo", "kaa", "hif", "ak", "pa", "crh", "ti", "myv", "ur", "se", "uz", "cdo", "lez", "srn", "kk", "pih", "de", "an", "tyv", "ext", "gan", "wo", "si", "lmo", "hak", "az", "ka", "ik", "frr", "hsb", "ho", "af", "nds", "pam", "el", "fur", "cu", "hr", "my", "nl", "da", "ch", "vls", "es", "as", "lt", "ny", "so", "oc", "lad", "pnt", "ms", "bcl", "os", "co", "ks", "or", "ay", "wa", "nah", "fa", "pl", "mzn", "za", "th", "fj", "kbp", "be", "zh", "ce", "sh", "sr", "id", "chy", "ps", "lo", "tr", "st", "he", "ang", "sah", "io", "gom", "ki", "sn", "kbd", "jam", "bo", "pms", "sk", "kv", "ckb", "nv", "dsb", "zea", "xmf", "fi", "ltg", "ksh", "ve", "new", "na", "jv", "tn", "sw", "rw", "ln", "bs", "gag", "ab", "olo", "is", "bjn", "ceb", "om", "vi", "ast", "uk", "mg", "mwl", "arz", "li", "mrj", "yo", "frp", "gl", "la", "km", "sv", "nap", "jbo", "bxr", "gv", "br", "fo", "ug", "pi", "bg", "ie", "din", "sa", "pdc", "cho", "lb", "ig", "aa", "sc", "fy", "kj", "eo", "eu", "kl", "sq", "to", "mi", "tpi", "kr", "hi", "arc", "ga", "nov", "mdf", "vo", "pfl", "rue", "haw", "kn", "mh", "mr", "te", "ca", "ace", "cv", "zu", "it", "iu", "av", "sg", "hz", "lv", "ts", "lrc", "ar", "hu", "nn", "nso", "krc", "mk", "tg", "ne", "dz", "ta", "mus", "ady", "en", "lg", "xal", "gu", "pt", "xh", "szl", "chr", ] def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: super().__init__(component_config) self.model = BPEmb( lang=self.component_config["lang"], dim=self.component_config["dim"], vs=self.component_config["vs"], vs_fallback=self.component_config["vs_fallback"], cache_dir=self.component_config["cache_dir"], ) def train( self, training_data: TrainingData, config: Optional[RasaNLUModelConfig] = None, **kwargs: Any, ) -> None: for example in training_data.intent_examples: for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: self.set_bpemb_features(example, attribute) def create_word_vector(self, document: Text) -> np.ndarray: encoded_ids = self.model.encode_ids(document) if encoded_ids: return self.model.vectors[encoded_ids[0]] return np.zeros((self.component_config["dim"], ), dtype=np.float32) def set_bpemb_features(self, message: Message, attribute: Text = TEXT) -> None: tokens = message.get(TOKENS_NAMES[attribute]) if not tokens: return None text_vector = self.create_word_vector(document=message.text) word_vectors = [ self.create_word_vector(document=t.text) for t in train_utils.tokens_without_cls(message, attribute) ] X = np.array(word_vectors + [text_vector]) features = self._combine_with_existing_dense_features( message, additional_features=X, feature_name=DENSE_FEATURE_NAMES[attribute]) message.set(DENSE_FEATURE_NAMES[attribute], features) def process(self, message: Message, **kwargs: Any) -> None: self.set_bpemb_features(message) def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]: pass @classmethod def load( cls, meta: Dict[Text, Any], model_dir: Optional[Text] = None, model_metadata: Optional["Metadata"] = None, cached_component: Optional["Component"] = None, **kwargs: Any, ) -> "Component": if cached_component: return cached_component return cls(meta)
# Build our DALL-E model dalle = DALLE( dim=CODEBOOK_DIM, # Codebook Dimension vae= vae, # DiscreteVAE instance: image sequence length and number of image tokens inferred num_text_tokens=VOCAB_SIZE + 1, # Vocab size for text. Add 1 for <PAD> text_sequence_len=TEXT_SEQ_LEN, # Text sequence length depth=DEPTH, # Transformer depth: should aim to be 64 heads=HEADS, # Attention heads dim_head=DIM_HEAD, # Attention head dimension reversible= REVERSIBLE, # Whether to use ReversibleSequence or SequentialSequence attn_dropout=ATTN_DROPOUT, # Attention dropout ff_dropout=FF_DROPOUT # Feedforward dropout ) dalle.load_weights("./dalle_tensorflow/model_weights/dalle/dalle_weights") text = "A running horse." bpe_encoder = BPEmb(lang="en", vs=VOCAB_SIZE, add_pad_emb=True) text = bpe_encoder.encode_ids(text) text = np.array(text) text = np.pad(array=text, pad_width=[0, TEXT_SEQ_LEN - len(text)]) text = tf.expand_dims(text, axis=0) mask = tf.cast(tf.where(text != 0, 1, 0, text), dtype=tf.bool) output_images = dalle.generate_images(text, mask=mask) output_images = tf.reshape(tensor=output_images, shape=[IMG_SIZE, IMG_SIZE, 3]) output_images = save_img(path="dalle_out.jpg", x=output_images)