def __init__(self, vocab_size, embedding_dim, max_sequence_len): self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.max_sequence_len = max_sequence_len self.bpemb_en_100k = BPEmb(lang="en", vs=self.vocab_size, dim=self.embedding_dim) # 40 M;词表越大切分越少
class SubWordVocab(object): def __init__(self, size): self.encoder = BPEmb(lang='en', vs=size) assert self.sos_id == 1 assert self.eos_id == 2 def __len__(self): return self.encoder.vs @property def sos_id(self): return 1 @property def eos_id(self): return self.encoder.EOS def encode(self, syms): return self.encoder.encode_ids(syms) def decode(self, ids): syms = self.encoder.decode_ids(ids) if isinstance(syms, list): return '' return syms
class BPEmbVaeSampler(VAESampler): def __init__(self, lang, vs, dim, decode_from, params, cuda=False): self.bp = BPEmb(lang=lang, vs=vs, dim=dim) super().__init__(decode_from, params, cuda) def to_s(self, decoded): out = [] for item in decoded: s = self.bp.decode(item).replace('▁', ' ').strip() s = s[0].upper() + s[1:] s = re.sub(r'\bi\b', 'I', s) s = re.sub(r'[.!?]\s+(\w)', lambda m: m.group()[:-1] + m.group()[-1].upper(), s) out.append(s) return out def str2ids(self, s): """ Encode string s with BPEmb. BPEmb has a fixed vocabulary size, but the model only has outputs for vocab items that are used in the training data, so this function replaces any BPEmb ids *not* in the training vocabulary with the model's "unknown" id. """ encoded = self.bp.encode(s) ids = [self.vocab.word2id.get(item, self.vocab.unk_id) \ for item in encoded] return ids
def test_multi_language(): text = ["This is Stratford", "Kitap okuyordu."] bpemb_multi = BPEmb(lang="multi", add_pad_emb=True) print(bpemb_multi.encode_ids_with_bos_eos(text)) print( bpemb_multi.decode_ids([[1, 5496, 200, 23866, 3927, 2], [1, 45350, 44934, 67191, 94777, 2]]))
def __init__(self, output_dim, vocab_size=10000, embed_dim=50, lang='en', embedding_preload=True, gpu_id=-1, dropout=0): super(LanguagePeripheral, self).__init__() self.gpu_id = gpu_id self.pad_char = vocab_size self.bpe_encoder = BPEmb(lang=lang, vs=vocab_size, dim=embed_dim, add_pad_emb=True) # Add an extra padding character self.embed_layer = nn.Embedding(vocab_size + 1, embed_dim, padding_idx=self.pad_char) if (embedding_preload == True): self.embed_layer.load_state_dict( {'weight': torch.tensor(self.bpe_encoder.emb.vectors)}) print("Loading pretrained word embeddings.") self.enc_dropout = nn.Dropout(dropout) self.output = nn.Linear(embed_dim, output_dim)
def test_punctuation(): text = [ "Leonidas: This's Sparta!!", "Leonidas : This ' s Sparta ! !", "Leonidas This s Sparta" ] bpemb_multi = BPEmb(lang="multi", add_pad_emb=True) print(bpemb_multi.encode(text))
class Hparams: parser = argparse.ArgumentParser() bpemb_en = BPEmb(lang="en", dim=50) bpemb_de = BPEmb(lang='de', dim=50) # preprocess parser.add_argument('--BUFFER_SIZE', default=10000) parser.add_argument('--batch_size', default=64) parser.add_argument('--maxlen', default=40, help='max length of sentences') parser.add_argument('--tokenizer_de', default=bpemb_de, help='encoding method') parser.add_argument('--tokenizer_en', default=bpemb_en, help='decoding method') # train parser.add_argument('--num_layers', default=4, help='blocks number of encoder and decoder') parser.add_argument('--d_model', default=128) parser.add_argument('--dff', default=512) parser.add_argument('--num_heads', default=8) parser.add_argument('--dropout_rate', default=0.1) parser.add_argument('--checkpoint_dir', default='./checkpoints/train') parser.add_argument('--checkpoint_dir_de', default='./checkpoints/de_en') parser.add_argument('--epochs', default=10)
def __init__(self, predictor_config): predictor_config = predictor_config['vectorizer'] self.bpemb = BPEmb(lang='en', dim=predictor_config['embedding_dim'], vs=predictor_config['max_vocab_size'], add_pad_emb=True) self.max_seq_len = predictor_config['max_seq_len']
def get_embedding_vec(self, word): if self.model is None: self.model = BPEmb(lang="en", dim=self.dim, vs=self.bp_vocab_size) if not self.case_sensitive: word = word.lower() vecs = self.model.embed(word) return np.reshape(np.sum(vecs, axis=0), (self.dim, ))
def __init__(self, lang='ru', pretrained=True, vocab_size=100000, dim=300): self.lang = lang self.pretrained = pretrained self.bpe = BPEmb(lang=self.lang, vs=vocab_size, dim=dim, vs_fallback=True)
def get_transformer(ff_dim: int, n_layers: int, n_heads: int, dropout_prob: float): """ Creates a new transformer and tokenizer using the given parameters :param ff_dim: :param n_layers: :param n_heads: :param dropout_prob: :return: """ # Load english model with 25k word-pieces tokenizer = BPEmb(lang='en', dim=300, vs=25000) # Extract the embeddings and add a randomly initialized embedding for our extra [PAD] token pretrained_embeddings = np.concatenate( [tokenizer.emb.vectors, np.zeros(shape=(1, 300))], axis=0) # Extract the vocab and add an extra [PAD] token vocabulary = tokenizer.emb.index2word + ['[PAD]'] tokenizer.pad_token_id = len(vocabulary) - 1 model = TransformerClassifier(torch.tensor(pretrained_embeddings).type( torch.FloatTensor), ff_dim=ff_dim, d_model=300, n_heads=n_heads, n_layers=n_layers, dropout_prob=dropout_prob).to(device) return model, tokenizer
def get_cnn(in_channels, out_channels, kernel_heights, stride, padding, dropout_prob): """ Creates a new CNN and tokenizer using the given parameters :return: """ # Load english model with 25k word-pieces tokenizer = BPEmb(lang='en', dim=300, vs=25000) # Extract the embeddings and add a randomly initialized embedding for our extra [PAD] token pretrained_embeddings = np.concatenate([tokenizer.emb.vectors, np.zeros(shape=(1, 300))], axis=0) # Extract the vocab and add an extra [PAD] token vocabulary = tokenizer.emb.index2word + ['[PAD]'] tokenizer.pad_token_id = len(vocabulary) - 1 model = CNN( torch.tensor(pretrained_embeddings).type(torch.FloatTensor), n_labels=2, in_channels=in_channels, out_channels=out_channels, kernel_heights=kernel_heights, stride=stride, padding=padding, dropout=dropout_prob ).to(device) return model, tokenizer
def __init__(self, lang="en", dim=200, vs=200000, distance_metric="cosine"): from bpemb import BPEmb self.bpemb = BPEmb(lang=lang, dim=dim, vs=vs) self.distance_metric = distance_metric
def __init__(self, path=config.path_to_data, mode='train'): self.path_to_data = path self.mode = mode print(f"Loading {self.mode} data...") self.data = self.read_data() self.preprocess_data() self.bpemb_ru = BPEmb(lang="ru", dim=300, vs=50000) self.placeholder = torch.zeros(config.max_seq_length, dtype=torch.long)
def __init__( self, lang, vs=10000, dim=100, cache_dir=Path.home() / Path(".cache/bpemb") ): self.lang = lang self.vs = vs self.dim = dim self.cache_dir = cache_dir self.module = BPEmb(lang=lang, vs=vs, dim=dim, cache_dir=cache_dir)
def load_bpe(vocab_size): """ Load pre-trained byte pair embedding models. Return src, trg """ bpemb_tr = BPEmb(lang="tr", vs=vocab_size) bpemb_en = BPEmb(lang="en", vs=vocab_size) return bpemb_tr, bpemb_en
def __init__(self, **kwargs): lang = kwargs.get("lang", "en") vs = kwargs.get("limit", 200000) self.bpemb = BPEmb(lang=lang, vs=vs) self.tokenizer = SpacyTokenizer(model="en", annotators=["lemma", "pos", "ner"]) self.annotators = self.tokenizer.annotators
def test_decoding(): # Although <pad> word is added, when decoding it can't handle. Therefore, remove padding before decoding. # Decoding removes start/end tokens. bpemb_en = BPEmb(lang="en", add_pad_emb=True) # ids = [1, 215, 80, 8526, 1221, 2] ids = [[1, 215, 80, 8526, 1221, 2], [1, 215, 80, 8526, 1221, 2]] # ids = [1, 215, 80, 8526, 1221, 2, 10000, 10000] # print(bpemb_en.vectors[10000]) print(bpemb_en.decode_ids(ids))
def clean_sub_word_sentence(word_ids: np.array, bpemb: BPEmb): # Extra padding token is remove in BPEmb word_ids = word_ids - 1 try: index = list(word_ids).index(bpemb.EOS) words = bpemb.decode_ids(word_ids[:index]) except ValueError: # No EOS found in sequence words = bpemb.decode_ids(word_ids) return words
def __init__(self, config): self._eval_data_path = os.path.join( os.path.abspath(os.path.dirname(os.getcwd())), config["eval_data"]) self._output_path = os.path.join( os.path.abspath(os.path.dirname(os.getcwd())), config["output_path"]) self.bpe_zh = BPEmb(lang="zh", vs=config["vocab_size"]) self.pad_token = 0 self.eos_token = 3
def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: super().__init__(component_config) self.model = BPEmb( lang=self.component_config["lang"], dim=self.component_config["dim"], vs=self.component_config["vs"], vs_fallback=self.component_config["vs_fallback"], cache_dir=self.component_config["cache_dir"], )
def __init__(self, embed_size, output_size, enc_hidden_size): super(TopicCEncSimpleBPemb, self).__init__() print("init: TopicCEncSimpleBPemb model") self.embedding_model = BPEmb(dim=embed_size, lang="en", vs=100000) self.encoder = nn.GRU(input_size=embed_size, hidden_size=enc_hidden_size, num_layers=1, bidirectional=True) self.seq_to_output_map = nn.Linear(2 * enc_hidden_size, output_size, bias=False)
def process(texts, vocab_size=25000, dim=300): emb = BPEmb(lang='de', vs=vocab_size, dim=dim) texts = [emb.encode(t) for t in texts] unique_words = set([w for t in texts for w in t]) vecs = [ wv for (i, wv) in enumerate(zip(emb.words, emb.vectors)) if i < 3 or wv[0] in unique_words ] # reserve the special tokens return texts, vecs
def __init__(self, conf, lang, bert=None): self.conf = conf self.lang = lang self.bert = bert self.device = torch.device(f"cuda:{conf.gpu_id}") self.name = conf.dataset self.tag = conf.tag self.batch_size = conf.batch_size self.eval_batch_size = conf.eval_batch_size self.examples_to_print = conf.n_examples if self.conf.tag_scheme: self.convert_tags = iob_to[self.tag_scheme] self.load_data_raw() self.NO_TAG = "NO_TAG" tags = self.get_tags() print(Counter(tags).most_common()) shapes = self.get_shapes() char_enc = None if conf.char_enc_file: assert Path(conf.char_enc_file).exists() char_enc = joblib.load(conf.char_enc_file) if self.name.endswith("multi_finetune"): assert char_enc if char_enc: self.char_enc = char_enc else: chars = self.get_chars() self.char_enc = LabelEncoder( to_torch=True, device=self.device).fit(chars) tag_enc = None if conf.tag_enc_file: assert Path(conf.tag_enc_file).exists() tag_enc = joblib.load(conf.tag_enc_file) if tag_enc: self.tag_enc = tag_enc else: self.tag_enc = LabelEncoder( to_torch=True, device=self.device).fit(tags) self.shape_enc = LabelEncoder( to_torch=True, device=self.device).fit(shapes) self.bpemb = BPEmb( lang=conf.bpemb_lang, vs=conf.vocab_size, dim=conf.bpemb_dim, add_pad_emb=True) if conf.use_fasttext: f = conf.fasttext_emb_file.format(dataset=self.name, lang=lang) self.fasttext_emb = load_word2vec_file(f, add_unk=True) self.pad_idx = self.bpemb.emb.key_to_index["<pad>"] if not conf.no_dataset_tensorize: self.tensorize()
def __init__( self, config: Dict[Text, Any], name: Text, ) -> None: """Constructs a new byte pair vectorizer.""" super().__init__(name, config) # The configuration dictionary is saved in `self._config` for reference. self.model = BPEmb( lang=self._config["lang"], dim=self._config["dim"], vs=self._config["vs"], vs_fallback=self._config["vs_fallback"], )
def get_multibpe_embeddings(x: List[str], multibpemb=None, vs=1000000, dim=300): if multibpemb is None: multibpemb = BPEmb(lang="multi", vs=vs, dim=dim) embeddings = [] for sentence in x: features = multibpemb.embed(sentence) embeddings.append(features) embeddings = pad(embeddings, [0 for _ in range(dim)], 32) return embeddings
def __init__(self, lang: str, dim: int = 300, vs: int = 100000, add_pad_emb: bool = True): super().__init__() try: from bpemb import BPEmb self.embedder = BPEmb(lang=lang, dim=dim, vs=vs, add_pad_emb=add_pad_emb, cache_dir=Path(aikido.cache_root) / "embeddings") self.embeddings_ = nn.Embedding.from_pretrained(tensor(self.embedder.vectors, dtype=torch.float), padding_idx=vs) self.dim_ = dim self.vs_ = vs except ImportError: logging.error("-" * 100) logging.error("no bpemb installation found. see https://github.com/bheinzerling/bpemb") logging.error("-" * 100) pass
def test(): bpemb_en = BPEmb(lang="en", dim=100) s = "Stratford" res1 = bpemb_en.encode(s) res2 = bpemb_en.encode_ids(s) print(res1) print(res2) bpemb_en_100k = BPEmb(lang="en", vs=100000, dim=100) # 40 M;词表越大切分越少 s = "hello world !" bpemb_en_100k.encode_ids(s) res1 = bpemb_en_100k.encode(s) res2 = bpemb_en_100k.encode_ids(s) print(res1) print(res2)
def make_byte_pair(corpus): '''This function implements byte-pair encodings''' # the bpe model bpemb_en = BPEmb(lang="en") # we are using the method to remove the stopwords so that the memory usage gets low tokenized_corpus = tokenize_preprocess_corpus(corpus) documents = [] for word_tokens in tokenized_corpus: sentence = ' '.join(word_tokens) documents.append(bpemb_en.encode(sentence)) return documents
class TweetTokenizer(): def __init__(self, dim=50, vocab_size=10000, mode='get_id'): self.dim = dim self.vocab_size = vocab_size self.bpemb_en = BPEmb(lang="en", dim=dim, vs=vocab_size) self.embedding_weight = self.bpemb_en.vectors self.mode = mode def __call__(self, tweet, mode='get_id'): if mode == 'get_id': return torch.tensor(self.bpemb_en.encode_ids(tweet), dtype=torch.long) elif mode == 'raw': return self.bpemb_en.encode(tweet) else: raise ValueError('Invalid mode')