def test(): bpemb_en = BPEmb(lang="en", dim=100) s = "Stratford" res1 = bpemb_en.encode(s) res2 = bpemb_en.encode_ids(s) print(res1) print(res2) bpemb_en_100k = BPEmb(lang="en", vs=100000, dim=100) # 40 M;词表越大切分越少 s = "hello world !" bpemb_en_100k.encode_ids(s) res1 = bpemb_en_100k.encode(s) res2 = bpemb_en_100k.encode_ids(s) print(res1) print(res2)
class BPETokenizer(Tokenizer): def __init__(self, **kwargs): lang = kwargs.get("lang", "en") vs = kwargs.get("limit", 200000) self.bpemb = BPEmb(lang=lang, vs=vs) self.tokenizer = SpacyTokenizer(model="en", annotators=["lemma", "pos", "ner"]) self.annotators = self.tokenizer.annotators def tokenize(self, text): data_spacy = self.tokenizer.tokenize(text).data data = [] start_ws = 0 for i in range(len(data_spacy)): subwords = self.bpemb.encode(data_spacy[i][0]) for j, sub in enumerate(subwords): tuple_idx = [-2, -2] if j == 0: tuple_idx[0] = data_spacy[i][2][0] if j + 1 == len(subwords): tuple_idx[1] = data_spacy[i][2][1] data.append( (sub, data_spacy[i][1] if j == 0 else "", tuple_idx, data_spacy[i][3], data_spacy[i][4], data_spacy[i][5])) return Tokens(data, self.annotators, opts={'non_ent': ''})
def test_punctuation(): text = [ "Leonidas: This's Sparta!!", "Leonidas : This ' s Sparta ! !", "Leonidas This s Sparta" ] bpemb_multi = BPEmb(lang="multi", add_pad_emb=True) print(bpemb_multi.encode(text))
class BPEmbVaeSampler(VAESampler): def __init__(self, lang, vs, dim, decode_from, params, cuda=False): self.bp = BPEmb(lang=lang, vs=vs, dim=dim) super().__init__(decode_from, params, cuda) def to_s(self, decoded): out = [] for item in decoded: s = self.bp.decode(item).replace('▁', ' ').strip() s = s[0].upper() + s[1:] s = re.sub(r'\bi\b', 'I', s) s = re.sub(r'[.!?]\s+(\w)', lambda m: m.group()[:-1] + m.group()[-1].upper(), s) out.append(s) return out def str2ids(self, s): """ Encode string s with BPEmb. BPEmb has a fixed vocabulary size, but the model only has outputs for vocab items that are used in the training data, so this function replaces any BPEmb ids *not* in the training vocabulary with the model's "unknown" id. """ encoded = self.bp.encode(s) ids = [self.vocab.word2id.get(item, self.vocab.unk_id) \ for item in encoded] return ids
class BPEmbAug: """ Thai Text Augment using word2vec from BPEmb BPEmb: `github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_ """ def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300): self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs) self.model = self.bpemb_temp.emb self.load_w2v() def tokenizer(self, text: str) -> List[str]: """ :param str text: thai text :rtype: List[str] """ return self.bpemb_temp.encode(text) def load_w2v(self): """ Load BPEmb model """ self.aug = Word2VecAug(self.model, tokenize=self.tokenizer, type="model") def augment(self, sentence: str, n_sent: int = 1, p: float = 0.7) -> List[Tuple[str]]: """ Text Augment using word2vec from BPEmb :param str sentence: thai sentence :param int n_sent: number sentence :param float p: Probability of word :return: list of synonyms :rtype: List[str] :Example: :: from pythainlp.augment.word2vec.bpemb_wv import BPEmbAug aug = BPEmbAug() aug.augment("ผมเรียน", n_sent=2, p=0.5) # output: ['ผมสอน', 'ผมเข้าเรียน'] """ self.sentence = sentence.replace(" ", "▁") self.temp = self.aug.augment(self.sentence, n_sent, p=p) self.temp_new = [] for i in self.temp: self.t = "" for j in i: self.t += j.replace('▁', '') self.temp_new.append(self.t) return self.temp_new
def process(texts, vocab_size=25000, dim=300): emb = BPEmb(lang='de', vs=vocab_size, dim=dim) texts = [emb.encode(t) for t in texts] unique_words = set([w for t in texts for w in t]) vecs = [ wv for (i, wv) in enumerate(zip(emb.words, emb.vectors)) if i < 3 or wv[0] in unique_words ] # reserve the special tokens return texts, vecs
class TokenBPESegmenter(TokenSubwordSegmenter): def __init__(self, language, vocab_size, *args, **kwargs): self.__bpe = BPEmb(lang=language, vs=vocab_size) def segment(self, token: str): subwords = [ch for ch in self.__bpe.encode(token)] if subwords[0].startswith('\u2581'): if len(subwords[0]) > 1: subwords[0] = subwords[0][1:] else: subwords.pop(0) return subwords
def test_encoding(): text = ["This is Stratford", "<pad>"] bpemb_en = BPEmb(lang="en", add_pad_emb=True) # We can auto-add and encode start/end tokens. However, encoder can't handle <pad> directly. # We should pad outside with the corresponding index (index of the last word when add_pad_emb True). print(bpemb_en.encode(text)) print(bpemb_en.encode_with_eos(text)) print(bpemb_en.encode_with_bos_eos(text)) print(bpemb_en.encode_ids(text)) print(bpemb_en.encode_ids_with_eos(text)) print(bpemb_en.encode_ids_with_bos_eos(text))
def make_byte_pair(corpus): '''This function implements byte-pair encodings''' # the bpe model bpemb_en = BPEmb(lang="en") # we are using the method to remove the stopwords so that the memory usage gets low tokenized_corpus = tokenize_preprocess_corpus(corpus) documents = [] for word_tokens in tokenized_corpus: sentence = ' '.join(word_tokens) documents.append(bpemb_en.encode(sentence)) return documents
class TweetTokenizer(): def __init__(self, dim=50, vocab_size=10000, mode='get_id'): self.dim = dim self.vocab_size = vocab_size self.bpemb_en = BPEmb(lang="en", dim=dim, vs=vocab_size) self.embedding_weight = self.bpemb_en.vectors self.mode = mode def __call__(self, tweet, mode='get_id'): if mode == 'get_id': return torch.tensor(self.bpemb_en.encode_ids(tweet), dtype=torch.long) elif mode == 'raw': return self.bpemb_en.encode(tweet) else: raise ValueError('Invalid mode')
class BPembTokenizer(Tokenizer): def __init__(self, vocab_size=50000, emb_dim=300, lang='en'): super(BPembTokenizer, self).__init__() from bpemb import BPEmb self.bpemb_en = BPEmb(lang=lang, vs=vocab_size, dim=emb_dim) def get_embeddings(self): return self.bpemb_en.vectors def encode_ids(self, text): return self.bpemb_en.encode_ids(text) def decode_ids(self, ids): return self.bpemb_en.decode_ids(ids) def tokenize(self, text): return self.bpemb_en.encode(text)
class BPEmbVaeSampler(VAESampler): def __init__(self, lang, vs, dim, decode_from, params, cuda=False): self.bp = BPEmb(lang=lang, vs=vs, dim=dim, add_pad_emb=True) super().__init__(decode_from, params, cuda) def _load_train_data(self): class Defaulter(dict): def __missing__(self, item): return 0 word2idx = Defaulter( **{item: self.bp.emb.vocab[item].index \ for item in self.bp.emb.vocab}) train_data = MonoTextData(self.params.train_data, label=False, vocab=word2idx) return train_data def to_s(self, decoded): out = [] for item in decoded: s = self.bp.decode(item).replace('▁', ' ').strip() s = s[0].upper() + s[1:] s = re.sub(r'\bi\b', 'I', s) s = re.sub(r'[.!?]\s+(\w)', lambda m: m.group()[:-1] + m.group()[-1].upper(), s) out.append(s) return out def str2ids(self, s): """ Encode string s with BPEmb. BPEmb has a fixed vocabulary size, but the model only has outputs for vocab items that are used in the training data, so this function replaces any BPEmb ids *not* in the training vocabulary with the model's "unknown" id. """ encoded = self.bp.encode(s) ids = [self.vocab.word2id.get(item, self.vocab.unk_id) \ for item in encoded] return ids
f=open("dev.en") contents = f.read() print(contents) import dev_test data= dev_test !pip install utils !pip install bpemb from bpemb import BPEmb bpemb_en = BPEmb(lang="en", dim=100) bpemb_hi = BPEmb(lang="hi", vs=1000) bpemb_en.encode(contents) from utils import * from bigan import BiGAN import dev_test training_epochs = 50 batch_size = 128 display_step = 1 # learning_rate=0.001 learning_rate=0.0002 n_samples = int(dev_test.train) training_epochs = 5000 batch_size = 128 display_step = 1
import random import math import time SEED = 1234 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True bpemb = BPEmb(lang="ru", vs=50000) field = Field(tokenize=lambda line: bpemb.encode(line.strip('\n')), init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) data = TabularDataset(path='train.сsv', format='csv', fields=[('original', field), ('paraphrase', field)]) test_data, train_data = data.split(split_ratio=0.05) field.build_vocab(train_data, min_freq=2) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') BATCH_SIZE = 256
class Corpus(object): def __init__(self, path, maxlen, vocab_size=20000, lowercase=False, max_lines=-1, test_size=-1, train_path='train.txt', test_path='test.txt', load_vocab_file='', apply_bpe=False # apply BPE now ): self.dictionary = Dictionary() self.maxlen = maxlen self.lowercase = lowercase self.vocab_size = vocab_size self.train_path = os.path.join(path, train_path) self.test_path = os.path.join(path, test_path) self.max_lines = max_lines self.apply_bpe = apply_bpe # NL new: option to apply BPE now if self.apply_bpe: from bpemb import BPEmb BPE_VOCAB_SIZE = 25000 BPE_DIM = 300 self.bpemb_en = BPEmb(lang="en", vs=BPE_VOCAB_SIZE, dim=BPE_DIM) print("\n\n--Applying BPE live--\n\n") # load existing vocabulary or make it from training set if len(load_vocab_file) > 0: self.load_vocab(load_vocab_file) else: self.make_vocab() assert len(self.dictionary) > 1 self.train = self.tokenize(self.train_path) if test_size > 0 and len(test_path) > 0: print("Test size and test path cannot both be present!") exit() if test_size > 0: print("Using {} in training set as test set".format(test_size)) self.train, self.test = self.train[:-test_size], self.train[-test_size:] return elif len(test_path) > 0: print("Using {} as test set".format(test_path)) self.test = self.tokenize(self.test_path) def make_vocab(self): assert os.path.exists(self.train_path) # Add words to the dictionary with open(self.train_path, 'r') as f: linecount = 0 for line in f: linecount += 1 if self.max_lines > 1 and linecount >= self.max_lines: break if self.lowercase: words = line.strip().lower().split() else: words = line.strip().split() words = words[1:] # exclude tag for word in words: if self.apply_bpe: for bp in self.bpemb_en.encode(word): self.dictionary.add_word(bp) else: self.dictionary.add_word(word) # prune the vocabulary self.dictionary.prune_vocab(k=self.vocab_size, cnt=False) def load_vocab(self, vocab_file='vocab.json'): assert os.path.exists(vocab_file) import json self.dictionary.word2idx = json.load(open("{}".format(vocab_file), "r")) self.dictionary.idx2word = {v: k for k, v in self.dictionary.word2idx.items()} print("Loaded vocab file {} with {} words". format(vocab_file, len(self.dictionary.word2idx))) def tokenize(self, path): """Tokenizes a text file.""" # Convert class 1,2 to 0,1 # print("Convert class 1,2 to 0,1") # Convert class 1,2 to 0,1 dropped = cropped = 0 oov_count = 0. word_count = 0. with open(path, 'r') as f: linecount = 0 lines = [] tags = [] for line in f: linecount += 1 if self.max_lines > 1 and linecount >= self.max_lines: break if self.lowercase: words = line.lower().strip().split() else: words = line.strip().split() tag, words = int(words[0]), words[1:] # if applying BPE if self.apply_bpe: words = [bp for word in words for bp in self.bpemb_en.encode(word)] if len(words) > self.maxlen: cropped += 1 words = words[:self.maxlen] # try: # crop_words = words[:maxlen] # last_period = max(rindex(crop_words, '.'), rindex(crop_words, '!'), rindex(crop_words, ',')) # except: # last_period = self.maxlen # if last_period < 10: # print("Sentence too short! {}".format(words)) # words = words[:last_period] if len(words) < 3: dropped += 1 # print(words) continue words = ['<sos>'] + words words += ['<eos>'] # vectorize vocab = self.dictionary.word2idx unk_idx = vocab['<oov>'] indices = [vocab[w] if w in vocab else unk_idx for w in words] word_count += len(indices) oov_count += sum([1 if ii==unk_idx else 0 for ii in indices]) # add to output list lines.append(indices) # Convert class 1,2 to 0,1 # tag = tag - 1 tags.append(tag) # tags = to_class_id(tags) print("Number of sentences cropped from {}: {} out of {} total, dropped {}. OOV rate {:.3f}". format(path, cropped, linecount, dropped, oov_count/word_count)) return list(zip(tags, lines))
lines= pd.read_table('eng_fre.txt', names=['eng', 'fre']) !pip install bpemb from bpemb import BPEmb lines.fre type(lines.eng) bpemb_en = BPEmb(lang="en") bpemb_fr = BPEmb(lang="fr") subword_eng=bpemb_en.encode(lines.eng) subword_fre=bpemb_en.encode(lines.fre) lines.eng pickle.dump(subword_eng,open("sub_only_eng.pkl","wb")) pickle.dump(subword_fre,open("sub_only_fre.pkl","wb")) from google.colab import files file=files.upload() import pickle
type(lines.English) # In[17]: bpemb_en = BPEmb(lang="en") # In[18]: bpemb_fr = BPEmb(lang="fr") # In[19]: subword_English=bpemb_en.encode(lines.English) # In[20]: subword_English # In[21]: type(subword_English) # In[22]: import pandas as pd
from bpemb import BPEmb # set up BPE bpemb = BPEmb(lang='an', vs=100000) # write all encoded train tweets to file with open('train.csv', 'r') as file: with open('encodedTrain.csv', 'w') as encoded: for tweet in file: tweet = bpemb.encode(tweet) tweet = ' '.join(tweet) encoded.write(tweet)
]: with open(os.path.join(path, in_file_name)) as in_file: words, tags = [], [] for line in in_file.readlines(): split = line.strip().split('\t') if len(split) > 1: word, tag = split[0], split[1] words.append(word) tags.append(tag) else: words.append('') tags.append('') out_file_name = 'ml-' + in_file_name + f'.bpe-{size}' with open(os.path.join(path, out_file_name), 'w') as out_file: encoded_words = multibpemb.encode(words) assert len(encoded_words) == len(tags) for encoded_word, tag in zip(encoded_words, tags): if tag == '': out_file.write('\n') else: for i, token in enumerate(encoded_word): if i == 0: out_file.write(f'{token}\t{tag}\n') else: out_file.write( f'{token}\t{inside_word_token}\n') # %% # %%
import csv import numpy from keras.utils.np_utils import to_categorical from sklearn.preprocessing import LabelEncoder bpemb_en = BPEmb(lang="en", dim=50) #print(bpemb_en.encode("Stratford")) #print(bpemb_en.embed("Stratford").shape) with open("datasets/Chatbot/train.csv") as f: reader = csv.reader(f, delimiter='\t') max_len = 0 y = [] for row in tqdm(reader): y.append(row[1]) sample_len = len(bpemb_en.encode(row[0])) max_len = sample_len if sample_len > max_len else max_len #print(max_len) #print(y[:10]) # label encoder le = LabelEncoder() encoded_labels = le.fit_transform(y) #print(encoded_labels) print(le.classes_) x = None y = to_categorical(encoded_labels, num_classes=len(le.classes_)) #print(y)
def map_word_to_sub_words(instance, bpemb: BPEmb): return tuple(bpemb.encode(instance))