Exemplo n.º 1
0
def test():
    bpemb_en = BPEmb(lang="en", dim=100)
    s = "Stratford"
    res1 = bpemb_en.encode(s)
    res2 = bpemb_en.encode_ids(s)
    print(res1)
    print(res2)

    bpemb_en_100k = BPEmb(lang="en", vs=100000, dim=100)  # 40 M;词表越大切分越少
    s = "hello world !"
    bpemb_en_100k.encode_ids(s)
    res1 = bpemb_en_100k.encode(s)
    res2 = bpemb_en_100k.encode_ids(s)
    print(res1)
    print(res2)
class BPETokenizer(Tokenizer):
    def __init__(self, **kwargs):
        lang = kwargs.get("lang", "en")
        vs = kwargs.get("limit", 200000)

        self.bpemb = BPEmb(lang=lang, vs=vs)
        self.tokenizer = SpacyTokenizer(model="en",
                                        annotators=["lemma", "pos", "ner"])
        self.annotators = self.tokenizer.annotators

    def tokenize(self, text):
        data_spacy = self.tokenizer.tokenize(text).data

        data = []
        start_ws = 0
        for i in range(len(data_spacy)):
            subwords = self.bpemb.encode(data_spacy[i][0])
            for j, sub in enumerate(subwords):
                tuple_idx = [-2, -2]
                if j == 0:
                    tuple_idx[0] = data_spacy[i][2][0]
                if j + 1 == len(subwords):
                    tuple_idx[1] = data_spacy[i][2][1]
                data.append(
                    (sub, data_spacy[i][1] if j == 0 else "", tuple_idx,
                     data_spacy[i][3], data_spacy[i][4], data_spacy[i][5]))

        return Tokens(data, self.annotators, opts={'non_ent': ''})
Exemplo n.º 3
0
def test_punctuation():
    text = [
        "Leonidas: This's Sparta!!", "Leonidas : This ' s Sparta ! !",
        "Leonidas This s Sparta"
    ]
    bpemb_multi = BPEmb(lang="multi", add_pad_emb=True)
    print(bpemb_multi.encode(text))
Exemplo n.º 4
0
class BPEmbVaeSampler(VAESampler):
    def __init__(self, lang, vs, dim, decode_from, params, cuda=False):
        self.bp = BPEmb(lang=lang, vs=vs, dim=dim)
        super().__init__(decode_from, params, cuda)

    def to_s(self, decoded):
        out = []
        for item in decoded:
            s = self.bp.decode(item).replace('▁', ' ').strip()
            s = s[0].upper() + s[1:]
            s = re.sub(r'\bi\b', 'I', s)
            s = re.sub(r'[.!?]\s+(\w)',
                       lambda m: m.group()[:-1] + m.group()[-1].upper(), s)
            out.append(s)
        return out

    def str2ids(self, s):
        """
        Encode string s with BPEmb. BPEmb has a fixed vocabulary size, but
        the model only has outputs for vocab items that are used in the
        training data, so this function replaces any BPEmb ids *not* in the
        training vocabulary with the model's "unknown" id.
        """
        encoded = self.bp.encode(s)
        ids = [self.vocab.word2id.get(item, self.vocab.unk_id) \
                for item in encoded]
        return ids
Exemplo n.º 5
0
class BPEmbAug:
    """
    Thai Text Augment using word2vec from BPEmb

    BPEmb:
    `github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_
    """
    def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300):
        self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs)
        self.model = self.bpemb_temp.emb
        self.load_w2v()

    def tokenizer(self, text: str) -> List[str]:
        """
        :param str text: thai text
        :rtype: List[str]
        """
        return self.bpemb_temp.encode(text)

    def load_w2v(self):
        """
        Load BPEmb model
        """
        self.aug = Word2VecAug(self.model,
                               tokenize=self.tokenizer,
                               type="model")

    def augment(self,
                sentence: str,
                n_sent: int = 1,
                p: float = 0.7) -> List[Tuple[str]]:
        """
        Text Augment using word2vec from BPEmb

        :param str sentence: thai sentence
        :param int n_sent: number sentence
        :param float p: Probability of word

        :return: list of synonyms
        :rtype: List[str]
        :Example:
        ::

            from pythainlp.augment.word2vec.bpemb_wv import BPEmbAug

            aug = BPEmbAug()
            aug.augment("ผมเรียน", n_sent=2, p=0.5)
            # output: ['ผมสอน', 'ผมเข้าเรียน']
        """
        self.sentence = sentence.replace(" ", "▁")
        self.temp = self.aug.augment(self.sentence, n_sent, p=p)
        self.temp_new = []
        for i in self.temp:
            self.t = ""
            for j in i:
                self.t += j.replace('▁', '')
            self.temp_new.append(self.t)
        return self.temp_new
Exemplo n.º 6
0
def process(texts, vocab_size=25000, dim=300):
    emb = BPEmb(lang='de', vs=vocab_size, dim=dim)

    texts = [emb.encode(t) for t in texts]

    unique_words = set([w for t in texts for w in t])
    vecs = [
        wv for (i, wv) in enumerate(zip(emb.words, emb.vectors))
        if i < 3 or wv[0] in unique_words
    ]  # reserve the special tokens

    return texts, vecs
Exemplo n.º 7
0
class TokenBPESegmenter(TokenSubwordSegmenter):
    def __init__(self, language, vocab_size, *args, **kwargs):
        self.__bpe = BPEmb(lang=language, vs=vocab_size)

    def segment(self, token: str):
        subwords = [ch for ch in self.__bpe.encode(token)]
        if subwords[0].startswith('\u2581'):
            if len(subwords[0]) > 1:
                subwords[0] = subwords[0][1:]
            else:
                subwords.pop(0)
        return subwords
Exemplo n.º 8
0
def test_encoding():
    text = ["This is Stratford", "<pad>"]

    bpemb_en = BPEmb(lang="en", add_pad_emb=True)

    # We can auto-add and encode start/end tokens. However, encoder can't handle <pad> directly.
    # We should pad outside with the corresponding index (index of the last word when add_pad_emb True).
    print(bpemb_en.encode(text))
    print(bpemb_en.encode_with_eos(text))
    print(bpemb_en.encode_with_bos_eos(text))
    print(bpemb_en.encode_ids(text))
    print(bpemb_en.encode_ids_with_eos(text))
    print(bpemb_en.encode_ids_with_bos_eos(text))
Exemplo n.º 9
0
def make_byte_pair(corpus):
    '''This function implements byte-pair encodings'''

    # the bpe model
    bpemb_en = BPEmb(lang="en")

    # we are using the method to remove the stopwords so that the memory usage gets low
    tokenized_corpus = tokenize_preprocess_corpus(corpus)
    documents = []

    for word_tokens in tokenized_corpus:
        sentence = ' '.join(word_tokens)
        documents.append(bpemb_en.encode(sentence))

    return documents
Exemplo n.º 10
0
class TweetTokenizer():
    def __init__(self, dim=50, vocab_size=10000, mode='get_id'):
        self.dim = dim
        self.vocab_size = vocab_size
        self.bpemb_en = BPEmb(lang="en", dim=dim, vs=vocab_size)
        self.embedding_weight = self.bpemb_en.vectors
        self.mode = mode
    
    def __call__(self, tweet, mode='get_id'):
        if mode == 'get_id':
            return torch.tensor(self.bpemb_en.encode_ids(tweet), dtype=torch.long)
        elif mode == 'raw':
            return self.bpemb_en.encode(tweet)
        else:
            raise ValueError('Invalid mode')
Exemplo n.º 11
0
class BPembTokenizer(Tokenizer):
    def __init__(self, vocab_size=50000, emb_dim=300, lang='en'):
        super(BPembTokenizer, self).__init__()
        from bpemb import BPEmb
        self.bpemb_en = BPEmb(lang=lang, vs=vocab_size, dim=emb_dim)

    def get_embeddings(self):
        return self.bpemb_en.vectors

    def encode_ids(self, text):
        return self.bpemb_en.encode_ids(text)

    def decode_ids(self, ids):
        return self.bpemb_en.decode_ids(ids)

    def tokenize(self, text):
        return self.bpemb_en.encode(text)
Exemplo n.º 12
0
class BPEmbVaeSampler(VAESampler):
    def __init__(self, lang, vs, dim, decode_from, params, cuda=False):
        self.bp = BPEmb(lang=lang, vs=vs, dim=dim, add_pad_emb=True)
        super().__init__(decode_from, params, cuda)

    def _load_train_data(self):
        class Defaulter(dict):
            def __missing__(self, item):
                return 0
        word2idx = Defaulter(
                **{item: self.bp.emb.vocab[item].index \
                        for item in self.bp.emb.vocab})
        train_data = MonoTextData(self.params.train_data,
                                  label=False,
                                  vocab=word2idx)
        return train_data

    def to_s(self, decoded):
        out = []
        for item in decoded:
            s = self.bp.decode(item).replace('▁', ' ').strip()
            s = s[0].upper() + s[1:]
            s = re.sub(r'\bi\b', 'I', s)
            s = re.sub(r'[.!?]\s+(\w)',
                       lambda m: m.group()[:-1] + m.group()[-1].upper(), s)
            out.append(s)
        return out

    def str2ids(self, s):
        """
        Encode string s with BPEmb. BPEmb has a fixed vocabulary size, but
        the model only has outputs for vocab items that are used in the
        training data, so this function replaces any BPEmb ids *not* in the
        training vocabulary with the model's "unknown" id.
        """
        encoded = self.bp.encode(s)
        ids = [self.vocab.word2id.get(item, self.vocab.unk_id) \
                for item in encoded]
        return ids
Exemplo n.º 13
0
f=open("dev.en")
contents = f.read()
print(contents)

import dev_test
data= dev_test

!pip install utils

!pip install bpemb
from bpemb import BPEmb

bpemb_en = BPEmb(lang="en", dim=100)
bpemb_hi = BPEmb(lang="hi", vs=1000)

bpemb_en.encode(contents)

from utils import *
from bigan import BiGAN
import dev_test

training_epochs = 50
batch_size = 128
display_step = 1
# learning_rate=0.001
learning_rate=0.0002
n_samples = int(dev_test.train)

training_epochs = 5000
batch_size = 128
display_step = 1
import random
import math
import time

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

bpemb = BPEmb(lang="ru", vs=50000)

field = Field(tokenize=lambda line: bpemb.encode(line.strip('\n')),
              init_token='<sos>',
              eos_token='<eos>',
              lower=True,
              batch_first=True)

data = TabularDataset(path='train.сsv',
                      format='csv',
                      fields=[('original', field), ('paraphrase', field)])
test_data, train_data = data.split(split_ratio=0.05)

field.build_vocab(train_data, min_freq=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 256
Exemplo n.º 15
0
class Corpus(object):
    def __init__(self, path, maxlen, vocab_size=20000, lowercase=False,
                 max_lines=-1, test_size=-1, train_path='train.txt',
                 test_path='test.txt', load_vocab_file='',
                 apply_bpe=False  # apply BPE now
                 ):
        self.dictionary = Dictionary()
        self.maxlen = maxlen
        self.lowercase = lowercase
        self.vocab_size = vocab_size
        self.train_path = os.path.join(path, train_path)
        self.test_path = os.path.join(path, test_path)
        self.max_lines = max_lines
        self.apply_bpe = apply_bpe

        # NL new: option to apply BPE now
        if self.apply_bpe:
            from bpemb import BPEmb
            BPE_VOCAB_SIZE = 25000
            BPE_DIM = 300
            self.bpemb_en = BPEmb(lang="en", vs=BPE_VOCAB_SIZE, dim=BPE_DIM)
            print("\n\n--Applying BPE live--\n\n")
            
        # load existing vocabulary or make it from training set
        if len(load_vocab_file) > 0:
            self.load_vocab(load_vocab_file)
        else:
            self.make_vocab()
        assert len(self.dictionary) > 1
        self.train = self.tokenize(self.train_path)
        if test_size > 0 and len(test_path) > 0:
            print("Test size and test path cannot both be present!")
            exit()
        if test_size > 0:
            print("Using {} in training set as test set".format(test_size))
            self.train, self.test = self.train[:-test_size], self.train[-test_size:]
            return
        elif len(test_path) > 0:
            print("Using {} as test set".format(test_path))
            self.test = self.tokenize(self.test_path)

    def make_vocab(self):
        assert os.path.exists(self.train_path)
        # Add words to the dictionary
        with open(self.train_path, 'r') as f:
            linecount = 0
            for line in f:
                linecount += 1
                if self.max_lines > 1 and linecount >= self.max_lines:
                    break
                if self.lowercase:
                    words = line.strip().lower().split()
                else:
                    words = line.strip().split()
                words = words[1:] # exclude tag
                for word in words:
                    if self.apply_bpe:
                        for bp in self.bpemb_en.encode(word):
                            self.dictionary.add_word(bp)
                    else:
                        self.dictionary.add_word(word)

        # prune the vocabulary
        self.dictionary.prune_vocab(k=self.vocab_size, cnt=False)

    def load_vocab(self, vocab_file='vocab.json'):
        assert os.path.exists(vocab_file)
        import json
        self.dictionary.word2idx = json.load(open("{}".format(vocab_file), "r"))
        self.dictionary.idx2word = {v: k for k, v in self.dictionary.word2idx.items()}
        print("Loaded vocab file {} with {} words".
              format(vocab_file, len(self.dictionary.word2idx)))

    def tokenize(self, path):
        """Tokenizes a text file."""
        # Convert class 1,2 to 0,1
        # print("Convert class 1,2 to 0,1")
        # Convert class 1,2 to 0,1
        dropped = cropped = 0
        oov_count = 0.
        word_count = 0.
        with open(path, 'r') as f:
            linecount = 0
            lines = []
            tags = []
            for line in f:
                linecount += 1
                if self.max_lines > 1 and linecount >= self.max_lines:
                    break
                if self.lowercase:
                    words = line.lower().strip().split()
                else:
                    words = line.strip().split()
                tag, words = int(words[0]), words[1:]

                # if applying BPE
                if self.apply_bpe:
                    words = [bp for word in words
                             for bp in self.bpemb_en.encode(word)]

                if len(words) > self.maxlen:
                    cropped += 1
                    words = words[:self.maxlen]
#                     try:
#                         crop_words = words[:maxlen]
#                         last_period = max(rindex(crop_words, '.'), rindex(crop_words, '!'), rindex(crop_words, ','))
#                     except:
#                         last_period = self.maxlen
#                     if last_period < 10:
#                         print("Sentence too short! {}".format(words))
#                     words = words[:last_period]
                if len(words) < 3:
                    dropped += 1
#                     print(words)
                    continue
                words = ['<sos>'] + words
                words += ['<eos>']

                # vectorize
                vocab = self.dictionary.word2idx
                unk_idx = vocab['<oov>']
                indices = [vocab[w] if w in vocab else unk_idx for w in words]
                word_count += len(indices)
                oov_count += sum([1 if ii==unk_idx else 0 for ii in indices])
                # add to output list
                lines.append(indices)
                # Convert class 1,2 to 0,1
                # tag = tag - 1
                tags.append(tag)
        # tags = to_class_id(tags)
        print("Number of sentences cropped from {}: {} out of {} total, dropped {}. OOV rate {:.3f}".
              format(path, cropped, linecount, dropped, oov_count/word_count))

        return list(zip(tags, lines))
Exemplo n.º 16
0
lines= pd.read_table('eng_fre.txt', names=['eng', 'fre'])

!pip install bpemb

from bpemb import BPEmb

lines.fre

type(lines.eng)

bpemb_en = BPEmb(lang="en")

bpemb_fr = BPEmb(lang="fr")

subword_eng=bpemb_en.encode(lines.eng)

subword_fre=bpemb_en.encode(lines.fre)

lines.eng



pickle.dump(subword_eng,open("sub_only_eng.pkl","wb"))

pickle.dump(subword_fre,open("sub_only_fre.pkl","wb"))

from google.colab import files
file=files.upload()

import pickle
type(lines.English)


# In[17]:

bpemb_en = BPEmb(lang="en")


# In[18]:

bpemb_fr = BPEmb(lang="fr")


# In[19]:

subword_English=bpemb_en.encode(lines.English)


# In[20]:

subword_English


# In[21]:

type(subword_English)


# In[22]:

import pandas as pd
Exemplo n.º 18
0
from bpemb import BPEmb

# set up BPE
bpemb = BPEmb(lang='an', vs=100000)

# write all encoded train tweets to file
with open('train.csv', 'r') as file:
    with open('encodedTrain.csv', 'w') as encoded:
        for tweet in file:
            tweet = bpemb.encode(tweet)
            tweet = ' '.join(tweet)
            encoded.write(tweet)
Exemplo n.º 19
0
        ]:
            with open(os.path.join(path, in_file_name)) as in_file:
                words, tags = [], []
                for line in in_file.readlines():
                    split = line.strip().split('\t')
                    if len(split) > 1:
                        word, tag = split[0], split[1]
                        words.append(word)
                        tags.append(tag)
                    else:
                        words.append('')
                        tags.append('')
                    out_file_name = 'ml-' + in_file_name + f'.bpe-{size}'

                with open(os.path.join(path, out_file_name), 'w') as out_file:
                    encoded_words = multibpemb.encode(words)
                    assert len(encoded_words) == len(tags)
                    for encoded_word, tag in zip(encoded_words, tags):
                        if tag == '':
                            out_file.write('\n')
                        else:
                            for i, token in enumerate(encoded_word):
                                if i == 0:
                                    out_file.write(f'{token}\t{tag}\n')
                                else:
                                    out_file.write(
                                        f'{token}\t{inside_word_token}\n')

# %%

# %%
Exemplo n.º 20
0
import csv
import numpy
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
bpemb_en = BPEmb(lang="en", dim=50)

#print(bpemb_en.encode("Stratford"))
#print(bpemb_en.embed("Stratford").shape)

with open("datasets/Chatbot/train.csv") as f:
    reader = csv.reader(f, delimiter='\t')
    max_len = 0
    y = []
    for row in tqdm(reader):
        y.append(row[1])
        sample_len = len(bpemb_en.encode(row[0]))
        max_len = sample_len if sample_len > max_len else max_len

#print(max_len)
#print(y[:10])

# label encoder
le = LabelEncoder()
encoded_labels = le.fit_transform(y)
#print(encoded_labels)
print(le.classes_)

x = None

y = to_categorical(encoded_labels, num_classes=len(le.classes_))
#print(y)
Exemplo n.º 21
0
def map_word_to_sub_words(instance, bpemb: BPEmb):
    return tuple(bpemb.encode(instance))