def test_train_from_iterator(self):
        text = ["A first sentence", "Another sentence", "And a last one"]
        tokenizer = CharBPETokenizer()
        tokenizer.train_from_iterator(text, show_progress=False)

        output = tokenizer.encode("A sentence")
        assert output.tokens == ["A</w>", "sentence</w>"]
Exemplo n.º 2
0
    def test_basic_encode(self, openai_files):
        tokenizer = CharBPETokenizer(openai_files["vocab"],
                                     openai_files["merges"])

        output = tokenizer.encode("My name is John", "pair")
        assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688]
        assert output.tokens == [
            "<unk>",
            "y</w>",
            "name</w>",
            "is</w>",
            "<unk>",
            "o",
            "hn</w>",
            "pair</w>",
        ]
        assert output.offsets == [
            (0, 1),
            (1, 2),
            (3, 7),
            (8, 10),
            (11, 12),
            (12, 13),
            (13, 15),
            (0, 4),
        ]
        assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]
class HuggingFaceTokenizer:
    def __init__(self, cache_dir, max_length=None, vocab_size=400):
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.cache_dir = cache_dir
        self.name = "%d-%s" % (vocab_size, max_length)
        self.tokenizer = None

        vocab = os.path.join(self.cache_dir, self.name + '-vocab.json')
        merges = os.path.join(self.cache_dir, self.name + '-merges.txt')
        if os.path.exists(vocab) and os.path.exists(merges):
            self.tokenizer = CharBPETokenizer(vocab, merges, lowercase=True)
            print('Using cached HuggingFaceTokenizer')

    def build(self, texts):
        if self.tokenizer is not None:
            return

        tmp_file = tempfile.NamedTemporaryFile()

        with open(tmp_file.name, "w") as f:
            f.write(' '.join(texts).lower())

        self.tokenizer = CharBPETokenizer(lowercase=True)
        self.tokenizer.train(
            [tmp_file.name],
            vocab_size=self.vocab_size,
            special_tokens=[
                NUL_token,
                PAD_token,
                BOS_token,
                UNK_token,
            ],
        )
        os.makedirs(self.cache_dir, exist_ok=True)
        self.tokenizer.save(self.cache_dir, self.name)

    def encode(self, text):
        token_ids = self.tokenizer.encode(text.lower()).ids
        token_ids = token_ids[:self.max_length]

        return token_ids

    def decode(self, tokens, skip_special_tokens=True):
        text = self.tokenizer.decode(  # My special tokens
            tokens,
            # [token for token in tokens if token > 3],   # aren't skipped
            skip_special_tokens=skip_special_tokens,  # even I set f*****g
        )  # skip_special_tokens
        return text  # to True

    def decode_plus(self, token_batch):
        sentences = []
        for tokens in token_batch:
            sentences.append(self.decode(tokens))
        return sentences
Exemplo n.º 4
0
 def test_lowercase(self, openai_files):
     tokenizer = CharBPETokenizer(openai_files["vocab"],
                                  openai_files["merges"],
                                  lowercase=True)
     output = tokenizer.encode("My name is John",
                               "pair",
                               add_special_tokens=False)
     assert output.ids == [547, 1362, 544, 2476, 2688]
     assert output.tokens == [
         "my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"
     ]
     assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
     assert output.type_ids == [0, 0, 0, 0, 1]
Exemplo n.º 5
0
def test():
    """Test trained tokenizer"""

    tokenizer = CharBPETokenizer('./thyme-tokenizer-vocab.json',
                                 './thyme-tokenizer-merges.txt')

    vocab = tokenizer.get_vocab()
    print('vocab size:', len(vocab))

    encoded = tokenizer.encode('patient dr. who diagnosed with brain abc')
    encoded.pad(15)

    print('encoded:', encoded.ids)
    print('decoded:', tokenizer.decode(encoded.ids))

    print(encoded.tokens)
    print(encoded.attention_mask)
Exemplo n.º 6
0
class SubwordEncoder:
    "Subword tokenization" 

    def __init__(self, path='subword/'):
        """ 
        Args:
            path: str, a path to vocab file.
        """
        
        # Load vocab
        self.subword_tokenizer = CharBPETokenizer(vocab_file=path+"/bpe-vocab.json", merges_file=path+"/bpe-merges.txt")

        self.encode = self._encode_subwords
        self.id_to_token = self._id_to_subword()
        self.token_to_id = self._subword_to_id()

    def get_vocab_size(self):
        return self.subword_tokenizer.get_vocab_size()   

    def _encode_subwords(self, sentence, with_eos):
        """ 
        Args:
            sentence: str, texts to be encoded.
            with_eos: end with <EOS> token.
        Returns:
            tokens: list, encoded sequence.
        """
        tokens = self.subword_tokenizer.encode(sentence).ids
        if with_eos:
            tokens += [2] # 2 is the id of <EOS> token
        return tokens

    def _id_to_subword(self):
        id2subword = {}
        for i in range(self.get_vocab_size()):
            id2subword[i] = self.subword_tokenizer.id_to_token(i)
        return id2subword

    def _subword_to_id(self):
        subword2id = {}
        for i in range(self.get_vocab_size()):
            subword2id[self.subword_tokenizer.id_to_token(i)] = i
        return subword2id
Exemplo n.º 7
0
def to_lstm_inputs(texts, max_len=None):
  """Padded at the beginning rather than at the end"""

  tokenizer = CharBPETokenizer(
    '../Tokenize/thyme-tokenizer-vocab.json',
    '../Tokenize/thyme-tokenizer-merges.txt')
  seqs = [tokenizer.encode(text).ids for text in texts]

  if max_len is None:
    # set max_len to the length of the longest sequence
    max_len = max(len(id_seq) for id_seq in seqs)

  ids = torch.zeros(len(seqs), max_len, dtype=torch.long)

  for i, seq in enumerate(seqs):
    if len(seq) > max_len:
      seq = seq[:max_len]
    ids[i, -len(seq):] = torch.tensor(seq)

  return ids
Exemplo n.º 8
0
def to_token_id_sequences(texts, max_len=None):
  """Matrix of token ids"""

  tokenizer = CharBPETokenizer(
    '../Tokenize/thyme-tokenizer-vocab.json',
    '../Tokenize/thyme-tokenizer-merges.txt')
  seqs = [tokenizer.encode(text).ids for text in texts]

  if max_len is None:
    # set max_len to the length of the longest sequence
    max_len = max(len(id_seq) for id_seq in seqs)

  ids = torch.zeros(len(seqs), max_len, dtype=torch.long)

  for i, seq in enumerate(seqs):
    if len(seq) > max_len:
      seq = seq[:max_len]
    ids[i, :len(seq)] = torch.tensor(seq)

  return ids
Exemplo n.º 9
0
def to_transformer_inputs(texts, max_len=None):
  """Matrix of token ids and a square attention mask for eash sample"""

  tokenizer = CharBPETokenizer(
    '../Tokenize/thyme-tokenizer-vocab.json',
    '../Tokenize/thyme-tokenizer-merges.txt')
  seqs = [tokenizer.encode(text).ids for text in texts]

  if max_len is None:
    # set max_len to the length of the longest sequence
    max_len = max(len(id_seq) for id_seq in seqs)

  ids = torch.zeros(len(seqs), max_len, dtype=torch.long)
  mask = torch.zeros(len(seqs), max_len, max_len, dtype=torch.long)

  for i, seq in enumerate(seqs):
    if len(seq) > max_len:
      seq = seq[:max_len]
    ids[i, :len(seq)] = torch.tensor(seq)
    mask[i, :len(seq), :len(seq)] = 1

  return ids, mask
Exemplo n.º 10
0
class BPETokenizer:
    def __init__(self, text_list, vocab_size, lazy=False):
        if not lazy:
            self.tokenizer = CharBPETokenizer()
            self.tokenizer.train(text_list,
                                 vocab_size=vocab_size,
                                 special_tokens=[PAD, BOS, EOS, "<unk>"])
            self.tokenizer.add_special_tokens([PAD, BOS, EOS])
        else:
            self.tokenizer = None

    def tokens_to_ids(self, tokens):
        return [self.tokenizer.token_to_id(t) for t in tokens]

    def ids_to_tokens(self, ids):
        return [self.tokenizer.id_to_token(i) for i in ids]

    def encode(self, text):
        encodes = self.tokenizer.encode(text)
        return encodes.ids

    def decode(self, ids, skip_special=True):
        return self.tokenizer.decode(ids, skip_special_tokens=skip_special)

    def save(self, path, file_name):
        self.tokenizer.save(path, file_name)

    @classmethod
    def load(cls, vocab, merges):
        tkz = cls(None, None, lazy=True)
        tkz.tokenizer = CharBPETokenizer(vocab, merges)
        tkz.tokenizer.add_special_tokens([PAD, BOS, EOS])
        return tkz

    def __len__(self):
        return self.tokenizer.get_vocab_size()
from tokenizers import CharBPETokenizer
import json
import tqdm

if __name__ == "__main__":
    # Initialize a tokenizer
    tokenizer = CharBPETokenizer()

    # Then train it!
    tokenizer.train(
        [
            "data\\train.txt",
            "D:/数据/wikitext-2-raw-v1/wikitext-2-raw/wiki.train.raw",
            "D:/数据/webtext2019zh/web_text_raw.txt"
        ],
        vocab_size=30000,
        min_frequency=2,
        special_tokens=['<UNK>', '<BOS>', '<EOS>', '<PAD>', '<CLS>', '<SEP>'])

    # Now, let's use it:
    encoded = tokenizer.encode("I can feel the magic, can you?")

    # And finally save it somewhere
    tokenizer.save("./", "bpe.tokenizer.json")
Exemplo n.º 12
0
class EngGerNewstest(Dataset):
    """
    The newstest 2014 dataset used for testing
    """
    def __init__(self,
                 data_folder,
                 rank=0,
                 val_set=False,
                 world_size=1,
                 seed=0,
                 eng_to_ger=True,
                 vocab_size=37000,
                 MASK="<MASK>",
                 START="<START>",
                 STOP="<STOP>",
                 exp_name="",
                 max_context=None,
                 batch_size=128,
                 val_size=30000,
                 **kwargs):
        """
        rank: int
            the rank in the distributed training
        val_set: bool
            if true, this dataset is created as the validation set
        world_size: int
            the number of processes if using distributed training
        seed: int
            random seed
        data_folder: str
            the path to the folder that should contain a `train.en` and
            a `train.de` file.
        eng_to_ger: bool
            if true, the x values are returned as english ids and the
            y values are german ids. If false, then visa-versa
        vocab_size: int
            the number of encodings for the byte-pair encoding scheme
        MASK: str
            the mask token
        START: str
            the start token
        STOP: str
            the stop token
        exp_name: str
            name of the experiment
        max_context: int
            the maximum sequence length
        val_size: int
            the number of samples to be set aside for validation
        """
        self.rank = rank
        print("rank:", self.rank)
        self.world_size = world_size
        self.val_set = val_set
        self.val_size = val_size
        self.batch_size = batch_size
        self.data_folder = os.path.expanduser(data_folder)
        self.en_path = os.path.join(data_folder, "newstest2014.en")
        self.de_path = os.path.join(data_folder, "newstest2014.de")
        self.eng_to_ger = eng_to_ger
        self.vocab_size = vocab_size
        self.MASK = MASK
        self.START = START
        self.STOP = STOP
        self.max_context = max_context
        self.en_tok_path = os.path.join(self.data_folder, "en_tokenizer")
        self.de_tok_path = os.path.join(self.data_folder, "de_tokenizer")
        self.en_arr_path = os.path.join(self.data_folder, "en_bcolz")
        self.de_arr_path = os.path.join(self.data_folder, "de_bcolz")
        self.en_lens_path = os.path.join(self.data_folder, "en_bcolz_lens")
        self.de_lens_path = os.path.join(self.data_folder, "de_bcolz_lens")

        # Train tokenizers
        if rank == 0: print("Tokenizing english..")
        self.en_tokenizer = CharBPETokenizer()
        if os.path.exists(self.en_tok_path):  # Load trained tokenizer
            if rank == 0:
                print("loading from pretrained tokenizer", self.en_tok_path)
            self.en_tokenizer = ml_utils.datas.load_tokenizer(
                self.en_tokenizer, self.en_tok_path)
        else:
            self.en_tokenizer.train([self.en_path], vocab_size=vocab_size)
            os.mkdir(self.en_tok_path)
            self.en_tokenizer.save_model(self.en_tok_path)
        self.en_tokenizer.add_special_tokens([self.MASK])
        self.en_tokenizer.add_tokens([self.START])
        self.en_tokenizer.add_tokens([self.STOP])
        self.en_mask_idx = self.en_tokenizer.token_to_id(self.MASK)
        self.en_start_idx = self.en_tokenizer.token_to_id(self.START)
        self.en_stop_idx = self.en_tokenizer.token_to_id(self.STOP)

        if rank == 0: print("Tokenizing german..")
        self.de_tokenizer = CharBPETokenizer()
        if os.path.exists(self.de_tok_path):  # Load trained tokenizer
            if rank == 0:
                print("loading from pretrained tokenizer", self.de_tok_path)
            self.de_tokenizer = ml_utils.datas.load_tokenizer(
                self.de_tokenizer, self.de_tok_path)
        else:
            self.de_tokenizer.train([self.de_path], vocab_size=vocab_size)
            os.mkdir(self.de_tok_path)
            self.de_tokenizer.save_model(self.de_tok_path)
        self.de_tokenizer.add_special_tokens([self.MASK])
        self.de_tokenizer.add_tokens([self.START])
        self.de_tokenizer.add_tokens([self.STOP])
        self.de_mask_idx = self.de_tokenizer.token_to_id(self.MASK)
        self.de_start_idx = self.de_tokenizer.token_to_id(self.START)
        self.de_stop_idx = self.de_tokenizer.token_to_id(self.STOP)

        # Get English sentence lists
        if rank == 0: print("Making english idxs")
        self.en_max_len = 0
        self.en_idxs = []
        self.en_lens = []
        with open(self.en_path, 'r') as f:
            for i, l in tqdm(enumerate(f.readlines())):
                l = l.strip()
                if len(l) > 0:
                    output = self.en_tokenizer.encode(l)
                    ids = [self.en_start_idx]+list(output.ids)\
                                             +[self.en_stop_idx]
                    self.en_idxs.append(ids)
                    self.en_lens.append(len(ids))
                    if len(ids) > self.en_max_len:
                        self.en_max_len = len(ids)
                if exp_name == "test" and i > 100:
                    break
        mask = [self.en_mask_idx for i in range(self.en_max_len)]
        l = 0
        if rank == 0: print("Padding english idxs")
        for i in tqdm(range(len(self.en_idxs))):
            diff = self.en_max_len - len(self.en_idxs[i])
            self.en_idxs[i] = self.en_idxs[i] + mask[:diff]

        # Get German Sentence Lists
        if rank == 0: print("Making german idxs")
        self.de_max_len = 0
        self.de_idxs = []
        self.de_lens = []
        with open(self.de_path, 'r') as f:
            for i, l in tqdm(enumerate(f.readlines())):
                l = l.strip()
                if len(l) > 0:
                    output = self.de_tokenizer.encode(l)
                    ids = [self.de_start_idx]+list(output.ids)\
                                             +[self.de_stop_idx]
                    self.de_idxs.append(ids)
                    self.de_lens.append(len(ids))
                    if len(ids) > self.de_max_len:
                        self.de_max_len = len(ids)
                if exp_name == "test" and i > 100:
                    break
        mask = [self.de_mask_idx for i in range(self.de_max_len)]
        if rank == 0: print("Padding german idxs")
        for i in tqdm(range(len(self.de_idxs))):
            diff = self.de_max_len - len(self.de_idxs[i])
            self.de_idxs[i] = self.de_idxs[i] + mask[:diff]

        if rank == 0: print("Converting to numpy arrays")
        if self.eng_to_ger:
            self.X = np.asarray(self.en_idxs)
            self.X_lens = np.asarray(self.en_lens)
            self.X_tokenizer = self.en_tokenizer
            self.X_mask_idx = self.en_mask_idx
            self.X_start_idx = self.en_start_idx
            self.X_stop_idx = self.en_stop_idx
            self.X_max_len = self.en_max_len

            self.Y = np.asarray(self.de_idxs)
            self.Y_lens = np.asarray(self.de_lens)
            self.Y_tokenizer = self.de_tokenizer
            self.Y_mask_idx = self.de_mask_idx
            self.Y_start_idx = self.de_start_idx
            self.Y_stop_idx = self.de_stop_idx
            self.Y_max_len = self.de_max_len
        else:
            self.X = np.asarray(self.de_idxs)
            self.X_lens = np.asarray(self.de_lens)
            self.X_tokenizer = self.de_tokenizer
            self.X_mask_idx = self.de_mask_idx
            self.X_start_idx = self.de_start_idx
            self.X_stop_idx = self.de_stop_idx
            self.X_max_len = self.de_max_len

            self.Y = np.asarray(self.en_idxs)
            self.Y_lens = np.asarray(self.en_lens)
            self.Y_tokenizer = self.en_tokenizer
            self.Y_mask_idx = self.en_mask_idx
            self.Y_start_idx = self.en_start_idx
            self.Y_stop_idx = self.en_stop_idx
            self.Y_max_len = self.en_max_len

    def __len__(self):
        return len(self.en_idxs)

    #def __getitem__(self,i,l=None):
    #    if l is None:
    #        l = self.X_lens[int(i)]
    #    idxs = np.zeros(1)
    #    margin = 5
    #    while idxs.sum()<25 and margin < 400:
    #        min_l = l-margin
    #        max_l = l+margin
    #        idxs = (self.X_lens>min_l)&(self.X_lens<max_l)
    #        margin += 5
    #    max_l = min(np.max(self.X_lens[idxs]),self.max_context)
    #    if max_l <   50 : batch_size = self.batch_size
    #    elif max_l < 70: batch_size = self.batch_size//2
    #    elif max_l < 100: batch_size = self.batch_size//4
    #    elif max_l < 120: batch_size = self.batch_size//8
    #    elif max_l < 140: batch_size = self.batch_size//16
    #    elif max_l < 160: batch_size = self.batch_size//32
    #    else: batch_size = self.batch_size//64
    #    batch_size = max(16,batch_size)
    #    perm = np.random.permutation(idxs.sum())[:batch_size]
    #    max_l = np.max(self.X_lens[idxs][perm])
    #    x = np.asarray(self.X[idxs][perm,:max_l])
    #    max_l = np.max(self.Y_lens[idxs][perm])
    #    y = np.asarray(self.Y[idxs][perm,:max_l])
    #    return torch.LongTensor(x), torch.LongTensor(y)

    def __getitem__(self, idx):
        return torch.LongTensor(self.X[idx]), torch.LongTensor(self.Y[idx])

    def get_largest_batch(self, size_num):
        l = 10
        if size_num == 1:
            l = 25
        elif size_num == 2:
            l = 400
        elif size_num == 3:
            l = 130
        elif size_num == 4:
            l = 75
        elif size_num == 5:
            l = 44
        elif size_num == 6:
            l = 94
        elif size_num == 7:
            l = 200
        elif size_num == 8:
            l = 300
        return self.__getitem__(0, l)

    def X_idxs2tokens(self, idxs):
        """
        idxs: LongTensor (N,)
            converts an array of tokens to a sentence
        """
        return self.X_tokenizer.decode(idxs)

    def Y_idxs2tokens(self, idxs):
        """
        idxs: LongTensor (N,)
            converts an array of tokens to a sentence
        """
        return self.Y_tokenizer.decode(idxs)
Exemplo n.º 13
0
class EngGerDataset(Dataset):
    """
    Can be english to german or german to english.
    """
    def __init__(self,
                 data_folder,
                 rank=0,
                 val_set=False,
                 world_size=1,
                 seed=0,
                 eng_to_ger=True,
                 vocab_size=37000,
                 MASK="<MASK>",
                 START="<START>",
                 STOP="<STOP>",
                 exp_name="",
                 max_context=None,
                 batch_size=128,
                 val_size=30000,
                 **kwargs):
        """
        rank: int
            the rank in the distributed training
        val_set: bool
            if true, this dataset is created as the validation set
        world_size: int
            the number of processes if using distributed training
        seed: int
            random seed
        data_folder: str
            the path to the folder that should contain a `train.en` and
            a `train.de` file.
        eng_to_ger: bool
            if true, the x values are returned as english ids and the
            y values are german ids. If false, then visa-versa
        vocab_size: int
            the number of encodings for the byte-pair encoding scheme
        MASK: str
            the mask token
        START: str
            the start token
        STOP: str
            the stop token
        exp_name: str
            name of the experiment
        max_context: int
            the maximum sequence length
        val_size: int
            the number of samples to be set aside for validation
        """
        self.rank = rank
        print("rank:", self.rank)
        self.world_size = world_size
        self.val_set = val_set
        self.val_size = val_size
        self.batch_size = batch_size
        self.data_folder = os.path.expanduser(data_folder)
        self.en_path = os.path.join(data_folder, "train.en")
        self.de_path = os.path.join(data_folder, "train.de")
        self.eng_to_ger = eng_to_ger
        self.vocab_size = vocab_size
        self.MASK = MASK
        self.START = START
        self.STOP = STOP
        self.max_context = max_context
        self.en_tok_path = os.path.join(self.data_folder, "en_tokenizer")
        self.de_tok_path = os.path.join(self.data_folder, "de_tokenizer")
        self.en_arr_path = os.path.join(self.data_folder, "en_bcolz")
        self.de_arr_path = os.path.join(self.data_folder, "de_bcolz")
        self.en_lens_path = os.path.join(self.data_folder, "en_bcolz_lens")
        self.de_lens_path = os.path.join(self.data_folder, "de_bcolz_lens")

        # Train tokenizers
        if rank == 0: print("Tokenizing english..")
        self.en_tokenizer = CharBPETokenizer()
        if os.path.exists(self.en_tok_path):  # Load trained tokenizer
            if rank == 0:
                print("loading from pretrained tokenizer", self.en_tok_path)
            self.en_tokenizer = ml_utils.datas.load_tokenizer(
                self.en_tokenizer, self.en_tok_path)
        else:
            self.en_tokenizer.train([self.en_path], vocab_size=vocab_size)
            os.mkdir(self.en_tok_path)
            self.en_tokenizer.save_model(self.en_tok_path)
        self.en_tokenizer.add_special_tokens([self.MASK])
        self.en_tokenizer.add_tokens([self.START])
        self.en_tokenizer.add_tokens([self.STOP])
        self.en_mask_idx = self.en_tokenizer.token_to_id(self.MASK)
        self.en_start_idx = self.en_tokenizer.token_to_id(self.START)
        self.en_stop_idx = self.en_tokenizer.token_to_id(self.STOP)

        if rank == 0: print("Tokenizing german..")
        self.de_tokenizer = CharBPETokenizer()
        if os.path.exists(self.de_tok_path):  # Load trained tokenizer
            if rank == 0:
                print("loading from pretrained tokenizer", self.de_tok_path)
            self.de_tokenizer = ml_utils.datas.load_tokenizer(
                self.de_tokenizer, self.de_tok_path)
        else:
            self.de_tokenizer.train([self.de_path], vocab_size=vocab_size)
            os.mkdir(self.de_tok_path)
            self.de_tokenizer.save_model(self.de_tok_path)
        self.de_tokenizer.add_special_tokens([self.MASK])
        self.de_tokenizer.add_tokens([self.START])
        self.de_tokenizer.add_tokens([self.STOP])
        self.de_mask_idx = self.de_tokenizer.token_to_id(self.MASK)
        self.de_start_idx = self.de_tokenizer.token_to_id(self.START)
        self.de_stop_idx = self.de_tokenizer.token_to_id(self.STOP)

        # Get English sentence lists
        if rank == 0: print("Making english idxs")
        if os.path.exists(self.en_arr_path):
            if rank == 0: print("loading from bcolz", self.en_arr_path)
            self.en_idxs = bcolz.carray(rootdir=self.en_arr_path)
            self.en_lens = bcolz.carray(rootdir=self.en_lens_path)
            self.en_max_len = self.en_idxs.shape[-1]
            if exp_name == "test":
                self.val_size = 250
                self.en_idxs = self.en_idxs[:1000]
                self.en_lens = self.en_lens[:1000]
            if self.world_size > 1:
                with temp_seed(seed - rank):
                    sample_perm = np.random.permutation(len(self.en_idxs))
                if not self.val_set:
                    n_samps = (len(self.en_idxs) - self.val_size)
                    n_samps = n_samps // self.world_size
                    indices = sample_perm[rank * n_samps:(rank + 1) * n_samps]
                else:
                    indices = sample_perm[-self.val_size:]
                try:
                    if rank == 0:
                        print("splitting dataset.. ", end="")
                        starttime = time.time()
                    self.en_idxs = self.en_idxs[indices]
                    self.en_lens = self.en_lens[indices]
                    if rank == 0: print("duration:", time.time() - starttime)
                except:
                    temp_idxs = []
                    temp_lens = []
                    if rank == 0:
                        print("Collecting data")
                        rnge = tqdm(indices)
                    else:
                        rnge = indices
                    for i in rnge:
                        temp_idxs.append(self.en_idxs[i])
                        temp_lens.append(self.en_lens[i])
                    self.en_idxs = np.asarray(temp_idxs)
                    self.en_lens = np.asarray(temp_lens)
                    if rank == 0: print("duration:", time.time() - starttime)
        elif world_size == 1:
            self.en_max_len = 0
            self.en_idxs = []
            self.en_lens = []
            with open(self.en_path, 'r') as f:
                for i, l in tqdm(enumerate(f.readlines())):
                    l = l.strip()
                    if len(l) > 0:
                        output = self.en_tokenizer.encode(l)
                        ids = [self.en_start_idx]+list(output.ids)\
                                                 +[self.en_stop_idx]
                        self.en_idxs.append(ids)
                        self.en_lens.append(len(ids))
                        if len(ids) > self.en_max_len:
                            self.en_max_len = len(ids)
                    if exp_name == "test" and i > 100:
                        break
            mask = [self.en_mask_idx for i in range(self.en_max_len)]
            l = 0
            if rank == 0: print("Padding english idxs")
            for i in tqdm(range(len(self.en_idxs))):
                diff = self.en_max_len - len(self.en_idxs[i])
                self.en_idxs[i] = self.en_idxs[i] + mask[:diff]
            if rank == 0: print("Saving to bcolz")
            self.en_idxs = bcolz.carray(self.en_idxs,
                                        rootdir=self.en_arr_path,
                                        dtype="int32")
            self.en_idxs.flush()
            self.en_lens = bcolz.carray(self.en_lens,
                                        rootdir=self.en_lens_path,
                                        dtype="int32")
            self.en_lens.flush()
        else:
            print("Make dataset without using multi-processing!!")
            assert False
        if self.en_max_len > max_context:
            if rank == 0:
                print("Truncating context from", self.en_max_len, "to",
                      self.max_context)
            self.en_max_len = self.max_context

        # Get German Sentence Lists
        if rank == 0: print("Making german idxs")
        if os.path.exists(self.de_arr_path):
            if rank == 0: print("loading from bcolz", self.de_arr_path)
            self.de_idxs = bcolz.carray(rootdir=self.de_arr_path)
            self.de_lens = bcolz.carray(rootdir=self.de_lens_path)
            self.de_max_len = self.de_idxs.shape[-1]
            if exp_name == "test":
                self.val_size = 250
                self.en_idxs = self.en_idxs[:1000]
                self.en_lens = self.en_lens[:1000]
            if self.world_size > 1:
                try:
                    if rank == 0:
                        print("splitting dataset.. ", end="")
                        starttime = time.time()
                    self.de_idxs = self.de_idxs[indices]
                    self.de_lens = self.de_lens[indices]
                    if rank == 0: print("duration:", time.time() - starttime)
                except:
                    temp_idxs = []
                    temp_lens = []
                    try:
                        if rank == 0: print("Collecting data")
                        for i in rnge:
                            temp_idxs.append(self.de_idxs[i])
                            temp_lens.append(self.de_lens[i])
                    except Exception as e:
                        print("Likely error caused by bcolz existing "+\
                                               "for en but not de data")
                        print(e)
                        assert False
                    self.de_idxs = np.asarray(temp_idxs)
                    self.de_lens = np.asarray(temp_lens)
                    if rank == 0: print("duration:", time.time() - starttime)
        else:
            self.de_max_len = 0
            self.de_idxs = []
            self.de_lens = []
            with open(self.de_path, 'r') as f:
                for i, l in tqdm(enumerate(f.readlines())):
                    l = l.strip()
                    if len(l) > 0:
                        output = self.de_tokenizer.encode(l)
                        ids = [self.de_start_idx]+list(output.ids)\
                                                 +[self.de_stop_idx]
                        self.de_idxs.append(ids)
                        self.de_lens.append(len(ids))
                        if len(ids) > self.de_max_len:
                            self.de_max_len = len(ids)
                    if exp_name == "test" and i > 100:
                        break
            mask = [self.de_mask_idx for i in range(self.de_max_len)]
            if rank == 0: print("Padding german idxs")
            for i in tqdm(range(len(self.de_idxs))):
                diff = self.de_max_len - len(self.de_idxs[i])
                self.de_idxs[i] = self.de_idxs[i] + mask[:diff]
            if rank == 0: print("Saving to bcolz")
            self.de_idxs = bcolz.carray(self.de_idxs,
                                        rootdir=self.de_arr_path,
                                        dtype="int32")
            self.de_idxs.flush()
            self.de_lens = bcolz.carray(self.de_lens,
                                        rootdir=self.de_lens_path,
                                        dtype="int32")
            self.de_lens.flush()
        if self.de_max_len > max_context:
            if rank == 0:
                print("Truncating context from", self.de_max_len, "to",
                      self.max_context)
            self.de_max_len = self.max_context

        if rank == 0: print("Converting to numpy arrays")
        if self.eng_to_ger:
            self.X = np.asarray(self.en_idxs)
            self.X_lens = np.asarray(self.en_lens)
            self.X_tokenizer = self.en_tokenizer
            self.X_mask_idx = self.en_mask_idx
            self.X_start_idx = self.en_start_idx
            self.X_stop_idx = self.en_stop_idx
            self.X_max_len = self.en_max_len

            self.Y = np.asarray(self.de_idxs)
            self.Y_lens = np.asarray(self.de_lens)
            self.Y_tokenizer = self.de_tokenizer
            self.Y_mask_idx = self.de_mask_idx
            self.Y_start_idx = self.de_start_idx
            self.Y_stop_idx = self.de_stop_idx
            self.Y_max_len = self.de_max_len
        else:
            self.X = np.asarray(self.de_idxs)
            self.X_lens = np.asarray(self.de_lens)
            self.X_tokenizer = self.de_tokenizer
            self.X_mask_idx = self.de_mask_idx
            self.X_start_idx = self.de_start_idx
            self.X_stop_idx = self.de_stop_idx
            self.X_max_len = self.de_max_len

            self.Y = np.asarray(self.en_idxs)
            self.Y_lens = np.asarray(self.en_lens)
            self.Y_tokenizer = self.en_tokenizer
            self.Y_mask_idx = self.en_mask_idx
            self.Y_start_idx = self.en_start_idx
            self.Y_stop_idx = self.en_stop_idx
            self.Y_max_len = self.en_max_len

    def __len__(self):
        return len(self.en_idxs)

    def __getitem__(self, i, l=None):
        if l is None:
            l = self.X_lens[int(i)]
        idxs = np.zeros(1)
        margin = 5
        while idxs.sum() < 25 and margin < 400:
            min_l = l - margin
            max_l = l + margin
            idxs = (self.X_lens > min_l) & (self.X_lens < max_l)
            margin += 5
        max_l = min(np.max(self.X_lens[idxs]), self.max_context)
        if max_l < 50: batch_size = self.batch_size
        elif max_l < 70: batch_size = self.batch_size // 2
        elif max_l < 100: batch_size = self.batch_size // 4
        elif max_l < 120: batch_size = self.batch_size // 8
        elif max_l < 140: batch_size = self.batch_size // 16
        elif max_l < 160: batch_size = self.batch_size // 32
        else: batch_size = self.batch_size // 64
        batch_size = max(16, batch_size)
        perm = np.random.permutation(idxs.sum())[:batch_size]
        max_l = np.max(self.X_lens[idxs][perm])
        x = np.asarray(self.X[idxs][perm, :max_l])
        max_l = np.max(self.Y_lens[idxs][perm])
        y = np.asarray(self.Y[idxs][perm, :max_l])
        return torch.LongTensor(x), torch.LongTensor(y)

    def get_largest_batch(self, size_num):
        l = 10
        if size_num == 1:
            l = 25
        elif size_num == 2:
            l = 400
        elif size_num == 3:
            l = 130
        elif size_num == 4:
            l = 75
        elif size_num == 5:
            l = 44
        elif size_num == 6:
            l = 94
        elif size_num == 7:
            l = 200
        elif size_num == 8:
            l = 300
        return self.__getitem__(0, l)

    def X_idxs2tokens(self, idxs):
        """
        idxs: LongTensor (N,)
            converts an array of tokens to a sentence
        """
        return self.X_tokenizer.decode(idxs)

    def Y_idxs2tokens(self, idxs):
        """
        idxs: LongTensor (N,)
            converts an array of tokens to a sentence
        """
        return self.Y_tokenizer.decode(idxs)
Exemplo n.º 14
0
print(tokenizer_code.get_vocab)
print(tokenizer_doc.get_vocab)

#use the trained tokenizer_code to encode and write in output_file

file_dir = "data/ncs_preprocessed_data/train-CoDesc/"
src_file_name = "code.original_subtoken"
tgt_file_name = "code.bpe"

output_file = open(file_dir + "/" + tgt_file_name, "w")
output_file.close()
output_file = open(file_dir + "/" + tgt_file_name, "a")
with open(file_dir + "/" + src_file_name, 'r') as file:
    for line in file:
        output = tokenizer_code.encode(line)
        line = ' '.join(output.tokens).replace("</w>", "")
        output_file.write(line)
        output_file.write("\n")
output_file.close()

#use the trained tokenizer_doc to encode and write in output_file

file_dir = "data/ncs_preprocessed_data/train-CoDesc/"
src_file_name = "javadoc.original"
tgt_file_name = "javadoc.bpe"

output_file = open(file_dir + "/" + tgt_file_name, "w")
output_file.close()
output_file = open(file_dir + "/" + tgt_file_name, "a")
with open(file_dir + "/" + src_file_name, 'r') as file:
Exemplo n.º 15
0
def torchtext_iterators(args):
    """
    Builds torchtext iterators from the files.
    """
    logger = logging.getLogger('logger')
    logger.info('Starting to load data and create iterators.')

    # Tokenizer.
    if args['model_name'] == 'roberta':
        tokenizer = lambda x: [x]
    elif args['subword']:
        tokenizer = 'subword'
    elif args['bpe']:
        bpe_tokenizer = CharBPETokenizer('log/bpe-trained-vocab.json',
                                         'log/bpe-trained-merges.txt')
        tokenizer = lambda x: bpe_tokenizer.encode(x).tokens
    else:
        tokenizer = None

    # `sequential` does not tokenize the label.
    label = data.Field(batch_first=True, sequential=False)
    text = data.Field(batch_first=True, lower=True, tokenize=tokenizer)

    fields = [('text', text), ('label', label)]
    train = data.TabularDataset(args['train_path'],
                                'tsv',
                                fields,
                                skip_header=True)
    valid = data.TabularDataset(args['valid_path'],
                                'tsv',
                                fields,
                                skip_header=True)
    test = data.TabularDataset(args['test_path'],
                               'tsv', [('text', text)],
                               skip_header=True)

    text.build_vocab(train, min_freq=args['min_freq'])
    label.build_vocab(train)

    train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits(
        (train, valid, test),
        batch_size=args['batch_size'],
        repeat=False,
        device=torch.device(args['device']),
        sort=False,
        sort_within_batch=False)

    if not args['no_pretrained_vectors']:
        if not args['load_vectors_manually']:
            logger.info('Starting to load vectors from Glove.')
            text.vocab.load_vectors(vectors=GloVe(name='6B'))
        else:
            logger.info('Starting to manually load vectors from FastText.')
            vector_map, stoi = load_vectors(args['fasttext_path'], text.vocab,
                                            torch.device(args['device']))
            average_embed = get_average_embedding(vector_map)
            text.vocab.set_vectors(stoi,
                                   vector_map,
                                   300,
                                   unk_init=lambda x: average_embed.clone())
            text.vocab.vectors[
                text.vocab.stoi['<unk>']] = average_embed.clone()

    logger.info('Built train vocabulary of {} words'.format(len(text.vocab)))
    return train_iter, valid_iter, test_iter, text, label
Exemplo n.º 16
0
tokenizer = CharBPETokenizer()

# And then train
tokenizer.train(
    files,
    vocab_size=args.vocab_size,
    min_frequency=2,
    show_progress=True,
    special_tokens=['<unk>'],
    suffix='</w>',
    limit_alphabet=args.limit_alphabet,
)

# Save the files
tokenizer.save(args.out, args.name)

# Restoring model from learned vocab/merges
tokenizer = CharBPETokenizer(
    join(args.out, '{}-vocab.json'.format(args.name)),
    join(args.out, '{}-merges.txt'.format(args.name)),
)

# Test encoding
logger.info(
    'Tokens and their ids from CharBPETokenizer with GFP protein sequence: \n MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT'
)
encoded = tokenizer.encode('MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT')
logger.info(encoded.tokens)
logger.info(encoded.ids)
logger.info('done!')
        special_tokens=[
            "<blank>",
            "<bos>",
            "<unk>",
        ],
    )

    # os.makedirs('./BPE-1000', exist_ok=True)
    tokenizer.save(f'./BPE-1000', '')

    tokenizer = CharBPETokenizer('./BPE-1000/-vocab.json',
                                 './BPE-1000/-merges.txt')
    # with open('.test.pkl', 'w') as f:
    #     pickle.dump(tokenizer, f)

    tokenizer = HuggingFaceTokenizer()
    print(
        tokenizer.encode(
            'might have a solution it might take a long time nobody'))

    print(
        tokenizer.decode(
            tokenizer.encode(
                'might have a solution it might take a long time nobody'), ))

    # transforms = torchaudio.transforms.MFCC(n_mfcc=40)
    # concat = ConcatFeature()
    # waveform = transforms(data)
    # print(waveform.shape)
    # waveform = concat(waveform)
    # print(waveform[:, -1])
Exemplo n.º 18
0
 def test_decoding(self, openai_files):
     tokenizer = CharBPETokenizer(openai_files["vocab"],
                                  openai_files["merges"],
                                  lowercase=True)
     decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
     assert decoded == "my name is john"
Exemplo n.º 19
0
# coding: utf-8
from tokenizers import CharBPETokenizer

# Initialize a tokenizer
merges = "./saved_tokenizer/wiki_sunyang/merges.txt"
vocab = "./saved_tokenizer/wiki_sunyang/vocab.json"
tokenizer = CharBPETokenizer(vocab, merges)

# And then encode:
encoded = tokenizer.encode(
    "In 2012, Sun became the first Chinese man to win an Olympic gold medal in swimming."
)
print(encoded.ids)
print(encoded.tokens)