Exemplo n.º 1
0
class VocabBuilder:
    def __init__(self, dataset, tokenizer, save_path):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.vocab_path = os.path.join(save_path, 'vocab.pt')
        self._vocab = None

    def vocab(self):
        if self._vocab is None:
            self.build_vocab()
        return self._vocab

    def build_vocab(self):
        if os.path.exists(self.vocab_path):
            self._vocab = Dictionary.load(self.vocab_path)
        else:
            self.rebuild_vocab()

    def rebuild_vocab(self):
        self._vocab = Dictionary()
        self._vocab.add_symbol(self.mask_builder.mask_token)
        desc = 'build-vocab: {}'.format(self.save_path)
        pbar = tqdm(range(len(self.dataset)), desc=desc, leave=True)

        for i in pbar:
            contents = self.dataset[i]
            tokens = self.tokenizer(contents)
            for token in tokens:
                self._vocab.add_symbol(token)

        if self.save_path is not None:
            self._vocab.save(self.vocab_path)
Exemplo n.º 2
0
    def build(self,
              filepath=None,
              vocab_path=None,
              threshold=-1,
              max_vocab=-1):
        if vocab_path and os.path.exists(vocab_path):
            print("loading vocab from {}".format(vocab_path))
            d = Dictionary.load(vocab_path)
            print('vocab size {}'.format(len(d)))
        else:
            print("building vocab...")
            d = Dictionary()
            for step, line in enumerate(sentence_iterator(filepath)):
                if not step % 1000:
                    print("working on {}kth line".format(step // 1000),
                          end='\r')
                tokens = [self.get_lemma(w) for w in line]
                for tok in tokens:
                    d.add_symbol(tok)
            d.finalize(threshold=threshold, nwords=max_vocab)
            print('build done. vocab size {}'.format(len(d)))
            d.save('{}/dict.txt'.format(self.data_dir))

        self.vocab = d
        self.unk = self.vocab.unk()