def __init__(self, n, token_vocab, *args, **kwargs): """ """ recount = kwargs.pop('recount', False) initialize_zero = kwargs.pop('initialize_zero', False) super(TokenVocab, self).__init__(*args, **kwargs) self._n = n self._token_vocab = token_vocab self._token_counts = Counter() self._subtoken_vocab = CharVocab.from_vocab(self.token_vocab) self._multibucket = Multibucket.from_configurable(self, embed_model=self.embed_model, name=self.name) if recount: self.count() else: if os.path.isfile(self.filename): self.load() else: self.count() self.dump() self.index_vocab() embed_dims = [len(self), self.embed_size] if initialize_zero: self.embeddings = np.zeros(embed_dims) else: self.embeddings = np.random.randn(*embed_dims) return
elif self.cased != value.cased: cls = value.__class__ value = cls.from_configurable(value, cased=self.cased, recount=True) super(SubtokenVocab, self).__setattr__(name, value) return #*************************************************************** class CharVocab(SubtokenVocab): pass #*************************************************************** if __name__ == '__main__': """ """ from nparser import Configurable from nparser.vocabs import WordVocab, CharVocab configurable = Configurable() token_vocab = WordVocab.from_configurable(configurable, 1) token_vocab.fit_to_zipf() if os.path.isfile('saves/defaults/chars.txt'): os.remove('saves/defaults/chars.txt') subtoken_vocab = CharVocab.from_vocab(token_vocab) subtoken_vocab = CharVocab.from_vocab(token_vocab) subtoken_vocab.token_vocab.count(configurable.valid_files) subtoken_vocab.index_tokens() subtoken_vocab.fit_to_zipf() print('SubtokenVocab passes',file=sys.stderr)