def index_tokens(self): """""" self._tok2idx = {} # {"apple":[12,6,6,2,8],"banana":[...]} # the index of each char in a word tok2idxs = { token: self.subtoken_indices(token) for token in self.token_vocab.counts } with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s' % self.name) as bucketer: splits = bucketer.compute_splits( len(indices) for indices in tok2idxs.values()) with self.multibucket.open(splits): for index, special_token in enumerate( self.token_vocab.special_tokens): index = index if index != self.token_vocab.UNK else self.META_UNK self.tok2idx[special_token] = self.multibucket.add([index]) for token, _ in self.sorted_counts(self.token_vocab.counts): self.tok2idx[token] = self.multibucket.add(tok2idxs[token]) self._idx2tok = {idx: tok for tok, idx in self.tok2idx.iteritems()} self._idx2tok[0] = self[self.PAD] return
def index_tokens(self): """""" n_buckets = self.n_buckets tok2idxs = { token: [vocab.subtoken_indices(token) for vocab in self] for token in self.token_vocab.counts } with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s' % self.name) as bucketer: splits = bucketer.compute_splits( len(indices[0]) for indices in tok2idxs.values()) bucketer.plot() with self.multibucket.open(splits, depth=len(self)): for index, special_token in enumerate(self.special_tokens): self.tok2idx[special_token] = self.multibucket.add([[index] * len(self)]) for token, _ in self.sorted_counts(self.token_vocab.counts): indices = tok2idxs[token] sequence = [[ indices[i][j] for i in xrange(len(indices)) if j < len(indices[i]) ] for j in xrange(len(indices[0]))] self.tok2idx[token] = self.multibucket.add(sequence) return
def index_tokens(self): """""" #Associe à chaque token un indice self._tok2idx = {} tok2idxs = { token: self.subtoken_indices(token) for token in self.token_vocab.counts } #Calcule une séparation entre n buckets en fonction de la taille des mots with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s' % self.name) as bucketer: #print("subtok.py - idx tok - " , (len(indices) for indices in tok2idxs.values())) splits = bucketer.compute_splits( len(indices) for indices in tok2idxs.values()) #Remplit les n buckets avec les tokens en fonction de leur taille et sort des listes associatives with self.multibucket.open(splits): for index, special_token in enumerate( self.token_vocab.special_tokens): index = index if index != self.token_vocab.UNK else self.META_UNK self.tok2idx[special_token] = self.multibucket.add([index]) for token, _ in self.sorted_counts(self.token_vocab.counts): self.tok2idx[token] = self.multibucket.add(tok2idxs[token]) self._idx2tok = {idx: tok for tok, idx in self.tok2idx.iteritems()} self._idx2tok[0] = self[self.PAD] return
def __init__(self, vocabs, *args, **kwargs): """""" # nlp_model = Parser nlp_model = kwargs.pop('nlp_model', None) #print ("---dataset.py---\n",nlp_model) super(Dataset, self).__init__(*args, **kwargs) self._vocabs = vocabs self._multibuckets = [Multibucket.from_configurable(vocab, name='%s-%s'%(self.name, vocab.name)) for vocab in self.vocabs] if nlp_model is not None: self._nlp_model = nlp_model.from_configurable(self, name=self.name) else: self._nlp_model = None #print ("---dataset.py---after\n",nlp_model) with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s'%self.name) as bucketer: splits = bucketer.compute_splits(len(sent) for sent in self.iterfiles()) for i in xrange(len(splits)): splits[i] += 1 for multibucket, vocab in self.iteritems(): multibucket.open(splits, depth=vocab.depth) for sent in self.iterfiles(): for multibucket, vocab in self.iteritems(): tokens = [line[vocab.conll_idx] for line in sent] idxs = [vocab.ROOT] + [vocab.index(token) for token in tokens] multibucket.add(idxs, tokens) for multibucket in self: multibucket.close() self._multibucket = Multibucket.from_dataset(self) return