示例#1
0
 def __init__(self, vocabs, *args, **kwargs):
   """"""
   # nlp_model = Parser
   nlp_model = kwargs.pop('nlp_model', None)
   #print ("---dataset.py---\n",nlp_model)
   super(Dataset, self).__init__(*args, **kwargs)
   
   self._vocabs = vocabs
   self._multibuckets = [Multibucket.from_configurable(vocab, name='%s-%s'%(self.name, vocab.name)) for vocab in self.vocabs]
   
   if nlp_model is not None:
     self._nlp_model = nlp_model.from_configurable(self, name=self.name)
   else:
     self._nlp_model = None
   #print ("---dataset.py---after\n",nlp_model)
   with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s'%self.name) as bucketer:
     splits = bucketer.compute_splits(len(sent) for sent in self.iterfiles())
     for i in xrange(len(splits)):
       splits[i] += 1
   for multibucket, vocab in self.iteritems():
     multibucket.open(splits, depth=vocab.depth)
   for sent in self.iterfiles():
     for multibucket, vocab in self.iteritems():
       tokens = [line[vocab.conll_idx] for line in sent]
       idxs = [vocab.ROOT] + [vocab.index(token) for token in tokens]
       multibucket.add(idxs, tokens)
   for multibucket in self:
     multibucket.close()
   self._multibucket = Multibucket.from_dataset(self)
   return
示例#2
0
    def __init__(self, token_vocab, *args, **kwargs):
        """"""

        recount = kwargs.pop('recount', False)
        initialize_zero = kwargs.pop('initialize_zero', False)
        super(TokenVocab, self).__init__(*args, **kwargs)

        self._token_vocab = token_vocab
        self._token_counts = Counter()
        self._multibucket = Multibucket.from_configurable(
            self, embed_model=self.embed_model, name=self.name)
        self._tok2idx = {}

        if recount:
            self.count()
        else:
            if os.path.isfile(self.filename):
                self.load()
            else:
                self.count()
                self.dump()
        self.index_vocab()

        embed_dims = [len(self), self.embed_size]
        if initialize_zero:
            self.embeddings = np.zeros(embed_dims)
        else:
            self.embeddings = np.random.randn(*embed_dims)
        return
示例#3
0
 def __init__(self, token_vocab, *args, **kwargs):
   """"""
   
   super(BaseVocab, self).__init__(*args, **kwargs)
   self._cased = super(BaseVocab, self).cased
   
   SubtokenVocab.__setattr__(self, '_token_vocab', token_vocab)
   self._multibucket = Multibucket.from_configurable(self, embed_model=self.embed_model, name=self.name)
   self._vocabs = [NgramVocab.from_vocab(self.token_vocab, i+1, cased=self.cased) for i in xrange(self.max_n)]
   self._special_tokens = super(BaseVocab, self).special_tokens
   self._special_tokens_set = set(self._special_tokens)
   SubtokenVocab._set_special_tokens(self)
   self._tok2idx = {}
   
   for vocab in self:
     assert vocab.token_vocab is self.token_vocab
   return