Exemplo n.º 1
0
 def data(self, size, model):
     counter = 0 
     for doclist in docstream():
         for doc in doclist:
             if counter >= size:
                 raise StopIteration()
             counter += 1
             sen_list = self.get_sents(doc)
             for sen in sen_list:
                 sen = [w for w in sen if w not in self.stopwords]
                 if len(sen) < 4:
                     continue
                 xs = np.zeros((len(self.vocab),len(sen)))
                 ys = np.zeros((len(self.vocab),len(sen)))
                 if model == 'cbow':
                     for _ in range(len(sen)):
                         context = self.get_context(_,sen)
                         xs[:,_] = self.get_binvec(context)
                         ys[:,_] = self.get_onehot(sen[_])
                     yield xs, ys
                 elif model == 'skipgram':
                     for _ in range(len(sen)):
                         context = self.get_context(_,sen)
                         xs[:,_] = self.get_onehot(sen[_])
                         ys[:,_] = self.get_binvec(context)
                     yield xs, ys
Exemplo n.º 2
0
 def ns_data(self, size, model):
     counter = 0 
     for doclist in docstream():
         for doc in doclist:
             if counter >= size:
                 raise StopIteration()
             counter += 1
             sen_list = self.get_sents(doc)
             for sen in sen_list:
                 sen = [w for w in sen if w not in self.stopwords]
                 if len(sen) < 4:
                     continue
                 if model == 'cbow':
                     for _ in range(len(sen)):
                         context = self.get_context(_,sen)
                         x = self.get_binvec(context)
                         y = self.get_onehot(sen[_])
                         yield x, y
                 elif model == 'skipgram':
                     for _ in range(len(sen)):
                         context = self.get_context(_,sen)
                         x = self.get_onehot(sen[_])
                         y = self.get_binvec(context)
                         yield x, y