예제 #1
0
    def weighted_vectorize(self, text):
        res = []
        sentences = tools.seperate_sentences(text)
        tr_text = self.tr.textrank(text)
        for sen in sentences:
            tmp = []
            tmp_weight = []
            sen_words = tools.seperate(sen)
            for w in sen_words:
                if self.model.wv.vocab.__contains__(w):
                    tmp.append(self.model.__getitem__(w))
                    if w in tr_text:
                        tmp_weight.append(tr_text[w])
                    else:
                        tmp_weight.append(1 / len(sen_words))
                else:
                    tmp.append([0] * self.vec_length)
                    tmp_weight.append(1 / len(sen_words))
            for i in range(len(tmp)):
                tmp[i] = tools.vector_multi(tmp[i],
                                            tmp_weight[i] / sum(tmp_weight))

            sen_vec = tools.vector_add_multi(tmp)
            if len(sen_vec) == 0:
                print(sen)
            res.append(sen_vec)
        return res
예제 #2
0
 def unweighted_vectorize(self, text):
     res = []
     sentences = tools.seperate_sentences(text)
     for line in sentences:
         tmp = []
         for word in tools.seperate(line):
             if self.model.wv.vocab.__contains__(word):
                 wv = self.model.__getitem__(word)
                 tmp.append(wv)
             else:
                 tmp.append([0] * self.vec_length)
         tmp = tools.vector_add_multi(tmp)
         tmp = tools.vector_multi(tmp, 1 / (len(tmp)))
         res.append(tmp)
     return res