예제 #1
0
파일: theme.py 프로젝트: orleika/secussion
 def get(self):
     # standardize
     keywords = self.trimmed_stopwords(self.tokenize(self.theme, pos='noun_verbs'))
     # search about theme
     articles = self.search_articles([keyword.surface for keyword in keywords][:3])
     # clean
     docs = map(self.clean, articles)
     # divide sentences
     sentences_cand = map(self.divide, docs)
     sent = []
     for s in sentences_cand:
         sent.append(list(filter(self.is_sentence, s)))
     sentences = list(chain.from_iterable(sent))
     # tfidf format
     sentence_tokens = []
     for sentence in sentences:
         noun_tokens = [token.surface for token in self.tokenize(sentence, pos='noun')]
         sentence_tokens.append(' '.join(noun_tokens))
     # vectorize
     vector = TfIdf.vector(sentence_tokens)
     # clustering
     cluster = numpy.array(TfIdf.cluster(vector, clusters=3))
     # retrieve opinion with tf
     tfidf_score_index = numpy.argsort(numpy.array([sum(v) for v in vector.toarray()]))[::-1]
     opinions = []
     for i in range(3):
         # retrieve vector index by cluster
         c_index = numpy.where(cluster == i)
         for k in tfidf_score_index:
             if k in c_index[0]:
                 opinions.append(sentences[k])
                 break
     theme = namedtuple('Theme', 'keywords, opinions')
     return theme(' '.join([keyword.surface for keyword in keywords][:3]), opinions)
예제 #2
0
    def get(self):
        # standardize
        keywords = self.trimmed_stopwords(
            self.tokenize(self.opinion, pos='noun_verbs'))
        # search about opinion with keywords
        articles = self.search_articles(
            self.keywords + [keyword.surface for keyword in keywords][:3])
        # clean
        docs = map(self.clean, articles)
        # divide sentences
        sentences_cand = map(self.divide, docs)
        sent = []
        for s in sentences_cand:
            sent.append(list(filter(self.is_sentence, s)))
        sentences = list(chain.from_iterable(sent))
        # tfidf format
        sentence_tokens = []
        for sentence in sentences:
            noun_tokens = [
                token.surface for token in self.tokenize(sentence, pos='noun')
            ]
            sentence_tokens.append(' '.join(noun_tokens))
        # vectorize
        vector = TfIdf.vector(sentence_tokens)
        # clustering
        cluster = numpy.array(TfIdf.cluster(vector, clusters=3))
        # retrieve opinion with tf
        tfidf_score = numpy.array([sum(v) for v in vector.toarray()])
        # retrieve opinion with senti
        # senti_score = numpy.array([self.senti(s) for s in sentences])
        senti_score = []
        # for s in sentences:
        #     senti_score.append(self.senti(s))

        for sentence in sentences:
            senti_tokens = [
                token.surface for token in self.tokenize(sentence, pos='senti')
            ]
            senti_score.append(self.senti(senti_tokens))

        senti_score = numpy.array(senti_score)
        score_index = numpy.argsort(tfidf_score * senti_score)
        positives = []
        negatives = []
        for i in range(3):
            # retrieve vector index by cluster
            c_index = numpy.where(cluster == i)
            for k in score_index:
                if k in c_index[0]:
                    negatives.append(sentences[k])
                    break
            for k in score_index[::-1]:
                if k in c_index[0]:
                    positives.append(sentences[k])
                    break
        opinion = namedtuple('Opinion', 'positives, negatives')
        return opinion(positives, negatives)