def get(self): # standardize keywords = self.trimmed_stopwords(self.tokenize(self.theme, pos='noun_verbs')) # search about theme articles = self.search_articles([keyword.surface for keyword in keywords][:3]) # clean docs = map(self.clean, articles) # divide sentences sentences_cand = map(self.divide, docs) sent = [] for s in sentences_cand: sent.append(list(filter(self.is_sentence, s))) sentences = list(chain.from_iterable(sent)) # tfidf format sentence_tokens = [] for sentence in sentences: noun_tokens = [token.surface for token in self.tokenize(sentence, pos='noun')] sentence_tokens.append(' '.join(noun_tokens)) # vectorize vector = TfIdf.vector(sentence_tokens) # clustering cluster = numpy.array(TfIdf.cluster(vector, clusters=3)) # retrieve opinion with tf tfidf_score_index = numpy.argsort(numpy.array([sum(v) for v in vector.toarray()]))[::-1] opinions = [] for i in range(3): # retrieve vector index by cluster c_index = numpy.where(cluster == i) for k in tfidf_score_index: if k in c_index[0]: opinions.append(sentences[k]) break theme = namedtuple('Theme', 'keywords, opinions') return theme(' '.join([keyword.surface for keyword in keywords][:3]), opinions)
def get(self): # standardize keywords = self.trimmed_stopwords( self.tokenize(self.opinion, pos='noun_verbs')) # search about opinion with keywords articles = self.search_articles( self.keywords + [keyword.surface for keyword in keywords][:3]) # clean docs = map(self.clean, articles) # divide sentences sentences_cand = map(self.divide, docs) sent = [] for s in sentences_cand: sent.append(list(filter(self.is_sentence, s))) sentences = list(chain.from_iterable(sent)) # tfidf format sentence_tokens = [] for sentence in sentences: noun_tokens = [ token.surface for token in self.tokenize(sentence, pos='noun') ] sentence_tokens.append(' '.join(noun_tokens)) # vectorize vector = TfIdf.vector(sentence_tokens) # clustering cluster = numpy.array(TfIdf.cluster(vector, clusters=3)) # retrieve opinion with tf tfidf_score = numpy.array([sum(v) for v in vector.toarray()]) # retrieve opinion with senti # senti_score = numpy.array([self.senti(s) for s in sentences]) senti_score = [] # for s in sentences: # senti_score.append(self.senti(s)) for sentence in sentences: senti_tokens = [ token.surface for token in self.tokenize(sentence, pos='senti') ] senti_score.append(self.senti(senti_tokens)) senti_score = numpy.array(senti_score) score_index = numpy.argsort(tfidf_score * senti_score) positives = [] negatives = [] for i in range(3): # retrieve vector index by cluster c_index = numpy.where(cluster == i) for k in score_index: if k in c_index[0]: negatives.append(sentences[k]) break for k in score_index[::-1]: if k in c_index[0]: positives.append(sentences[k]) break opinion = namedtuple('Opinion', 'positives, negatives') return opinion(positives, negatives)