示例#1
0
def wordRank():
    #Retreive text from elasticsearch
    results = es.get(index='nkdb',
                     doc_type='nkdb',
                     id='5dc9fc5033ec463330e97e94')
    texts = json.dumps(results['_source'], ensure_ascii=False)

    # split the text by sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', texts)

    # normalize the text
    texts = [normalize(text, number=True) for text in sentences]

    wordrank_extractor = KRWordRank(
        min_count=3,  # Minimum frequency of word
        max_length=10,  # Maximum length of word
        verbose=True)

    beta = 0.85  # Decaying factor beta of PageRank
    max_iter = 10

    keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)

    result = []
    dic = {}
    # Make a dictionary [word, weight]
    for word, r in sorted(keywords.items(), key=lambda x: x[1],
                          reverse=True)[:30]:
        dic["y"] = r
        dic["label"] = word
        result.append(dic)
        dic = {}

    return json.dumps(result, ensure_ascii=False)
示例#2
0
 def sentrank_keyword(self):
     if self.content is not None:
         top_sents = []
         if self.title:
             top_sents.append(self.title)
         first_sents = sent_tokenize(self.content)[0]
         if first_sents:
             top_sents.append(first_sents)
         if top_sents:
             top_sents += self.sentrank()
             tfidf_kv = self.tf_idf()[:16]
             top_tfidf = {k: v for k, v in tfidf_kv}
             keywords = {}
             for sent in top_sents:
                 lower = sent.strip().lower()
                 sub = re.sub(r"\d+", " ", lower)
                 tokens = word_tokenize(sub)
                 for token in tokens:
                     lemma = self.lemmatizer.lemmatize(token)
                     if lemma in top_tfidf:
                         keywords[lemma] = top_tfidf[lemma]
             return sorted(keywords.items(),
                           key=operator.itemgetter(1),
                           reverse=True)
         else:
             return []
def keyword_gensim_lda(docs, k=5, num_topics=10, num_words=5):
    lines = [line.rstrip() for line in open('SmartStoplist.txt')]
    stop_list = set(lines)
    texts = [[
        word for word in gensim.utils.tokenize(
            document, lowercase=True, deacc=True, errors='replace')
        if word not in stop_list
    ] for document in docs]

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = gensim.models.LdaModel(corpus,
                                 id2word=dictionary,
                                 num_topics=num_topics)
    gensim_topics = [
        t[1] for t in lda.show_topics(
            num_topics=num_topics, num_words=num_words, formatted=False)
    ]
    topics = [[(i[1], i[0]) for i in t] for t in gensim_topics]
    keywords = {}
    # Sum of probabilities for token in all topics
    for topic in topics:
        for t in topic:
            token = t[1]
            pr = t[0]
            if token in keywords:
                keywords[token] += pr
            else:
                keywords[token] = pr

    # Probability for each token multiplied by token frequency
    matrix = gensim.matutils.corpus2csc(corpus)
    for token, pr in keywords.items():
        for d in dictionary.items():
            if d[1] == token:
                token_index = d[0]
                break
        token_row = matrix.getrow(token_index)
        token_freq = token_row.sum(1).item()
        keywords[token] = pr * math.log(token_freq)

    # Sort keywords by highest score
    return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:k]
示例#4
0
    def sentrank_entity(self):
        if self.content is not None:
            top_sents = []
            if self.title:
                top_sents.append(self.title)
            # first_sents = sent_tokenize(self.content)[1]
            # if first_sents:
            #     top_sents.append(first_sents)
            # print("len of topsents :{}".format(len(top_sents)))
            # if top_sents:
            top_sents += self.sentrank()
            print("len of topsents:{}".format(len(top_sents)))
            if top_sents:
                entities = []
                keywords = {}
                tfidf_kv = self.tf_idf()[:16]
                top_tfidf = {k: v for k, v in tfidf_kv}
                sent_counter = 0
                for sent in top_sents:
                    # print(sent_counter,"top sent:",sent)
                    sent_counter += 1
                    strip = sent.strip()
                    ners, words = ner.ner(strip)
                    # print("ners:",ners)
                    for ne in ners:
                        if ne not in entities:
                            entities.append(ne)
                    sent_words = " ".join(words)
                    lower = sent_words.strip().lower()
                    sub = re.sub(r"\d+", " ", lower)
                    tokens = word_tokenize(sub)
                    for token in tokens:
                        lemma = self.lemmatizer.lemmatize(token)
                        if lemma in top_tfidf:
                            keywords[token] = top_tfidf[lemma]

                print("len of entities:{}".format(len(entities)))
                return entities, sorted(keywords.items(),
                                        key=operator.itemgetter(1),
                                        reverse=True)
            else:
                return [], {}
示例#5
0
 def sentrank_keyword_0(self):
     if self.content is not None:
         top_sents = self.sentrank()  #sentrank(self.content,3)
         # print("top sents:",top_sents)
         if top_sents:
             tfidf_kv = self.tf_idf()[:20]
             top_tfidf = {k: v for k, v in tfidf_kv}
             # print("top_tfidf: ",top_tfidf)
             keywords = {}
             if self.title:
                 top_sents = top_sents.append(self.title)
             for sent in top_sents:
                 strip = sent.strip()
                 lower = re.sub(r"\d+", " ", strip)
                 tokens = word_tokenize(lower)
                 for token in tokens:
                     if token.lower() in top_tfidf:
                         keywords[token] = top_tfidf[token.lower()]
             return sorted(keywords.items(),
                           key=operator.itemgetter(1),
                           reverse=True)
         else:
             return []