forked from rpinsler/info-retrieval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
popular_topics.py
62 lines (52 loc) · 2.2 KB
/
popular_topics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
from collections import Counter
import nltk
from nltk import word_tokenize
from nltk.tag.perceptron import PerceptronTagger
import lucene
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.util import Version
from org.apache.lucene.analysis.miscellaneous import PerFieldAnalyzerWrapper
from java.util import HashMap
from search import Searcher
from index import CustomAnalyzer
class PopularTopics:
def __init__(self, index_dir, analyzer):
self.searcher = Searcher(index_dir, analyzer)
def dict_append(self, entity, f_dist):
entity = ' '.join(entity)
if entity not in f_dist:
f_dist[entity] = 0
f_dist[entity] += 1
def get_popular_topics(self, q_year, top_k):
titles = self.searcher.search_year(q_year)
unigram_dist = {}
bigram_dist = {}
trigram_dist = {}
ngram_dist = {}
tagset = None
tagger = PerceptronTagger()
grammar = "NP: {<JJ>*(<NN>|<NNS>)*<NN>(<NN>|<NNS>)*}"
cp = nltk.RegexpParser(grammar)
for title in titles:
title = title.lower()
text = word_tokenize(title)
sentence = nltk.tag._pos_tag(text, tagset, tagger)
result = cp.parse(sentence)
for node in list(result):
if isinstance(node, nltk.tree.Tree):
entity = zip(*list(node))[0]
if len(entity) == 1:
self.dict_append(entity, unigram_dist)
elif len(entity) == 2:
self.dict_append(entity, bigram_dist)
elif len(entity) == 3:
self.dict_append(entity, trigram_dist)
else:
self.dict_append(entity, ngram_dist)
unigram_result = Counter(unigram_dist).most_common(int(len(unigram_dist) * 0.01) + top_k)[int(len(unigram_dist) * 0.01):]
bigram_result = Counter(bigram_dist).most_common(top_k)
trigram_result = Counter(trigram_dist).most_common(top_k)
result = unigram_result + bigram_result + trigram_result
result = sorted(result, key=lambda k: k[1], reverse=True)[:top_k]
return result