-
Notifications
You must be signed in to change notification settings - Fork 0
/
ngram_bhartendu.py
69 lines (60 loc) · 1.87 KB
/
ngram_bhartendu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import nltk
from nltk.corpus import stopwords
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
## Here is collocation output
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
def get_ngram(filename = None,_type=None,is_stopword=None):
file_content = open(filename).read()
# Get the tockens. Use word punctuations for tokenizing too other than spaces
tokens = nltk.word_tokenize(file_content)
text = nltk.Text(tokens)
word_filter = lambda *w: word_to_find not in w
## Bigrams
# ENABLE STOP WORDS
#stopwords = stopwords.words('english')
stopwords = []
stopwords.append('.')
#stopwords.append('The')
stopwords.append(':')
stopwords.append(',')
stopwords.append(';')
stopwords.append('`')
stopwords.append('``')
stopwords.append('\"')
#print stopwords
if is_stopword == 1:
filtered_tokens = []
for ftoken in tokens:
ftoken_low = ftoken.lower()
if ftoken not in stopwords:
#print "Removing ######### " + ftoken
filtered_tokens.append(ftoken)
else:
filtered_tokens = tokens
if _type == 2:
finder = BigramCollocationFinder.from_words(filtered_tokens,window_size=3)
else:
finder = TrigramCollocationFinder.from_words(filtered_tokens,window_size=3)
# only bigrams that appear 3+ times
finder.apply_freq_filter(1)
lst = list(finder.ngram_fd.viewitems())
#print lst
ll = sorted(lst, key=lambda x: x[1])
#ll = lst.sort(key=lambda x: x[0])
#print ll
res =[]
for i in ll:
k1 = []
k = list(i)
k1.append(' '.join(k[0]))
k1.append(k[1])
res.append(k1)
#raw_input()
#print k1
#print res
return res
#get_ngram("fred-file.txt",2)