-
Notifications
You must be signed in to change notification settings - Fork 0
/
wikicount.py
99 lines (73 loc) · 2.79 KB
/
wikicount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# -*- coding: utf-8 -*-
import urllib2
import codecs
from scrapy.selector import Selector
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk import bigrams, trigrams
import math
import json
from nltk.stem import WordNetLemmatizer
stopwords = nltk.corpus.stopwords.words('english')
tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE)
#st = LancasterStemmer()
wnl = WordNetLemmatizer()
keywords=[]
with open('keywords.txt','r') as f:
for i in f:
keywords.append(i.strip())
with open('stopwords.txt','r') as f:
for i in f:
stopwords.append(i.strip())
def freq(word, doc):
return doc.count(word)
def word_count(doc):
return len(doc)
def tf(word, doc):
return (freq(word, doc) / float(word_count(doc)))
def calcu_tf(keyword):
url = "http://en.wikipedia.com/wiki/"+keyword
content = urllib2.urlopen(url).read()
vocabulary = []
all_tips = []
sel=Selector(text=content)
#text="".join(sel.xpath('.//div[@id="mw-content-text"]/*[self::p or self::ul]//text()').extract())
text="".join(sel.xpath('.//div[@id="mw-content-text"]//text()').extract())
if text.find(keyword+' may refer to:') >=0 :
#if text.find(keyword+' may refer to:') or text.find('Wikipedia does not have an article with this exact name') >=0 :
#if text.find('Wikipedia does not have an article with this exact name') >=0 :
return #[]
tokens = tokenizer.tokenize(text)
tokens = [token.lower() for token in tokens if len(token) > 2]
tokens = [wnl.lemmatize(token) for token in tokens if token not in stopwords]
docs = { 'tf': {}, 'tokens': []}
for token in tokens:
docs['tf'][token] = tf(token, tokens)
tops=sorted(docs['tf'].items(), key=lambda x: x[1], reverse=True)[:15]
#print tops
return [ i[0] for i in tops ]
with codecs.open('links.txt','ab+','utf-8') as f1:
with codecs.open('tf.json','ab+','utf-8') as f:
j=0
for i in f :
print i
j+=1
print j
for keyword in keywords[j:] :
print "processing : %s "% keyword
list=[]
try:
list=calcu_tf(keyword)
except urllib2.HTTPError, err:
if err.code == 404:
pass
else:
raise
f.write(json.dumps({keyword:list})+'\n')
f1.write("http://en.wikipedia.com/wiki/"+keyword+"\n")
print json.dumps({keyword:list})
#break
raw_input()