/
main.py
95 lines (72 loc) · 2.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from HTMLParser import HTMLParser
from clustering import Vector, Cluster
from collections import Counter
import Queue
import clustering
import feedparser
import jieba
import jieba.posseg as pseg
import time
from db import News
import json
meaninglessWords = (u" ", u",", u"。", u":", u"會", u"「", u"」", u"有", u"...", u"在", u"的", u"被", u"及", u"他", u"與", u"但", u"是", u"時", u"為", u"和", u"就", u"上", u"到", u"因為", u"向", u"於", u"他們", u"其中", u"沒有", u"指", u"將", u"才")
# only consider certain type of word
def preprocess(segList):
# w.flag in ["a", "an", "i", "j", "l", "m", "n", "nr", "ns", "nt", "nz", "tg", "t", "vg", "v", "vd", "vn", "x"] and
return [w for w in segList if w not in meaninglessWords]
def getBagOfWords(segList):
return Counter(segList)
class HTMLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
# remove html tag
def stripTag(html):
s = HTMLStripper()
s.feed(html)
return s.get_data()
def printList(l):
print repr(l).decode("unicode_escape")
def run():
start_time = time.clock()
jieba.set_dictionary('jieba/dict.txt.big')
jieba.initialize()
print ("jieba " + str(time.clock() - start_time))
start_time = time.clock()
news_rss_url = "http://hk.news.yahoo.com/rss/hong-kong"
# news_rss_url = "http://hk.news.yahoo.com/rss/china"
info = feedparser.parse(news_rss_url)
start_time = time.clock()
for entry in info.entries:
# word count of each word of summary
word_list = getBagOfWords(preprocess(jieba.cut(stripTag(entry.summary))))
# word count of each word of title
bag_of_word_of_title = getBagOfWords(preprocess(jieba.cut(stripTag(entry.title))))
# Combine word count of both summary and title and title weights more
bag_of_word = Counter()
for i in range(3):
bag_of_word.update(bag_of_word_of_title)
bag_of_word.update(word_list)
entry["bag_of_words"] = bag_of_word
print ("preprocess " + str(time.clock() - start_time))
# result = Counter()
# for entry in info.entries:
# result.update(entry["bag_of_words"])
# printList(result)
# Clustering them
start_time = time.clock()
clusters = clustering.clustering([Cluster([Vector(entry)]) for entry in info.entries])
print ("clustering " + str(time.clock() - start_time))
# Print the result
newsList = []
for (index, cluster) in enumerate(clusters):
for vector in cluster.listOfVectors:
news = News(index, (vector == cluster.centroidVector), vector.data["title"], vector.data["published"], vector.data["link"])
newsList.append(news.__dict__)
return json.dumps(newsList)
if __name__ == "__main__":
print run()