-
Notifications
You must be signed in to change notification settings - Fork 0
/
step1searchNseg.py
105 lines (87 loc) · 4.04 KB
/
step1searchNseg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
# Jieba and BeatuifulSoup 4 should be installed before running the script.
import sys
import urllib2
import random
import jieba
import jieba.analyse
from bs4 import BeautifulSoup
# The script only reads input Chinese term list in UTF-8 to avoid encoding issues.
reload(sys)
sys.setdefaultencoding('utf-8')
# User agnets are used for preventing being banned by search engine.
user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \
(KHTML, like Gecko) Element Browser 5.0',
'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \
Version/6.0 Mobile/10A5355d Safari/8536.25',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/28.0.1468.0 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)']
# Search a term against Google. Change the base URL below to use other search engines.
def search(query):
query = urllib2.quote(query)
# url = 'http://www.baidu.com/s?wd=%s' % query # To search from Baidu
url = 'https://www.google.com/search?hl=zh-cn&q=%s' % query
request = urllib2.Request(url)
index = random.randint(0, 9)
user_agent = user_agents[index]
request.add_header('User-agent', user_agent)
html = ''
while not html:
try:
print("Connecting...")
response = urllib2.urlopen(request)
print("Result retrieved...")
except httplib.BadStatusLine:
response = ''
if response:
html = response.read()
return html
# Read a file of list of terms and their classifications, one term per line. Each line consistes of "Classification \t Term"
infile = open("drugList.txt")
lines = infile.readlines()
infile.close()
allWordList = ['drug-name', 'drug-type'] # To store all the possible features
drugDictList = [] # To store feature words of each drug in a list
jieba.set_dictionary('dict.txt.big') # Read a better dictionary for text segmentation
i = 1
for line in lines:
print "\nSearching for Entity:\t%s" % i
i += 1
fieldList = line.split("\t")
queryStr = fieldList[1].strip()
drugType = fieldList[0].strip()
html = search('%s' % queryStr)
soup = BeautifulSoup(html) # Build a html object easy to parse
drugDict = {'drug-name': queryStr, 'drug-type': drugType} # Every drug has these two features
# for content in soup.find_all("div", "c-abstract"): # Parse Baidu page
for content in soup.find_all("span", "st"): # Digest for each search result from google
seg_list = jieba.cut(content.get_text().strip(),cut_all=True) # Full cut for more features
#seg_list = jieba.cut_for_search(content.get_text().strip()) # Srarch Engine Mode if want to use
seg_list = map(lambda item: item.strip(), seg_list)
seg_list = filter(lambda item: item, seg_list)
for seg in seg_list: # Put all features found for a particular drug
if seg not in drugDict.keys():
drugDict[seg] = 1
else:
drugDict[seg] += 1
if seg not in allWordList: # Put all features found for the drug into all feature list
allWordList.append(seg)
drugDictList.append(drugDict)
# Now output the matrix
outfile = open("drugFeature.txt", "w")
for word in allWordList:
outfile.write("%s\t" % word)
outfile.write("\n")
for drugDict in drugDictList:
for word in allWordList:
if word not in drugDict.keys():
outfile.write("0\t")
else:
outfile.write("%s\t" % drugDict[word])
outfile.write("\n")