/
scraMU.py
150 lines (119 loc) · 4.4 KB
/
scraMU.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- Coding:UTF-8 -*-
import os
import requests
from pyquery import PyQuery as pq
from ConfigParser import ConfigParser
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MUscrapy():
def __init__(self):
conf = ConfigParser()
conf.read('conf.ini')
self.url = 'http://www.mace.manchester.ac.uk'
self.login_data = dict(conf.items('account'))
self.headers = dict(conf.items('header'))
self.ses = requests.session()
def getPage(self):
try:
res = self.ses.get(url = self.url+'/people/staff/academic-staff/', headers = self.headers)
return res.text.decode('utf-8')
except requests.exceptions.RequestException:
print 'open '+self.url+' error'
def getGeneInfo(self,html):
doc = pq(html)
staff = []
for data in doc('tr'):
person = []
for i in range(len(pq(data).find('td'))):
person.append(pq(data).find('td').eq(i).text())
person.append(pq(data).find('td').eq(0).find('a').attr('href'))
person.append(pq(data).find('td').eq(4).find('a').attr('href'))
if(len(person)!=0 and person[0]):
staff.append(person)
return staff
def mkDir(self,path):
path = path.strip()
isPathexits = os.path.exists(path)
if (not isPathexits):
os.makedirs(path)
return True
else:
print 'path'+'dir exists'
return False
def getPersonBio(self,url,name):
try:
res = self.ses.get(self.url+url+'&pg=1', headers = self.headers)
# with open('personresearch.html','w') as f:
# f.write(res.text)
info = []
doc = pq(res.text)
for item in doc('div.researchstaffprofile-section').eq(1).find('p'):
info.append(pq(item).text())
return info
except requests.exceptions.RequestException:
print 'open '+name+' detail page error'
def getPersonResearch(self,url,name):
try:
res = self.ses.get(self.url+url+'&pg=2', headers = self.headers)
# with open('personresearch.html','w') as f:
# f.write(res.text)
info = []
doc = pq(res.text)
for item in doc('div.researchstaffprofile-section').eq(0).find('li'):
info.append(pq(item).text())
return info
except requests.exceptions.RequestException:
print 'open research '+name+' page error'
def getPersonPub(self,url,name):
try:
res = self.ses.get(self.url + url + '&pg=4', headers=self.headers)
# with open('personresearch.html','w') as f:
# f.write(res.text)
doc = pq(res.text)
info = []
for item in doc('div.researchstaffprofile-section').eq(0).find('li'):
info.append(pq(item).text())
return info
except requests.exceptions.RequestException:
print 'open '+name+'publication page error'
MU = MUscrapy()
mess = MU.getGeneInfo(MU.getPage())
with open('general.txt','w') as f:
f.write('name'+' role'+' phone'+' location'+' email'+'\n')
for item in mess:
for i in range(len(item)-2):
string = str(item[i])
f.write(string+r' ')
f.write('\n')
savePath = 'professor info'
MU.mkDir(savePath)
# save information for all professors in MU engineering college
i = 0
print len(mess)
for item in mess:
i += 1
biography = MU.getPersonBio(item[5],item[0])
research = MU.getPersonResearch(item[5],item[0])
publication = MU.getPersonPub(item[5],item[0])
filename = savePath+'/'+str(item[0])+'.txt'
print 'saving ' +item[0]+' information'+ str(i)
with open(filename,'w') as f:
f.write('name' + ' role' + ' phone' + ' location' + ' email' + '\n')
for i in range(len(item)-2):
string = str(item[i])
f.write(string+r' ')
f.write('\n')
f.write('\n')
f.write('BIOGRAPHY: \n')
for part in biography:
f.write(str(part)+'\n')
f.write('\n')
f.write('RESEARCH: \n')
for part in research:
f.write(str(part)+'\n')
f.write('\n')
f.write('PUBLICATION: \n')
for part in publication:
f.write(str(part)+'\n')
f.write('\n')