-
Notifications
You must be signed in to change notification settings - Fork 0
/
computer_icourses.py
129 lines (117 loc) · 3.99 KB
/
computer_icourses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import Tool
import MysqlHelper
#数据库一条记录内容
class Item:
def __init__(self):
self.title = ""
self.short_desc = ""
self.description = ""
self.requirement = ""
self.pre_knowledge = ""
self.chapter = ""
self.reference = ""
self.common_prob = ""
self.teacher = ""
self.url = ""
#爬虫类
class computer_icourses:
def __init__(self):
self.tool = Tool.Tool()
def getPage(self,url):
try:
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"连接MOOC失败,错误原因",e.reason
return None
def getURL(self,page):
pattern = re.compile('<div class="col-md-4 col-sm-6">.*?<a class="" href="(.*?)"',re.S)
result = re.findall(pattern,page)
if result:
return result
else:
return None
def getTitle(self,page):
pattern = re.compile('<h2 class="f-fl">(.*?)<span',re.S)
result = re.search(pattern,page)
if result:
return result.group(1).strip()
else:
return None
def getShortDesc(self,page):
pattern = re.compile('<p class="f-fc6" id="j-rectxt".*?>spContent=(.*?)</p>',re.S)
result = re.search(pattern,page)
if result:
return result.group(1).strip()
else:
return None
def getInfo(self,page):
pattern = re.compile('<div class="top f-f0".*?>(.*?)</div>.*?<div.*?>(.*?)</div>',re.S)
result = re.findall(pattern,page)
if result:
return result
else:
return None
def getTeacher(self,page):
pattern = re.compile('<a class="u-tchcard f-cb".*?>.*?<div class="cnt f-fl">.*?<h3 class="f-fc3">(.*?)</h3>',re.S)
result = re.findall(pattern,page)
if result:
return result
else:
return None
def start(self):
indexPage = self.getPage('http://computer.icourses.cn/')
conn = MysqlHelper.connect()
cur = conn.cursor()
cur.execute('drop table if exists computer_icourses')
cur.execute('create table computer_icourses(id int(11) primary key auto_increment,title varchar(255),short_desc text,description text,requirement text,pre_knowledge text,chapter text,reference text,common_prob text,teacher text,url varchar(255))')
sql = 'insert into computer_icourses(title,short_desc,description,requirement,pre_knowledge,chapter,reference,common_prob,teacher,url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
url = self.getURL(indexPage)
for item in url:
oneline = Item()
oneline.url = item
page = self.getPage(item)
title = self.getTitle(page)
oneline.title = title
shortDesc = self.getShortDesc(page)
oneline.short_desc = shortDesc
info = self.getInfo(page)
for item in info:
if item[0] == '课程概述':
oneline.description = re.sub(self.tool.replaceNBSP," ",self.tool.replace(item[1]))
if item[0] == '证书要求':
oneline.requirement = re.sub(self.tool.replaceNBSP," ",self.tool.replace(item[1]))
if item[0] == '预备知识':
oneline.pre_knowledge = re.sub(self.tool.replaceNBSP," ",self.tool.replace(item[1]))
if item[0] == '授课大纲':
oneline.chapter = re.sub(self.tool.replaceNBSP," ",self.tool.replace(item[1]))
if item[0] == '参考资料':
oneline.reference = re.sub(self.tool.replaceNBSP," ",self.tool.replace(item[1]))
if item[0] == '常见问题':
oneline.common_prob = re.sub(self.tool.replaceNBSP," ",self.tool.replace(item[1]))
teacher = self.getTeacher(page)
teacherstr = ""
for item in teacher:
teacherstr = teacherstr + item + '\n'
oneline.teacher = teacherstr
value = []
value.append(oneline.title)
value.append(oneline.short_desc)
value.append(oneline.description)
value.append(oneline.requirement)
value.append(oneline.pre_knowledge)
value.append(oneline.chapter)
value.append(oneline.reference)
value.append(oneline.common_prob)
value.append(oneline.teacher)
value.append(oneline.url)
MysqlHelper.insert_one(cur,sql,value)
MysqlHelper.finish(conn)
computer_icourses = computer_icourses()
computer_icourses.start()