def __init__(self, path, failPath): self.path = path self.failPath = failPath self.failId = [] self.helptool = HelpTool() # 实例化爬虫类和数据库连接工具类 self.db_helper = DbHelper() self.login = GetCookies() # self.login = Login() self.start_time = datetime.datetime.now() self.end_time = datetime.datetime.now() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'}
class personInfo: def __init__(self, path, failPath): self.path = path self.failPath = failPath self.failId = [] self.helptool = HelpTool() # 实例化爬虫类和数据库连接工具类 self.db_helper = DbHelper() self.login = GetCookies() # self.login = Login() self.start_time = datetime.datetime.now() self.end_time = datetime.datetime.now() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' } def parsePage(self, personUrl, html, name): personalInfo = Entity.personalInfo.copy() tmp = personUrl.split('/') if tmp[-1] == '': personalInfo['pid'] = tmp[-2] else: personalInfo['pid'] = tmp[-1] personalInfo['name'] = name[0].strip() personalInfo['personUrl'] = personUrl try: personalInfo['register_time'] = html.xpath( '//*[@id="profile"]/div/div[2]/div[1]/div/div/text()' )[1].strip()[:-2] except Exception: pass try: personalInfo['location'] = html.xpath( '//*[@id="profile"]/div/div[2]/div[1]/div/a/text()')[0].strip( ) except Exception: pass try: personalInfo['introduction'] = ''.join( html.xpath('//*[@id="intro_display"]/text()')) except Exception: pass try: temp = html.xpath('//*[@id="friend"]/h2/span/a/text()')[0].strip() personalInfo['follow_num'] = re.search('(\d+)', temp).group() except Exception: pass try: personalInfo['follow_url'] = html.xpath( '//*[@id="friend"]/h2/span/a/@href')[0] except Exception: pass try: temps = html.xpath('//*[@id="movie"]/h2/span/a') #['https://movie.douban.com/people/153843683/do', 'https://movie.douban.com/people/153843683/wish', 'https://movie.douban.com/people/153843683/collect'] for temp in temps: result = temp.xpath('@href')[0] num = temp.xpath('text()')[0] num1 = re.search('(\d+)', num).group() tmp = result.split('/') url = '' if tmp[-1] == '': url = tmp[-2] else: url = tmp[-1] if url == 'do': personalInfo['do'] = result personalInfo['do_num'] = num1 elif url == 'wish': personalInfo['wish'] = result personalInfo['wish_num'] = num1 elif url == 'collect': personalInfo['collect'] = result personalInfo['collect_num'] = num1 except Exception: pass return personalInfo def requestInfo(self, cookies, personUrl, peopleIndex, retryTime): sys.stdout.flush() r = requests.get(personUrl, headers=self.headers, cookies=cookies) r.encoding = 'utf-8' if r.status_code != 200: self.failId.append(personUrl) print('----------爬取第{}个用户,链接为{},爬取失败----------'.format( str(peopleIndex), personUrl)) Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return 2 # 提示当前到达的id(log) print('正在爬取第{}个用户,链接为{}的个人信息'.format(str(peopleIndex), personUrl)) html = etree.HTML(r.text) name = html.xpath( '//*[@id="profile"]/div/div[2]/div[1]/div/div/text()[1]') if not name: if retryTime >= constants.MAX_RETRY_TIMES: return 2 retryTime += 1 print('cookies失效') end_time1 = datetime.datetime.now() print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time)) cookies = self.login.getCookie() if not cookies: print('获取cookie失败,退出程序!') return 1 return self.requestInfo(cookies, personUrl, peopleIndex, retryTime) else: personalInfo = self.parsePage(personUrl, html, name) # 豆瓣数据有效,写入数据库 if personalInfo: self.db_helper.insert_personalInfo(personalInfo) print('插入链接为{}用户信息成功!'.format(personUrl)) return 0 def end(self): # 存储爬取失败的电影id self.helptool.storeFailData(self.failPath, self.failId) # 释放资源 self.db_helper.close_db() self.end_time = datetime.datetime.now() self.login.closeChrome() # 每次爬取一个电影的影评用一个新的用户 def spider(self): peopleIndex = 1 times = 0 cookies = self.login.getCookie() with open(self.path, "r") as f: # 设置文件对象 for personUrl in f: if personUrl[-1] == '\n': personUrl = personUrl[:-1] if times >= constants.MAX_URL_TIMES: times = 0 cookies = self.login.getCookie() if not cookies: print('获取cookie失败,退出程序!') print(personUrl) break sys.stdout.flush() flag = self.requestInfo(cookies, personUrl, peopleIndex, 1) if flag == 1: print(personUrl) break peopleIndex += 1 times += 1 Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) self.end()
class FollowPersonUrl: def __init__(self, path, failPath): self.path = path self.failPath = failPath self.failId = [] self.helptool = HelpTool() # 实例化爬虫类和数据库连接工具类 self.db_helper = DbHelper() self.login = GetCookies() # self.login = Login() self.start_time = datetime.datetime.now() self.end_time = datetime.datetime.now() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' } def parsePage(self, result, personUrl): followPersonUrls = [] for item in result: try: followPersonUrl = Entity.followPersonUrl.copy() followName = item.xpath('dd/a/text()')[0].strip() if followName == "[已注销]" or followName == "已注销": continue tmp = personUrl.split('/') if tmp[-1] == '': followPersonUrl['originalId'] = tmp[-3] else: followPersonUrl['originalId'] = tmp[-2] url = item.xpath('dd/a/@href')[0].strip() followPersonUrl['followUrl'] = url tmp = url.split('/') if tmp[-1] == '': followPersonUrl['followId'] = tmp[-2] else: followPersonUrl['followId'] = tmp[-1] followPersonUrls.append(followPersonUrl) except Exception: pass return followPersonUrls def request_personfollowurl(self, cookies, personUrl, peopleIndex, retryTime): sys.stdout.flush() r = requests.get(personUrl, headers=self.headers, cookies=cookies) r.encoding = 'utf-8' if r.status_code != 200: self.failId.append(personUrl) print('----------爬取第{}个用户,链接为{}的关注人链接,爬取失败----------'.format( str(peopleIndex), personUrl)) Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return 2 # 提示当前到达的id(log) print('正在爬取第{}个用户,链接为{}的关注人链接信息'.format(str(peopleIndex), personUrl)) html = etree.HTML(r.text) result = html.xpath('//dl[@class="obu"]') if not result: if retryTime >= constants.MAX_RETRY_TIMES: return 2 retryTime += 1 print('cookies失效') end_time1 = datetime.datetime.now() print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time)) cookies = self.login.getCookie() if not cookies: print('获取cookie失败,退出程序!') return 1 return self.request_personfollowurl(cookies, personUrl, peopleIndex, retryTime) else: followPersonUrls = self.parsePage(result, personUrl) # 豆瓣数据有效,写入数据库 if followPersonUrls: self.db_helper.insert_followPersonUrl(followPersonUrls) print('插入链接为{}用户关注人链接成功!'.format(personUrl)) return 0 def end(self): # 存储爬取失败的电影id self.helptool.storeFailData(self.failPath, self.failId) # 释放资源 self.db_helper.close_db() self.end_time = datetime.datetime.now() self.login.closeChrome() def spider(self): peopleIndex = 1 times = 0 cookies = self.login.getCookie() with open(self.path, "r") as f: # 设置文件对象 for personUrl in f: if personUrl[-1] == '\n': personUrl = personUrl[:-1] if times >= constants.MAX_URL_TIMES: times = 0 cookies = self.login.getCookie() if not cookies: print('获取cookie失败,退出程序!') print(personUrl) break sys.stdout.flush() flag = self.request_personfollowurl(cookies, personUrl, peopleIndex, 1) if flag == 1: print(personUrl) break peopleIndex += 1 times += 1 Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) self.end()
class Movie: def __init__(self, path, failPath): self.path = path self.failPath = failPath self.failId = [] self.helptool = HelpTool() # 实例化爬虫类和数据库连接工具类 self.db_helper = DbHelper() self.login = GetCookies() # self.login = Login() self.start_time = datetime.datetime.now() self.end_time = datetime.datetime.now() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' } # 实例化爬虫类和数据库连接工具类 self.movie_parser = MovieParser() def request_movie(self, cookies, mid, movieIndex, retryTime): sys.stdout.flush() r = requests.get(constants.URL_PREFIX + mid, headers=self.headers, cookies=cookies) r.encoding = 'utf-8' if r.status_code != 200: self.failId.append(mid) print('----------爬取第{}部电影信息,id为{},爬取失败----------'.format( str(movieIndex), mid)) Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return 2 print('正在爬取第{}部电影信息,id为{}'.format(str(movieIndex), mid)) movie = self.movie_parser.extract_movie_info(r) if not movie: if retryTime >= constants.MAX_RETRY_TIMES: return 2 retryTime += 1 print('cookies失效') end_time1 = datetime.datetime.now() print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time)) cookies = self.login.getCookie() if not cookies: print('获取cookie失败,退出程序!') return 1 return self.request_movie(cookies, mid, movieIndex, retryTime) else: # 豆瓣数据有效,写入数据库 movie['douban_id'] = mid self.db_helper.insert_movie(movie) print('----------电影id ' + mid + ':爬取成功' + '----------') return 0 def end(self): # 存储爬取失败的电影id self.helptool.storeFailData(self.failPath, self.failId) # 释放资源 self.db_helper.close_db() self.end_time = datetime.datetime.now() self.login.closeChrome() def spider(self): movieIndex = 1 times = 0 cookies = self.login.getCookie() with open(self.path, "r") as f: # 设置文件对象 for mid in f: if mid[-1] == '\n': mid = mid[:-1] if times >= constants.MAX_URL_TIMES: times = 0 cookies = self.login.getCookie() if not cookies: print('获取cookie失败,退出程序!') print(mid) break sys.stdout.flush() flag = self.request_movie(cookies, mid, movieIndex, 1) if flag == 1: print(mid) break movieIndex += 1 times += 1 Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) self.end()
class DoMovie: def __init__(self, path, failPath): self.path = path self.failPath = failPath self.failId = [] self.helptool = HelpTool() # 实例化爬虫类和数据库连接工具类 self.db_helper = DbHelper() self.login = GetCookies() # self.login = Login() self.start_time = datetime.datetime.now() self.end_time = datetime.datetime.now() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' } def parsePage(self, pid, do_url, result, html): ids = 1 doComments = [] star = 'allstar{}0 rating' for item in result: movieComment = Entity.movieComment.copy() movieComment['people_id'] = pid movieComment['comment_url'] = do_url movieComment['comment_id'] = str( datetime.datetime.now().timestamp()).replace('.', '') + str(ids) ids += 1 tmp = item.xpath( 'div[@class="info"]/ul/li[@class="title"]/a/@href') if not tmp: continue else: tmp1 = tmp[0].strip().split('/') if tmp1[-1] == '': douban_id = tmp1[-2] else: douban_id = tmp1[-1] movieComment['douban_id'] = douban_id try: tmp = item.xpath( 'div[@class="info"]/ul/li[3]/span[1]/@class')[0].strip() tmp1 = re.search('(\d+)', tmp).group() movieComment['star'] = star.format(tmp1) except Exception: pass try: movieComment['time'] = item.xpath( 'div[@class="info"]/ul/li[3]/span[@class="date"]/text()' )[0].strip() except Exception: pass try: movieComment['content'] = item.xpath( 'div[@class="info"]/ul/li[4]/span[@class="comment"]/text()' )[0].strip() except Exception: pass # try: # tmp = item.xpath('div[@class="info"]/ul/li[4]/span[@class="p1"]/text()')[0].strip() # movieComment['useful_num'] = re.search('(\d+)', tmp).group() # except Exception: # pass doComments.append(movieComment) nextUrl = html.xpath( '//div[@class="paginator"]/span[@class="next"]/a[1]/@href') return doComments, nextUrl def requestDo(self, cookies, pid, pageIndex, base_url, nextUrl, retryTime): sys.stdout.flush() if nextUrl: do_url = base_url + nextUrl else: do_url = base_url r = requests.get(do_url, headers=self.headers, cookies=cookies) r.encoding = 'utf-8' if r.status_code != 200: self.failId.append(pid) Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return 2 html = etree.HTML(r.text) result = html.xpath('//div[@class="item"]') if not result: if retryTime >= constants.MAX_RETRY_TIMES: return 2 retryTime += 1 print('cookie失效') end_time1 = datetime.datetime.now() print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time)) cookies = self.login.getCookie() if not cookies: print('获取session失败,退出程序!') return 1 return self.requestDo(cookies, pid, pageIndex, base_url, nextUrl, retryTime) else: doComments, nextUrl = self.parsePage(pid, do_url, result, html) # 豆瓣数据有效,写入数据库 if doComments: self.db_helper.insert_movieComments(doComments) print('插入第{}页短评成功!'.format(pageIndex)) pageIndex += 1 if nextUrl: base_url = 'https://movie.douban.com' Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return self.requestDo(cookies, pid, pageIndex, base_url, nextUrl[0], 1) return 0 def end(self): # 存储爬取失败的电影id self.helptool.storeFailData(self.failPath, self.failId) # 释放资源 self.db_helper.close_db() self.end_time = datetime.datetime.now() self.login.closeChrome() def spider(self): times = 0 cookies = self.login.getCookie() with open(self.path, "r") as f: # 设置文件对象 for doUrl in f: pid = '' if times >= constants.MAX_URL_TIMES: times = 0 cookies = self.login.getCookie() if not cookies: print('获取session失败,退出程序!') print(doUrl) break tmp = doUrl.split('/') if tmp[-1] == '': pid = tmp[-3] else: pid = tmp[-2] # 提示当前到达的id(log) print('当前爬取用户id {} 正在看的电影!'.format(pid)) sys.stdout.flush() flag = self.requestDo(cookies, pid, 1, doUrl.strip(), None, 1) if flag == 1: print(pid) break times += 1 Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) self.end()
class WishMovie: def __init__(self, path, failPath): self.path = path self.failPath = failPath self.failId = [] self.helptool = HelpTool() # 实例化爬虫类和数据库连接工具类 self.db_helper = DbHelper() self.login = GetCookies() # self.login = Login() self.start_time = datetime.datetime.now() self.end_time = datetime.datetime.now() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'} def parsePage(self, pid, result, html): wishMovies = [] for item in result: wishMovie = Entity.wishMovie.copy() wishMovie['people_id'] = pid tmp = item.xpath('div[@class="info"]/ul/li[@class="title"]/a/@href') if not tmp: continue else: tmp1 = tmp[0].strip().split('/') if tmp1[-1] == '': douban_id = tmp1[-2] else: douban_id = tmp1[-1] wishMovie['douban_id'] = douban_id try: wishMovie['time'] = item.xpath('div[@class="info"]/ul/li[3]/span[@class="date"]/text()')[0].strip() except Exception: pass wishMovies.append(wishMovie) nextUrl = html.xpath('//div[@class="paginator"]/span[@class="next"]/a[1]/@href') return wishMovies, nextUrl def requestWish(self, cookies, pid, pageIndex, base_url, nextUrl,retryTime): sys.stdout.flush() if nextUrl: wish_url = base_url + nextUrl else: wish_url = base_url r = requests.get( wish_url, headers=self.headers, cookies=cookies ) r.encoding = 'utf-8' if r.status_code != 200: self.failId.append(pid) Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return 2 html = etree.HTML(r.text) result = html.xpath('//div[@class="item"]') if not result: if retryTime >= constants.MAX_RETRY_TIMES: return 2 retryTime += 1 print('cookie失效') end_time1 = datetime.datetime.now() print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time)) cookies = self.login.getCookie() if not cookies: print('获取session失败,退出程序!') return 1 return self.requestWish(cookies, pid, pageIndex, base_url, nextUrl,retryTime) else: wishMovies, nextUrl = self.parsePage(pid, result, html) # 豆瓣数据有效,写入数据库 if wishMovies: self.db_helper.insert_wishMovies(wishMovies) print('插入第{}页想看的电影成功!'.format(pageIndex)) pageIndex += 1 if nextUrl: base_url = 'https://movie.douban.com' Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return self.requestWish(cookies, pid, pageIndex, base_url, nextUrl[0],1) return 0 def end(self): # 存储爬取失败的电影id self.helptool.storeFailData(self.failPath, self.failId) # 释放资源 self.db_helper.close_db() self.end_time = datetime.datetime.now() self.login.closeChrome() def spider(self): times = 0 cookies = self.login.getCookie() with open(self.path, "r") as f: # 设置文件对象 for wishUrl in f: pid = '' if times >= constants.MAX_URL_TIMES: times = 0 cookies = self.login.getCookie() if not cookies: print('获取session失败,退出程序!') print(wishUrl) break sys.stdout.flush() tmp = wishUrl.split('/') if tmp[-1] == '': pid = tmp[-3] else: pid = tmp[-2] # 提示当前到达的id(log) print('当前爬取用户id {} 想看的电影!'.format(pid)) flag = self.requestWish(cookies, pid, 1, wishUrl.strip(), None,1) if flag == 1: print(pid) break times += 1 Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) self.end()
class Comment: def __init__(self, path, failPath): self.path = path self.failPath = failPath self.failId = [] self.helptool = HelpTool() # 实例化爬虫类和数据库连接工具类 self.db_helper = DbHelper() self.login = GetCookies() # self.login = Login() self.start_time = datetime.datetime.now() self.end_time = datetime.datetime.now() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' } def parsePage(self, mid, comment_url, result, html): movieComments = [] # 提取豆瓣数据 for item in result: movieComment = Entity.movieComment.copy() movieComment['douban_id'] = mid movieComment['comment_url'] = comment_url try: # 短评的唯一id movieComment['comment_id'] = \ item.xpath( 'div[@class="comment"]/h3/span[@class="comment-vote"]/input/@value')[0].strip() except Exception: pass try: # 多少人评论有用 movieComment['useful_num'] = \ item.xpath( 'div[@class="comment"]/h3/span[@class="comment-vote"]/span/text()')[0].strip() except Exception: pass try: # 评分 movieComment['star'] = \ item.xpath( 'div[@class="comment"]/h3/span[@class="comment-info"]/span[2]/@class')[0].strip() except Exception: pass try: # 评论时间 movieComment['time'] = item.xpath( 'div[@class="comment"]/h3/span[@class="comment-info"]/span[@class="comment-time "]/@title' )[0] except Exception: pass try: # 评论内容 movieComment['content'] = item.xpath( 'div[@class="comment"]/p/span/text()')[0] except Exception: pass try: # 评论者名字(唯一) movieComment['people'] = item.xpath( 'div[@class="avatar"]/a/@title')[0] except Exception: pass try: # 评论者页面 url = item.xpath('div[@class="avatar"]/a/@href')[0].strip() tmp = url.split('/') if tmp[-1] == '': movieComment['people_id'] = tmp[-2] else: movieComment['people_id'] = tmp[-1] movieComment['people_url'] = item.xpath( 'div[@class="avatar"]/a/@href')[0] except Exception: pass movieComments.append(movieComment) nextUrl = html.xpath('//a[@class="next"]/@href') return movieComments, nextUrl def requestComment(self, cookies, mid, pageIndex, base_url, nextUrl, retryTime): # headers = {'User-Agent': random.choice(constants.USER_AGENT)} # 获取豆瓣页面(API)数据 sys.stdout.flush() if nextUrl: comment_url = base_url + nextUrl else: comment_url = base_url r = requests.get(comment_url, headers=self.headers, cookies=cookies) r.encoding = 'utf-8' if r.status_code != 200: self.failId.append(mid) Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return 2 html = etree.HTML(r.text) result = html.xpath('//div[@class="comment-item"]') # 如果获取的数据为空,延时以减轻对目标服务器的压力,并跳过。 if not result: if retryTime >= constants.MAX_RETRY_TIMES: return 2 retryTime += 1 print('session失效') end_time1 = datetime.datetime.now() print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time)) cookies = self.login.getCookie() if not cookies: print('获取session失败,退出程序!') return 1 return self.requestComment(cookies, mid, pageIndex, base_url, nextUrl, retryTime) else: movieComments, nextUrl = self.parsePage(mid, comment_url, result, html) # 豆瓣数据有效,写入数据库 if movieComments: self.db_helper.insert_movieComments(movieComments) print('插入第{}页短评成功!'.format(pageIndex)) pageIndex += 1 if nextUrl: Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return self.requestComment(cookies, mid, pageIndex, base_url, nextUrl[0], 1) return 0 def end(self): # 存储爬取失败的电影id self.helptool.storeFailData(self.failPath, self.failId) # 释放资源 self.db_helper.close_db() self.end_time = datetime.datetime.now() self.login.closeChrome() # 每次爬取一个电影的影评用一个新的用户 def spider(self): with open(self.path, "r") as f: # 设置文件对象 for mid in f: cookies = self.login.getCookie() if not cookies: print('获取session失败,退出程序!') print(mid) break sys.stdout.flush() if mid[-1] == '\n': mid = mid[:-1] base_url = constants.URL_PREFIX + mid + "/comments" # 提示当前到达的id(log) print('当前爬取电影id {} 的影评!'.format(mid)) flag = self.requestComment(cookies, mid, 1, base_url, None, 1) if flag == 1: print(mid) break Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) self.end()