def requestInfo(self, cookies, personUrl, peopleIndex, retryTime): sys.stdout.flush() r = requests.get(personUrl, headers=self.headers, cookies=cookies) r.encoding = 'utf-8' if r.status_code != 200: self.failId.append(personUrl) print('----------爬取第{}个用户,链接为{},爬取失败----------'.format( str(peopleIndex), personUrl)) Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return 2 # 提示当前到达的id(log) print('正在爬取第{}个用户,链接为{}的个人信息'.format(str(peopleIndex), personUrl)) html = etree.HTML(r.text) name = html.xpath( '//*[@id="profile"]/div/div[2]/div[1]/div/div/text()[1]') if not name: if retryTime >= constants.MAX_RETRY_TIMES: return 2 retryTime += 1 print('cookies失效') end_time1 = datetime.datetime.now() print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time)) cookies = self.login.getCookie() if not cookies: print('获取cookie失败,退出程序!') return 1 return self.requestInfo(cookies, personUrl, peopleIndex, retryTime) else: personalInfo = self.parsePage(personUrl, html, name) # 豆瓣数据有效,写入数据库 if personalInfo: self.db_helper.insert_personalInfo(personalInfo) print('插入链接为{}用户信息成功!'.format(personUrl)) return 0
def spider(self): peopleIndex = 1 times = 0 cookies = self.login.getCookie() with open(self.path, "r") as f: # 设置文件对象 for personUrl in f: if personUrl[-1] == '\n': personUrl = personUrl[:-1] if times >= constants.MAX_URL_TIMES: times = 0 cookies = self.login.getCookie() if not cookies: print('获取cookie失败,退出程序!') print(personUrl) break sys.stdout.flush() flag = self.requestInfo(cookies, personUrl, peopleIndex, 1) if flag == 1: print(personUrl) break peopleIndex += 1 times += 1 Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) self.end()
def request_personfollowurl(self, cookies, personUrl, peopleIndex, retryTime): sys.stdout.flush() r = requests.get(personUrl, headers=self.headers, cookies=cookies) r.encoding = 'utf-8' if r.status_code != 200: self.failId.append(personUrl) print('----------爬取第{}个用户,链接为{}的关注人链接,爬取失败----------'.format( str(peopleIndex), personUrl)) Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return 2 # 提示当前到达的id(log) print('正在爬取第{}个用户,链接为{}的关注人链接信息'.format(str(peopleIndex), personUrl)) html = etree.HTML(r.text) result = html.xpath('//dl[@class="obu"]') if not result: if retryTime >= constants.MAX_RETRY_TIMES: return 2 retryTime += 1 print('cookies失效') end_time1 = datetime.datetime.now() print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time)) cookies = self.login.getCookie() if not cookies: print('获取cookie失败,退出程序!') return 1 return self.request_personfollowurl(cookies, personUrl, peopleIndex, retryTime) else: followPersonUrls = self.parsePage(result, personUrl) # 豆瓣数据有效,写入数据库 if followPersonUrls: self.db_helper.insert_followPersonUrl(followPersonUrls) print('插入链接为{}用户关注人链接成功!'.format(personUrl)) return 0
def request_movie(self, cookies, mid, movieIndex, retryTime): sys.stdout.flush() r = requests.get(constants.URL_PREFIX + mid, headers=self.headers, cookies=cookies) r.encoding = 'utf-8' if r.status_code != 200: self.failId.append(mid) print('----------爬取第{}部电影信息,id为{},爬取失败----------'.format( str(movieIndex), mid)) Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return 2 print('正在爬取第{}部电影信息,id为{}'.format(str(movieIndex), mid)) movie = self.movie_parser.extract_movie_info(r) if not movie: if retryTime >= constants.MAX_RETRY_TIMES: return 2 retryTime += 1 print('cookies失效') end_time1 = datetime.datetime.now() print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time)) cookies = self.login.getCookie() if not cookies: print('获取cookie失败,退出程序!') return 1 return self.request_movie(cookies, mid, movieIndex, retryTime) else: # 豆瓣数据有效,写入数据库 movie['douban_id'] = mid self.db_helper.insert_movie(movie) print('----------电影id ' + mid + ':爬取成功' + '----------') return 0
def spider(self): movieIndex = 1 times = 0 cookies = self.login.getCookie() with open(self.path, "r") as f: # 设置文件对象 for mid in f: if mid[-1] == '\n': mid = mid[:-1] if times >= constants.MAX_URL_TIMES: times = 0 cookies = self.login.getCookie() if not cookies: print('获取cookie失败,退出程序!') print(mid) break sys.stdout.flush() flag = self.request_movie(cookies, mid, movieIndex, 1) if flag == 1: print(mid) break movieIndex += 1 times += 1 Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) self.end()
def spider(self): times = 0 cookies = self.login.getCookie() with open(self.path, "r") as f: # 设置文件对象 for doUrl in f: pid = '' if times >= constants.MAX_URL_TIMES: times = 0 cookies = self.login.getCookie() if not cookies: print('获取session失败,退出程序!') print(doUrl) break tmp = doUrl.split('/') if tmp[-1] == '': pid = tmp[-3] else: pid = tmp[-2] # 提示当前到达的id(log) print('当前爬取用户id {} 正在看的电影!'.format(pid)) sys.stdout.flush() flag = self.requestDo(cookies, pid, 1, doUrl.strip(), None, 1) if flag == 1: print(pid) break times += 1 Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) self.end()
def detection(self, session): r = session.get(self.logined_url, headers=self.headers3) Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) if r.status_code == 200: r.encoding = 'utf-8' html = etree.HTML(r.text) name = html.xpath( '//*[@id="profile"]/div/div[2]/div[1]/div/div/text()[1]') if not name: return 1 else: return 0 else: return 1
def run(): helptool = HelpTool() cookieList = helptool.getCookie() base_url = 'https://movie.douban.com/top250' headers = {'User-Agent': random.choice(constants.USER_AGENT)} doubanIds = [] savePath = 'data/top250id.txt' # 指明当前用第几个cookie index = 0 # 获取豆瓣TOP250id数据 r = requests.get( base_url, headers=headers, cookies=cookieList[index] ) r.encoding = 'utf-8' while True: html = etree.HTML(r.text) result = html.xpath('//div[@class="item"]') # 如果获取的数据为空,延时以减轻对目标服务器的压力,并跳过。 if not result: continue for item in result: doubanid = item.xpath('div/div[@class="hd"]/a/@href') if doubanid: tmp = doubanid[0].strip().split('/') if tmp[-1] == '': value = tmp[-2] else: value = tmp[-1] doubanIds.append(value) print('----------电影id ' + value + ':爬取成功' + '----------') nextUrl = html.xpath('//span[@class="next"]/a/@href') if not nextUrl: break url = base_url+nextUrl[0] index += 1 if index >= constants.UserNum: index = 0 Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) r = requests.get( url, headers=headers, cookies=cookieList[index] ) r.encoding = 'utf-8' if doubanIds: helptool.storeFailData(savePath, doubanIds)
def getCookies1(self): index = 1 cookieNum = 0 cookieList = [] for i in range(constants.UserNum): cookie = self.getCookie(constants.UserInfo[i][0], constants.UserInfo[i][1]) if 'dbcl2' in cookie.keys(): print('获取第{}个cookie信息成功!'.format(index)) cookieList.append(cookie) cookieNum += 1 else: print('获取第{}个cookie信息失败!'.format(index)) index += 1 Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return cookieList, cookieNum
def requestComment(self, cookies, mid, pageIndex, base_url, nextUrl, retryTime): # headers = {'User-Agent': random.choice(constants.USER_AGENT)} # 获取豆瓣页面(API)数据 sys.stdout.flush() if nextUrl: comment_url = base_url + nextUrl else: comment_url = base_url r = requests.get(comment_url, headers=self.headers, cookies=cookies) r.encoding = 'utf-8' if r.status_code != 200: self.failId.append(mid) Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return 2 html = etree.HTML(r.text) result = html.xpath('//div[@class="comment-item"]') # 如果获取的数据为空,延时以减轻对目标服务器的压力,并跳过。 if not result: if retryTime >= constants.MAX_RETRY_TIMES: return 2 retryTime += 1 print('session失效') end_time1 = datetime.datetime.now() print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time)) cookies = self.login.getCookie() if not cookies: print('获取session失败,退出程序!') return 1 return self.requestComment(cookies, mid, pageIndex, base_url, nextUrl, retryTime) else: movieComments, nextUrl = self.parsePage(mid, comment_url, result, html) # 豆瓣数据有效,写入数据库 if movieComments: self.db_helper.insert_movieComments(movieComments) print('插入第{}页短评成功!'.format(pageIndex)) pageIndex += 1 if nextUrl: Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return self.requestComment(cookies, mid, pageIndex, base_url, nextUrl[0], 1) return 0
def login(self, name, password): session = requests.Session() session.get(self.login_url, headers=self.headers1) post_data = {'name': name, 'password': password, 'remember': 'false'} response = session.post(self.post_url, data=post_data, headers=self.headers2) if response.status_code == 200: Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) flag = self.detection(session) if flag == 0: print('获取session成功') return session else: print('获取session失败') return None else: print('获取session失败') return None
def requestWish(self, cookies, pid, pageIndex, base_url, nextUrl,retryTime): sys.stdout.flush() if nextUrl: wish_url = base_url + nextUrl else: wish_url = base_url r = requests.get( wish_url, headers=self.headers, cookies=cookies ) r.encoding = 'utf-8' if r.status_code != 200: self.failId.append(pid) Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return 2 html = etree.HTML(r.text) result = html.xpath('//div[@class="item"]') if not result: if retryTime >= constants.MAX_RETRY_TIMES: return 2 retryTime += 1 print('cookie失效') end_time1 = datetime.datetime.now() print('失效时间间隔:{} 秒'.format(end_time1 - self.start_time)) cookies = self.login.getCookie() if not cookies: print('获取session失败,退出程序!') return 1 return self.requestWish(cookies, pid, pageIndex, base_url, nextUrl,retryTime) else: wishMovies, nextUrl = self.parsePage(pid, result, html) # 豆瓣数据有效,写入数据库 if wishMovies: self.db_helper.insert_wishMovies(wishMovies) print('插入第{}页想看的电影成功!'.format(pageIndex)) pageIndex += 1 if nextUrl: base_url = 'https://movie.douban.com' Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) return self.requestWish(cookies, pid, pageIndex, base_url, nextUrl[0],1) return 0
def spider(self): with open(self.path, "r") as f: # 设置文件对象 for mid in f: cookies = self.login.getCookie() if not cookies: print('获取session失败,退出程序!') print(mid) break sys.stdout.flush() if mid[-1] == '\n': mid = mid[:-1] base_url = constants.URL_PREFIX + mid + "/comments" # 提示当前到达的id(log) print('当前爬取电影id {} 的影评!'.format(mid)) flag = self.requestComment(cookies, mid, 1, base_url, None, 1) if flag == 1: print(mid) break Utils.delay(constants.DELAY_MIN_SECOND, constants.DELAY_MAX_SECOND) self.end()