def begin_spider(): print("start spider") status = '000000' if pid_queue.empty(): try: logger.info('Parent process {0}'.format(os.getpid())) logger.info('spider_process is started now :') process = Process(target=spider_process, args=( 'spider_process', pid_queue, )) process.start() # join()子进程结束后再继续往下运行,通常用于进程间的同步 # process.join() time.sleep(3) if not pid_queue.empty(): spider_status = 1 else: spider_status = 0 except: spider_status = 0 status = '999999' logger.debug('进程启动异常!') else: spider_status = 1 logger.info('进程已存在!正在爬取知乎信息!') retn_data = {'status': status, 'spiderStatus': spider_status} logger.info('返回状态:{0}和启动状态:{1}'.format(status, spider_status)) return jsonify(retn_data)
def get_parse_url(): if not queue_follow_url.empty(): using_url = queue_follow_url.get() else: # 从数据库加载已经爬取的url数据 和 之前在队列中的数据 all_query_set = FollowingUrl.objects.all() for followingUrl in all_query_set: try: # 加载所以已经爬取的url(再次判重,将重复的数据从数据库删除) if followingUrl.urlToken not in had_url: had_url.add(followingUrl.urlToken) if followingUrl.queueUrl != 'none': # 加载程序结束前队列中的url到队列中 queue_follow_url.put(followingUrl.queueUrl) else: continue else: logger.info("删除重复的 followingUrl:{0} 的 _id:{1}".format( followingUrl._data, followingUrl.id)) followingUrl.delete({'_id': str(followingUrl.id)}) continue except Exception as err: logger.debug('get_parse_url err :{0}'.format(err)) logger.info('error happened in reload urls from mongodb!') continue # 加载完毕 重新从队列取值 if not queue_follow_url.empty() and len(had_url) > 0: using_url = queue_follow_url.get() elif queue_follow_url.empty() and len(had_url) == 0: # 爬虫入口 using_url = follow_url_into else: logger.info("爬取结束了!") return return using_url
def change_queue_url2none(used_url): # 将数据库中 (queueUrl) 已经出队列的url 为none # 返回所有符合查询条件的结果的文档对象列表 query_set = FollowingUrl.objects(queueUrl=used_url) for updateFollowingUrl in query_set: try: updateFollowingUrl.queueUrl = 'none' updateFollowingUrl.save() except Exception as err: logger.debug('change_queue_url2none err :{0}'.format(err)) continue
def parse_html_info(self, source): try: soup = BeautifulSoup(source, "html5lib") # 分析页面 发现个人信息都在 id = "data" data-state={} 里面 获取源码,解析的到用户信息json串 data_div = soup.find('div', attrs={'id': 'data'}) if data_div is not None: data = soup.find('div', attrs={ 'id': 'data' }).attrs['data-state'] # 将网页解析的用户信息转成json串data_json data_json = json.loads(str(data)) # 首页中的所有用户集合 all_users_data = data_json['entities']['users'] if len(all_users_data) > 0 and all_users_data is not None: # 截取用户连接中的名字 url_user_name = self.url.split("/")[-1] user_data = all_users_data[url_user_name] if len(user_data) > 0 and user_data is not None: # 解析用户信息,存入数据库 self.fetch_user_info.parsed_user_info( user_data, self.url) # 将已经解析获得用户信息的url值设为none self.change_queue_url2none(self.url) # 查询网页所有分页信息 pages_html = soup.find_all( 'button', attrs={ 'class': 'Button PaginationButton Button--plain' }) # 得到总页码数 if len(pages_html) > 0: total_page = int(pages_html[-1].contents[0]) else: total_page = 1 # 异步io爬取每一页的关注人地址(如果全局最大爬取页数为1页时候,直接爬取第一页following_url,减少page=1访问次数) if max_page > 1: self.getFollowingUrl.get_other_page_following( self.url, total_page) else: self.getFollowingUrl.add_following_url( all_users_data) else: logger.info('user_data is none!') else: logger.info('all_users_data is none!') else: logger.info( 'data_div is none!(NoneType object has no attribute attrs)' ) self.change_queue_url2none(self.url) except Exception as err: logger.debug('parse_html_info err is : {0}'.format(err)) self.change_queue_url2none(self.url)
def get_user_html(self): while self.url is not None: self.url = self.get_parse_url() following_url = self.url + "/following" logger.info('following_url is {0}'.format(following_url)) try: while True: # proxies_ip = get_proxies_ip() proxies_ip = self.getProxyIp.get_proxies_ip() if proxies_ip is not None: logger.info( "spider get proxies IP is: {0}".format(proxies_ip)) # get_html = requests.get(following_url, headers=header, verify=True, # cookies=cookies2) self.session.proxies = proxies_ip # self.session.cookies = cookies2 get_html = self.session.get(following_url, timeout=MAX_TIME_OUT) time.sleep(1) break else: continue # get_html状态吗为200 if get_html is not None and get_html.ok: self.parse_html_info(get_html.text) # 如果状态码为403 forbidden n那么 说明账号被封程序先停止 elif get_html.status_code == 403: print(get_html.text) logger.info( 'status_code is 403!!! forbidden progress return!') return # 如果是其他 404问题 说明该账号用户账号被封或无效出现爬取异常,将数据库url修改成none elif get_html.status_code == 404 or get_html.status_code == 410: logger.info( "status_code is: {0} ! followingUrl:{1} is not invalid" .format(get_html.status_code, following_url)) self.change_queue_url2none(self.url) continue else: logger.info( 'get_html is:{0} or other and status_code is :{1}'. format(get_html.text, get_html.status_code)) continue except Exception as err: logger.debug('get_user_html{0} Exception is {1} '.format( following_url, err)) continue # return else: logger.info("抓取结束!") return
def get_other_page_following(self, url, total_pages): try: url_append_page = url + '/following?page=' limit_page = max_page # 限制 因过多分页 花费在解析分页网页的时间(去掉(被关注)关注和被关注的关系 ,还存在 多人关注 同一个用户的情况,) # 上述关系可以在多数据中遍历到大部分拥有这个歌关系的用户信息,但是存在信息(只有关注关系的用户信息)不全面。 # 传入的页码处理 一次传入5个 url if total_pages >= limit_page: total_pages = limit_page page_list = [] for i in list(range(1, total_pages + 1)): page_list.append(url_append_page + (str(i))) # 页码数使5的倍数处理 if total_pages % max_parse_page == 0: times = int(total_pages / max_parse_page) else: times = int(total_pages / max_parse_page) + 1 # 截取链接数 5链接一个list page_cut_list = [] for i in list(range(1, times + 1)): page_cut_list.append(page_list[max_parse_page * (i - 1):max_parse_page * i]) print(page_cut_list) # 获取代理ip proxy_ip = self.getProxyIp.get_proxies_ip() # 循环 发送请求 time.sleep(2) for url_list in page_cut_list: # 每次传多个url进去 urls = url_list # rs = (grequests.get(u, headers=header, proxies=proxy_ip, cookies=cookies) for u in urls) rs = (grequests.get(u, headers=header, proxies=proxy_ip) for u in urls) respond_html = [] for resp in grequests.map( rs, exception_handler=self.exception_handler): # grequests.map查看异步io请求返回的状态码200 500 # print('status:{0} url:{1}'.format(resp, resp.url)) if resp is not None: respond_html.append(resp.text) else: continue # 1。可以使用for循环一个一个解析, # 2.map/reduce 解析 一次领解析最多5个 list(map(self.parse_page_html, respond_html)) except Exception as err: logger.debug(" get_other_page_following error !:{0}".format(err))
def parse_page_html(self, page_html): try: soup_page = BeautifulSoup(page_html, 'html5lib') # 分析分页页面 得到个人信息,解析的到用户信息json串 data = soup_page.find('div', attrs={ 'id': 'data' }).attrs['data-state'] if data is not None: # 将网页解析的用户信息转成json串data_json data_json = json.loads(str(data)) # 当前页所有user的数据集 all_user_data = data_json['entities']['users'] self.add_following_url(all_user_data) else: logger.info('parse_page_html data is none!') except Exception as err: logger.debug("parse_page_html error ! {0}".format(err))
def add_following_url(follow_user__data): try: base_url = 'https://www.zhihu.com/people/' # dict类型 keys 和values 可以使用如下遍历(也可以.keys()he .values()直接输出) for key, value in follow_user__data.items(): new_url = base_url + (str(value['urlToken'])) if new_url in had_url: continue elif str(value['urlToken']) == 'None': continue else: # 存入had_url had_url.add(new_url) # 存入队列 had_url去重后队列值都是唯一性,未解析用户信息的url logger.info('new following url is :{0}'.format(new_url)) queue_follow_url.put(new_url) # 已经爬去的url和追踪队列中的url 存储到mongodb(已经去重) db_url = FollowingUrl() db_url.urlToken = new_url db_url.queueUrl = new_url db_url.save() except Exception as err: logger.debug('add_following_url has err :{0}'.format(err))
def load_url(): logger.info("load url begin!") # 从数据库加载已经爬取的url数据 和 之前在队列中的数据 all_query_set = FollowingUrl.objects.all() for followingUrl in all_query_set: try: # 加载所以已经爬取的url(再次判重,将重复的数据从数据库删除) if followingUrl.urlToken not in had_url: had_url.add(followingUrl.urlToken) if followingUrl.queueUrl != 'none': # 加载程序结束前队列中的url到队列中 queue_follow_url.put(followingUrl.queueUrl) else: continue else: logger.info("删除重复的 followingUrl:{0} 的 _id:{1}".format( followingUrl._data, followingUrl.id)) followingUrl.delete({'_id': str(followingUrl.id)}) continue except Exception as err: logger.debug('load url err :{0}'.format(err)) logger.info('error happened in load urls from mongodb!') continue logger.info("load url end!")
def run(self): try: self.get_user_html() except Exception as err: logger.debug('spider get_user_html is err :{0}'.format(err))