示例#1
0
def begin_spider():
    print("start spider")
    status = '000000'
    if pid_queue.empty():
        try:
            logger.info('Parent process {0}'.format(os.getpid()))
            logger.info('spider_process is started now :')
            process = Process(target=spider_process,
                              args=(
                                  'spider_process',
                                  pid_queue,
                              ))
            process.start()
            # join()子进程结束后再继续往下运行,通常用于进程间的同步
            # process.join()
            time.sleep(3)
            if not pid_queue.empty():
                spider_status = 1
            else:
                spider_status = 0
        except:
            spider_status = 0
            status = '999999'
            logger.debug('进程启动异常!')
    else:
        spider_status = 1
        logger.info('进程已存在!正在爬取知乎信息!')
    retn_data = {'status': status, 'spiderStatus': spider_status}
    logger.info('返回状态:{0}和启动状态:{1}'.format(status, spider_status))
    return jsonify(retn_data)
示例#2
0
 def get_parse_url():
     if not queue_follow_url.empty():
         using_url = queue_follow_url.get()
     else:
         # 从数据库加载已经爬取的url数据 和 之前在队列中的数据
         all_query_set = FollowingUrl.objects.all()
         for followingUrl in all_query_set:
             try:
                 # 加载所以已经爬取的url(再次判重,将重复的数据从数据库删除)
                 if followingUrl.urlToken not in had_url:
                     had_url.add(followingUrl.urlToken)
                     if followingUrl.queueUrl != 'none':
                         # 加载程序结束前队列中的url到队列中
                         queue_follow_url.put(followingUrl.queueUrl)
                     else:
                         continue
                 else:
                     logger.info("删除重复的 followingUrl:{0} 的 _id:{1}".format(
                         followingUrl._data, followingUrl.id))
                     followingUrl.delete({'_id': str(followingUrl.id)})
                     continue
             except Exception as err:
                 logger.debug('get_parse_url err :{0}'.format(err))
                 logger.info('error happened in reload urls from mongodb!')
                 continue
         # 加载完毕 重新从队列取值
         if not queue_follow_url.empty() and len(had_url) > 0:
             using_url = queue_follow_url.get()
         elif queue_follow_url.empty() and len(had_url) == 0:
             # 爬虫入口
             using_url = follow_url_into
         else:
             logger.info("爬取结束了!")
             return
     return using_url
示例#3
0
 def change_queue_url2none(used_url):
     # 将数据库中 (queueUrl) 已经出队列的url 为none
     # 返回所有符合查询条件的结果的文档对象列表
     query_set = FollowingUrl.objects(queueUrl=used_url)
     for updateFollowingUrl in query_set:
         try:
             updateFollowingUrl.queueUrl = 'none'
             updateFollowingUrl.save()
         except Exception as err:
             logger.debug('change_queue_url2none err :{0}'.format(err))
             continue
示例#4
0
 def parse_html_info(self, source):
     try:
         soup = BeautifulSoup(source, "html5lib")
         # 分析页面 发现个人信息都在 id = "data" data-state={} 里面 获取源码,解析的到用户信息json串
         data_div = soup.find('div', attrs={'id': 'data'})
         if data_div is not None:
             data = soup.find('div', attrs={
                 'id': 'data'
             }).attrs['data-state']
             # 将网页解析的用户信息转成json串data_json
             data_json = json.loads(str(data))
             # 首页中的所有用户集合
             all_users_data = data_json['entities']['users']
             if len(all_users_data) > 0 and all_users_data is not None:
                 # 截取用户连接中的名字
                 url_user_name = self.url.split("/")[-1]
                 user_data = all_users_data[url_user_name]
                 if len(user_data) > 0 and user_data is not None:
                     # 解析用户信息,存入数据库
                     self.fetch_user_info.parsed_user_info(
                         user_data, self.url)
                     # 将已经解析获得用户信息的url值设为none
                     self.change_queue_url2none(self.url)
                     # 查询网页所有分页信息
                     pages_html = soup.find_all(
                         'button',
                         attrs={
                             'class':
                             'Button PaginationButton Button--plain'
                         })
                     # 得到总页码数
                     if len(pages_html) > 0:
                         total_page = int(pages_html[-1].contents[0])
                     else:
                         total_page = 1
                     # 异步io爬取每一页的关注人地址(如果全局最大爬取页数为1页时候,直接爬取第一页following_url,减少page=1访问次数)
                     if max_page > 1:
                         self.getFollowingUrl.get_other_page_following(
                             self.url, total_page)
                     else:
                         self.getFollowingUrl.add_following_url(
                             all_users_data)
                 else:
                     logger.info('user_data is none!')
             else:
                 logger.info('all_users_data is none!')
         else:
             logger.info(
                 'data_div is none!(NoneType object has no attribute attrs)'
             )
             self.change_queue_url2none(self.url)
     except Exception as err:
         logger.debug('parse_html_info err is : {0}'.format(err))
         self.change_queue_url2none(self.url)
示例#5
0
 def get_user_html(self):
     while self.url is not None:
         self.url = self.get_parse_url()
         following_url = self.url + "/following"
         logger.info('following_url is {0}'.format(following_url))
         try:
             while True:
                 # proxies_ip = get_proxies_ip()
                 proxies_ip = self.getProxyIp.get_proxies_ip()
                 if proxies_ip is not None:
                     logger.info(
                         "spider get proxies IP is: {0}".format(proxies_ip))
                     # get_html = requests.get(following_url, headers=header, verify=True,
                     #                         cookies=cookies2)
                     self.session.proxies = proxies_ip
                     # self.session.cookies = cookies2
                     get_html = self.session.get(following_url,
                                                 timeout=MAX_TIME_OUT)
                     time.sleep(1)
                     break
                 else:
                     continue
             # get_html状态吗为200
             if get_html is not None and get_html.ok:
                 self.parse_html_info(get_html.text)
             # 如果状态码为403 forbidden n那么 说明账号被封程序先停止
             elif get_html.status_code == 403:
                 print(get_html.text)
                 logger.info(
                     'status_code is 403!!! forbidden progress return!')
                 return
             # 如果是其他 404问题 说明该账号用户账号被封或无效出现爬取异常,将数据库url修改成none
             elif get_html.status_code == 404 or get_html.status_code == 410:
                 logger.info(
                     "status_code is: {0} ! followingUrl:{1} is not invalid"
                     .format(get_html.status_code, following_url))
                 self.change_queue_url2none(self.url)
                 continue
             else:
                 logger.info(
                     'get_html is:{0} or other and status_code is :{1}'.
                     format(get_html.text, get_html.status_code))
                 continue
         except Exception as err:
             logger.debug('get_user_html{0} Exception is {1} '.format(
                 following_url, err))
             continue
             # return
     else:
         logger.info("抓取结束!")
         return
 def get_other_page_following(self, url, total_pages):
     try:
         url_append_page = url + '/following?page='
         limit_page = max_page
         # 限制 因过多分页 花费在解析分页网页的时间(去掉(被关注)关注和被关注的关系 ,还存在 多人关注 同一个用户的情况,)
         # 上述关系可以在多数据中遍历到大部分拥有这个歌关系的用户信息,但是存在信息(只有关注关系的用户信息)不全面。
         # 传入的页码处理 一次传入5个 url
         if total_pages >= limit_page:
             total_pages = limit_page
         page_list = []
         for i in list(range(1, total_pages + 1)):
             page_list.append(url_append_page + (str(i)))
         # 页码数使5的倍数处理
         if total_pages % max_parse_page == 0:
             times = int(total_pages / max_parse_page)
         else:
             times = int(total_pages / max_parse_page) + 1
         # 截取链接数 5链接一个list
         page_cut_list = []
         for i in list(range(1, times + 1)):
             page_cut_list.append(page_list[max_parse_page *
                                            (i - 1):max_parse_page * i])
         print(page_cut_list)
         # 获取代理ip
         proxy_ip = self.getProxyIp.get_proxies_ip()
         # 循环 发送请求
         time.sleep(2)
         for url_list in page_cut_list:
             # 每次传多个url进去
             urls = url_list
             # rs = (grequests.get(u, headers=header, proxies=proxy_ip, cookies=cookies) for u in urls)
             rs = (grequests.get(u, headers=header, proxies=proxy_ip)
                   for u in urls)
             respond_html = []
             for resp in grequests.map(
                     rs, exception_handler=self.exception_handler):
                 # grequests.map查看异步io请求返回的状态码200 500
                 # print('status:{0} url:{1}'.format(resp, resp.url))
                 if resp is not None:
                     respond_html.append(resp.text)
                 else:
                     continue
             # 1。可以使用for循环一个一个解析,
             #  2.map/reduce 解析 一次领解析最多5个
             list(map(self.parse_page_html, respond_html))
     except Exception as err:
         logger.debug(" get_other_page_following error !:{0}".format(err))
 def parse_page_html(self, page_html):
     try:
         soup_page = BeautifulSoup(page_html, 'html5lib')
         # 分析分页页面 得到个人信息,解析的到用户信息json串
         data = soup_page.find('div', attrs={
             'id': 'data'
         }).attrs['data-state']
         if data is not None:
             # 将网页解析的用户信息转成json串data_json
             data_json = json.loads(str(data))
             # 当前页所有user的数据集
             all_user_data = data_json['entities']['users']
             self.add_following_url(all_user_data)
         else:
             logger.info('parse_page_html data is none!')
     except Exception as err:
         logger.debug("parse_page_html error ! {0}".format(err))
 def add_following_url(follow_user__data):
     try:
         base_url = 'https://www.zhihu.com/people/'
         #  dict类型 keys 和values 可以使用如下遍历(也可以.keys()he .values()直接输出)
         for key, value in follow_user__data.items():
             new_url = base_url + (str(value['urlToken']))
             if new_url in had_url:
                 continue
             elif str(value['urlToken']) == 'None':
                 continue
             else:
                 # 存入had_url
                 had_url.add(new_url)
                 # 存入队列 had_url去重后队列值都是唯一性,未解析用户信息的url
                 logger.info('new following url is :{0}'.format(new_url))
                 queue_follow_url.put(new_url)
                 # 已经爬去的url和追踪队列中的url 存储到mongodb(已经去重)
                 db_url = FollowingUrl()
                 db_url.urlToken = new_url
                 db_url.queueUrl = new_url
                 db_url.save()
     except Exception as err:
         logger.debug('add_following_url has err :{0}'.format(err))
示例#9
0
 def load_url():
     logger.info("load url begin!")
     # 从数据库加载已经爬取的url数据 和 之前在队列中的数据
     all_query_set = FollowingUrl.objects.all()
     for followingUrl in all_query_set:
         try:
             # 加载所以已经爬取的url(再次判重,将重复的数据从数据库删除)
             if followingUrl.urlToken not in had_url:
                 had_url.add(followingUrl.urlToken)
                 if followingUrl.queueUrl != 'none':
                     # 加载程序结束前队列中的url到队列中
                     queue_follow_url.put(followingUrl.queueUrl)
                 else:
                     continue
             else:
                 logger.info("删除重复的 followingUrl:{0} 的 _id:{1}".format(
                     followingUrl._data, followingUrl.id))
                 followingUrl.delete({'_id': str(followingUrl.id)})
                 continue
         except Exception as err:
             logger.debug('load url err :{0}'.format(err))
             logger.info('error happened in load urls from mongodb!')
             continue
     logger.info("load url end!")
示例#10
0
 def run(self):
     try:
         self.get_user_html()
     except Exception as err:
         logger.debug('spider  get_user_html is err :{0}'.format(err))