def get_status_info(url, session, user_id, name, headers, mid=''): soc = SpreadOtherCache() print('当前转发微博url为:' + url) repost_cont = get_page(url, session, headers) if not is_404(repost_cont): repost_user_id = status_parse.get_userid(repost_cont) repost_user_name = status_parse.get_username(repost_cont) soc.set_id(repost_user_id) soc.set_name(repost_user_name) so = SpreadOther() so.id = repost_user_id so.screen_name = repost_user_name so.upper_user_name = status_parse.get_upperusername(repost_cont, name) cur_user = get_userinfo.get_profile(repost_user_id, session, headers) try: so.province = cur_user.province so.city = cur_user.city so.location = cur_user.location so.description = cur_user.description so.domain_name = cur_user.domain_name so.blog_url = cur_user.blog_url so.gender = cur_user.gender so.headimg_url = cur_user.headimg_url so.followers_count = cur_user.followers_count so.friends_count = cur_user.friends_count so.status_count = cur_user.status_count so.verify_type = cur_user.verify_type so.verify_info = cur_user.verify_info so.register_time = cur_user.register_time if so.screen_name == name: so.id = user_id so.mid = status_parse.get_mid(repost_cont) so.status_post_time = status_parse.get_statustime(repost_cont) so.device = status_parse.get_statussource(repost_cont) if mid: so.original_status_id = mid else: so.original_status_id = status_parse.get_orignalmid(repost_cont) so.comments_count = status_parse.get_commentcounts(repost_cont) so.reposts_count = status_parse.get_repostcounts(repost_cont) so.like_count = status_parse.get_likecounts(repost_cont) so.status_url = url except AttributeError as e: # todo:找出这里的问题 logging.info('解析{user_id}失败, 堆栈为{e}'.format(user_id=user_id, e=e)) logging.info(r'该转发页面的源代码为:\n{repost_cont}'.format(repost_cont=repost_cont)) return None else: return SpreadOtherAndCache(so, soc) else: return None
def get_page(url, session, headers, user_verify=True): """ :param session: :param url: :param headers: :param user_verify: 是否为可能出现验证码的页面(搜索页面的403还没解析),否为抓取转发的ajax连接 :return: """ print('本次抓取的url为{url}'.format(url=url)) try: page = session.get(url, headers=headers, timeout=time_out, verify=False).text.\ encode('utf-8', 'ignore').decode('utf-8') gl.count += 1 time.sleep(1) if user_verify: if is_403(page): logging.info('本账号已经被冻结') print('账号{username}已经被冻结') logging.info('它的页面源码为{page}'.format(page=page)) logging.info('本次抓取结束,时间是:{curtime},一共抓取了{count}个页面'.format( curtime=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), count=gl.count)) if is_404(page): logging.info('url为{url}的连接不存在'.format(url=url)) print('url为{url}的连接不存在'.format(url=url)) return '' if not is_complete(page): time.sleep(30) try: page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \ encode('utf-8', 'ignore').decode('utf-8') except Exception as why: print(why) return '' except requests.exceptions.ReadTimeout: logging.info('抓取{url}时连接目标服务器超时'.format(url=url)) print('抓取{url}时连接目标服务器超时'.format(url=url)) time.sleep(60) # 休眠5分钟 return '' except requests.exceptions.ConnectionError as e: logging.info('目标服务器拒绝连接,程序休眠1分钟,具体异常信息为:{e}'.format(e=e)) print('目标服务器拒绝连接,程序休眠1分钟,具体异常信息为:{e}'.format(e=e)) time.sleep(60) # 休眠5分钟 return '' else: return page
def _get_reposts(url, session): """ 抓取主程序 解析源微博,并保存;得到转发微博信息 注意判断404页面,同理个人资料抓取程序也需要做同样的判断 :param url: :param session: :return: """ spread_other_caches = [] spread_others = [] spread_other_and_caches = [] html = get_page(url, session, headers) if not basic.is_404(html): root_url = url if not status_parse.is_root(html): print('该微博不是源微博,现在从源微博开始爬取') root_url = status_parse.get_rooturl(url, html) if root_url != '': html = get_page(root_url, session, headers) if basic.is_404(html): print('根微博已经找不到了') return mid = status_parse.get_orignalmid(html) user_id = status_parse.get_userid(html) user_name = status_parse.get_username(html) post_time = status_parse.get_statustime(html) device = status_parse.get_statussource(html) comments_count = status_parse.get_commentcounts(html) reposts_count = status_parse.get_repostcounts(html) root_user = get_userinfo.get_profile(user_id, session, headers) spread_original_dao.save(root_user, mid, post_time, device, reposts_count, comments_count, root_url) print('转发数为{counts}'.format(counts=reposts_count)) if reposts_count > 0: base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}' soc = SpreadOtherCache() soc.set_id(user_id) soc.set_name(user_name) spread_other_caches.append(soc) page = 1 ajax_url = base_url.format(mid=mid, currpage=page) source = get_page(ajax_url, session, headers, False) print('本次转发信息url为:' + ajax_url) repost_json = json.loads(source) total_page = int(repost_json['data']['page']['totalpage']) page = total_page page_counter = 0 while page > 0 and page_counter < page_max: ajax_url = base_url.format(mid=mid, currpage=page) repost_info = session.get(ajax_url).text repost_json = json.loads(repost_info) repost_html = repost_json['data']['html'] repost_urls = status_parse.get_reposturls(repost_html) for repost_url in repost_urls: repost_cont = get_statusinfo.get_status_info( repost_url, session, user_id, user_name, headers) if repost_cont is not None: spread_other_and_caches.append(repost_cont) for soac in spread_other_and_caches: if soac.get_so().id != '': spread_others.append(soac.get_so()) spread_other_caches.append(soac.get_soc()) print('当前位于第{currpage}页'.format(currpage=page)) page -= 1 page_counter += 1 for so in spread_others: for i in spread_other_caches: if so.upper_user_name == i.get_name(): so.upper_user_id = i.get_id() break else: so.upper_user_id = user_id spread_other_dao.save(spread_others) print('一共获取了{num}条转发信息'.format(num=len(spread_others))) print('该条微博的转发信息已经采集完成') else: print('该微博{url}的源微博已经被删除了'.format(url=url)) else: logging.info('{url}为404页面'.format(url=url))
user.birthday = r[11] user.verify_type = r[12] user.verify_info = r[13] user.register_time = r[14] # 防止在插入数据库的时候encode()出问题 for key in user.__dict__: if user.__dict__[key] is None: setattr(user, key, '') print('该用户信息已经存在于数据库中') else: url = 'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' html = get_page(url, session, headers) if not is_404(html): domain = get_publicinfo.get_userdomain(html) if domain == '100505' or domain == '103505' or domain == '100306': user = get_personalinfo.get_detail(html) if user is not None: user.followers_count = get_personalinfo.get_fans(html) user.friends_count = get_personalinfo.get_friends(html) user.status_count = get_personalinfo.get_status(html) else: user = User() else: # 为了尽可能少抓取url,所以这里不适配所有服务号 if domain == '100106': url = 'http://weibo.com/p/'+domain+user_id+'/home' html = get_page(url, session, headers)
def get_profile(user_id, session, headers): """ 默认为个人用户,如果为作家,则需要再做一次抓取,而为企业用户,它会重定向到企业主页,直接解析即可 登陆后可以根据http://weibo.com/u/userId来进行确定用户主页,不知道稳定不,todo 测试这个路径 好像'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' 这个路径可以解决大部分路径问题,只是非普通用户 会被重定向到主页,有的并不行,比如domain=100106 :param headers: :param session: :param user_id: :return: """ user = User() r = get_user(user_id) if r: user.id = user_id user.screen_name = r[0] user.province = r[1] user.city = r[2] user.location = '{province} {city}'.format(province=r[1], city=r[2]) try: user.description = r[3].read() except AttributeError: user.description = '' user.headimg_url = r[4] user.blog_url = r[5] user.domain_name = r[6] user.gender = r[7] user.followers_count = r[8] user.friends_count = r[9] user.status_count = r[10] user.birthday = r[11] user.verify_type = r[12] user.verify_info = r[13] user.register_time = r[14] # 防止在插入数据库的时候encode()出问题 for key in user.__dict__: if user.__dict__[key] is None: setattr(user, key, '') print('该用户信息已经存在于数据库中') else: url = 'http://weibo.com/p/100505' + user_id + '/info?mod=pedit_more' html = get_page(url, session, headers) if not is_404(html): domain = get_publicinfo.get_userdomain(html) if domain == '100505' or domain == '103505' or domain == '100306': user = get_personalinfo.get_detail(html) if user is not None: user.followers_count = get_personalinfo.get_fans(html) user.friends_count = get_personalinfo.get_friends(html) user.status_count = get_personalinfo.get_status(html) else: user = User() else: # 为了尽可能少抓取url,所以这里不适配所有服务号 if domain == '100106': url = 'http://weibo.com/p/' + domain + user_id + '/home' html = get_page(url, session, headers) if html == '': return user user.followers_count = get_enterpriseinfo.get_fans(html) user.friends_count = get_enterpriseinfo.get_friends(html) user.status_count = get_enterpriseinfo.get_status(html) user.description = get_enterpriseinfo.get_description( html).encode('gbk', 'ignore').decode('gbk') user.id = user_id user.screen_name = get_publicinfo.get_username(html) user.headimg_url = get_publicinfo.get_headimg(html) user.verify_type = get_publicinfo.get_verifytype(html) user.verify_info = get_publicinfo.get_verifyreason( html, user.verify_type) save_user(user) return user
def _get_reposts(url, session, weibo_mid): """ 抓取主程序 解析源微博,并保存;得到转发微博信息 注意判断404页面,同理个人资料抓取程序也需要做同样的判断 :param url: :param session: :return: """ spread_other_caches = [] spread_others = [] spread_other_and_caches = [] html = get_page(url, session, headers) reposts_count = status_parse.get_repostcounts(html) weibosearch_dao.update_weibo_repost(weibo_mid, reposts_count) if not basic.is_404(html): if not status_parse.is_root(html): print('该微博不是源微博,现在从源微博开始爬取') root_url = status_parse.get_rooturl(url, html) if root_url != '': html = get_page(root_url, session, headers) if basic.is_404(html): print('源微博已经找不到了') return mid = status_parse.get_orignalmid(html) user_id = status_parse.get_userid(html) user_name = status_parse.get_username(html) post_time = status_parse.get_statustime(html) device = status_parse.get_statussource(html) comments_count = status_parse.get_commentcounts(html) reposts_count = status_parse.get_repostcounts(html) root_user = get_userinfo.get_profile(user_id, session, headers) # 以下代码是因为测试反爬虫机制注释掉的 rs = spread_original_dao.save(root_user, mid, post_time, device, reposts_count, comments_count, root_url) # # if rs is False: # print('源微博的扩散信息已经获取过了') # return print('转发数为{counts}'.format(counts=reposts_count)) if reposts_count > 0: base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}' soc = SpreadOtherCache() soc.set_id(user_id) soc.set_name(user_name) spread_other_caches.append(soc) page = 1 ajax_url = base_url.format(mid=mid, currpage=page) source = get_page(ajax_url, session, headers, False) print('本次转发信息url为:' + ajax_url) try: repost_json = json.loads(source) total_page = int(repost_json['data']['page']['totalpage']) except Exception as why: print('使用json解析转发信息出现异常,具体信息为:{why}'.format(why=why)) print('本次返回的转发信息为{info}'.format(info=source)) else: page = total_page page_counter = 0 while page > 0 and page_counter < page_max: ajax_url = base_url.format(mid=mid, currpage=page) repost_info = get_page(ajax_url, session, headers, False) try: repost_json = json.loads(repost_info) repost_html = repost_json['data']['html'] except Exception as why: print('使用json解析转发信息出现异常,具体信息为:{why}'.format(why=why)) print('本次返回的转发信息为{info}'.format(info=source)) else: repost_urls = status_parse.get_reposturls(repost_html) for repost_url in repost_urls: repost_cont = get_statusinfo.get_status_info( repost_url, session, user_id, user_name, headers) if repost_cont is not None: spread_other_and_caches.append(repost_cont) for soac in spread_other_and_caches: if soac.get_so().id != '': spread_others.append(soac.get_so()) spread_other_caches.append(soac.get_soc()) finally: print('当前位于第{currpage}页'.format(currpage=page)) page -= 1 page_counter += 1 for so in spread_others: for i in spread_other_caches: if so.upper_user_name == i.get_name(): so.upper_user_id = i.get_id() break if so.verify_type == '': so.verify_type = 0 else: so.upper_user_id = user_id spread_others = list(set(spread_others)) # 以下代码是研究反爬虫机制而注释掉的 # spread_other_dao.save(spread_others) print('一共获取了{num}条转发信息'.format(num=len(spread_others))) print('该条微博的转发信息已经采集完成') else: logging.info('{url}为404页面'.format(url=url)) print('该微博{url}已经被删除了'.format(url=url))
def _get_current_reposts(url, session, weibo_mid): """ 修改过后的抓取主程序,由于微博频率限制比较严格,目前只抓取当前微博及其子微博,不抓取源微波 :param url: :param session: :return: """ spread_other_caches = [] spread_others = [] spread_other_and_caches = [] html = get_page(url, session, headers) reposts_count = status_parse.get_repostcounts(html) # 更新weibo_search_data表中的转发数、评论数 weibosearch_dao.update_weibo_repost(weibo_mid, reposts_count) if not basic.is_404(html): root_url = url mid = status_parse.get_mid(html) user_id = status_parse.get_userid(html) user_name = status_parse.get_username(html) post_time = status_parse.get_statustime(html) device = status_parse.get_statussource(html) comments_count = status_parse.get_commentcounts(html) reposts_count = status_parse.get_repostcounts(html) root_user = get_userinfo.get_profile(user_id, session, headers) spread_original_dao.save(root_user, mid, post_time, device, reposts_count, comments_count, root_url) print('转发数为{counts}'.format(counts=reposts_count)) if reposts_count > 0: base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}' soc = SpreadOtherCache() soc.set_id(user_id) soc.set_name(user_name) spread_other_caches.append(soc) page = 1 ajax_url = base_url.format(mid=mid, currpage=page) source = get_page(ajax_url, session, headers, False) print('本次转发信息url为:' + ajax_url) try: repost_json = json.loads(source) total_page = int(repost_json['data']['page']['totalpage']) except Exception as why: print('使用json解析转发信息出现异常,具体信息为:{why}'.format(why=why)) print('本次返回的转发信息为{info}'.format(info=source)) else: page = total_page page_counter = 0 while page > 0 and page_counter < page_max: ajax_url = base_url.format(mid=mid, currpage=page) repost_info = get_page(ajax_url, session, headers, False) try: repost_json = json.loads(repost_info) repost_html = repost_json['data']['html'] except Exception as why: print('使用json解析转发信息出现异常,具体信息为:{why}'.format(why=why)) print('本次返回的转发信息为{info}'.format(info=source)) else: repost_urls = status_parse.get_reposturls(repost_html) for repost_url in repost_urls: repost_cont = get_statusinfo.get_status_info( repost_url, session, user_id, user_name, headers, mid) if repost_cont is not None: spread_other_and_caches.append(repost_cont) for soac in spread_other_and_caches: if soac.get_so().id != '': spread_others.append(soac.get_so()) spread_other_caches.append(soac.get_soc()) finally: print('当前位于第{currpage}页'.format(currpage=page)) page -= 1 page_counter += 1 for so in spread_others: if so.verify_type == '': so.verify_type = 0 for i in spread_other_caches: if so.upper_user_name == i.get_name(): so.upper_user_id = i.get_id() break else: so.upper_user_id = user_id spread_others = list(set(spread_others)) spread_other_dao.save(spread_others) print('一共获取了{num}条转发信息'.format(num=len(spread_others))) print('该条微博的转发信息已经采集完成') else: logging.info('{url}为404页面'.format(url=url)) print('该微博{url}已经被删除了'.format(url=url))