def get_current_reposts(url, weibo_mid): """ 抓取主程序,抓取当前微博及其子微博,不向上追溯源微博 """ crawler.info('正在抓取url为{}的微博'.format(url)) spread_other_caches = list() spread_others = list() spread_other_and_caches = list() result = _get_current_source(url, weibo_mid) if result is None: weibosearch_dao.update_weibo_url(weibo_mid, 2) return reposts_count, user_id, user_name = result if reposts_count > 0: soc = SpreadOtherCache() soc.set_id(user_id) soc.set_name(user_name) spread_other_caches.append(soc) page = _get_total_page(weibo_mid) if page == 0: weibosearch_dao.update_weibo_url(weibo_mid, 2) return page_counter = 0 _crawl_loop(page, page_counter, weibo_mid, user_id, user_name, spread_other_and_caches, spread_others, spread_other_caches) _save_spread_other(spread_others, spread_other_caches, user_id) crawler.info( '一共获取了{num}条转发信息,该条微博的转发信息已经采集完成'.format(num=len(spread_others))) weibosearch_dao.update_weibo_url(weibo_mid, 1)
def crawl_comment_page(mid): limit = get_max_comment_page() cur_page = 1 next_url = '' while cur_page <= limit: cur_time = int(time.time()*1000) if cur_page == 1: url = start_url.format(mid, cur_time) else: url = base_url.format(next_url, cur_time) html = get_page(url, user_verify=False) comment_datas = comment.get_comment_list(html, mid) if not comment_datas and cur_page == 1: crawler.warning('微博id为{}的微博评论未采集成功,请检查原因'.format(mid)) return save_comments(comment_datas) # 由于这里每一步都要根据上一步来迭代,所以不适合采用网络调用(主要是比较麻烦) next_url = comment.get_next_url(html) if not next_url: crawler.info('微博{}的评论采集已经完成'.format(mid)) return cur_page += 1
def _get_current_source(url, wb_mid): """ :param url: 当前微博url :param wb_mid: 当前微博mid :return: 转发数,微博用户id,用户名 """ html = get_page(url) if not html or basic.is_404(html): return None reposts = parse_status.get_repostcounts(html) comments = parse_status.get_commentcounts(html) # 更新weibo_search_data表中的转发数、评论数 weibosearch_dao.update_repost_comment(mid=wb_mid, reposts=reposts, comments=comments) root_url = url user_id = parse_status.get_userid(html) user_name = parse_status.get_username(html) post_time = parse_status.get_statustime(html) device = parse_status.get_statussource(html) comments_count = parse_status.get_commentcounts(html) reposts_count = parse_status.get_repostcounts(html) root_user = user.get_profile(user_id) # 源微博的相关信息存储 spread_original_dao.save(root_user, wb_mid, post_time, device, reposts_count, comments_count, root_url) crawler.info('该微博转发数为{counts}'.format(counts=reposts_count)) return reposts_count, user_id, user_name
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning('本次并没获取到关键词{}的相关微博,该页面源码是{}'.format( keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环 for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword)) return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处 app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info') # 判断是否包含下一页 if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('关键词{}搜索完成'.format(keyword)) return
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return
def save_data(self, data): tasks = [] for item in data: try: dic = {} uuid = item.get("uuid") dic["uuid"] = uuid dic["url"] = f"https://www.infoq.cn/article/{uuid}" dic["title"] = item.get("article_title") dic["cover"] = item.get("article_cover") dic["summary"] = item.get("article_summary") author = item.get("author") if author: dic["author"] = author[0].get("nickname") else: dic["author"] = item.get("no_author", "").split(":")[-1] score = item.get("publish_time") dic["publish_time"] = datetime.datetime.utcfromtimestamp( score / 1000).strftime("%Y-%m-%d %H:%M:%S") dic["tags"] = ",".join( [data.get("name") for data in item.get("topic")]) translate = item.get("translator") dic["translator"] = dic["author"] if translate: dic["translator"] = translate[0].get("nickname") dic["status"] = 0 dic["update_time"] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") tasks.append(dic) except IndexError as e: crawler.error("解析出错") Mongo().save_data(tasks) crawler.info(f"add {len(tasks)} datas to mongodb") return score
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) #crawler.info(limit) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) #crawler.info(search_page) if not search_page: crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('keyword {} has been crawled in last turn'.format(keyword)) #continue return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return
async def get_session(self, url, _kwargs: dict = {}, source_type="text", status_code=200) -> Response: ''' :param kwargs:url,headers,data,params,etc,, :param method: get post. :param timeout: defalut 5s. ''' kwargs = _kwargs if USE_PROXY: kwargs["proxy"] = await self.get_proxy() method = kwargs.pop("method", "get") timeout = kwargs.pop("timeout", 5) with async_timeout.timeout(timeout): async with getattr(self.session, method)(url, **kwargs) as req: status = req.status if status in [status_code, 201]: if source_type == "text": source = await req.text() elif source_type == "buff": source = await req.read() crawler.info(f"get url:{url},status:{status}") res = Response(status=status, source=source) return res
def excute_repost_task(): # 以当前微博为源微博进行分析,不向上溯源,如果有同学需要向上溯源,需要自己判断一下该微博是否是根微博 weibo_datas = wb_data.get_weibo_repost_not_crawled() crawler.info('本次一共有{}条微博需要抓取转发信息'.format(len(weibo_datas))) for weibo_data in weibo_datas: app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid), queue='repost_crawler', routing_key='repost_info')
async def run(data): crawler.info("Start Spider") async with aiohttp.connector.TCPConnector( limit=300, force_close=True, enable_cleanup_closed=True) as tc: async with aiohttp.ClientSession(connector=tc) as session: coros = (asyncio.ensure_future(bound_fetch(item, session)) for item in data) await start_branch(coros)
def excute_repost_task(): datas = weibosearch_dao.get_crawl_urls() crawler.info('一共获取到{len}条需要抓取的微博'.format(len=len(datas))) # 把抓取任务分发到各个机器上执行 for data in datas: app.send_task('tasks.repost.get_current_reposts', args=(data['url'], data['mid'])) crawler.info('本次任务分发完成')
def excute_repost_task(): # regard current weibo url as the original url, you can also analyse from the root url weibo_datas = wb_data.get_weibo_repost_not_crawled() crawler.info('There are {} repost urls have to be crawled'.format(len(weibo_datas))) for weibo_data in weibo_datas: app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid), queue='repost_crawler', routing_key='repost_info')
async def get_buff(item, session): url = item.get("cover") with async_timeout.timeout(60): async with session.get(url) as r: if r.status == 200: buff = await r.read() if len(buff): crawler.info(f"NOW_IMAGE_URL:, {url}") await get_img(item, buff)
def log(*args, **kwargs): try: if f: crawler.info(f"{func.__name__} is run") return func(*args, **kwargs) except Exception as e: crawler.error( f"{func.__name__} is error,here are details:{traceback.format_exc()}" )
async def run(data): crawler.info("Start Spider") # TCPConnector维持链接池,限制并行连接的总量,当池满了,有请求退出再加入新请求。默认是100,limit=0的时候是无限制 # ClientSession调用TCPConnector构造连接,Session可以共用 async with aiohttp.connector.TCPConnector( limit=300, force_close=True, enable_cleanup_closed=True) as tc: async with aiohttp.ClientSession(connector=tc) as session: coros = (asyncio.ensure_future(bound_fetch(item, session)) for item in data) await start_branch(coros)
def get_page(url, need_login=True): """ :param url: url to be crawled :param need_login: if the url is need to login, the value is True, else False :return: return '' if exception happens or status_code != 200 """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning( 'no cookies in cookies pool, please find out the reason') send_email() os.kill(os.getppid(), signal.SIGTERM) try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # slow down to aviod being banned time.sleep(interal) except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning( 'excepitons happens when crawling {},specific infos are {}'. format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('max tries for {},check the url in redis db2'.format(url)) Urls.store_crawl_url(url, 0) return ''
def excute_repost_task(): # regard current weibo url as the original url, you can also analyse from the root url weibo_datas = wb_data.get_weibo_repost_not_crawled() crawler.info('There are {} repost urls have to be crawled'.format( len(weibo_datas))) for weibo_data in weibo_datas: app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid), queue='repost_crawler', routing_key='repost_info')
async def run(): ''' 入口函数 :return: ''' data = await MotorBase().find() crawler.info("Start Spider") async with aiohttp.connector.TCPConnector( limit=300, force_close=True, enable_cleanup_closed=True) as tc: async with aiohttp.ClientSession(connector=tc) as session: coros = (asyncio.ensure_future(bound_fetch(item, session)) async for item in data) await branch(coros)
async def get_proxy(self) -> Optional[str]: ''' 获取代理 ''' while True: proxy = await proxy_helper.get_proxy(isown=1, protocol=2, site='dianping') if proxy: host = proxy[0].get('ip') port = proxy[0].get('port') ip = f"http://{host}:{port}" return ip else: crawler.info("代理超时开始等待") await asyncio.sleep(5)
def _get_total_page(wb_mid): page = 1 ajax_url = base_url.format(mid=wb_mid, currpage=page) source = get_page(ajax_url, False) if source == '': crawler.error('本次转发url{}抓取出错'.format(ajax_url)) return 0 crawler.info('本次转发信息url为{}'.format(ajax_url)) try: repost_json = json.loads(source) total_page = int(repost_json['data']['page']['totalpage']) except Exception as why: parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(url=ajax_url, why=why)) return 0 else: return total_page
async def fetch_detail_page(self, item: dict): ''' 访问详情页,开始解析 :param url: :return: ''' detail_url = item.get("detail_url") kwargs = {"headers": DEFAULT_HEADRS} # 修改种子URL的状态为1表示开始爬取。 condition = {'detail_url': detail_url} await MotorOperation().change_status(condition, col="discogs_index_data", status_code=1) response = await self.get_session(detail_url, kwargs) if response.status == 200: source = response.source # await self.more_images(source) try: await self.get_list_info(item, detail_url, source) except: crawler.info(f"解析出错:{detail_url}")
def search_keyword(row): cur_page = 1 keyword = row.keyword if row.startTime: startTime = row.startTime.strftime('%Y-%m-%d') url = 'http://s.weibo.com/weibo/{}&scope=ori&suball=1&page={}×cope=custom:{}' if row.endTime: endTime = row.endTime.strftime('%Y-%m-%d') encode_keyword = url_parse.quote(keyword) while cur_page < limit: if row.startTime and row.endTime: finalTime = startTime + ':' + endTime cur_url = url.format(encode_keyword, cur_page, finalTime) else: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning( '本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环 for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword)) return else: insert_weibo_data(wb_data) # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处 app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') # 判断是否包含下一页 if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('关键词{}搜索完成'.format(keyword)) return
def get_page(url, session, headers, user_verify=True): """ :param user_verify: 是否为可能出现验证码的页面(搜索页面的403还没解析),否为抓取转发的ajax连接 """ crawler.info('本次抓取的url为{url}'.format(url=url)) try: page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \ encode('utf-8', 'ignore').decode('utf-8') time.sleep(interal) if user_verify: if is_403(page): crawler.warning('本账号已经被冻结') crawler.info('本次抓取结束,时间是:{curtime}'.format(curtime=time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()))) exit(-1) if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' if not is_complete(page): time.sleep(excp_interal) try: page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \ encode('utf-8', 'ignore').decode('utf-8') except Exception as why: crawler.error(why) return '' except requests.exceptions.ReadTimeout: crawler.warning('抓取{url}时连接目标服务器超时'.format(url=url)) time.sleep(excp_interal) return '' except requests.exceptions.ConnectionError as e: crawler.warning('目标服务器拒绝连接,程序休眠1分钟,具体异常信息为:{e}'.format(e=e)) time.sleep(excp_interal) return '' else: return page
def get_all(d): while not d: crawler.info('现在还未得到有效的session') time.sleep(60) datas = weibosearch_dao.get_crawl_urls() crawler.info('一共获取到{len}条需要抓取的微博'.format(len=len(datas))) for data in datas: # session放在里面是为了防止某个抓取队列太长或者转发微博太多 session = d.get('session') crawler.info('正在抓取url为{url}的微博'.format(url=data['url'])) _get_current_reposts(data['url'], session, data['mid']) weibosearch_dao.update_weibo_url(data['mid']) crawler.info('本次抓取结束')
def get_page(url, user_verify=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.error('cookie池中不存在cookie,请检查账号和登录任务是否正常。采集程序退出。') os._exit(0) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or is_403(page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待抓取url :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同,如果只有一个账号,那么就允许相同 name_cookies, cookies_count = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() # 选择状态正常的账号进行登录,账号都不可用就停掉celery worker if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) # 只有cookies总数大于1的时候才会在每次重试的时候切换不同cookie if cookies_count > 1 and name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: url to be crawled :param user_verify: if it's ajax url, the value is False, else True :param need_login: if the url is need to login, the value is True, else False :return: return '' if exception happens or status_code != 200 """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('no cookies in cookies pool, please find out the reason') send_email() os.kill(os.getppid(), signal.SIGTERM) try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('account {} has been banned'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # slow down to aviod being banned time.sleep(interal) if user_verify: if is_banned(resp.url) or is_403(page): crawler.warning('account {} has been banned'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if 'verifybmobile' in resp.url: crawler.warning('account {} has been locked,you should use your phone to unlock it'. format(name_cookies[0])) freeze_account(name_cookies[0], -1) Cookies.delete_cookies(name_cookies[0]) continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('{url} seems to be 404'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('excepitons happens when crawling {},specific infos are {}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('max tries for {},check the url in redis db2'.format(url)) Urls.store_crawl_url(url, 0) return ''
def send_personal_message(target_uid, adver_message, user_verify=True, need_login=True): """ :param url: url to be crawled :param user_verify: if it's ajax url, the value is False, else True :param need_login: if the url is need to login, the value is True, else False :return: return '' if exception happens or status_code != 200 """ crawler.info('the send_personal_message uid is {uid}'.format(uid=str(target_uid))) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() print(name_cookies) # check adver_timers if int(name_cookies[3]) >= int(adver_timers): continue if name_cookies is None: crawler.warning('no cookies in cookies pool, please find out the reason') send_email() os.kill(os.getppid(), signal.SIGTERM) try: if need_login: resp = requests.post('http://api.weibo.com/webim/2/direct_messages/new.json?source='+str(name_cookies[2]), data={'text': adver_message, 'uid':str(target_uid)}, cookies=name_cookies[1], headers=personal_message_headers) if "error" in resp.text: crawler.warning('account {} has been banned, resp.text is: {}'.format(name_cookies[0], resp.text)) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: # update adver_times Cookies.store_cookies(name_cookies[0], name_cookies[1], name_cookies[2], 1) return None # if "$CONFIG['islogin'] = '******'" in resp.text: # crawler.warning('account {} has been banned'.format(name_cookies[0])) # freeze_account(name_cookies[0], 0) # Cookies.delete_cookies(name_cookies[0]) # continue # # else: # # resp = requests.get(url, headers=headers, timeout=time_out, verify=False) # # page = resp.text # # if page: # page = page.encode('utf-8', 'ignore').decode('utf-8') # else: # continue # # # slow down to aviod being banned # time.sleep(interal) # # if user_verify: # if is_banned(resp.url) or is_403(page): # crawler.warning('account {} has been banned'.format(name_cookies[0])) # freeze_account(name_cookies[0], 0) # Cookies.delete_cookies(name_cookies[0]) # count += 1 # continue # # if 'verifybmobile' in resp.url: # crawler.warning('account {} has been locked,you should use your phone to unlock it'. # format(name_cookies[0])) # # freeze_account(name_cookies[0], -1) # Cookies.delete_cookies(name_cookies[0]) # continue # # if not is_complete(page): # count += 1 # continue # # if is_404(page): # crawler.warning('send_personal_message{uid} seems to be 404'.format(uid=str(target_uid))) # return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('excepitons happens when send_personal_message {},specific infos are {}'.format(target_uid, e)) count += 1 time.sleep(excp_interal) else: # Urls.store_crawl_url(url, 1) # return page return None crawler.warning('max tries for {},check the target_uid in redis db2'.format(target_uid)) # Urls.store_crawl_url(url, 0) return ''
def _get_current_reposts(url, session, weibo_mid): """ 修改过后的抓取主程序,由于微博频率限制比较严格,目前只抓取当前微博及其子微博,不抓取源微博 """ spread_other_caches = list() spread_others = list() spread_other_and_caches = list() html = get_page(url, session, headers) reposts = status_parse.get_repostcounts(html) comments = status_parse.get_commentcounts(html) # 更新weibo_search_data表中的转发数、评论数 weibosearch_dao.update_repost_comment(mid=weibo_mid, reposts=reposts, comments=comments) if not basic.is_404(html): root_url = url mid = status_parse.get_mid(html) user_id = status_parse.get_userid(html) user_name = status_parse.get_username(html) post_time = status_parse.get_statustime(html) device = status_parse.get_statussource(html) comments_count = status_parse.get_commentcounts(html) reposts_count = status_parse.get_repostcounts(html) root_user = user.get_profile(user_id, session, headers) spread_original_dao.save(root_user, mid, post_time, device, reposts_count, comments_count, root_url) crawler.info('该微博转发数为{counts}'.format(counts=reposts_count)) if reposts_count > 0: base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}' soc = SpreadOtherCache() soc.set_id(user_id) soc.set_name(user_name) spread_other_caches.append(soc) page = 1 ajax_url = base_url.format(mid=mid, currpage=page) source = get_page(ajax_url, session, headers, False) crawler.info('本次转发信息url为:' + ajax_url) try: repost_json = json.loads(source) total_page = int(repost_json['data']['page']['totalpage']) except Exception as why: parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format( url=ajax_url, why=why)) else: page = total_page page_counter = 0 while page > 0 and page_counter < page_max: ajax_url = base_url.format(mid=mid, currpage=page) repost_info = get_page(ajax_url, session, headers, False) try: repost_json = json.loads(repost_info) repost_html = repost_json['data']['html'] except Exception as why: parser.error( '{url}使用json解析转发信息出现异常,具体信息为:{why}'.format( url=ajax_url, why=why)) else: repost_urls = status_parse.get_reposturls(repost_html) # 转发节点排序逻辑 for repost_url in repost_urls: repost_cont = status.get_status_info( repost_url, session, user_id, user_name, headers, mid) if repost_cont is not None: spread_other_and_caches.append(repost_cont) for soac in spread_other_and_caches: if soac.get_so().id != '': spread_others.append(soac.get_so()) spread_other_caches.append(soac.get_soc()) finally: print('当前位于第{currpage}页'.format(currpage=page)) page -= 1 page_counter += 1 for so in spread_others: if so.verify_type == '': so.verify_type = 0 for i in spread_other_caches: if so.upper_user_name == i.get_name(): so.upper_user_id = i.get_id() break else: so.upper_user_id = user_id spread_others = list(set(spread_others)) spread_other_dao.save(spread_others) crawler.info('一共获取了{num}条转发信息,该条微博的转发信息已经采集完成'.format( num=len(spread_others))) else: crawler.info('{url}为404页面'.format(url=url))
def send_jd_seckill_task(jd_user_string, address_string, task_id, skuId, netproxy): """ """ s = requests.session() s.timeout = session_timeout s.proxies = netproxy jd_user_json = json.loads(jd_user_string) address_json = json.loads(address_string) cookies_encode = jd_user_json['cookies'].encode() cookies_decode = base64.b64decode(cookies_encode).decode() # cookies_dict = json.loads(cookies_decode) rawdata = '__jdv=122270672|direct|-|none|-|1504798597931; o2-webp=true; TrackID=1d8yuf-8hCib8xjpwDjMwOLGCD0gmGtLEjJFNZQwBIvwskJdwUNnq1kiTmBcsfXw2nATZkxctFmE3r1fN0yVk9egAz0M5KDHytNxuRLuHtOk; pinId=7iwdYGSz99W1ffsfn98I-w; pin=xtuyaowu; thor=C3888A1807C299F45E21294E559BB739649F3F90C26DB309D58688491645C60E7745B49FBD8CD722E210B31A2EE861DAF9C0782F8A06AAF23606C377C1953E40B92BA29EED15FF5F57F2A0165047E0C44F71D5CA5FF000281EC43042F0403E24E8A7B703856EC818D09300F82CB14986EF55754C61CA47D6A3F1A6ADE7E1FE0B99D7576D0BD2721B0E8F279EE5980A2B; _tp=gs6zPQLXL133eDDGdm%2Bv%2Fg%3D%3D; _pst=xtuyaowu; ceshi3.com=000; __jda=122270672.15047985979311779686273.1504798598.1504798598.1504798598.1; __jdb=122270672.3.15047985979311779686273|1.1504798598; __jdc=122270672; __jdu=15047985979311779686273' cookie = SimpleCookie() cookie.load(cookies_decode) # Even though SimpleCookie is dictionary-like, it internally uses a Morsel object # which is incompatible with requests. Manually construct a dictionary instead. cookies = {} for key, morsel in cookie.items(): cookies[key] = morsel.value crawler.info('the send_jd_seckill_task jd_user is {uid}'.format(uid=str(jd_user_string))) celery_stask_status = 7 try: # 第一次提交获取地址 resp = s.get('https://marathon.jd.com/async/getUsualAddressList.action?skuId='+str(skuId), headers=headers, cookies=cookies, timeout=time_out, verify=False) # [{ # "name": "冷月", # "id": 138356479, # "addressDetail": "广州外国语学校-凤凰大道 丰巢快递柜", # "provinceId": 19, # "cityId": 1601, # "countyId": 50259, # "townId": 51886, # "mobile": "", # "provinceName": "广东", # "cityName": "广州市", # "countyName": "南沙区", # "mobileKey": "5fe7bdd8ce0aa7af84b7d1380d8141a3", # "email": "", # "townName": "城区", # "mobileWithXing": "131****5409" # }, { # "name": "冷月", # "id": 138359040, # "addressDetail": "中信香樟墅1街12号", # "provinceId": 19, # "cityId": 1601, # "countyId": 50284, # "townId": 50451, # "mobile": "", # "provinceName": "广东", # "cityName": "广州市", # "countyName": "增城区", # "mobileKey": "5fe7bdd8ce0aa7af84b7d1380d8141a3", # "email": "", # "townName": "中新镇", # "mobileWithXing": "131****5409" # }] # # todo 第一次提交返回校验 if not resp.text: save_task_monitor(task_id, celery_stask_status, "do not contain address") return None if '登录' in resp.text: save_task_monitor(task_id, celery_stask_status, "cookies失败") return None address_list = json.loads(resp.text) if len(address_list) >0: address_dict = address_list[0] if 'addressDetail' not in address_dict: crawler.warning('task_id {} has been banned, resp.text is: {}'.format(task_id, resp.text)) save_task_monitor(task_id, celery_stask_status, resp.text) return None # todo 秒杀 参数需要确认 resp = s.post('https://marathon.jd.com/seckill/submitOrder.action?skuId='+str(skuId)+'&vid= HTTP/1.1', data={'orderParam.name':address_dict['name'], 'orderParam.addressDetail':address_dict['addressDetail'], 'orderParam.mobile':address_dict['mobileWithXing'], 'orderParam.email':address_dict['email'], 'orderParam.provinceId':address_dict['provinceId'], 'orderParam.cityId':address_dict['cityId'], 'orderParam.countyId':address_dict['countyId'], 'orderParam.townId':address_dict['townId'], 'orderParam.paymentType':4, 'orderParam.password':'', 'orderParam.invoiceTitle':4, 'orderParam.invoiceContent':1, 'orderParam.invoiceCompanyName':'', 'orderParam.invoiceTaxpayerNO':'', 'orderParam.usualAddressId':address_dict['id'], 'skuId':skuId, 'num':1, 'orderParam.provinceName':address_dict['provinceName'], 'orderParam.cityName':address_dict['cityName'], 'orderParam.countyName':address_dict['countyName'], 'orderParam.townName':address_dict['townName'], 'orderParam.codTimeType':3, 'orderParam.mobileKey':address_dict['mobileKey'], 'eid':jd_user_json['eid'], 'fp':jd_user_json['fp'] }, cookies=cookies, headers=personal_message_headers) # 秒杀返回校验 if "//marathon.jd.com/koFail.html?reason=" in resp.text: crawler.warning('task_id {} has been banned, resp.text is: {}'.format(task_id, resp.text)) else: celery_stask_status = 8 except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: print(e.format_exc()) crawler.warning('excepitons happens when task_id {},specific infos are {}'.format(task_id, e)) time.sleep(excp_interal) dbc = class_MongoDB.MongoClient(uri, class_logger.getLogger('MongoDB_Users'), 'JD') dbc.setUnique('Users', 'username') dbc.update('Users', {'username': jd_user_json['username']}, {'status': 2}) save_task_monitor(task_id, celery_stask_status, resp.text) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待抓取url :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码), 否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,请检查账号是否正常') other.warning('正在关闭爬虫程序...') if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or 'userblock' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if 'verifybmobile' in resp.url: crawler.warning('账号{}功能被锁定,需要手机解锁'.format(name_cookies[0])) freeze_account(name_cookies[0], -1) Cookies.delete_cookies(name_cookies[0]) continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: # 如果有可用账号,那么就拿来登录,这里用了本地调用,好像不是很合理,因为如果login queue # 不会在本机上,那么该调用就会无效但是用网络调用,如何保证不会出现在某些不常用登录地的节点 # 上还有待考量,亦或者找到一个更适合的方法可以突破异地登录的限制 # TODO 衡量是用网络调用还是直接调用 login.get_session()方法,这里应该不是很合理 # 目前暂时不考虑节点登录出现验证码的问题, 考虑到大规模账号登录的话,需要把login_queue的节点放在账号常用地 crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''