def retry_crawl(self, data): """ 如果被禁爬,重试 """ r = get_redis() retry = data.get('retry', 0) if data.get('kind') == KIND_DETAIL: if retry >= 20: return data = { 'kind': data['kind'], 'url': data['url'], 'retry': retry + 1 } elif data.get('kind') == KIND_KEYWORD: if retry >= 3: return data = { 'kind': data['kind'], 'word': data['word'], 'retry': retry + 1 } else: if retry >= 3: return data = { 'kind': data['kind'], 'wechat_id': data['wechat_id'], 'wechatid': data['wechatid'], 'retry': retry + 1 } r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
def log_antispider(self): """ 记录1小时内的被禁爬的数量 """ r = get_redis() if r.incr(CRAWLER_CONFIG['antispider']) <= 1: r.expire(CRAWLER_CONFIG['antispider'], 3600)
def topic_add(request): url = request.POST.get('url', '') if url.startswith('http://mp.weixin.qq.com/') or url.startswith( 'https://mp.weixin.qq.com/'): data = {'kind': KIND_DETAIL, 'url': url} r = get_redis() r.rpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) messages.success(request, '链接已经提交给爬虫,稍后查看爬取结果.') else: # print("错误") messages.error(request, 'url 错误, 添加失败') return redirect(reverse('wechat.topic_list'))
def run(self): r = get_redis() if settings.CRAWLER_DEBUG: r.delete(settings.CRAWLER_CONFIG["processor"]) while True: try: rsp = r.brpop(settings.CRAWLER_CONFIG["processor"]) except Exception as e: print(e) continue data = json.loads(rsp[1]) logger.info(json.dumps(data, encoding="UTF-8", ensure_ascii=False)) self.process(data)
def api_topic_add(request): url = request.POST.get('url', '') logging.error(url) if url.startswith('http://mp.weixin.qq.com/') or url.startswith( 'https://mp.weixin.qq.com/'): data = {'kind': KIND_DETAIL, 'url': url} r = get_redis() r.rpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) return JsonResponse({'ret': 0, 'message': '提交成功,链接已经提交给爬虫,稍后查看爬取结果'}) else: return JsonResponse({ 'ret': 1, 'message': '提交失败,url必须以 http://mp.weixin.qq.com/ 开头' })
def index(request): context = {} params = request.GET.copy() status = params.get('status', None) if status is None: _obj_list = Wechat.objects.filter().order_by('-id') else: _obj_list = Wechat.objects.filter(status=status).order_by('-id') paginator = Paginator(_obj_list, 50) # Show 20 contacts per page page = request.GET.get('page') try: _objs = paginator.page(page) except PageNotAnInteger: # If page is not an integer, deliver first page. _objs = paginator.page(1) except EmptyPage: # If page is out of range (e.g. 9999), deliver last page of results. _objs = paginator.page(paginator.num_pages) r = get_redis() # 获取代理状态 proxies = Proxy.objects.filter(kind=Proxy.KIND_DOWNLOAD, status=Proxy.STATUS_SUCCESS)[:1] # print(proxies) if len(proxies) > 0: dt = datetime.now() - proxies[0].update_time _proxy_status = '正常' if dt.total_seconds() < 3600 else '异常' else: _proxy_status = '异常' context.update({ "active_nav": "wechats", "wechats": _objs, "params": params, "downloader": r.llen(CRAWLER_CONFIG['downloader']) or 0, "antispider": r.get(CRAWLER_CONFIG['antispider']) or 0, "proxy_status": _proxy_status }) return render(request, 'wechat/index.html', context=context)
def __init__(self): self.redis = get_redis()