def retry_crawl(self, data): """ 如果被禁爬,重试 """ r = get_redis() retry = data.get('retry', 0) if data.get('kind') == KIND_DETAIL: if retry >= 20: return data = { 'kind': data['kind'], 'url': data['url'], 'retry': retry + 1 } elif data.get('kind') == KIND_KEYWORD: if retry >= 3: return data = { 'kind': data['kind'], 'word': data['word'], 'retry': retry + 1, 'user_hobby_id': data['user_hobby_id'], 'crawl_source': data['crawl_source'] } else: if retry >= 3: return data = { 'kind': data['kind'], 'wechat_id': data['wechat_id'], 'wechatid': data['wechatid'], 'retry': retry + 1 } r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
def retry_crawl(self, data): """ 如果被禁爬,重试 """ r = get_redis() retry = data.get('retry', 0) if data.get('kind') == KIND_DETAIL: if retry >= 20: return data = { 'kind': data['kind'], 'url': data['url'], 'retry': retry + 1 } elif data.get('kind') == KIND_KEYWORD: if retry >= 3: return data = { 'kind': data['kind'], 'word': data['word'], 'retry': retry + 1 } else: if retry >= 3: return data = { 'kind': data['kind'], 'wechat_id': data['wechat_id'], 'wechatid': data['wechatid'], 'retry': retry + 1 } r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
def topic_add(request): url = request.POST.get('url', '') if url.startswith('http://mp.weixin.qq.com/') or url.startswith( 'https://mp.weixin.qq.com/'): data = {'kind': KIND_DETAIL, 'url': url} r = get_redis() r.rpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) messages.success(request, '链接已经提交给爬虫,稍后查看爬取结果.') else: messages.error(request, 'url 错误, 添加失败') return redirect(reverse('wechat.topic_list'))
def now_do(request, id_): r = get_redis() word_record = get_object_or_404(Word, pk=id_) data = { 'kind': KIND_KEYWORD, 'word': word_record.text, 'user_hobby_id': word_record.user_hobby_id, 'crawl_source': word_record.crawl_source } r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) next_page = request.GET.get('next') return redirect(next_page)
def run(self): r = get_redis() if settings.CRAWLER_DEBUG: r.delete(settings.CRAWLER_CONFIG["processor"]) while True: try: rsp = r.brpop(settings.CRAWLER_CONFIG["processor"]) except Exception as e: print e continue data = json.loads(rsp[1]) logger.info(json.dumps(data, encoding="UTF-8", ensure_ascii=False)) self.process(data)
def topic_add(request): url = request.POST.get('url', '') if url.startswith('http://mp.weixin.qq.com/') or url.startswith('https://mp.weixin.qq.com/') : data = { 'kind': KIND_DETAIL, 'url': url } r = get_redis() r.rpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) messages.success(request, '链接已经提交给爬虫,稍后查看爬取结果.') else: messages.error(request, 'url 错误, 添加失败') return redirect(reverse('wechat.topic_list'))
def api_topic_add(request): url = request.POST.get('url', '') logging.error(url) if url.startswith('http://mp.weixin.qq.com/') or url.startswith( 'https://mp.weixin.qq.com/'): data = {'kind': KIND_DETAIL, 'url': url} r = get_redis() r.rpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) return JsonResponse({'ret': 0, 'message': '提交成功,链接已经提交给爬虫,稍后查看爬取结果'}) else: return JsonResponse({ 'ret': 1, 'message': '提交失败,url必须以 http://mp.weixin.qq.com/ 开头' })
def run(self): r = get_redis() if settings.CRAWLER_DEBUG: r.delete(settings.CRAWLER_CONFIG["downloader"]) while True: now = datetime.now() # 获取要抓取的公众号 wechats = Wechat.objects.filter( frequency__gt=0, next_crawl_time__lt=now, status=Wechat.STATUS_DEFAULT).order_by('-id') for item in wechats: data = { 'kind': KIND_NORMAL, 'wechat_id': item.id, 'wechatid': item.wechatid } r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) # 更新index_rule item.next_crawl_time = now + timedelta(minutes=item.frequency) #item.next_crawl_time = now + timedelta(seconds=item.frequency) item.save() logging.debug(data) # 获取要抓取的关键词 keywords = Word.objects.filter( frequency__gt=0, next_crawl_time__lt=now).order_by('-id') for item in keywords: data = {'kind': KIND_KEYWORD, 'word': item.text} r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) # 更新index_rule item.next_crawl_time = now + timedelta(minutes=item.frequency) #item.next_crawl_time = now + timedelta(seconds=item.frequency) item.save() logging.debug(data) time.sleep(1)
def run(self): r = get_redis() if settings.CRAWLER_DEBUG: r.delete(settings.CRAWLER_CONFIG["downloader"]) while True: now = datetime.now() # 获取要抓取的公众号 wechats = Wechat.objects.filter(frequency__gt=0, next_crawl_time__lt=now, status=Wechat.STATUS_DEFAULT).order_by('-id') for item in wechats: data = { 'kind': KIND_NORMAL, 'wechat_id': item.id, 'wechatid': item.wechatid } r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) # 更新index_rule item.next_crawl_time = now + timedelta(minutes=item.frequency) #item.next_crawl_time = now + timedelta(seconds=item.frequency) item.save() logging.debug(data) # 获取要抓取的关键词 keywords = Word.objects.filter(frequency__gt=0, next_crawl_time__lt=now).order_by('-id') for item in keywords: data = { 'kind': KIND_KEYWORD, 'word': item.text } r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) # 更新index_rule item.next_crawl_time = now + timedelta(minutes=item.frequency) #item.next_crawl_time = now + timedelta(seconds=item.frequency) item.save() logging.debug(data) time.sleep(1)
def index(request): context = {} params = request.GET.copy() status = params.get('status', None) if status is None: _obj_list = Wechat.objects.filter().order_by('-id') else: _obj_list = Wechat.objects.filter(status=status).order_by('-id') paginator = Paginator(_obj_list, 50) # Show 20 contacts per page page = request.GET.get('page') try: _objs = paginator.page(page) except PageNotAnInteger: # If page is not an integer, deliver first page. _objs = paginator.page(1) except EmptyPage: # If page is out of range (e.g. 9999), deliver last page of results. _objs = paginator.page(paginator.num_pages) r = get_redis() # 获取代理状态 proxies = Proxy.objects.filter(kind=Proxy.KIND_DOWNLOAD, status=Proxy.STATUS_SUCCESS)[:1] if len(proxies) > 0: dt = datetime.now() - proxies[0].update_time _proxy_status = '正常' if dt.total_seconds() < 3600 else '异常' else: _proxy_status = '异常' c = csrf(request) c.update({ "active_nav": "wechats", "wechats": _objs, "params": params, "downloader": r.llen(CRAWLER_CONFIG['downloader']) or 0, "antispider": r.get(CRAWLER_CONFIG['antispider']) or 0, "proxy_status": _proxy_status }) print context return render_to_response('wechat/index.html', context=c)
def api_topic_add(request): url = request.POST.get('url', '') logging.error(url) if url.startswith('http://mp.weixin.qq.com/') or url.startswith('https://mp.weixin.qq.com/') : data = { 'kind': KIND_DETAIL, 'url': url } r = get_redis() r.rpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) return JsonResponse({ 'ret': 0, 'message': '提交成功,链接已经提交给爬虫,稍后查看爬取结果' }) else: return JsonResponse({ 'ret': 1, 'message': '提交失败,url必须以 http://mp.weixin.qq.com/ 开头' })
def index(request): context = {} params = request.GET.copy() status = params.get('status', None) if status is None: _obj_list = Wechat.objects.filter().order_by('-id') else: _obj_list = Wechat.objects.filter(status=status).order_by('-id') paginator = Paginator(_obj_list, 50) # Show 20 contacts per page page = request.GET.get('page') try: _objs = paginator.page(page) except PageNotAnInteger: # If page is not an integer, deliver first page. _objs = paginator.page(1) except EmptyPage: # If page is out of range (e.g. 9999), deliver last page of results. _objs = paginator.page(paginator.num_pages) r = get_redis() # 获取代理状态 proxies = Proxy.objects.filter(kind=Proxy.KIND_DOWNLOAD, status=Proxy.STATUS_SUCCESS)[:1] if len(proxies) > 0: dt = datetime.now() - proxies[0].update_time _proxy_status = '正常' if dt.total_seconds() < 3600 else '异常' else: _proxy_status = '异常' context.update({ "active_nav": "wechats", "wechats": _objs, "params": params, "downloader": r.llen(CRAWLER_CONFIG['downloader']) or 0, "antispider": r.get(CRAWLER_CONFIG['antispider']) or 0, "proxy_status": _proxy_status }) print context return render_to_response('wechat/index.html', RequestContext(request, context))
def __init__(self): self.redis = get_redis()
def log_antispider(self): """ 记录1小时内的被禁爬的数量 """ r = get_redis() if r.incr(CRAWLER_CONFIG['antispider']) <= 1: r.expire(CRAWLER_CONFIG['antispider'], 3600)
def log_antispider(self): r = get_redis() if r.incr(CRAWLER_CONFIG['antispider']) <= 1: r.expire(CRAWLER_CONFIG['antispider'], 3600)