示例#1
0
    def retry_crawl(self, data):
        """ 如果被禁爬,重试 """
        r = get_redis()
        retry = data.get('retry', 0)

        if data.get('kind') == KIND_DETAIL:
            if retry >= 20:
                return
            data = {
                'kind': data['kind'],
                'url': data['url'],
                'retry': retry + 1
            }
        elif data.get('kind') == KIND_KEYWORD:
            if retry >= 3:
                return
            data = {
                'kind': data['kind'],
                'word': data['word'],
                'retry': retry + 1
            }
        else:
            if retry >= 3:
                return
            data = {
                'kind': data['kind'],
                'wechat_id': data['wechat_id'],
                'wechatid': data['wechatid'],
                'retry': retry + 1
            }

        r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
示例#2
0
 def log_antispider(self):
     """
     记录1小时内的被禁爬的数量
     """
     r = get_redis()
     if r.incr(CRAWLER_CONFIG['antispider']) <= 1:
         r.expire(CRAWLER_CONFIG['antispider'], 3600)
示例#3
0
文件: views.py 项目: Kpassionate/woas
def topic_add(request):
    url = request.POST.get('url', '')
    if url.startswith('http://mp.weixin.qq.com/') or url.startswith(
            'https://mp.weixin.qq.com/'):
        data = {'kind': KIND_DETAIL, 'url': url}

        r = get_redis()
        r.rpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
        messages.success(request, '链接已经提交给爬虫,稍后查看爬取结果.')
    else:
        # print("错误")
        messages.error(request, 'url 错误, 添加失败')
    return redirect(reverse('wechat.topic_list'))
示例#4
0
    def run(self):
        r = get_redis()
        if settings.CRAWLER_DEBUG:
            r.delete(settings.CRAWLER_CONFIG["processor"])
        while True:
            try:
                rsp = r.brpop(settings.CRAWLER_CONFIG["processor"])
            except Exception as e:
                print(e)
                continue

            data = json.loads(rsp[1])
            logger.info(json.dumps(data, encoding="UTF-8", ensure_ascii=False))
            self.process(data)
示例#5
0
文件: views.py 项目: Kpassionate/woas
def api_topic_add(request):
    url = request.POST.get('url', '')
    logging.error(url)
    if url.startswith('http://mp.weixin.qq.com/') or url.startswith(
            'https://mp.weixin.qq.com/'):
        data = {'kind': KIND_DETAIL, 'url': url}

        r = get_redis()
        r.rpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
        return JsonResponse({'ret': 0, 'message': '提交成功,链接已经提交给爬虫,稍后查看爬取结果'})
    else:
        return JsonResponse({
            'ret':
            1,
            'message':
            '提交失败,url必须以 http://mp.weixin.qq.com/ 开头'
        })
示例#6
0
文件: views.py 项目: Kpassionate/woas
def index(request):
    context = {}
    params = request.GET.copy()
    status = params.get('status', None)
    if status is None:
        _obj_list = Wechat.objects.filter().order_by('-id')
    else:
        _obj_list = Wechat.objects.filter(status=status).order_by('-id')

    paginator = Paginator(_obj_list, 50)  # Show 20 contacts per page

    page = request.GET.get('page')
    try:
        _objs = paginator.page(page)
    except PageNotAnInteger:
        # If page is not an integer, deliver first page.
        _objs = paginator.page(1)
    except EmptyPage:
        # If page is out of range (e.g. 9999), deliver last page of results.
        _objs = paginator.page(paginator.num_pages)

    r = get_redis()
    # 获取代理状态
    proxies = Proxy.objects.filter(kind=Proxy.KIND_DOWNLOAD,
                                   status=Proxy.STATUS_SUCCESS)[:1]
    # print(proxies)
    if len(proxies) > 0:
        dt = datetime.now() - proxies[0].update_time
        _proxy_status = '正常' if dt.total_seconds() < 3600 else '异常'
    else:
        _proxy_status = '异常'
    context.update({
        "active_nav": "wechats",
        "wechats": _objs,
        "params": params,
        "downloader": r.llen(CRAWLER_CONFIG['downloader']) or 0,
        "antispider": r.get(CRAWLER_CONFIG['antispider']) or 0,
        "proxy_status": _proxy_status
    })

    return render(request, 'wechat/index.html', context=context)
示例#7
0
 def __init__(self):
     self.redis = get_redis()