示例#1
0
    def retry_crawl(self, data):
        """ 如果被禁爬,重试 """
        r = get_redis()
        retry = data.get('retry', 0)

        if data.get('kind') == KIND_DETAIL:
            if retry >= 20:
                return
            data = {
                'kind': data['kind'],
                'url': data['url'],
                'retry': retry + 1
            }
        elif data.get('kind') == KIND_KEYWORD:
            if retry >= 3:
                return
            data = {
                'kind': data['kind'],
                'word': data['word'],
                'retry': retry + 1,
                'user_hobby_id': data['user_hobby_id'],
                'crawl_source': data['crawl_source']
            }
        else:
            if retry >= 3:
                return
            data = {
                'kind': data['kind'],
                'wechat_id': data['wechat_id'],
                'wechatid': data['wechatid'],
                'retry': retry + 1
            }

        r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
示例#2
0
    def retry_crawl(self, data):
        """ 如果被禁爬,重试 """
        r = get_redis()
        retry = data.get('retry', 0)

        if data.get('kind') == KIND_DETAIL:
            if retry >= 20:
                return
            data = {
                'kind': data['kind'],
                'url': data['url'],
                'retry': retry + 1
            }
        elif data.get('kind') == KIND_KEYWORD:
            if retry >= 3:
                return
            data = {
                'kind': data['kind'],
                'word': data['word'],
                'retry': retry + 1
            }
        else:
            if retry >= 3:
                return
            data = {
                'kind': data['kind'],
                'wechat_id': data['wechat_id'],
                'wechatid': data['wechatid'],
                'retry': retry + 1
            }

        r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
示例#3
0
def topic_add(request):
    url = request.POST.get('url', '')
    if url.startswith('http://mp.weixin.qq.com/') or url.startswith(
            'https://mp.weixin.qq.com/'):
        data = {'kind': KIND_DETAIL, 'url': url}

        r = get_redis()
        r.rpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
        messages.success(request, '链接已经提交给爬虫,稍后查看爬取结果.')
    else:
        messages.error(request, 'url 错误, 添加失败')
    return redirect(reverse('wechat.topic_list'))
示例#4
0
def now_do(request, id_):
    r = get_redis()
    word_record = get_object_or_404(Word, pk=id_)
    data = {
        'kind': KIND_KEYWORD,
        'word': word_record.text,
        'user_hobby_id': word_record.user_hobby_id,
        'crawl_source': word_record.crawl_source
    }

    r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
    next_page = request.GET.get('next')
    return redirect(next_page)
示例#5
0
    def run(self):
        r = get_redis()
        if settings.CRAWLER_DEBUG:
            r.delete(settings.CRAWLER_CONFIG["processor"])
        while True:
            try:
                rsp = r.brpop(settings.CRAWLER_CONFIG["processor"])
            except Exception as e:
                print e
                continue

            data = json.loads(rsp[1])
            logger.info(json.dumps(data, encoding="UTF-8", ensure_ascii=False))
            self.process(data)
示例#6
0
def topic_add(request):
    url = request.POST.get('url', '')
    if url.startswith('http://mp.weixin.qq.com/') or url.startswith('https://mp.weixin.qq.com/') :
        data = {
            'kind': KIND_DETAIL,
            'url': url
        }

        r = get_redis()
        r.rpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
        messages.success(request, '链接已经提交给爬虫,稍后查看爬取结果.')
    else:
        messages.error(request, 'url 错误, 添加失败')
    return redirect(reverse('wechat.topic_list'))
示例#7
0
    def run(self):
        r = get_redis()
        if settings.CRAWLER_DEBUG:
            r.delete(settings.CRAWLER_CONFIG["processor"])
        while True:
            try:
                rsp = r.brpop(settings.CRAWLER_CONFIG["processor"])
            except Exception as e:
                print e
                continue

            data = json.loads(rsp[1])
            logger.info(json.dumps(data, encoding="UTF-8", ensure_ascii=False))
            self.process(data)
示例#8
0
def api_topic_add(request):
    url = request.POST.get('url', '')
    logging.error(url)
    if url.startswith('http://mp.weixin.qq.com/') or url.startswith(
            'https://mp.weixin.qq.com/'):
        data = {'kind': KIND_DETAIL, 'url': url}

        r = get_redis()
        r.rpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
        return JsonResponse({'ret': 0, 'message': '提交成功,链接已经提交给爬虫,稍后查看爬取结果'})
    else:
        return JsonResponse({
            'ret':
            1,
            'message':
            '提交失败,url必须以 http://mp.weixin.qq.com/ 开头'
        })
示例#9
0
    def run(self):
        r = get_redis()
        if settings.CRAWLER_DEBUG:
            r.delete(settings.CRAWLER_CONFIG["downloader"])

        while True:
            now = datetime.now()
            # 获取要抓取的公众号
            wechats = Wechat.objects.filter(
                frequency__gt=0,
                next_crawl_time__lt=now,
                status=Wechat.STATUS_DEFAULT).order_by('-id')
            for item in wechats:
                data = {
                    'kind': KIND_NORMAL,
                    'wechat_id': item.id,
                    'wechatid': item.wechatid
                }

                r.lpush(settings.CRAWLER_CONFIG["downloader"],
                        json.dumps(data))

                # 更新index_rule
                item.next_crawl_time = now + timedelta(minutes=item.frequency)
                #item.next_crawl_time = now + timedelta(seconds=item.frequency)
                item.save()

                logging.debug(data)

            # 获取要抓取的关键词
            keywords = Word.objects.filter(
                frequency__gt=0, next_crawl_time__lt=now).order_by('-id')
            for item in keywords:
                data = {'kind': KIND_KEYWORD, 'word': item.text}

                r.lpush(settings.CRAWLER_CONFIG["downloader"],
                        json.dumps(data))

                # 更新index_rule
                item.next_crawl_time = now + timedelta(minutes=item.frequency)
                #item.next_crawl_time = now + timedelta(seconds=item.frequency)
                item.save()

                logging.debug(data)

            time.sleep(1)
示例#10
0
    def run(self):
        r = get_redis()
        if settings.CRAWLER_DEBUG:
            r.delete(settings.CRAWLER_CONFIG["downloader"])

        while True:
            now = datetime.now()
            # 获取要抓取的公众号
            wechats = Wechat.objects.filter(frequency__gt=0, next_crawl_time__lt=now, status=Wechat.STATUS_DEFAULT).order_by('-id')
            for item in wechats:
                data = {
                    'kind': KIND_NORMAL,
                    'wechat_id': item.id,
                    'wechatid': item.wechatid
                }

                r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))

                # 更新index_rule
                item.next_crawl_time = now + timedelta(minutes=item.frequency)
                #item.next_crawl_time = now + timedelta(seconds=item.frequency)
                item.save()

                logging.debug(data)

            # 获取要抓取的关键词
            keywords = Word.objects.filter(frequency__gt=0, next_crawl_time__lt=now).order_by('-id')
            for item in keywords:
                data = {
                    'kind': KIND_KEYWORD,
                    'word': item.text
                }

                r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))

                # 更新index_rule
                item.next_crawl_time = now + timedelta(minutes=item.frequency)
                #item.next_crawl_time = now + timedelta(seconds=item.frequency)
                item.save()

                logging.debug(data)

            time.sleep(1)
示例#11
0
def index(request):
    context = {}
    params = request.GET.copy()
    status = params.get('status', None)
    if status is None:
        _obj_list = Wechat.objects.filter().order_by('-id')
    else:
        _obj_list = Wechat.objects.filter(status=status).order_by('-id')

    paginator = Paginator(_obj_list, 50)  # Show 20 contacts per page

    page = request.GET.get('page')
    try:
        _objs = paginator.page(page)
    except PageNotAnInteger:
        # If page is not an integer, deliver first page.
        _objs = paginator.page(1)
    except EmptyPage:
        # If page is out of range (e.g. 9999), deliver last page of results.
        _objs = paginator.page(paginator.num_pages)

    r = get_redis()
    # 获取代理状态
    proxies = Proxy.objects.filter(kind=Proxy.KIND_DOWNLOAD,
                                   status=Proxy.STATUS_SUCCESS)[:1]
    if len(proxies) > 0:
        dt = datetime.now() - proxies[0].update_time
        _proxy_status = '正常' if dt.total_seconds() < 3600 else '异常'
    else:
        _proxy_status = '异常'
    c = csrf(request)
    c.update({
        "active_nav": "wechats",
        "wechats": _objs,
        "params": params,
        "downloader": r.llen(CRAWLER_CONFIG['downloader']) or 0,
        "antispider": r.get(CRAWLER_CONFIG['antispider']) or 0,
        "proxy_status": _proxy_status
    })
    print context

    return render_to_response('wechat/index.html', context=c)
示例#12
0
def api_topic_add(request):
    url = request.POST.get('url', '')
    logging.error(url)
    if url.startswith('http://mp.weixin.qq.com/') or url.startswith('https://mp.weixin.qq.com/') :
        data = {
            'kind': KIND_DETAIL,
            'url': url
        }

        r = get_redis()
        r.rpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))
        return JsonResponse({
            'ret': 0,
            'message': '提交成功,链接已经提交给爬虫,稍后查看爬取结果'
        })
    else:
        return JsonResponse({
            'ret': 1,
            'message': '提交失败,url必须以 http://mp.weixin.qq.com/ 开头'
        })
示例#13
0
def index(request):
    context = {}
    params = request.GET.copy()
    status = params.get('status', None)
    if status is None:
        _obj_list = Wechat.objects.filter().order_by('-id')
    else:
        _obj_list = Wechat.objects.filter(status=status).order_by('-id')

    paginator = Paginator(_obj_list, 50)  # Show 20 contacts per page

    page = request.GET.get('page')
    try:
        _objs = paginator.page(page)
    except PageNotAnInteger:
        # If page is not an integer, deliver first page.
        _objs = paginator.page(1)
    except EmptyPage:
        # If page is out of range (e.g. 9999), deliver last page of results.
        _objs = paginator.page(paginator.num_pages)

    r = get_redis()
    # 获取代理状态
    proxies = Proxy.objects.filter(kind=Proxy.KIND_DOWNLOAD, status=Proxy.STATUS_SUCCESS)[:1]
    if len(proxies) > 0:
        dt = datetime.now() - proxies[0].update_time
        _proxy_status = '正常' if dt.total_seconds() < 3600 else '异常'
    else:
        _proxy_status = '异常'
    context.update({
        "active_nav": "wechats",
        "wechats": _objs,
        "params": params,
        "downloader": r.llen(CRAWLER_CONFIG['downloader']) or 0,
        "antispider": r.get(CRAWLER_CONFIG['antispider']) or 0,
        "proxy_status": _proxy_status

    })
    print context

    return render_to_response('wechat/index.html', RequestContext(request, context))
示例#14
0
 def __init__(self):
     self.redis = get_redis()
示例#15
0
 def log_antispider(self):
     """ 记录1小时内的被禁爬的数量 """
     r = get_redis()
     if r.incr(CRAWLER_CONFIG['antispider']) <= 1:
         r.expire(CRAWLER_CONFIG['antispider'], 3600)
示例#16
0
 def log_antispider(self):
     """ 记录1小时内的被禁爬的数量 """
     r = get_redis()
     if r.incr(CRAWLER_CONFIG['antispider']) <= 1:
         r.expire(CRAWLER_CONFIG['antispider'], 3600)
示例#17
0
 def __init__(self):
     self.redis = get_redis()
示例#18
0
 def log_antispider(self):
     r = get_redis()
     if r.incr(CRAWLER_CONFIG['antispider']) <= 1:
         r.expire(CRAWLER_CONFIG['antispider'], 3600)