Exemplo n.º 1
0
    def run(self):
        r = get_redis()
        if settings.CRAWLER_DEBUG:
            r.delete(settings.CRAWLER_CONFIG["processor"])
        while True:
            try:
                rsp = r.brpop(settings.CRAWLER_CONFIG["processor"])
            except Exception as e:
                print e
                continue

            data = json.loads(rsp[1])
            #logger.info(json.dumps(data, encoding="UTF-8", ensure_ascii=False))
            self.process(data)
Exemplo n.º 2
0
    def run(self):
        r = get_redis()
        if settings.CRAWLER_DEBUG:
            r.delete(settings.CRAWLER_CONFIG["processor"])
        while True:
            try:
                rsp = r.brpop(settings.CRAWLER_CONFIG["processor"])
            except Exception as e:
                print e
                continue

            data = json.loads(rsp[1])
            #logger.info(json.dumps(data, encoding="UTF-8", ensure_ascii=False))
            self.process(data)
Exemplo n.º 3
0
    def run(self):
        r = get_redis()
        while True:
            now = datetime.now()
            for item in Seed.objects.filter(
                    status=Seed.STATUS_ENABLE).order_by('-weight'):
                rules = IndexRule.objects.filter(
                    seed=item,
                    status=IndexRule.STATUS_ENABLE,
                    next_crawl_time__lte=now)
                for rule in rules:
                    try:
                        deital_rule = DetailRule.objects.get(index_rule=rule)
                    except DetailRule.DoesNotExist as e:
                        print e
                        continue

                    base = {
                        'url': '',
                        'kind': KIND_LIST_URL,
                        "seed_id": item.pk,
                        'rule_id': rule.pk,
                        "fresh_pages": rule.fresh_pages,
                        'site_config': rule.site.get_config(),
                        'list_rules': rule.list_rules,
                        'next_url_rules': rule.next_url_rules,
                        'detail_rules': deital_rule.data,
                        'detail_exclude': deital_rule.exclude,
                        'detail_multi': deital_rule.multi,
                        'detail_multi_unique': deital_rule.multi_unique,
                        'detail_fresh_time': deital_rule.fresh_time,
                        'unique_key': item.data[0]["unique_key"]
                    }
                    for url in rule.url:
                        data = base.copy()
                        data['url'] = url
                        r.lpush(settings.CRAWLER_CONFIG["downloader"],
                                json.dumps(data))

                    # 更新index_rule
                    rule.next_crawl_time = now + timedelta(
                        seconds=rule.frequency)
                    rule.save()

                    logging.debug(data)

            #print r.rpop('unicrawler:urls')
            time.sleep(1)
Exemplo n.º 4
0
 def monitor_service(self):
     conf = settings.CRAWLER_CONFIG
     r = get_redis()
     now = datetime.now().replace(second=0, microsecond=0)
     pipe = r.pipeline()
     result = pipe.llen(conf['downloader']).llen(conf['extractor']).llen(conf['processor']).execute()
     scheduler = IndexRule.objects.filter(seed__status=Seed.STATUS_ENABLE, status=IndexRule.STATUS_ENABLE,
                                          next_crawl_time__lte=now).count()
     print result
     Service.objects.create(
         scheduler=scheduler,
         downloader=result[0],
         extractor=result[1],
         processor=result[2],
         create_time=now
     )
Exemplo n.º 5
0
    def run(self):
        r = get_redis()
        while True:
            now = datetime.now()
            for item in Seed.objects.filter(status=Seed.STATUS_ENABLE).order_by('-weight'):
                rules = IndexRule.objects.filter(seed=item, status=IndexRule.STATUS_ENABLE, next_crawl_time__lte=now)
                for rule in rules:
                    try:
                        detail_rule = DetailRule.objects.get(index_rule=rule)
                    except DetailRule.DoesNotExist as e:
                        print e
                        continue

                    base = {
                        'url': '',
                        'kind': KIND_LIST_URL,
                        "seed_id": item.pk,
                        'rule_id': rule.pk,
                        "fresh_pages": rule.fresh_pages,
                        'site_config': rule.site.get_config(),
                        'list_rules': rule.list_rules,
                        'next_url_rules': rule.next_url_rules,
                        'detail_rules': detail_rule.data,
                        'detail_exclude': detail_rule.exclude,
                        'detail_multi': detail_rule.multi,
                        'detail_multi_unique': detail_rule.multi_unique,
                        'detail_fresh_time': detail_rule.fresh_time,
                        'unique_key': item.data[0]["unique_key"]
                    }
                    for url in rule.url:
                        data = base.copy()
                        data['url'] = url
                        r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data))

                    # 更新index_rule
                    rule.next_crawl_time = now + timedelta(seconds=rule.frequency)
                    rule.save()

                    logging.debug(data)

            #print r.rpop('unicrawler:urls')
            time.sleep(1)
Exemplo n.º 6
0
    def run(self):
        r = get_redis()
        if settings.CRAWLER_DEBUG:
            r.delete(settings.CRAWLER_CONFIG["extractor"])
        while True:
            try:
                data = r.brpop(settings.CRAWLER_CONFIG["extractor"])
            except Exception as e:
                print e
                continue
            #print data
            data = json.loads(data[1])
            body = data['body']
            # 1 如果当前接卸的页面是列表页
            if data["kind"] == KIND_LIST_URL:
                # 1.1先找详情页
                # 检查详情的内容是否都包含在列表页中
                multi_rules = data['detail_multi']
                if multi_rules:
                    # 1.1.1 详情都包含在列表页中
                    multi_parts = self.extract(body, multi_rules, {'data': data})
                    for part in multi_parts:
                        self.get_detail(part, data)
                else:
                    # 1.1.2 详情不在列表中,通过列表url去访问详情
                    detail_urls = self.extract(body, data['list_rules'], {'data': data})
                    #logger.debug('detail_urls: %s' % detail_urls)
                    for item in detail_urls:
                        item_data = {
                            "url": item,
                            'kind': KIND_DETAIL_URL,
                            'seed_id': data['seed_id'],
                            'rule_id': data['rule_id'],
                            #'fresh_pages': '',
                            #'list_rules': '',
                            #'next_url_rules': '',
                            'site_config': data['site_config'],
                            'detail_rules': data['detail_rules'],
                            'detail_exclude': data['detail_exclude'],
                            'detail_multi': data['detail_multi'],
                            'detail_multi_unique': data['detail_multi_unique'],
                            'detail_fresh_time': data['detail_fresh_time'],
                            'unique_key': data['unique_key']
                        }
                        r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(item_data))

                # 1.2后找下一页
                next_urls = self.extract(body, data["next_url_rules"], {'data': data})
                print 'next_urls: %s' % next_urls
                for item in next_urls:
                    item_data = {
                        "url": item,
                        'kind': KIND_LIST_URL,
                        'seed_id': data['seed_id'],
                        'rule_id': data['rule_id'],
                        'fresh_pages': data['fresh_pages'] - 1,
                        'site_config': data['site_config'],
                        'list_rules': data['list_rules'],
                        'next_url_rules': data['next_url_rules'],
                        'detail_rules': data['detail_rules'],
                        'detail_exclude': data['detail_exclude'],
                        'detail_multi': data['detail_multi'],
                        'detail_multi_unique': data['detail_multi_unique'],
                        'detail_fresh_time': data['detail_fresh_time'],
                        'unique_key': data['unique_key']
                    }
                    if item_data['fresh_pages'] > 0:
                        logger.debug('list:%s' % data['url'])
                        r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(item_data))
            # 2 如果当前解析的页面是详情页
            elif data["kind"] == KIND_DETAIL_URL:
                logger.debug('detail:%s' % data['url'])
                # 如果没有多项详情,则只是单项
                self.get_detail(body, data)
Exemplo n.º 7
0
 def __init__(self):
     self.redis = get_redis()
Exemplo n.º 8
0
    def run(self):
        r = get_redis()
        if settings.CRAWLER_DEBUG:
            r.delete(settings.CRAWLER_CONFIG["extractor"])
        while True:
            try:
                data = r.brpop(settings.CRAWLER_CONFIG["extractor"])
            except Exception as e:
                print e
                continue
            #print data
            data = json.loads(data[1])
            body = data['body']
            # 1 如果当前接卸的页面是列表页
            if data["kind"] == KIND_LIST_URL:
                # 1.1先找详情页
                # 检查详情的内容是否都包含在列表页中
                multi_rules = data['detail_multi']
                if multi_rules:
                    # 1.1.1 详情都包含在列表页中
                    multi_parts = self.extract(body, multi_rules,
                                               {'data': data})
                    for part in multi_parts:
                        self.get_detail(part, data)
                else:
                    # 1.1.2 详情不在列表中,通过列表url去访问详情
                    detail_urls = self.extract(body, data['list_rules'],
                                               {'data': data})
                    #logger.debug('detail_urls: %s' % detail_urls)
                    for item in detail_urls:
                        item_data = {
                            "url": item,
                            'kind': KIND_DETAIL_URL,
                            'seed_id': data['seed_id'],
                            'rule_id': data['rule_id'],
                            #'fresh_pages': '',
                            #'list_rules': '',
                            #'next_url_rules': '',
                            'site_config': data['site_config'],
                            'detail_rules': data['detail_rules'],
                            'detail_exclude': data['detail_exclude'],
                            'detail_multi': data['detail_multi'],
                            'detail_multi_unique': data['detail_multi_unique'],
                            'detail_fresh_time': data['detail_fresh_time'],
                            'unique_key': data['unique_key']
                        }
                        r.lpush(settings.CRAWLER_CONFIG["downloader"],
                                json.dumps(item_data))

                # 1.2后找下一页
                next_urls = self.extract(body, data["next_url_rules"],
                                         {'data': data})
                site_config = data['site_config']
                print 'next_urls: %s' % next_urls
                for item in next_urls:
                    item = checkUrlValidate(item, site_config)
                    item_data = {
                        "url": item,
                        'kind': KIND_LIST_URL,
                        'seed_id': data['seed_id'],
                        'rule_id': data['rule_id'],
                        'fresh_pages': data['fresh_pages'] - 1,
                        'site_config': data['site_config'],
                        'list_rules': data['list_rules'],
                        'next_url_rules': data['next_url_rules'],
                        'detail_rules': data['detail_rules'],
                        'detail_exclude': data['detail_exclude'],
                        'detail_multi': data['detail_multi'],
                        'detail_multi_unique': data['detail_multi_unique'],
                        'detail_fresh_time': data['detail_fresh_time'],
                        'unique_key': data['unique_key']
                    }
                    if item_data['fresh_pages'] > 0:
                        logger.debug('list:%s' % data['url'])
                        r.lpush(settings.CRAWLER_CONFIG["downloader"],
                                json.dumps(item_data))
            # 2 如果当前解析的页面是详情页
            elif data["kind"] == KIND_DETAIL_URL:
                logger.debug('detail:%s' % data['url'])
                # 如果没有多项详情,则只是单项
                self.get_detail(body, data)
Exemplo n.º 9
0
 def __init__(self):
     self.redis = get_redis()