Пример #1
0
class HorrorHandler(Handler):
    family = "MovieItem"
    logger = get_logger("horrorlog")

    async def handle_after(self, item):
        if item["genres"] and "Horror" in item["genres"]:
            self.logger.warning(f"({item['title']}) is a horror movie!!!!")
Пример #2
0
class ProxyLogRedis(Handler):
    family = 'Request'
    logger = get_logger('validator')

    async def on_start(self):
        self.keys = self.crawler.keys

        # get redis connection from crawler
        self.redis: _Redis = self.crawler.redis

    async def handle_after(self, req: Request):
        if req.exceptions or not (await self.crawler.is_ok(req.response)):
            await self.update_proxy_to_redis(req.meta['proxy'], False, old_score=req.meta['old_score'])
        else:
            await self.update_proxy_to_redis(
                req.meta['proxy'], True, req.meta['speed'], int(time.time()), req.meta['old_score'])

    async def update_proxy_to_redis(self,
                                    proxy: str,
                                    success: bool,
                                    speed: int = None,
                                    last: int = None,
                                    old_score: int = None):
        """Update infos in three redis sorted sets."""
        if success:
            tr = self.redis.multi_exec()
            tr.zadd(self.keys['speed'], speed, proxy)
            tr.zadd(self.keys['last'], last, proxy)
            if old_score is None or old_score < 5:
                tr.zincrby(self.keys['score'], 1, proxy)
            else:
                tr.zincrby(self.keys['score'], round(5 / old_score, 2), proxy)
            await tr.execute()
            # self.logger.info('{} speed:{}'.format(proxy, speed))
        else:
            if old_score and old_score <= -4:
                await self.delete_proxy(proxy)
                # self.logger.info('delete proxy:{}'.format(proxy))
            else:
                await self.redis.zincrby(self.keys['score'], -1, proxy)
                # self.logger.info('proxy failed:{}'.format(proxy))

    async def delete_proxy(self, proxy: str):
        tr = self.redis.multi_exec()
        tr.zrem(self.keys['speed'], proxy)
        tr.zrem(self.keys['last'], proxy)
        tr.zrem(self.keys['score'], proxy)
        tr.srem(self.keys['init'], proxy)
        await tr.execute()
Пример #3
0
from acrawler import Handler, get_logger
from prometheus_client import start_http_server, Histogram, Counter, Gauge
import asyncio

logger = get_logger("prometheus")


class PromExporter(Handler):
    family = "Task"
    priority = 100

    async def on_start(self):
        self.name = self.crawler.name
        self.port = self.crawler.config.get("PROMETHEUS_PORT", 8000)
        self.addr = self.crawler.config.get("PROMETHEUS_ADDR", "localhost")
        self.interval = self.crawler.config.get("PROMETHEUS_INTERVAL", 1)

        self.reqs = Gauge(f"{self.name}_requests_progress",
                          "Number of working requests")
        self.reqs_q = Gauge(f"{self.name}_requests_queue",
                            "Number of requests in ready queue")
        self.reqs_w = Gauge(f"{self.name}_requests_waiting",
                            "Number of requests in waiting queue")

        self.counts = {}

        self.crawler.create_task(self.start_server())
        self.crawler.create_task(self.export())

    async def start_server(self):
        logger.info(
Пример #4
0
import re
import time

from acrawler import Crawler, ParselItem, Parser, Request, get_logger, Item
from aproxy.rules import COMMON_TASKS, TEST_TASKS
from aproxy.task import ProxyGen, ProxyItemForWeb, ProxyParseItem
import asyncio
import sys
import os

logger = get_logger('aproxy')


class ProxyCrawler(Crawler):

    config = {
        'DOWNLOAD_DELAY': 3,
        'MAX_REQUESTS_PER_HOST': 1,
        'MAX_REQUESTS': 12,
        'REDIS_ENABLE': True,
        'WEB_ENABLE': True,
        # 'LOG_TO_FILE': 'proxycrawler.log'
    }
    middleware_config = {
        'aproxy.handlers.ToRedisInit': 500,
        'aproxy.handlers.WebQuery': 2000,
        'acrawler.handlers.RequestPrepareBrowser': 1000,
    }

    parsers = [
        Parser(css_divider='table tr', item_type=ProxyParseItem),
Пример #5
0
import json
from pathlib import Path

import cloudscraper
from yarl import URL

from acrawler import ReScheduleImmediatelyError, get_logger
from acrawler.handlers import ExpiredWatcher

logger = get_logger("cfscrape")


class CfscrapeHandler(ExpiredWatcher):
    """Bypass the cloudflare.
    """

    family = "Request"
    priority = 500
    ttl = 20

    async def custom_on_start(self):
        """Load local token and update cookies if it is possible.
        """

        self.p = Path(
            self.crawler.config.get("CFS_COOKIES_FILE",
                                    Path.home() / ".cfscookies"))
        self.proxies = self.crawler.config.get("CFS_PROXIES", None)
        self.url = URL(self.crawler.config.get("CFS_URL"))
        self.ua = self.crawler.config.get(
            "CFS_USERAGENT",
Пример #6
0
# Scrape quotes from http://quotes.toscrape.com/
from acrawler import get_logger
from acrawler import Parser, Crawler, ParselItem, Request

logger = get_logger("quotes")


class QuoteItem(ParselItem):
    log = True
    default = {"type": "quote"}
    css = {"author": "small.author::text"}
    xpath = {
        "text": ['.//span[@class="text"]/text()', lambda s: s.strip("“")[:20]]
    }


class AuthorItem(ParselItem):
    log = True
    default = {"type": "author"}
    css = {
        "name": "h3.author-title::text",
        "born": "span.author-born-date::text"
    }


class QuoteCrawler(Crawler):

    main_page = r"quotes.toscrape.com/page/\d+"
    author_page = r"quotes.toscrape.com/author/.*"
    parsers = [
        Parser(
Пример #7
0
from acrawler.http import BrowserRequest
from acrawler import Crawler, get_logger

logger = get_logger("pyclock")


class ClockCrawler(Crawler):

    middleware_config = {
        # you should enable this handler to support BrowserRequest
        "acrawler.handlers.RequestPrepareBrowser": 800
    }

    async def start_requests(self):
        yield BrowserRequest(url="https://pythonclock.org",
                             page_callback=self.operate_page)

    async def operate_page(self, page, response):
        logger.info(await response.text())
        logger.info(await page.text())
        assert not "countdown-amount" in (await response.text())
        assert "countdown-amount" in (await page.text())
        await page.screenshot(show=True)


if __name__ == "__main__":
    ClockCrawler().run()
Пример #8
0
import random
import re

from acrawler import Crawler, ParselItem, Parser, Request, get_logger

logger = get_logger()
PATTERN = re.compile(
    r"\b((?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9]))\D*([0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])"
)

# Some websites only allow IP access from China
WEBSITES = [
    {
        "name":
        "xicidaili.com",
        "resource":
        ["http://www.xicidaili.com/nn/%s" % i for i in range(1, 6)] +
        ["http://www.xicidaili.com/wn/%s" % i for i in range(1, 6)] +
        ["http://www.xicidaili.com/wt/%s" % i for i in range(1, 6)],
        "enable":
        1,
    },
    {
        "name":
        "kuaidaili.com",
        "resource":
        ["https://www.kuaidaili.com/free/inha/%s" % i for i in range(1, 6)] +
        ["https://www.kuaidaili.com/proxylist/%s" % i for i in range(1, 11)],
        "enable":
        1,
    },
Пример #9
0
import asyncio
import json
import time

from acrawler import Crawler, Handler, Item, Request, Response, get_logger
from aproxy.handlers import RequestSpeed
import aioredis
import random

logger = get_logger('validator')


class ValidatedItem(Item):
    def __init__(self, name, extra=None, **kwargs):
        super().__init__(**kwargs)
        self.content['name'] = name
        self.content['speed'] = None
        self.content['last'] = None
        self.content['score'] = 5
        self.content.update(extra)


class HTTPValidator(Crawler):
    middleware_config = {
        'aproxy.handlers.RequestSpeed': 1000,
        'aproxy.handlers.ProxyLogRedis': 800
    }
    config = {
        'REDIS_ENABLE': True,
    }