Пример #1
0
def default_settings():
    if "TORSCRAPER_ROOT" not in os.environ:
        os.environ["TORSCRAPER_ROOT"] = os.path.dirname(os.path.abspath(__file__))

    if "TORSCRAPER_DATA" not in os.environ:
        os.environ["TORSCRAPER_DATA"] = os.getenv("TORSCRAPER_ROOT") + "/data/app"

    if "TORSCRAPER_STATIC" not in os.environ:
        os.environ["TORSCRAPER_STATIC"] = os.getenv("TORSCRAPER_ROOT") + "/static"

    if "TORSCRAPER_CONFIG" not in os.environ:
        os.environ["TORSCRAPER_CONFIG"] = os.getenv("TORSCRAPER_ROOT") + "/config/app"

    if "TORSCRAPER_PLUGINS" not in os.environ:
        os.environ["TORSCRAPER_PLUGINS"] = os.getenv("TORSCRAPER_ROOT") + "/plugin"

    logging_config = CustomConfig("logging")
    logging.config.dictConfig(logging_config.content())

    from scraper.utils.logging import ChainLoggerAdapter
    log = ChainLoggerAdapter.from_logger_name("scraper.init")

    plugin_dirs = set()
    from glob import glob
    for dir_name in AppConfig.get("plugin.source_dirs"):
        dirs = set(map(os.path.abspath, glob(dir_name)))
        if len(dirs) == 0:
            log.warning("plugin source directory '%s' matched no directories, cwd=%s",
                        dir_name, os.getenv("TORSCRAPER_ROOT"))
            continue

        plugin_dirs.update(dirs)

    import sys
    missing_plugin_dirs = plugin_dirs.difference(set(sys.path))
    log.debug("adding %s to PYTHONPATH", missing_plugin_dirs)
    sys.path.extend(missing_plugin_dirs)

    for handler in logging.getLogger().handlers:
        import socket
        try:
            if isinstance(handler, SocketHandler):
                socket.getaddrinfo(handler.host, handler.port)
        
        except socket.gaierror as exc:
            raise RuntimeError(f"Unable to resolve address information for {handler}.") from exc

    local_tor_path = os.getenv('TORSCRAPER_DATA') + "/torproxy/Tor"
    if os.path.exists(local_tor_path):
        import platform
        _sep = ";" if platform.system() == "Windows" else ":"
        os.environ["PATH"] = os.path.realpath(local_tor_path + _sep + os.getenv("PATH"))
Пример #2
0
def launch_process(arguments: dict, custom_env: dict = None):
    # update local environment
    os.environ.update(custom_env or {})

    # initialize global settings and variables
    default_settings()

    from scraper.utils.logging import ChainLoggerAdapter, logflags
    log = ChainLoggerAdapter.from_logger_name("scraper.init")
    log.debug("current environment state: %s", dict(os.environ), flags=[logflags.SENSITIVE])
    log.debug("arguments provided: %s", arguments, flags=[logflags.SENSITIVE])

    # create new event loop
    event_loop = asyncio.new_event_loop()

    # start application itself (blocking!)
    log.info("starting application")
    run(event_loop, arguments)
Пример #3
0
import re
import asyncio
from os import makedirs
from pathlib import Path
from typing import Union, Any

import aiofiles
import aiohttp
from bs4 import BeautifulSoup
from yarl import URL

from scraper.utils.mixins import LoggableMixin
from scraper.utils.logging import ChainLoggerAdapter

log = ChainLoggerAdapter.from_logger_name("scraper.services.webpage")


class WebPage(LoggableMixin):
    """ Class representing a web page view. Contains all necessary information from HTTP requests and responses. """

    _css_links_regex = re.compile(r'url\([\'"]?([^\'"]+?)[\'"]?\)')

    def __init__(self, exception: Exception = None):
        super().__init__({
            "_url_real": "http_request_url",
            "_exception": "http_request_exception",
            "_request_method": "http_request_method",
            "_status": "http_response_code",
        })

        self._soup = None
Пример #4
0
import asyncio
from datetime import datetime
from enum import Enum
from os import getenv

from aioredis import Redis, create_redis_pool
from yarl import URL

from scraper.tor import TorSession
from scraper.config import AppConfig
from scraper.database import Connector
from scraper.utils.logging import ChainLoggerAdapter, logflags

from .router import Router

log = ChainLoggerAdapter.from_logger_name("scraper.control.container")


class Container(object):
    """ Global application container holding important application objects. """
    class ServiceType(Enum):
        ANY = "ANY"
        NON_MANAGER = "non-manager"
        MANAGER = "manager"
        CRAWLER = "crawler"
        SCRAPER = "scraper"

        def __eq__(self, o: object) -> bool:
            # is this ServiceType comparison
            if isinstance(o, self.__class__):
                # ANY type matches any other type
Пример #5
0
import asyncio
import signal

from scraper.utils.logging import ChainLoggerAdapter

log = ChainLoggerAdapter.from_logger_name("scraper.setup")


async def main(args: dict):
    from scraper.control import Container, ServiceManager, ServiceController, Server
    # setup instance-wide data container
    container: Container = await Container.initialize(
        service_type=args.get("service_type", Container.ServiceType.MANAGER),
        api_port=args.get("api_port"),
    )

    log.info("running version %s", container.version, extra={"torscraper_version": container.version})

    # define handler for instance shutdown
    def shutdown_handler(code, _):
        log.warning("received signal %s - shutting down...", code)
        asyncio.ensure_future(container.stop())

    # bind and activate the shutdown handler
    signal.signal(signal.SIGTERM, shutdown_handler)
    signal.signal(signal.SIGINT, shutdown_handler)

    # run the instance's API HTTP server
    log.info("setting up control server")
    server = Server(container)
    await server.start(args.get("api_port"))
Пример #6
0
from abc import ABCMeta, abstractmethod
from typing import Any, Union, Callable, Awaitable
from datetime import datetime, timedelta
from asyncio.tasks import Task

from yarl import URL

from scraper.control import Container
from scraper.utils.logging import ChainLoggerAdapter
from scraper.utils.exceptions import ShuttingDown
from plugin.default.database.services import Service

from .webpage import WebPage
from .queue import DateTimePriorityWrapper, DownloadEntry, ParseEntry

log = ChainLoggerAdapter.from_logger_name("scraper.services.service_base")


class ServiceBase(metaclass=ABCMeta):
    """ Base class for any custom service definition. """

    __service_type__ = Container.ServiceType.ANY

    def __init__(self,
                 container: Container,
                 service: Service,
                 download_entry_class: type = DownloadEntry,
                 parse_entry_class: type = ParseEntry):
        assert self.__service_type__ in [
            Container.ServiceType.SCRAPER,
            Container.ServiceType.CRAWLER,
Пример #7
0
import asyncio
import os
import re
import signal
from multiprocessing import Process
from typing import Type

from plugin.default.database import Service
from scraper.services import ServiceBase
from scraper.utils import list_classes, list_modules
from scraper.utils.logging import ChainLoggerAdapter
from .container import Container

log = ChainLoggerAdapter.from_logger_name("scraper.control.service_manager")


class ServiceManager(object):
    """ Class responsible for initialization and management of all service plugins and corresponding processes. """

    _target_plugin_modules = "plugin.(\w+).services.(\w+)"

    def __init__(self, container: Container):
        self._container: Container = container
        self._services = {}
        self._processes: list[Process] = []
        self._api_port = 40100

    # ==========================================================================dd==
    #   INTERNAL METHODS
    # ==========================================================================dd==
Пример #8
0
import csv
from collections import defaultdict
from os import getenv
from pathlib import Path

from aiohttp import web
from aiohttp_apispec import docs, querystring_schema

from scraper.control import ResourceBase
from scraper.control.exceptions import *
from scraper.utils.apispec import STATIC_RESPONSES
from scraper.utils.logging import ChainLoggerAdapter
from ..utils.schemas import CountriesQuery

log = ChainLoggerAdapter.from_logger_name("default.resources.static")


class StaticResource(ResourceBase):

    # ==========================================================================dd==
    #   INTERNAL METHODS
    # ==========================================================================dd==

    def _register_routes(self):
        return [
            web.get("/public/static/file/{file_path:.+}",
                    self.file,
                    name="static-file",
                    allow_head=False),
            web.get("/public/static/json/countries/list",
                    self.countries,
Пример #9
0
from typing import Any, Union
from datetime import datetime

import jwt
from aiohttp import web
from aiohttp_jwt import JWTMiddleware

from scraper.config import AppConfig, AuthUsersConfig
from scraper.utils.logging import ChainLoggerAdapter, logflags

from .collections import update_recursive

log = ChainLoggerAdapter.from_logger_name("scraper.utils.jwt_manager")


class JWTManager(object):
    @classmethod
    def setup_middleware(cls, app: web.Application, **kwargs):
        auth_config = AppConfig.get("control_server.auth", {})
        auth_config = update_recursive(auth_config, kwargs)

        if "secret_or_pub_key" in auth_config:
            log.debug("setting up JWT authentication middleware, config=%s",
                      auth_config,
                      flags=[logflags.SENSITIVE])
            app.middlewares.append(JWTMiddleware(**auth_config))

    @classmethod
    def create_token(cls, payload: dict[str, Any]) -> bytes:
        headers = {"iat": datetime.utcnow().timestamp()}
Пример #10
0
from aiohttp import web
from aiohttp.web_exceptions import *
from aiohttp_apispec import docs, json_schema

from scraper.control import ResourceBase
from scraper.control.exceptions import *
from scraper.utils import JWTManager
from scraper.utils.logging import ChainLoggerAdapter

from ..utils.schemas import AccessTokenResponse, AccessTokenRequest

log = ChainLoggerAdapter.from_logger_name("default.resources.access")


class AccessResource(ResourceBase):

    # ==========================================================================dd==
    #   INTERNAL METHODS
    # ==========================================================================dd==

    def _register_routes(self):
        return [
            web.post("/public/access/token", self.token, name="create-token"),
        ]

    # ==========================================================================dd==
    #   PUBLIC METHODS
    # ==========================================================================dd==

    @docs(
        tags=["plugin.default.access", "public"],
Пример #11
0
from scraper.utils.apispec import SECURITY_SPECS, AUTH_RESPONSES, STATIC_RESPONSES
from scraper.utils.schemas import DatabaseRelationQuery
from scraper.utils.logging import ChainLoggerAdapter

from ..database import Service, ServiceUrl
from ..utils.schemas import (
    ServiceEntry,
    ServiceResponse,
    ServiceListResponse,
    ServiceUrlEntry,
    ServiceUrlUpdateRequest,
    ServiceUrlResponse,
    ServiceUrlListResponse,
)

log = ChainLoggerAdapter.from_logger_name("default.resources.services")


class ServicesResource(ResourceBase):
    __service_type__ = Container.ServiceType.MANAGER

    # ==========================================================================dd==
    #   INTERNAL METHODS
    # ==========================================================================dd==

    def _register_routes(self):
        return [
            # service endpoints
            web.get("/db/service",
                    self.service_list,
                    name="service-list",
Пример #12
0
import asyncio
from abc import ABCMeta, abstractmethod

from aiohttp import web
from aiohttp.web_routedef import RouteDef

from scraper.config import AppConfig
from scraper.utils.logging import ChainLoggerAdapter
from .container import Container
from .server import Server

log = ChainLoggerAdapter.from_logger_name("scraper.control.resource_base")


class ResourceBase(metaclass=ABCMeta):
    __service_type__ = Container.ServiceType.ANY

    def __init__(self, server: Server = None):
        self._server = server
        if server:
            server.add_routes(self._register_routes())
            server.add_task_registrator(self._register_tasks())

    # ==========================================================================dd==
    #   PUBLIC PROPERTIES
    # ==========================================================================dd==

    @property
    def server(self) -> Server:
        return self._server
Пример #13
0
from contextlib import asynccontextmanager, contextmanager
from typing import ContextManager, AsyncContextManager

from sqlalchemy import create_engine, inspect
from sqlalchemy.engine.reflection import Inspector
from sqlalchemy.orm import sessionmaker, scoped_session, Session

from scraper.utils import list_classes, list_modules
from scraper.utils.logging import ChainLoggerAdapter
from .table_base import PrimaryTableBase, MemoryTableBase

log = ChainLoggerAdapter.from_logger_name("scraper.database.connector")


class Connector(object):
    """ Database structure, connection, session and plugin manager. """

    _target_plugin_modules = "plugin.(\w+).database.(\w+)"

    def __init__(self, connection_string, **kwargs):
        # create database engine instances based on provided configuration
        self._engines = {
            "primary": create_engine(connection_string, **kwargs),
            "memory": create_engine(f"sqlite:///:memory:"),
        }
        # create session factories for each created database engine
        self._factories = {engine_type: sessionmaker(bind=engine) for engine_type, engine in self._engines.items()}
        # create new session for each database engine
        self._sessions = {engine_type: factory() for engine_type, factory in self._factories.items()}

        # load available plugin modules and initialize/update database structure
Пример #14
0
from aiohttp_apispec import setup_aiohttp_apispec, validation_middleware
from jwt.exceptions import PyJWTError, InvalidTokenError

from scraper.config import AppConfig
from scraper.control.exceptions import (
    HTTP_EXCEPTION_STATUS_MAPPING,
    CustomClientExceptionBase,
    InternalServerError,
    Unauthorized,
)
from scraper.utils import list_classes, JWTManager, list_modules
from scraper.utils.logging import ChainLoggerAdapter
from .container import Container
from .router import Router

log = ChainLoggerAdapter.from_logger_name("scraper.control.server")
access_log = ChainLoggerAdapter.from_logger_name(
    "scraper.control.server.access")


class Server(object):
    """ Class managing the application's control server. """

    _target_plugin_modules = "plugin.(\w+).resources.(\w+)"

    def __init__(self, container: Container):
        # noinspection PyTypeChecker
        self._app = web.Application(logger=log, router=Router())
        self._app.middlewares.append(self.error_middleware)
        self._app.middlewares.append(validation_middleware)
        JWTManager.setup_middleware(self._app)
Пример #15
0
import asyncio
import pickle
from abc import ABCMeta, abstractmethod
from enum import Enum
from typing import Callable, Any

from aioredis import Channel

from scraper.control import Container
from scraper.utils.logging import ChainLoggerAdapter
from scraper.utils.logging.adapters import PreformatterAdapter

from .redis_message import RedisMessage

# noinspection PyTypeChecker
log = ChainLoggerAdapter.from_logger_name(
    "default.utils.redis_client_base")  # type: ChainLoggerAdapter
log.add_adapter(PreformatterAdapter)


class RedisClientBase(metaclass=ABCMeta):
    """ Provides extensible class base for any custom Kafka consumers. """
    def __init__(
        self,
        container: Container,
        topics_bare: list = None,
        prefix_in: str = None,
        prefix_out: str = None,
    ):
        self._container = container
        self._prefix_in = prefix_in or ""
        self._prefix_out = prefix_out or ""
Пример #16
0
from os import path, getenv
from typing import Any

import yaml
import schema

from scraper.utils import update_recursive
from scraper.utils.logging import ChainLoggerAdapter

log = ChainLoggerAdapter.from_logger_name("scraper.config.base")


class Base(object):
    """ Singleton-accessible base class for any configuration. """

    _filename: str = None
    _contents: dict = None
    _schema: schema.Schema = None
    _get_cache = {}

    # ==========================================================================dd==
    #   INTERNAL METHODS
    # ==========================================================================dd==

    @staticmethod
    def _get_filepath(filename: str, basepath: str = None) -> str:
        basepath = basepath or getenv("TORSCRAPER_CONFIG", "")
        return basepath + f"/{filename}.yaml"

    @classmethod
    def _load_from_file(cls, filepath):