def default_settings(): if "TORSCRAPER_ROOT" not in os.environ: os.environ["TORSCRAPER_ROOT"] = os.path.dirname(os.path.abspath(__file__)) if "TORSCRAPER_DATA" not in os.environ: os.environ["TORSCRAPER_DATA"] = os.getenv("TORSCRAPER_ROOT") + "/data/app" if "TORSCRAPER_STATIC" not in os.environ: os.environ["TORSCRAPER_STATIC"] = os.getenv("TORSCRAPER_ROOT") + "/static" if "TORSCRAPER_CONFIG" not in os.environ: os.environ["TORSCRAPER_CONFIG"] = os.getenv("TORSCRAPER_ROOT") + "/config/app" if "TORSCRAPER_PLUGINS" not in os.environ: os.environ["TORSCRAPER_PLUGINS"] = os.getenv("TORSCRAPER_ROOT") + "/plugin" logging_config = CustomConfig("logging") logging.config.dictConfig(logging_config.content()) from scraper.utils.logging import ChainLoggerAdapter log = ChainLoggerAdapter.from_logger_name("scraper.init") plugin_dirs = set() from glob import glob for dir_name in AppConfig.get("plugin.source_dirs"): dirs = set(map(os.path.abspath, glob(dir_name))) if len(dirs) == 0: log.warning("plugin source directory '%s' matched no directories, cwd=%s", dir_name, os.getenv("TORSCRAPER_ROOT")) continue plugin_dirs.update(dirs) import sys missing_plugin_dirs = plugin_dirs.difference(set(sys.path)) log.debug("adding %s to PYTHONPATH", missing_plugin_dirs) sys.path.extend(missing_plugin_dirs) for handler in logging.getLogger().handlers: import socket try: if isinstance(handler, SocketHandler): socket.getaddrinfo(handler.host, handler.port) except socket.gaierror as exc: raise RuntimeError(f"Unable to resolve address information for {handler}.") from exc local_tor_path = os.getenv('TORSCRAPER_DATA') + "/torproxy/Tor" if os.path.exists(local_tor_path): import platform _sep = ";" if platform.system() == "Windows" else ":" os.environ["PATH"] = os.path.realpath(local_tor_path + _sep + os.getenv("PATH"))
def launch_process(arguments: dict, custom_env: dict = None): # update local environment os.environ.update(custom_env or {}) # initialize global settings and variables default_settings() from scraper.utils.logging import ChainLoggerAdapter, logflags log = ChainLoggerAdapter.from_logger_name("scraper.init") log.debug("current environment state: %s", dict(os.environ), flags=[logflags.SENSITIVE]) log.debug("arguments provided: %s", arguments, flags=[logflags.SENSITIVE]) # create new event loop event_loop = asyncio.new_event_loop() # start application itself (blocking!) log.info("starting application") run(event_loop, arguments)
import re import asyncio from os import makedirs from pathlib import Path from typing import Union, Any import aiofiles import aiohttp from bs4 import BeautifulSoup from yarl import URL from scraper.utils.mixins import LoggableMixin from scraper.utils.logging import ChainLoggerAdapter log = ChainLoggerAdapter.from_logger_name("scraper.services.webpage") class WebPage(LoggableMixin): """ Class representing a web page view. Contains all necessary information from HTTP requests and responses. """ _css_links_regex = re.compile(r'url\([\'"]?([^\'"]+?)[\'"]?\)') def __init__(self, exception: Exception = None): super().__init__({ "_url_real": "http_request_url", "_exception": "http_request_exception", "_request_method": "http_request_method", "_status": "http_response_code", }) self._soup = None
import asyncio from datetime import datetime from enum import Enum from os import getenv from aioredis import Redis, create_redis_pool from yarl import URL from scraper.tor import TorSession from scraper.config import AppConfig from scraper.database import Connector from scraper.utils.logging import ChainLoggerAdapter, logflags from .router import Router log = ChainLoggerAdapter.from_logger_name("scraper.control.container") class Container(object): """ Global application container holding important application objects. """ class ServiceType(Enum): ANY = "ANY" NON_MANAGER = "non-manager" MANAGER = "manager" CRAWLER = "crawler" SCRAPER = "scraper" def __eq__(self, o: object) -> bool: # is this ServiceType comparison if isinstance(o, self.__class__): # ANY type matches any other type
import asyncio import signal from scraper.utils.logging import ChainLoggerAdapter log = ChainLoggerAdapter.from_logger_name("scraper.setup") async def main(args: dict): from scraper.control import Container, ServiceManager, ServiceController, Server # setup instance-wide data container container: Container = await Container.initialize( service_type=args.get("service_type", Container.ServiceType.MANAGER), api_port=args.get("api_port"), ) log.info("running version %s", container.version, extra={"torscraper_version": container.version}) # define handler for instance shutdown def shutdown_handler(code, _): log.warning("received signal %s - shutting down...", code) asyncio.ensure_future(container.stop()) # bind and activate the shutdown handler signal.signal(signal.SIGTERM, shutdown_handler) signal.signal(signal.SIGINT, shutdown_handler) # run the instance's API HTTP server log.info("setting up control server") server = Server(container) await server.start(args.get("api_port"))
from abc import ABCMeta, abstractmethod from typing import Any, Union, Callable, Awaitable from datetime import datetime, timedelta from asyncio.tasks import Task from yarl import URL from scraper.control import Container from scraper.utils.logging import ChainLoggerAdapter from scraper.utils.exceptions import ShuttingDown from plugin.default.database.services import Service from .webpage import WebPage from .queue import DateTimePriorityWrapper, DownloadEntry, ParseEntry log = ChainLoggerAdapter.from_logger_name("scraper.services.service_base") class ServiceBase(metaclass=ABCMeta): """ Base class for any custom service definition. """ __service_type__ = Container.ServiceType.ANY def __init__(self, container: Container, service: Service, download_entry_class: type = DownloadEntry, parse_entry_class: type = ParseEntry): assert self.__service_type__ in [ Container.ServiceType.SCRAPER, Container.ServiceType.CRAWLER,
import asyncio import os import re import signal from multiprocessing import Process from typing import Type from plugin.default.database import Service from scraper.services import ServiceBase from scraper.utils import list_classes, list_modules from scraper.utils.logging import ChainLoggerAdapter from .container import Container log = ChainLoggerAdapter.from_logger_name("scraper.control.service_manager") class ServiceManager(object): """ Class responsible for initialization and management of all service plugins and corresponding processes. """ _target_plugin_modules = "plugin.(\w+).services.(\w+)" def __init__(self, container: Container): self._container: Container = container self._services = {} self._processes: list[Process] = [] self._api_port = 40100 # ==========================================================================dd== # INTERNAL METHODS # ==========================================================================dd==
import csv from collections import defaultdict from os import getenv from pathlib import Path from aiohttp import web from aiohttp_apispec import docs, querystring_schema from scraper.control import ResourceBase from scraper.control.exceptions import * from scraper.utils.apispec import STATIC_RESPONSES from scraper.utils.logging import ChainLoggerAdapter from ..utils.schemas import CountriesQuery log = ChainLoggerAdapter.from_logger_name("default.resources.static") class StaticResource(ResourceBase): # ==========================================================================dd== # INTERNAL METHODS # ==========================================================================dd== def _register_routes(self): return [ web.get("/public/static/file/{file_path:.+}", self.file, name="static-file", allow_head=False), web.get("/public/static/json/countries/list", self.countries,
from typing import Any, Union from datetime import datetime import jwt from aiohttp import web from aiohttp_jwt import JWTMiddleware from scraper.config import AppConfig, AuthUsersConfig from scraper.utils.logging import ChainLoggerAdapter, logflags from .collections import update_recursive log = ChainLoggerAdapter.from_logger_name("scraper.utils.jwt_manager") class JWTManager(object): @classmethod def setup_middleware(cls, app: web.Application, **kwargs): auth_config = AppConfig.get("control_server.auth", {}) auth_config = update_recursive(auth_config, kwargs) if "secret_or_pub_key" in auth_config: log.debug("setting up JWT authentication middleware, config=%s", auth_config, flags=[logflags.SENSITIVE]) app.middlewares.append(JWTMiddleware(**auth_config)) @classmethod def create_token(cls, payload: dict[str, Any]) -> bytes: headers = {"iat": datetime.utcnow().timestamp()}
from aiohttp import web from aiohttp.web_exceptions import * from aiohttp_apispec import docs, json_schema from scraper.control import ResourceBase from scraper.control.exceptions import * from scraper.utils import JWTManager from scraper.utils.logging import ChainLoggerAdapter from ..utils.schemas import AccessTokenResponse, AccessTokenRequest log = ChainLoggerAdapter.from_logger_name("default.resources.access") class AccessResource(ResourceBase): # ==========================================================================dd== # INTERNAL METHODS # ==========================================================================dd== def _register_routes(self): return [ web.post("/public/access/token", self.token, name="create-token"), ] # ==========================================================================dd== # PUBLIC METHODS # ==========================================================================dd== @docs( tags=["plugin.default.access", "public"],
from scraper.utils.apispec import SECURITY_SPECS, AUTH_RESPONSES, STATIC_RESPONSES from scraper.utils.schemas import DatabaseRelationQuery from scraper.utils.logging import ChainLoggerAdapter from ..database import Service, ServiceUrl from ..utils.schemas import ( ServiceEntry, ServiceResponse, ServiceListResponse, ServiceUrlEntry, ServiceUrlUpdateRequest, ServiceUrlResponse, ServiceUrlListResponse, ) log = ChainLoggerAdapter.from_logger_name("default.resources.services") class ServicesResource(ResourceBase): __service_type__ = Container.ServiceType.MANAGER # ==========================================================================dd== # INTERNAL METHODS # ==========================================================================dd== def _register_routes(self): return [ # service endpoints web.get("/db/service", self.service_list, name="service-list",
import asyncio from abc import ABCMeta, abstractmethod from aiohttp import web from aiohttp.web_routedef import RouteDef from scraper.config import AppConfig from scraper.utils.logging import ChainLoggerAdapter from .container import Container from .server import Server log = ChainLoggerAdapter.from_logger_name("scraper.control.resource_base") class ResourceBase(metaclass=ABCMeta): __service_type__ = Container.ServiceType.ANY def __init__(self, server: Server = None): self._server = server if server: server.add_routes(self._register_routes()) server.add_task_registrator(self._register_tasks()) # ==========================================================================dd== # PUBLIC PROPERTIES # ==========================================================================dd== @property def server(self) -> Server: return self._server
from contextlib import asynccontextmanager, contextmanager from typing import ContextManager, AsyncContextManager from sqlalchemy import create_engine, inspect from sqlalchemy.engine.reflection import Inspector from sqlalchemy.orm import sessionmaker, scoped_session, Session from scraper.utils import list_classes, list_modules from scraper.utils.logging import ChainLoggerAdapter from .table_base import PrimaryTableBase, MemoryTableBase log = ChainLoggerAdapter.from_logger_name("scraper.database.connector") class Connector(object): """ Database structure, connection, session and plugin manager. """ _target_plugin_modules = "plugin.(\w+).database.(\w+)" def __init__(self, connection_string, **kwargs): # create database engine instances based on provided configuration self._engines = { "primary": create_engine(connection_string, **kwargs), "memory": create_engine(f"sqlite:///:memory:"), } # create session factories for each created database engine self._factories = {engine_type: sessionmaker(bind=engine) for engine_type, engine in self._engines.items()} # create new session for each database engine self._sessions = {engine_type: factory() for engine_type, factory in self._factories.items()} # load available plugin modules and initialize/update database structure
from aiohttp_apispec import setup_aiohttp_apispec, validation_middleware from jwt.exceptions import PyJWTError, InvalidTokenError from scraper.config import AppConfig from scraper.control.exceptions import ( HTTP_EXCEPTION_STATUS_MAPPING, CustomClientExceptionBase, InternalServerError, Unauthorized, ) from scraper.utils import list_classes, JWTManager, list_modules from scraper.utils.logging import ChainLoggerAdapter from .container import Container from .router import Router log = ChainLoggerAdapter.from_logger_name("scraper.control.server") access_log = ChainLoggerAdapter.from_logger_name( "scraper.control.server.access") class Server(object): """ Class managing the application's control server. """ _target_plugin_modules = "plugin.(\w+).resources.(\w+)" def __init__(self, container: Container): # noinspection PyTypeChecker self._app = web.Application(logger=log, router=Router()) self._app.middlewares.append(self.error_middleware) self._app.middlewares.append(validation_middleware) JWTManager.setup_middleware(self._app)
import asyncio import pickle from abc import ABCMeta, abstractmethod from enum import Enum from typing import Callable, Any from aioredis import Channel from scraper.control import Container from scraper.utils.logging import ChainLoggerAdapter from scraper.utils.logging.adapters import PreformatterAdapter from .redis_message import RedisMessage # noinspection PyTypeChecker log = ChainLoggerAdapter.from_logger_name( "default.utils.redis_client_base") # type: ChainLoggerAdapter log.add_adapter(PreformatterAdapter) class RedisClientBase(metaclass=ABCMeta): """ Provides extensible class base for any custom Kafka consumers. """ def __init__( self, container: Container, topics_bare: list = None, prefix_in: str = None, prefix_out: str = None, ): self._container = container self._prefix_in = prefix_in or "" self._prefix_out = prefix_out or ""
from os import path, getenv from typing import Any import yaml import schema from scraper.utils import update_recursive from scraper.utils.logging import ChainLoggerAdapter log = ChainLoggerAdapter.from_logger_name("scraper.config.base") class Base(object): """ Singleton-accessible base class for any configuration. """ _filename: str = None _contents: dict = None _schema: schema.Schema = None _get_cache = {} # ==========================================================================dd== # INTERNAL METHODS # ==========================================================================dd== @staticmethod def _get_filepath(filename: str, basepath: str = None) -> str: basepath = basepath or getenv("TORSCRAPER_CONFIG", "") return basepath + f"/{filename}.yaml" @classmethod def _load_from_file(cls, filepath):