def test_cli_journal_client_origin_visit_status( swh_scheduler_cfg, swh_scheduler_cfg_path, ): kafka_server = swh_scheduler_cfg["journal"]["brokers"][0] swh_scheduler = get_scheduler(**swh_scheduler_cfg["scheduler"]) producer = Producer( { "bootstrap.servers": kafka_server, "client.id": "test visit-stats producer", "acks": "all", } ) visit_status = VISIT_STATUSES_1[0] value = value_to_kafka(visit_status) topic = "swh.journal.objects.origin_visit_status" producer.produce(topic=topic, key=b"bogus-origin", value=value) producer.flush() result = invoke( ["journal-client", "--stop-after-objects", "1",], swh_scheduler_cfg_path, ) # Check the output expected_output = "Processed 1 message(s).\nDone.\n" assert result.exit_code == 0, result.output assert result.output == expected_output actual_visit_stats = swh_scheduler.origin_visit_stats_get( [(visit_status["origin"], visit_status["type"])] ) assert actual_visit_stats assert len(actual_visit_stats) == 1
def test_register_ttypes_all( cli_runner, mock_pkg_resources, local_sched_config, local_sched_configfile ): """Registering all task types""" for command in [ ["--config-file", local_sched_configfile, "task-type", "register"], ["--config-file", local_sched_configfile, "task-type", "register", "-p", "all"], [ "--config-file", local_sched_configfile, "task-type", "register", "-p", "lister.gnu", "-p", "lister.pypi", ], ]: result = cli_runner.invoke(cli, command) assert result.exit_code == 0, traceback.print_exception(*result.exc_info) scheduler = get_scheduler(**local_sched_config["scheduler"]) all_tasks = [ "list-gnu-full", "list-pypi", ] for task in all_tasks: task_type_desc = scheduler.get_task_type(task) assert task_type_desc assert task_type_desc["type"] == task assert task_type_desc["backoff_factor"] == 1
def test_register_ttypes_filter(mock_pkg_resources, cli_runner, local_sched_config, local_sched_configfile): """Filtering on one worker should only register its associated task type """ result = cli_runner.invoke( cli, [ "--config-file", local_sched_configfile, "task-type", "register", "--plugins", "lister.gnu", ], ) assert result.exit_code == 0, traceback.print_exception(*result.exc_info) scheduler = get_scheduler(**local_sched_config["scheduler"]) all_tasks = [ "list-gnu-full", ] for task in all_tasks: task_type_desc = scheduler.get_task_type(task) assert task_type_desc assert task_type_desc["type"] == task assert task_type_desc["backoff_factor"] == 1
def get_config(config_file="web/web"): """Read the configuration file `config_file`. If an environment variable SWH_CONFIG_FILENAME is defined, this takes precedence over the config_file parameter. In any case, update the app with parameters (secret_key, conf) and return the parsed configuration as a dict. If no configuration file is provided, return a default configuration. """ if not swhweb_config: config_filename = os.environ.get("SWH_CONFIG_FILENAME") if config_filename: config_file = config_filename cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, "log_dir") if swhweb_config.get("search"): swhweb_config["search"] = get_search(**swhweb_config["search"]) else: swhweb_config["search"] = None swhweb_config["storage"] = get_storage(**swhweb_config["storage"]) swhweb_config["vault"] = get_vault(**swhweb_config["vault"]) swhweb_config["indexer_storage"] = get_indexer_storage( **swhweb_config["indexer_storage"]) swhweb_config["scheduler"] = get_scheduler( **swhweb_config["scheduler"]) return swhweb_config
def get_config(config_file='web/web'): """Read the configuration file `config_file`. If an environment variable SWH_CONFIG_FILENAME is defined, this takes precedence over the config_file parameter. In any case, update the app with parameters (secret_key, conf) and return the parsed configuration as a dict. If no configuration file is provided, return a default configuration. """ if not swhweb_config: config_filename = os.environ.get('SWH_CONFIG_FILENAME') if config_filename: config_file = config_filename cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, 'log_dir') swhweb_config['storage'] = get_storage(**swhweb_config['storage']) swhweb_config['vault'] = get_vault(**swhweb_config['vault']) swhweb_config['indexer_storage'] = \ get_indexer_storage(**swhweb_config['indexer_storage']) swhweb_config['scheduler'] = get_scheduler( **swhweb_config['scheduler']) return swhweb_config
def visit_scheduler_thread( config: Dict, visit_type: str, command_queue: Queue[object], exc_queue: Queue[Tuple[str, BaseException]], ): """Target function for the visit sending thread, which initializes local connections and handles exceptions by sending them back to the main thread.""" from swh.scheduler import get_scheduler from swh.scheduler.celery_backend.config import build_app try: # We need to reinitialize these connections because they're not generally # thread-safe app = build_app(config.get("celery")) scheduler = get_scheduler(**config["scheduler"]) task_type = scheduler.get_task_type(f"load-{visit_type}") if task_type is None: raise ValueError(f"Unknown task type: load-{visit_type}") policy_cfg = config.get("scheduling_policy", DEFAULT_POLICY_CONFIG) for policies in policy_cfg.values(): for policy in policies: if "weight" not in policy or "policy" not in policy: raise ValueError( "Each policy configuration needs at least a 'policy' " "and a 'weight' entry") policy_cfg = {**DEFAULT_POLICY_CONFIG, **policy_cfg} next_iteration = time.monotonic() while True: # vary the next iteration time a little bit next_iteration = next_iteration + splay() while time.monotonic() < next_iteration: # Wait for next iteration to start. Listen for termination message. try: msg = command_queue.get(block=True, timeout=1) except Empty: continue if msg is TERMINATE: return else: logger.warn( "Received unexpected message %s in command queue", msg) next_iteration = send_visits_for_visit_type( scheduler, app, visit_type, task_type, policy_cfg.get(visit_type, policy_cfg["default"]), ) except BaseException as e: exc_queue.put((visit_type, e))
def main(): from .config import app as main_app for module in main_app.conf.CELERY_IMPORTS: __import__(module) main_backend = get_scheduler("local") try: run_ready_tasks(main_backend, main_app) except Exception: main_backend.rollback() raise
def __init__(self): self.config: Dict[str, Any] = config.load_from_envvar(DEFAULT_CONFIG) self.scheduler: SchedulerInterface = get_scheduler( **self.config["scheduler"]) self.tool = { "name": "swh-deposit", "version": __version__, "configuration": { "sword_version": "2" }, } self.storage: StorageInterface = get_storage(**self.config["storage"]) self.storage_metadata: StorageInterface = get_storage( **self.config["storage_metadata"])
def __init__(self, **config): self.config = config self.cache = VaultCache(**config["cache"]) self.scheduler = get_scheduler(**config["scheduler"]) self.storage = get_storage(**config["storage"]) self.smtp_server = smtplib.SMTP(**config.get("smtp", {})) db_conn = config["db"] self._pool = psycopg2.pool.ThreadedConnectionPool( config.get("min_pool_conns", 1), config.get("max_pool_conns", 10), db_conn, cursor_factory=psycopg2.extras.RealDictCursor, ) self._db = None
def swh_scheduler(swh_scheduler_config): scheduler = get_scheduler("postgresql", **swh_scheduler_config) for taskname in TASK_NAMES: scheduler.create_task_type({ "type": "swh-test-{}".format(taskname), "description": "The {} testing task".format(taskname), "backend_name": "swh.scheduler.tests.tasks.{}".format(taskname), "default_interval": timedelta(days=1), "min_interval": timedelta(hours=6), "max_interval": timedelta(days=12), }) return scheduler
def __init__(self, override_config=None): self.backoff = self.INITIAL_BACKOFF logger.debug('Loading config from %s' % self.CONFIG_BASE_FILENAME) self.config = self.parse_config_file( base_filename=self.CONFIG_BASE_FILENAME, additional_configs=[self.ADDITIONAL_CONFIG] ) self.config['cache_dir'] = os.path.expanduser(self.config['cache_dir']) if self.config['cache_responses']: config.prepare_folders(self.config, 'cache_dir') if override_config: self.config.update(override_config) logger.debug('%s CONFIG=%s' % (self, self.config)) self.storage = get_storage(**self.config['storage']) self.scheduler = get_scheduler(**self.config['scheduler']) self.db_engine = create_engine(self.config['lister']['args']['db']) self.mk_session = sessionmaker(bind=self.db_engine) self.db_session = self.mk_session()
def cli(ctx, config_file, database, url, no_stdout): """Software Heritage Scheduler tools. Use a local scheduler instance by default (plugged to the main scheduler db). """ try: from psycopg2 import OperationalError except ImportError: class OperationalError(Exception): pass from swh.core import config from swh.scheduler import DEFAULT_CONFIG, get_scheduler ctx.ensure_object(dict) logger = logging.getLogger(__name__) scheduler = None conf = config.read(config_file, DEFAULT_CONFIG) if "scheduler" not in conf: raise ValueError("missing 'scheduler' configuration") if database: conf["scheduler"]["cls"] = "postgresql" conf["scheduler"]["db"] = database elif url: conf["scheduler"]["cls"] = "remote" conf["scheduler"]["url"] = url sched_conf = conf["scheduler"] try: logger.debug("Instantiating scheduler with %s", sched_conf) scheduler = get_scheduler(**sched_conf) except (ValueError, OperationalError): # it's the subcommand to decide whether not having a proper # scheduler instance is a problem. pass ctx.obj["scheduler"] = scheduler ctx.obj["config"] = conf
def from_config(cls, scheduler: Dict[str, Any], **config: Any): """Instantiate a lister from a configuration dict. This is basically a backwards-compatibility shim for the CLI. Args: scheduler: instantiation config for the scheduler config: the configuration dict for the lister, with the following keys: - credentials (optional): credentials list for the scheduler - any other kwargs passed to the lister. Returns: the instantiated lister """ # Drop the legacy config keys which aren't used for this generation of listers. for legacy_key in ("storage", "lister", "celery"): config.pop(legacy_key, None) # Instantiate the scheduler scheduler_instance = get_scheduler(**scheduler) return cls(scheduler=scheduler_instance, **config)
def deposit_autoconfig(deposit_config_path): """Enforce config for deposit classes inherited from APIConfig.""" cfg = read(deposit_config_path) if "scheduler" in cfg: # scheduler setup: require the check-deposit and load-deposit tasks scheduler = get_scheduler(**cfg["scheduler"]) task_types = [ { "type": "check-deposit", "backend_name": "swh.deposit.loader.tasks.ChecksDepositTsk", "description": "Check deposit metadata/archive before loading", "num_retries": 3, }, { "type": "load-deposit", "backend_name": "swh.loader.package.deposit.tasks.LoadDeposit", "description": "Loading deposit archive into swh archive", "num_retries": 3, }, ] for task_type in task_types: scheduler.create_task_type(task_type)
def get_global_scheduler(): global scheduler if not scheduler: scheduler = get_scheduler(**app.config["scheduler"]) return scheduler
result = event["result"] status = None if isinstance(result, dict) and "status" in result: status = result["status"] if status == "success": status = "eventful" if result.get("eventful") else "uneventful" if status is None: status = "eventful" if result else "uneventful" scheduler_backend.end_task_run(uuid, timestamp=utcnow(), status=status, result=result) elif event_type == "task-failed": scheduler_backend.end_task_run(uuid, timestamp=utcnow(), status="failed") if __name__ == "__main__": url = sys.argv[1] logging.basicConfig(level=logging.DEBUG) scheduler_backend = get_scheduler("local", args={"db": "service=swh-scheduler"}) channel = get_listener(url, "celeryev.test", scheduler_backend) logger.info("Start consuming") channel.start_consuming()
def test_init_get_scheduler_deprecation_warning(class_name, expected_class, kwargs, mock_psycopg2): with pytest.warns(DeprecationWarning): concrete_scheduler = get_scheduler(class_name, args=kwargs) assert isinstance(concrete_scheduler, expected_class)
def test_init_get_scheduler(class_name, expected_class, kwargs, mock_psycopg2): concrete_scheduler = get_scheduler(class_name, **kwargs) assert isinstance(concrete_scheduler, expected_class) assert isinstance(concrete_scheduler, SchedulerInterface)
def test_init_get_scheduler_failure(): with pytest.raises(ValueError, match="Unknown Scheduler class"): get_scheduler("unknown-scheduler-storage")
def swh_sched(swh_sched_config): return get_scheduler(**swh_sched_config["scheduler"])
result = event["result"] status = None if isinstance(result, dict) and "status" in result: status = result["status"] if status == "success": status = "eventful" if result.get("eventful") else "uneventful" if status is None: status = "eventful" if result else "uneventful" scheduler_backend.end_task_run(uuid, timestamp=utcnow(), status=status, result=result) elif event_type == "task-failed": scheduler_backend.end_task_run(uuid, timestamp=utcnow(), status="failed") if __name__ == "__main__": url = sys.argv[1] logging.basicConfig(level=logging.DEBUG) scheduler_backend = get_scheduler("postgresql", args={"db": "service=swh-scheduler"}) channel = get_listener(url, "celeryev.test", scheduler_backend) logger.info("Start consuming") channel.start_consuming()