Пример #1
0
    async def create(app):
        c = Canceller(app)

        c.task_manager.ensure_future(
            retry_long_running(
                'cancel_cancelled_ready_jobs_loop',
                run_if_changed,
                c.cancel_ready_state_changed,
                c.cancel_cancelled_ready_jobs_loop_body,
            ))

        c.task_manager.ensure_future(
            retry_long_running(
                'cancel_cancelled_creating_jobs_loop',
                run_if_changed,
                c.cancel_creating_state_changed,
                c.cancel_cancelled_creating_jobs_loop_body,
            ))

        c.task_manager.ensure_future(
            retry_long_running(
                'cancel_cancelled_running_jobs_loop',
                run_if_changed,
                c.cancel_running_state_changed,
                c.cancel_cancelled_running_jobs_loop_body,
            ))

        c.task_manager.ensure_future(
            periodically_call(60, c.cancel_orphaned_attempts_loop_body))

        return c
Пример #2
0
    async def async_init(self):
        log.info(f'initializing {self}')

        await super().async_init()

        async for record in self.db.select_and_fetchall(
                'SELECT * FROM instances WHERE removed = 0 AND inst_coll = %s;',
            (self.name, )):
            instance = Instance.from_record(self.app, self, record)
            self.add_instance(instance)

        self.task_manager.ensure_future(
            retry_long_running(
                'create_instances_loop',
                run_if_changed,
                self.create_instances_state_changed,
                self.create_instances_loop_body,
            ))

        self.task_manager.ensure_future(
            retry_long_running('schedule_jobs_loop', run_if_changed,
                               self.scheduler_state_changed,
                               self.schedule_jobs_loop_body))

        self.task_manager.ensure_future(
            periodically_call(15, self.bump_scheduler))
Пример #3
0
    async def async_init(self):
        self.task_manager.ensure_future(
            retry_long_running(
                'cancel_cancelled_ready_jobs_loop',
                run_if_changed,
                self.cancel_ready_state_changed,
                self.cancel_cancelled_ready_jobs_loop_body,
            ))
        self.task_manager.ensure_future(
            retry_long_running(
                'cancel_cancelled_creating_jobs_loop',
                run_if_changed,
                self.cancel_creating_state_changed,
                self.cancel_cancelled_creating_jobs_loop_body,
            ))
        self.task_manager.ensure_future(
            retry_long_running(
                'cancel_cancelled_running_jobs_loop',
                run_if_changed,
                self.cancel_running_state_changed,
                self.cancel_cancelled_running_jobs_loop_body,
            ))

        self.task_manager.ensure_future(
            periodically_call(60, self.cancel_orphaned_attempts_loop_body))
Пример #4
0
async def on_startup(app):
    db = Database()
    await db.async_init()
    app['db'] = db
    app['client_session'] = httpx.client_session()

    aiogoogle_credentials = aiogoogle.GoogleCredentials.from_file(
        '/billing-monitoring-gsa-key/key.json')

    bigquery_client = aiogoogle.GoogleBigQueryClient(
        'broad-ctsa', credentials=aiogoogle_credentials)
    app['bigquery_client'] = bigquery_client

    compute_client = aiogoogle.GoogleComputeClient(
        PROJECT, credentials=aiogoogle_credentials)
    app['compute_client'] = compute_client

    query_billing_event = asyncio.Event()
    app['query_billing_event'] = query_billing_event

    region_info = {
        name: await compute_client.get(f'/regions/{name}')
        for name in BATCH_GCP_REGIONS
    }
    zones = [url_basename(z) for r in region_info.values() for z in r['zones']]
    app['zones'] = zones

    app['task_manager'] = aiotools.BackgroundTaskManager()

    app['task_manager'].ensure_future(
        retry_long_running('polling_loop', polling_loop, app))

    app['task_manager'].ensure_future(
        retry_long_running('query_billing_loop', run_if_changed_idempotent,
                           query_billing_event, query_billing_body, app))

    app['task_manager'].ensure_future(periodically_call(
        60, monitor_disks, app))
    app['task_manager'].ensure_future(
        periodically_call(60, monitor_instances, app))
Пример #5
0
    def __init__(
        self,
        app,
        db: Database,  # BORROWED
        inst_coll_manager: InstanceCollectionManager,
        resource_manager: CloudResourceManager,
        machine_name_prefix: str,
        config: JobPrivateInstanceManagerConfig,
        task_manager: aiotools.BackgroundTaskManager,
    ):
        super().__init__(
            db,
            inst_coll_manager,
            resource_manager,
            config.cloud,
            config.name,
            machine_name_prefix,
            is_pool=False,
            max_instances=config.max_instances,
            max_live_instances=config.max_live_instances,
            task_manager=task_manager,
        )
        self.app = app
        global_scheduler_state_changed: Notice = self.app['scheduler_state_changed']
        self.create_instances_state_changed = global_scheduler_state_changed.subscribe()
        self.scheduler_state_changed = asyncio.Event()

        self.async_worker_pool: AsyncWorkerPool = app['async_worker_pool']
        self.exceeded_shares_counter = ExceededSharesCounter()

        self.boot_disk_size_gb = config.boot_disk_size_gb

        task_manager.ensure_future(
            retry_long_running(
                'create_instances_loop',
                run_if_changed,
                self.create_instances_state_changed,
                self.create_instances_loop_body,
            )
        )
        task_manager.ensure_future(
            retry_long_running(
                'schedule_jobs_loop', run_if_changed, self.scheduler_state_changed, self.schedule_jobs_loop_body
            )
        )
        task_manager.ensure_future(periodically_call(15, self.bump_scheduler))
Пример #6
0
async def on_startup(app):
    task_manager = aiotools.BackgroundTaskManager()
    app['task_manager'] = task_manager

    app['client_session'] = httpx.client_session()

    kubernetes_asyncio.config.load_incluster_config()
    app['k8s_client'] = kubernetes_asyncio.client.CoreV1Api()
    app['k8s_cache'] = K8sCache(app['k8s_client'])

    db = Database()
    await db.async_init(maxsize=50)
    app['db'] = db

    row = await db.select_and_fetchone('''
SELECT instance_id, internal_token, frozen FROM globals;
''')

    instance_id = row['instance_id']
    log.info(f'instance_id {instance_id}')
    app['instance_id'] = instance_id

    app['internal_token'] = row['internal_token']

    app['batch_headers'] = {'Authorization': f'Bearer {row["internal_token"]}'}

    app['frozen'] = row['frozen']

    scheduler_state_changed = Notice()
    app['scheduler_state_changed'] = scheduler_state_changed

    cancel_ready_state_changed = asyncio.Event()
    app['cancel_ready_state_changed'] = cancel_ready_state_changed

    cancel_creating_state_changed = asyncio.Event()
    app['cancel_creating_state_changed'] = cancel_creating_state_changed

    cancel_running_state_changed = asyncio.Event()
    app['cancel_running_state_changed'] = cancel_running_state_changed

    async_worker_pool = AsyncWorkerPool(100, queue_size=100)
    app['async_worker_pool'] = async_worker_pool

    credentials_file = '/gsa-key/key.json'
    fs = get_cloud_async_fs(credentials_file=credentials_file)
    app['file_store'] = FileStore(fs, BATCH_STORAGE_URI, instance_id)

    inst_coll_configs = await InstanceCollectionConfigs.create(db)

    app['driver'] = await get_cloud_driver(app, db, MACHINE_NAME_PREFIX,
                                           DEFAULT_NAMESPACE,
                                           inst_coll_configs, credentials_file,
                                           task_manager)

    canceller = await Canceller.create(app)
    app['canceller'] = canceller

    app['check_incremental_error'] = None
    app['check_resource_aggregation_error'] = None

    if HAIL_SHOULD_CHECK_INVARIANTS:
        task_manager.ensure_future(
            periodically_call(10, check_incremental, app, db))
        task_manager.ensure_future(
            periodically_call(10, check_resource_aggregation, app, db))

    task_manager.ensure_future(
        periodically_call(10, monitor_billing_limits, app))
    task_manager.ensure_future(
        periodically_call(10, cancel_fast_failing_batches, app))
    task_manager.ensure_future(
        periodically_call(60, scheduling_cancelling_bump, app))
    task_manager.ensure_future(periodically_call(15, monitor_system, app))
Пример #7
0
async def on_startup(app):
    app['task_manager'] = aiotools.BackgroundTaskManager()
    pool = concurrent.futures.ThreadPoolExecutor()
    app['blocking_pool'] = pool

    kube.config.load_incluster_config()
    k8s_client = kube.client.CoreV1Api()
    k8s_cache = K8sCache(k8s_client, refresh_time=5)
    app['k8s_cache'] = k8s_cache

    db = Database()
    await db.async_init(maxsize=50)
    app['db'] = db

    row = await db.select_and_fetchone('''
SELECT instance_id, internal_token FROM globals;
''')

    instance_id = row['instance_id']
    log.info(f'instance_id {instance_id}')
    app['instance_id'] = instance_id

    app['internal_token'] = row['internal_token']

    app['batch_headers'] = {'Authorization': f'Bearer {row["internal_token"]}'}

    resources = db.select_and_fetchall('SELECT resource FROM resources;')

    app['resources'] = [record['resource'] async for record in resources]

    aiogoogle_credentials = aiogoogle.Credentials.from_file(
        '/gsa-key/key.json')
    compute_client = aiogoogle.ComputeClient(PROJECT,
                                             credentials=aiogoogle_credentials)
    app['compute_client'] = compute_client

    logging_client = aiogoogle.LoggingClient(
        credentials=aiogoogle_credentials,
        # The project-wide logging quota is 60 request/m.  The event
        # loop sleeps 15s per iteration, so the max rate is 4
        # iterations/m.  Note, the event loop could make multiple
        # logging requests per iteration, so these numbers are not
        # quite comparable.  I didn't want to consume the entire quota
        # since there will be other users of the logging API (us at
        # the web console, test deployments, etc.)
        rate_limit=RateLimit(10, 60),
    )
    app['logging_client'] = logging_client

    scheduler_state_changed = Notice()
    app['scheduler_state_changed'] = scheduler_state_changed

    cancel_ready_state_changed = asyncio.Event()
    app['cancel_ready_state_changed'] = cancel_ready_state_changed

    cancel_creating_state_changed = asyncio.Event()
    app['cancel_creating_state_changed'] = cancel_creating_state_changed

    cancel_running_state_changed = asyncio.Event()
    app['cancel_running_state_changed'] = cancel_running_state_changed

    async_worker_pool = AsyncWorkerPool(100, queue_size=100)
    app['async_worker_pool'] = async_worker_pool

    credentials = google.oauth2.service_account.Credentials.from_service_account_file(
        '/gsa-key/key.json')
    log_store = LogStore(BATCH_BUCKET_NAME,
                         instance_id,
                         pool,
                         credentials=credentials)
    app['log_store'] = log_store

    zone_monitor = ZoneMonitor(app)
    app['zone_monitor'] = zone_monitor
    await zone_monitor.async_init()

    inst_coll_configs = InstanceCollectionConfigs(app)
    await inst_coll_configs.async_init()

    inst_coll_manager = InstanceCollectionManager(app, MACHINE_NAME_PREFIX)
    app['inst_coll_manager'] = inst_coll_manager
    await inst_coll_manager.async_init(inst_coll_configs)

    canceller = Canceller(app)
    app['canceller'] = canceller
    await canceller.async_init()

    gce_event_monitor = GCEEventMonitor(app, MACHINE_NAME_PREFIX)
    app['gce_event_monitor'] = gce_event_monitor
    await gce_event_monitor.async_init()

    app['check_incremental_error'] = None
    app['check_resource_aggregation_error'] = None

    if HAIL_SHOULD_CHECK_INVARIANTS:
        app['task_manager'].ensure_future(
            periodically_call(10, check_incremental, app, db))
        app['task_manager'].ensure_future(
            periodically_call(10, check_resource_aggregation, app, db))

    app['task_manager'].ensure_future(
        periodically_call(10, monitor_billing_limits, app))

    app['task_manager'].ensure_future(
        periodically_call(10, cancel_fast_failing_batches, app))

    app['task_manager'].ensure_future(
        periodically_call(60, scheduling_cancelling_bump, app))
Пример #8
0
    async def create(
            app,
            db: Database,  # BORROWED
            machine_name_prefix: str,
            namespace: str,
            inst_coll_configs: InstanceCollectionConfigs,
            credentials_file: str,
            task_manager: aiotools.BackgroundTaskManager,  # BORROWED
    ) -> 'GCPDriver':
        gcp_config = get_gcp_config()
        project = gcp_config.project
        zone = gcp_config.zone
        regions = gcp_config.regions

        compute_client = aiogoogle.GoogleComputeClient(
            project, credentials_file=credentials_file)

        activity_logs_client = aiogoogle.GoogleLoggingClient(
            credentials_file=credentials_file,
            # The project-wide logging quota is 60 request/m.  The event
            # loop sleeps 15s per iteration, so the max rate is 4
            # iterations/m.  Note, the event loop could make multiple
            # logging requests per iteration, so these numbers are not
            # quite comparable.  I didn't want to consume the entire quota
            # since there will be other users of the logging API (us at
            # the web console, test deployments, etc.)
            rate_limit=RateLimit(10, 60),
        )

        zone_monitor = await ZoneMonitor.create(compute_client, regions, zone)
        billing_manager = await GCPBillingManager.create(db)
        inst_coll_manager = InstanceCollectionManager(db, machine_name_prefix,
                                                      zone_monitor)
        resource_manager = GCPResourceManager(project, compute_client,
                                              billing_manager)

        create_pools_coros = [
            Pool.create(
                app,
                db,
                inst_coll_manager,
                resource_manager,
                machine_name_prefix,
                config,
                app['async_worker_pool'],
                task_manager,
            ) for pool_name, config in
            inst_coll_configs.name_pool_config.items()
        ]

        jpim, *_ = await asyncio.gather(
            JobPrivateInstanceManager.create(
                app,
                db,
                inst_coll_manager,
                resource_manager,
                machine_name_prefix,
                inst_coll_configs.jpim_config,
                task_manager,
            ), *create_pools_coros)

        driver = GCPDriver(
            db,
            machine_name_prefix,
            compute_client,
            activity_logs_client,
            project,
            namespace,
            zone_monitor,
            inst_coll_manager,
            jpim,
            billing_manager,
        )

        task_manager.ensure_future(
            periodically_call(15, driver.process_activity_logs))
        task_manager.ensure_future(
            periodically_call(60, zone_monitor.update_region_quotas))
        task_manager.ensure_future(
            periodically_call(60, driver.delete_orphaned_disks))
        task_manager.ensure_future(
            periodically_call(300, billing_manager.refresh_resources))

        return driver
Пример #9
0
    async def create(
            app,
            db: Database,  # BORROWED
            machine_name_prefix: str,
            namespace: str,
            inst_coll_configs: InstanceCollectionConfigs,
            credentials_file: str,
            task_manager: aiotools.BackgroundTaskManager,  # BORROWED
    ) -> 'AzureDriver':
        azure_config = get_azure_config()
        subscription_id = azure_config.subscription_id
        resource_group = azure_config.resource_group
        region = azure_config.region
        regions = [region]

        with open(os.environ['HAIL_SSH_PUBLIC_KEY'], encoding='utf-8') as f:
            ssh_public_key = f.read()

        arm_client = aioazure.AzureResourceManagerClient(
            subscription_id, resource_group, credentials_file=credentials_file)
        compute_client = aioazure.AzureComputeClient(
            subscription_id, resource_group, credentials_file=credentials_file)
        resources_client = aioazure.AzureResourcesClient(
            subscription_id, credentials_file=credentials_file)
        network_client = aioazure.AzureNetworkClient(
            subscription_id, resource_group, credentials_file=credentials_file)
        pricing_client = aioazure.AzurePricingClient()

        region_monitor = await RegionMonitor.create(region)
        billing_manager = await AzureBillingManager.create(
            db, pricing_client, regions)
        inst_coll_manager = InstanceCollectionManager(db, machine_name_prefix,
                                                      region_monitor)
        resource_manager = AzureResourceManager(subscription_id,
                                                resource_group, ssh_public_key,
                                                arm_client, compute_client,
                                                billing_manager)

        create_pools_coros = [
            Pool.create(
                app,
                db,
                inst_coll_manager,
                resource_manager,
                machine_name_prefix,
                config,
                app['async_worker_pool'],
                task_manager,
            ) for pool_name, config in
            inst_coll_configs.name_pool_config.items()
        ]

        jpim, *_ = await asyncio.gather(
            JobPrivateInstanceManager.create(
                app,
                db,
                inst_coll_manager,
                resource_manager,
                machine_name_prefix,
                inst_coll_configs.jpim_config,
                task_manager,
            ),
            *create_pools_coros,
        )

        driver = AzureDriver(
            db,
            machine_name_prefix,
            arm_client,
            compute_client,
            resources_client,
            network_client,
            pricing_client,
            subscription_id,
            resource_group,
            namespace,
            region_monitor,
            inst_coll_manager,
            jpim,
            billing_manager,
        )

        task_manager.ensure_future(
            periodically_call(60, driver.delete_orphaned_nics))
        task_manager.ensure_future(
            periodically_call(60, driver.delete_orphaned_public_ips))
        task_manager.ensure_future(
            periodically_call(60, driver.delete_completed_deployments))
        task_manager.ensure_future(
            periodically_call(
                300, billing_manager.refresh_resources_from_retail_prices))

        return driver