Exemplo n.º 1
0
    async def create(
        app,
        db: Database,  # BORROWED
        inst_coll_manager: InstanceCollectionManager,
        resource_manager: CloudResourceManager,
        machine_name_prefix: str,
        config: JobPrivateInstanceManagerConfig,
        task_manager: aiotools.BackgroundTaskManager,
    ):
        jpim = JobPrivateInstanceManager(
            app, db, inst_coll_manager, resource_manager, machine_name_prefix, config, task_manager
        )

        log.info(f'initializing {jpim}')

        async for record in db.select_and_fetchall(
            '''
SELECT instances.*, instances_free_cores_mcpu.free_cores_mcpu
FROM instances
INNER JOIN instances_free_cores_mcpu
ON instances.name = instances_free_cores_mcpu.name
WHERE removed = 0 AND inst_coll = %s;
''',
            (jpim.name,),
        ):
            jpim.add_instance(Instance.from_record(app, jpim, record))

        return jpim
Exemplo n.º 2
0
    async def create(
        app,
        db: Database,  # BORROWED
        inst_coll_manager: InstanceCollectionManager,
        resource_manager: CloudResourceManager,
        machine_name_prefix: str,
        config: PoolConfig,
        async_worker_pool: AsyncWorkerPool,  # BORROWED
        task_manager: aiotools.BackgroundTaskManager,
    ) -> 'Pool':
        pool = Pool(app, db, inst_coll_manager, resource_manager,
                    machine_name_prefix, config, async_worker_pool,
                    task_manager)
        log.info(f'initializing {pool}')

        async for record in db.select_and_fetchall(
                '''
SELECT instances.*, instances_free_cores_mcpu.free_cores_mcpu
FROM instances
INNER JOIN instances_free_cores_mcpu
ON instances.name = instances_free_cores_mcpu.name
WHERE removed = 0 AND inst_coll = %s;
''',
            (pool.name, ),
        ):
            pool.add_instance(Instance.from_record(app, pool, record))

        return pool
Exemplo n.º 3
0
async def on_startup(app):
    pool = concurrent.futures.ThreadPoolExecutor()
    app['blocking_pool'] = pool

    kube.config.load_incluster_config()
    k8s_client = kube.client.CoreV1Api()
    k8s_cache = K8sCache(k8s_client, refresh_time=5)
    app['k8s_cache'] = k8s_cache

    db = Database()
    await db.async_init(maxsize=50)
    app['db'] = db

    row = await db.select_and_fetchone('''
SELECT worker_type, worker_cores, worker_disk_size_gb,
  instance_id, internal_token FROM globals;
''')

    app['worker_type'] = row['worker_type']
    app['worker_cores'] = row['worker_cores']
    app['worker_disk_size_gb'] = row['worker_disk_size_gb']

    instance_id = row['instance_id']
    log.info(f'instance_id {instance_id}')
    app['instance_id'] = instance_id

    app['internal_token'] = row['internal_token']

    resources = db.select_and_fetchall('SELECT resource FROM resources;')

    app['resources'] = [record['resource'] async for record in resources]

    machine_name_prefix = f'batch-worker-{DEFAULT_NAMESPACE}-'

    aiogoogle_credentials = aiogoogle.Credentials.from_file(
        '/gsa-key/key.json')
    compute_client = aiogoogle.ComputeClient(PROJECT,
                                             credentials=aiogoogle_credentials)
    app['compute_client'] = compute_client

    logging_client = aiogoogle.LoggingClient(
        credentials=aiogoogle_credentials,
        # The project-wide logging quota is 60 request/m.  The event
        # loop sleeps 15s per iteration, so the max rate is 4
        # iterations/m.  Note, the event loop could make multiple
        # logging requests per iteration, so these numbers are not
        # quite comparable.  I didn't want to consume the entire quota
        # since there will be other users of the logging API (us at
        # the web console, test deployments, etc.)
        rate_limit=RateLimit(10, 60))
    app['logging_client'] = logging_client

    scheduler_state_changed = asyncio.Event()
    app['scheduler_state_changed'] = scheduler_state_changed

    cancel_ready_state_changed = asyncio.Event()
    app['cancel_ready_state_changed'] = cancel_ready_state_changed

    cancel_running_state_changed = asyncio.Event()
    app['cancel_running_state_changed'] = cancel_running_state_changed

    credentials = google.oauth2.service_account.Credentials.from_service_account_file(
        '/gsa-key/key.json')
    log_store = LogStore(BATCH_BUCKET_NAME,
                         WORKER_LOGS_BUCKET_NAME,
                         instance_id,
                         pool,
                         credentials=credentials)
    app['log_store'] = log_store

    inst_pool = InstancePool(app, machine_name_prefix)
    app['inst_pool'] = inst_pool
    await inst_pool.async_init()

    scheduler = Scheduler(app)
    await scheduler.async_init()
    app['scheduler'] = scheduler
Exemplo n.º 4
0
Arquivo: main.py Projeto: saponas/hail
async def on_startup(app):
    app['task_manager'] = aiotools.BackgroundTaskManager()
    pool = concurrent.futures.ThreadPoolExecutor()
    app['blocking_pool'] = pool

    kube.config.load_incluster_config()
    k8s_client = kube.client.CoreV1Api()
    k8s_cache = K8sCache(k8s_client, refresh_time=5)
    app['k8s_cache'] = k8s_cache

    db = Database()
    await db.async_init(maxsize=50)
    app['db'] = db

    row = await db.select_and_fetchone('''
SELECT instance_id, internal_token FROM globals;
''')

    instance_id = row['instance_id']
    log.info(f'instance_id {instance_id}')
    app['instance_id'] = instance_id

    app['internal_token'] = row['internal_token']

    app['batch_headers'] = {'Authorization': f'Bearer {row["internal_token"]}'}

    resources = db.select_and_fetchall('SELECT resource FROM resources;')

    app['resources'] = [record['resource'] async for record in resources]

    aiogoogle_credentials = aiogoogle.Credentials.from_file(
        '/gsa-key/key.json')
    compute_client = aiogoogle.ComputeClient(PROJECT,
                                             credentials=aiogoogle_credentials)
    app['compute_client'] = compute_client

    logging_client = aiogoogle.LoggingClient(
        credentials=aiogoogle_credentials,
        # The project-wide logging quota is 60 request/m.  The event
        # loop sleeps 15s per iteration, so the max rate is 4
        # iterations/m.  Note, the event loop could make multiple
        # logging requests per iteration, so these numbers are not
        # quite comparable.  I didn't want to consume the entire quota
        # since there will be other users of the logging API (us at
        # the web console, test deployments, etc.)
        rate_limit=RateLimit(10, 60),
    )
    app['logging_client'] = logging_client

    scheduler_state_changed = Notice()
    app['scheduler_state_changed'] = scheduler_state_changed

    cancel_ready_state_changed = asyncio.Event()
    app['cancel_ready_state_changed'] = cancel_ready_state_changed

    cancel_creating_state_changed = asyncio.Event()
    app['cancel_creating_state_changed'] = cancel_creating_state_changed

    cancel_running_state_changed = asyncio.Event()
    app['cancel_running_state_changed'] = cancel_running_state_changed

    async_worker_pool = AsyncWorkerPool(100, queue_size=100)
    app['async_worker_pool'] = async_worker_pool

    credentials = google.oauth2.service_account.Credentials.from_service_account_file(
        '/gsa-key/key.json')
    log_store = LogStore(BATCH_BUCKET_NAME,
                         instance_id,
                         pool,
                         credentials=credentials)
    app['log_store'] = log_store

    zone_monitor = ZoneMonitor(app)
    app['zone_monitor'] = zone_monitor
    await zone_monitor.async_init()

    inst_coll_configs = InstanceCollectionConfigs(app)
    await inst_coll_configs.async_init()

    inst_coll_manager = InstanceCollectionManager(app, MACHINE_NAME_PREFIX)
    app['inst_coll_manager'] = inst_coll_manager
    await inst_coll_manager.async_init(inst_coll_configs)

    canceller = Canceller(app)
    app['canceller'] = canceller
    await canceller.async_init()

    gce_event_monitor = GCEEventMonitor(app, MACHINE_NAME_PREFIX)
    app['gce_event_monitor'] = gce_event_monitor
    await gce_event_monitor.async_init()

    app['check_incremental_error'] = None
    app['check_resource_aggregation_error'] = None

    if HAIL_SHOULD_CHECK_INVARIANTS:
        app['task_manager'].ensure_future(
            periodically_call(10, check_incremental, app, db))
        app['task_manager'].ensure_future(
            periodically_call(10, check_resource_aggregation, app, db))

    app['task_manager'].ensure_future(
        periodically_call(10, monitor_billing_limits, app))

    app['task_manager'].ensure_future(
        periodically_call(10, cancel_fast_failing_batches, app))

    app['task_manager'].ensure_future(
        periodically_call(60, scheduling_cancelling_bump, app))