예제 #1
0
파일: clients.py 프로젝트: chrisvittal/hail
def get_cloud_async_fs(credentials_file: Optional[str] = None) -> AsyncFS:
    if credentials_file is None:
        credentials_file = '/gsa-key/key.json'

    cloud = get_global_config()['cloud']

    if cloud == 'azure':
        return aioazure.AzureAsyncFS(credential_file=credentials_file)

    assert cloud == 'gcp', cloud
    return aiogoogle.GoogleStorageAsyncFS(credentials_file=credentials_file)
예제 #2
0
 def input_config(self, code, scope):
     config = {}
     config['global'] = get_global_config()
     config['token'] = self.token
     config['deploy'] = scope == 'deploy'
     config['scope'] = scope
     config['code'] = code.config()
     if self.deps:
         for d in self.deps:
             config[d.name] = d.config(scope)
     return config
예제 #3
0
def get_identity_client(credentials_file: Optional[str] = None):
    if credentials_file is None:
        credentials_file = '/gsa-key/key.json'

    cloud = get_global_config()['cloud']
    if cloud == 'gcp':
        project = get_gcp_config().project
        return aiogoogle.GoogleIAmClient(project, credentials=aiogoogle.GoogleCredentials.from_file(credentials_file))

    assert cloud == 'azure'
    scopes = ['https://graph.microsoft.com/.default']
    return aioazure.AzureGraphClient(
        credentials=aioazure.AzureCredentials.from_file(credentials_file, scopes=scopes),
        scopes=scopes,
    )
예제 #4
0
파일: clients.py 프로젝트: chrisvittal/hail
def get_compute_client(credentials_file: Optional[str] = None):
    if credentials_file is None:
        credentials_file = '/gsa-key/key.json'

    cloud = get_global_config()['cloud']

    if cloud == 'azure':
        azure_config = get_azure_config()
        return aioazure.AzureComputeClient(azure_config.subscription_id,
                                           azure_config.resource_group)

    assert cloud == 'gcp', cloud
    project = get_gcp_config().project
    return aiogoogle.GoogleComputeClient(project,
                                         credentials_file=credentials_file)
예제 #5
0
async def get_cloud_driver(
    app,
    db: Database,
    machine_name_prefix: str,
    namespace: str,
    inst_coll_configs: InstanceCollectionConfigs,
    credentials_file: str,
    task_manager: aiotools.BackgroundTaskManager,
) -> CloudDriver:
    cloud = get_global_config()['cloud']

    if cloud == 'azure':
        return await AzureDriver.create(app, db, machine_name_prefix,
                                        namespace, inst_coll_configs,
                                        credentials_file, task_manager)

    assert cloud == 'gcp', cloud
    return await GCPDriver.create(app, db, machine_name_prefix, namespace,
                                  inst_coll_configs, credentials_file,
                                  task_manager)
예제 #6
0
import base64
import json
import os

import kubernetes_asyncio.client
import kubernetes_asyncio.config

from auth.driver.driver import create_user
from gear import Database, transaction
from gear.clients import get_identity_client
from gear.cloud_config import get_global_config
from hailtop.utils import async_to_blocking

CLOUD = get_global_config()['cloud']
SCOPE = os.environ['HAIL_SCOPE']
DEFAULT_NAMESPACE = os.environ['HAIL_DEFAULT_NAMESPACE']


async def insert_user_if_not_exists(app, username, login_id, is_developer,
                                    is_service_account):
    db = app['db']
    k8s_client = app['k8s_client']

    @transaction(db)
    async def insert(tx):
        row = await tx.execute_and_fetchone(
            'SELECT id, state FROM users where username = %s;', (username, ))
        if row:
            if row['state'] == 'active':
                return None
            return row['id']
예제 #7
0
import json
import os

from gear.cloud_config import get_global_config

global_config = get_global_config()

CLOUD = global_config['cloud']
assert CLOUD in ('gcp', 'azure'), CLOUD

DOCKER_PREFIX = global_config['docker_prefix']
DOCKER_ROOT_IMAGE = global_config['docker_root_image']
DOMAIN = global_config['domain']
KUBERNETES_SERVER_URL = global_config['kubernetes_server_url']
DEFAULT_NAMESPACE = global_config['default_namespace']

CI_UTILS_IMAGE = os.environ['HAIL_CI_UTILS_IMAGE']
BUILDKIT_IMAGE = os.environ['HAIL_BUILDKIT_IMAGE']
STORAGE_URI = os.environ['HAIL_CI_STORAGE_URI']
DEPLOY_STEPS = tuple(json.loads(os.environ.get('HAIL_CI_DEPLOY_STEPS', '[]')))
예제 #8
0
파일: clients.py 프로젝트: chrisvittal/hail
def get_cloud_async_fs_factory() -> AsyncFSFactory:
    cloud = get_global_config()['cloud']
    if cloud == 'azure':
        return aioazure.AzureAsyncFSFactory()
    assert cloud == 'gcp', cloud
    return aiogoogle.GoogleStorageAsyncFSFactory()
예제 #9
0
def create_vm_config(
    file_store: FileStore,
    resource_rates: Dict[str, float],
    zone: str,
    machine_name: str,
    machine_type: str,
    activation_token: str,
    max_idle_time_msecs: int,
    local_ssd_data_disk: bool,
    data_disk_size_gb: int,
    boot_disk_size_gb: int,
    preemptible: bool,
    job_private: bool,
    project: str,
    instance_config: InstanceConfig,
) -> dict:
    _, cores = gcp_machine_type_to_worker_type_and_cores(machine_type)

    if local_ssd_data_disk:
        worker_data_disk = {
            'type': 'SCRATCH',
            'autoDelete': True,
            'interface': 'NVME',
            'initializeParams': {
                'diskType': f'zones/{zone}/diskTypes/local-ssd'
            },
        }
        worker_data_disk_name = 'nvme0n1'
    else:
        worker_data_disk = {
            'autoDelete': True,
            'initializeParams': {
                'diskType':
                f'projects/{project}/zones/{zone}/diskTypes/pd-ssd',
                'diskSizeGb': str(data_disk_size_gb),
            },
        }
        worker_data_disk_name = 'sdb'

    if job_private:
        unreserved_disk_storage_gb = data_disk_size_gb
    else:
        unreserved_disk_storage_gb = unreserved_worker_data_disk_size_gib(
            data_disk_size_gb, cores)
    assert unreserved_disk_storage_gb >= 0

    make_global_config = ['mkdir /global-config']
    global_config = get_global_config()
    for name, value in global_config.items():
        make_global_config.append(
            f'echo -n {shq(value)} > /global-config/{name}')
    make_global_config_str = '\n'.join(make_global_config)

    assert instance_config.is_valid_configuration(resource_rates.keys())

    return {
        'name':
        machine_name,
        'machineType':
        f'projects/{project}/zones/{zone}/machineTypes/{machine_type}',
        'labels': {
            'role': 'batch2-agent',
            'namespace': DEFAULT_NAMESPACE
        },
        'disks': [
            {
                'boot': True,
                'autoDelete': True,
                'initializeParams': {
                    'sourceImage':
                    f'projects/{project}/global/images/batch-worker-12',
                    'diskType':
                    f'projects/{project}/zones/{zone}/diskTypes/pd-ssd',
                    'diskSizeGb': str(boot_disk_size_gb),
                },
            },
            worker_data_disk,
        ],
        'networkInterfaces': [{
            'network':
            'global/networks/default',
            'networkTier':
            'PREMIUM',
            'accessConfigs': [{
                'type': 'ONE_TO_ONE_NAT',
                'name': 'external-nat'
            }],
        }],
        'scheduling': {
            'automaticRestart': False,
            'onHostMaintenance': "TERMINATE",
            'preemptible': preemptible
        },
        'serviceAccounts': [{
            'email':
            f'batch2-agent@{project}.iam.gserviceaccount.com',
            'scopes': ['https://www.googleapis.com/auth/cloud-platform'],
        }],
        'metadata': {
            'items': [
                {
                    'key':
                    'startup-script',
                    'value':
                    '''
#!/bin/bash
set -x

NAME=$(curl -s http://metadata.google.internal/computeMetadata/v1/instance/name -H 'Metadata-Flavor: Google')
ZONE=$(curl -s http://metadata.google.internal/computeMetadata/v1/instance/zone -H 'Metadata-Flavor: Google')

if [ -f "/started" ]; then
    echo "instance $NAME has previously been started"
    while true; do
    gcloud -q compute instances delete $NAME --zone=$ZONE
    sleep 1
    done
    exit
else
    touch /started
fi

curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/run_script"  >./run.sh

nohup /bin/bash run.sh >run.log 2>&1 &
    ''',
                },
                {
                    'key':
                    'run_script',
                    'value':
                    rf'''
#!/bin/bash
set -x

WORKER_DATA_DISK_NAME="{worker_data_disk_name}"
UNRESERVED_WORKER_DATA_DISK_SIZE_GB="{unreserved_disk_storage_gb}"
ACCEPTABLE_QUERY_JAR_URL_PREFIX="{ACCEPTABLE_QUERY_JAR_URL_PREFIX}"

# format worker data disk
sudo mkfs.xfs -m reflink=1 -n ftype=1 /dev/$WORKER_DATA_DISK_NAME
sudo mkdir -p /mnt/disks/$WORKER_DATA_DISK_NAME
sudo mount -o prjquota /dev/$WORKER_DATA_DISK_NAME /mnt/disks/$WORKER_DATA_DISK_NAME
sudo chmod a+w /mnt/disks/$WORKER_DATA_DISK_NAME
XFS_DEVICE=$(xfs_info /mnt/disks/$WORKER_DATA_DISK_NAME | head -n 1 | awk '{{ print $1 }}' | awk  'BEGIN {{ FS = "=" }}; {{ print $2 }}')

# reconfigure docker to use local SSD
sudo service docker stop
sudo mv /var/lib/docker /mnt/disks/$WORKER_DATA_DISK_NAME/docker
sudo ln -s /mnt/disks/$WORKER_DATA_DISK_NAME/docker /var/lib/docker
sudo service docker start

# reconfigure /batch and /logs and /gcsfuse to use local SSD
sudo mkdir -p /mnt/disks/$WORKER_DATA_DISK_NAME/batch/
sudo ln -s /mnt/disks/$WORKER_DATA_DISK_NAME/batch /batch

sudo mkdir -p /mnt/disks/$WORKER_DATA_DISK_NAME/logs/
sudo ln -s /mnt/disks/$WORKER_DATA_DISK_NAME/logs /logs

sudo mkdir -p /mnt/disks/$WORKER_DATA_DISK_NAME/cloudfuse/
sudo ln -s /mnt/disks/$WORKER_DATA_DISK_NAME/cloudfuse /cloudfuse

sudo mkdir -p /etc/netns

CORES=$(nproc)
NAMESPACE=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/namespace")
ACTIVATION_TOKEN=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/activation_token")
IP_ADDRESS=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/ip")
PROJECT=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/project/project-id")

BATCH_LOGS_STORAGE_URI=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/batch_logs_storage_uri")
INSTANCE_ID=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/instance_id")
INSTANCE_CONFIG=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/instance_config")
MAX_IDLE_TIME_MSECS=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/max_idle_time_msecs")
NAME=$(curl -s http://metadata.google.internal/computeMetadata/v1/instance/name -H 'Metadata-Flavor: Google')
ZONE=$(curl -s http://metadata.google.internal/computeMetadata/v1/instance/zone -H 'Metadata-Flavor: Google')

BATCH_WORKER_IMAGE=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/batch_worker_image")
DOCKER_ROOT_IMAGE=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/docker_root_image")
DOCKER_PREFIX=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/docker_prefix")

INTERNAL_GATEWAY_IP=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/internal_ip")

# private job network = 172.20.0.0/16
# public job network = 172.21.0.0/16
# [all networks] Rewrite traffic coming from containers to masquerade as the host
iptables --table nat --append POSTROUTING --source 172.20.0.0/15 --jump MASQUERADE

# [public]
# Block public traffic to the metadata server
iptables --append FORWARD --source 172.21.0.0/16 --destination 169.254.169.254 --jump DROP
# But allow the internal gateway
iptables --append FORWARD --destination $INTERNAL_GATEWAY_IP --jump ACCEPT
# And this worker
iptables --append FORWARD --destination $IP_ADDRESS --jump ACCEPT
# Forbid outgoing requests to cluster-internal IP addresses
INTERNET_INTERFACE=$(ip link list | grep ens | awk -F": " '{{ print $2 }}')
iptables --append FORWARD --out-interface $INTERNET_INTERFACE ! --destination 10.128.0.0/16 --jump ACCEPT


# Setup fluentd
touch /worker.log
touch /run.log

sudo rm /etc/google-fluentd/config.d/*  # remove unused config files

sudo tee /etc/google-fluentd/config.d/syslog.conf <<EOF
<source>
@type tail
format syslog
path /var/log/syslog
pos_file /var/lib/google-fluentd/pos/syslog.pos
read_from_head true
tag syslog
</source>
EOF

sudo tee /etc/google-fluentd/config.d/worker-log.conf <<EOF
<source>
@type tail
format json
path /worker.log
pos_file /var/lib/google-fluentd/pos/worker-log.pos
read_from_head true
tag worker.log
</source>

<filter worker.log>
@type record_transformer
enable_ruby
<record>
    severity \${{ record["levelname"] }}
    timestamp \${{ record["asctime"] }}
</record>
</filter>
EOF

sudo tee /etc/google-fluentd/config.d/run-log.conf <<EOF
<source>
@type tail
format none
path /run.log
pos_file /var/lib/google-fluentd/pos/run-log.pos
read_from_head true
tag run.log
</source>
EOF

sudo cp /etc/google-fluentd/google-fluentd.conf /etc/google-fluentd/google-fluentd.conf.bak
head -n -1 /etc/google-fluentd/google-fluentd.conf.bak | sudo tee /etc/google-fluentd/google-fluentd.conf
sudo tee -a /etc/google-fluentd/google-fluentd.conf <<EOF
labels {{
"namespace": "$NAMESPACE",
"instance_id": "$INSTANCE_ID"
}}
</match>
EOF
rm /etc/google-fluentd/google-fluentd.conf.bak

sudo service google-fluentd restart

{make_global_config_str}

# retry once
docker pull $BATCH_WORKER_IMAGE || \
(echo 'pull failed, retrying' && sleep 15 && docker pull $BATCH_WORKER_IMAGE)

BATCH_WORKER_IMAGE_ID=$(docker inspect $BATCH_WORKER_IMAGE --format='{{{{.Id}}}}' | cut -d':' -f2)

# So here I go it's my shot.
docker run \
-e CLOUD=gcp \
-e CORES=$CORES \
-e NAME=$NAME \
-e NAMESPACE=$NAMESPACE \
-e ACTIVATION_TOKEN=$ACTIVATION_TOKEN \
-e IP_ADDRESS=$IP_ADDRESS \
-e BATCH_LOGS_STORAGE_URI=$BATCH_LOGS_STORAGE_URI \
-e INSTANCE_ID=$INSTANCE_ID \
-e PROJECT=$PROJECT \
-e ZONE=$ZONE \
-e DOCKER_PREFIX=$DOCKER_PREFIX \
-e DOCKER_ROOT_IMAGE=$DOCKER_ROOT_IMAGE \
-e INSTANCE_CONFIG=$INSTANCE_CONFIG \
-e MAX_IDLE_TIME_MSECS=$MAX_IDLE_TIME_MSECS \
-e BATCH_WORKER_IMAGE=$BATCH_WORKER_IMAGE \
-e BATCH_WORKER_IMAGE_ID=$BATCH_WORKER_IMAGE_ID \
-e INTERNET_INTERFACE=$INTERNET_INTERFACE \
-e UNRESERVED_WORKER_DATA_DISK_SIZE_GB=$UNRESERVED_WORKER_DATA_DISK_SIZE_GB \
-e ACCEPTABLE_QUERY_JAR_URL_PREFIX=$ACCEPTABLE_QUERY_JAR_URL_PREFIX \
-e INTERNAL_GATEWAY_IP=$INTERNAL_GATEWAY_IP \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /var/run/netns:/var/run/netns:shared \
-v /usr/bin/docker:/usr/bin/docker \
-v /usr/sbin/xfs_quota:/usr/sbin/xfs_quota \
-v /batch:/batch:shared \
-v /logs:/logs \
-v /global-config:/global-config \
-v /cloudfuse:/cloudfuse:shared \
-v /etc/netns:/etc/netns \
-v /sys/fs/cgroup:/sys/fs/cgroup \
--mount type=bind,source=/mnt/disks/$WORKER_DATA_DISK_NAME,target=/host \
--mount type=bind,source=/dev,target=/dev,bind-propagation=rshared \
-p 5000:5000 \
--device /dev/fuse \
--device $XFS_DEVICE \
--device /dev \
--privileged \
--cap-add SYS_ADMIN \
--security-opt apparmor:unconfined \
--network host \
$BATCH_WORKER_IMAGE \
python3 -u -m batch.worker.worker >worker.log 2>&1

[ $? -eq 0 ] || tail -n 1000 worker.log

while true; do
gcloud -q compute instances delete $NAME --zone=$ZONE
sleep 1
done
''',
                },
                {
                    'key':
                    'shutdown-script',
                    'value':
                    '''
set -x

INSTANCE_ID=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/instance_id")
NAME=$(curl -s http://metadata.google.internal/computeMetadata/v1/instance/name -H 'Metadata-Flavor: Google')

journalctl -u docker.service > dockerd.log
''',
                },
                {
                    'key': 'activation_token',
                    'value': activation_token
                },
                {
                    'key': 'batch_worker_image',
                    'value': BATCH_WORKER_IMAGE
                },
                {
                    'key': 'docker_root_image',
                    'value': DOCKER_ROOT_IMAGE
                },
                {
                    'key': 'docker_prefix',
                    'value': DOCKER_PREFIX
                },
                {
                    'key': 'namespace',
                    'value': DEFAULT_NAMESPACE
                },
                {
                    'key': 'internal_ip',
                    'value': INTERNAL_GATEWAY_IP
                },
                {
                    'key': 'batch_logs_storage_uri',
                    'value': file_store.batch_logs_storage_uri
                },
                {
                    'key': 'instance_id',
                    'value': file_store.instance_id
                },
                {
                    'key': 'max_idle_time_msecs',
                    'value': max_idle_time_msecs
                },
                {
                    'key':
                    'instance_config',
                    'value':
                    base64.b64encode(
                        json.dumps(
                            instance_config.to_dict()).encode()).decode(),
                },
            ]
        },
        'tags': {
            'items': ["batch2-agent"]
        },
    }
예제 #10
0
def get_flow_client(credentials_file: str) -> Flow:
    cloud = get_global_config()['cloud']
    if cloud == 'azure':
        return AzureFlow(credentials_file)
    assert cloud == 'gcp'
    return GoogleFlow(credentials_file)