Пример #1
0
def __get_kubeconfig(conf):
    tf_prefix = conf['terraform_prefix']
    resource_group = f'{tf_prefix}rg'
    cluster_name = f'{tf_prefix}k8s'
    cmd = f'az aks get-credentials --name {cluster_name} --resource-group {resource_group} --overwrite-existing'
    run(cmd)
    pass
Пример #2
0
def __delete_compute_tf_files(root):
    cmd = f'rm {root}/terraform_state/k8s.tf {root}/terraform_state/compute_pool.tf '+\
            f'rm {root}/terraform_state/storage_pool.tf'
    try:
        run(cmd)
    except Exception as e:
        ## if already deleted, not a problem
        pass
    pass
Пример #3
0
def minio_deploy(root, conf): 
    ## install helm chart 
    cmd1 = f'helm repo add minio https://helm.min.io/'
    run(cmd1) 
    ## apply helm chart with values 
    cmd2 = f'helm upgrade minio minio/minio --install '+\
            f'-f {root}/src/helm/minio/values.yaml'
    run(cmd2) 
    pass 
Пример #4
0
def __upload_tls_crt(root):
    cmd1 = 'kubectl delete secret tls-secret'
    cmd2 = 'kubectl create secret tls tls-secret '+\
            f'--cert="{root}/secret/crt/crt.pub" '+\
            f'--key="{root}/secret/crt/crt.key"'
    try:
        run(cmd1)
    except:
        pass
    run(cmd2)
    pass
Пример #5
0
def viewer_deploy(root, conf, interactive_mode=True):
    ## get image name
    cmd1 = f'cat {root}/secret/acr/server'
    acr_server = run(cmd1, return_stdout=True)
    image_name = acr_server + '/ai:' + conf['image_tag']
    ## apply helm chart with values
    cmd2 = f'helm upgrade viewer {root}/src/helm/viewer --install '+\
            f'-f {root}/src/helm/viewer/values.yaml '+\
            f'--set "image={image_name}" '+\
            f'--set "interactive_mode={interactive_mode}"'
    run(cmd2)
    pass
Пример #6
0
def update_horovod_worker_src(root, conf):
    'updates horovod worker src directory'
    horovod_instances = int(conf['horovod_instances'])
    for worker_idx in range(horovod_instances):
        ## delete remote src
        cmd1 = f'kubectl exec -it horovod-{worker_idx} -- rm -rf /app/src'
        run(cmd1, os_system=True)
        ## copy local src to remote
        cmd2 = f'kubectl cp {root}/src horovod-{worker_idx}:/app/src'
        run(cmd2, os_system=True)
        pass
    pass
Пример #7
0
def terraform_destroy(root, config):
    'execute `terraform destroy` in in terraform_state/ directory'
    ## work from terraform_state directory
    cmd_part_1 = f'cd {root}/terraform_state'
    ## apply with variables
    tf_vars = __get_base_var_str(config)
    cmd_part_2 = 'terraform destroy -auto-approve' + tf_vars
    ## build command
    cmd = cmd_part_1 + ' && ' + cmd_part_2
    ## execute
    run(cmd, os_system=True)
    pass
Пример #8
0
def deploy_service(root, conf):
    'deploys service'
    interactive_debugging_mode = conf['interactive_debugging_mode']
    ## build image name
    cmd1 = f'cat {root}/secret/acr/server'
    acr_server = run(cmd1, return_stdout=True)
    image_name = acr_server + '/' + conf['image_name']
    domain = str(conf['domain_prefix']) + '.eastus.cloudapp.azure.com'
    ## helm deploy
    cmd2 = f'helm upgrade --install service {root}/src/helm/service '+\
            f'--set service.image={image_name} '+\
            f'--set ingress.host={domain} '
    run(cmd2, os_system=True)
    pass
Пример #9
0
def __deploy_docker_build_env(root, conf, blocking=True): 
    'deploys build env helm chart'
    ## deploy build 
    name = 'build'
    cmd1 = f'helm upgrade {name} {root}/src/helm/build/ --install '+\
        f'--set name={name} ' 
    run(cmd1) 
    if blocking:
        ## wait until deployed 
        cmd2 = f'kubectl wait --for=condition=ready pod -l name={name}'
        run(cmd2) 
        ## docker daemon needs a little more time 
        sleep(3) 
    pass 
Пример #10
0
def __upload_acr_secret_to_k8s(root, config):
    cmd1 = f'kubectl delete secret acr-creds'
    try:
        run(cmd1)
    except:
        ## if secret doesn't exist yet, just create a new one
        pass
    tf_prefix = config['terraform_prefix']
    cmd2 = 'kubectl create secret docker-registry acr-creds '+\
        f'--docker-server=$(cat {root}/secret/acr/server) '+\
        f'--docker-username={tf_prefix}acr '+\
        f'--docker-password=$(cat {root}/secret/acr/token)'
    run(cmd2)
    pass
Пример #11
0
def deploy_horovod(root, conf):
    'deploys horovod'
    interactive_debugging_mode = conf['interactive_debugging_mode']
    ## build image name
    cmd1 = f'cat {root}/secret/acr/server'
    acr_server = run(cmd1, return_stdout=True)
    image_name = acr_server + '/ai:' + conf['image_tag']
    horovod_instances = int(conf['horovod_instances'])
    ## helm deploy
    cmd2 = f'helm upgrade --install horovod-ring {root}/src/helm/horovod-ring '+\
            f'--set image={image_name} '+\
            f'--set interactive_debugging_mode={interactive_debugging_mode} '+\
            f'--set replicas={horovod_instances}'
    run(cmd2, os_system=True)
    pass
Пример #12
0
def __get_acr_server(root, config):
    ## get server in JSON from az cli stdout
    tf_prefix = config['terraform_prefix']
    acr_name = f'{tf_prefix}acr'
    cmd = f'az acr show -n {acr_name} -o json'
    json_str = run(cmd, return_stdout=True)
    ## parse JSON and save server
    j = json.loads(json_str)
    server = j['loginServer']
    server_path = os.path.join(root, 'secret', 'acr', 'server')
    with open(server_path, 'w') as f:
        f.write(server)
        pass
    pass
Пример #13
0
def __get_acr_token(root, config):
    ## get token in JSON from az cli stdout
    tf_prefix = config['terraform_prefix']
    acr_name = f'{tf_prefix}acr'
    cmd = f'az acr credential show -n {acr_name} -o json'
    json_str = run(cmd, return_stdout=True)
    ## parse JSON and save token
    j = json.loads(json_str)
    token = j['passwords'][0]['value']
    token_path = os.path.join(root, 'secret', 'acr', 'token')
    with open(token_path, 'w') as f:
        f.write(token)
        pass
    pass
Пример #14
0
def postgres_deploy(root, conf):
    ## update local secret 
    postgres_secret = base64.urlsafe_b64encode(os.urandom(16)).decode() 
    postgres_secret_path = f'{root}/secret/postgres/postgres-secret'
    if not os.path.isfile(postgres_secret_path):
        ## only write once 
        with open(f'{root}/secret/postgres/postgres-secret', 'w') as f:
            f.write(postgres_secret) 
    cmd1 = f'kubectl delete secret postgres'
    cmd2 = f'kubectl create secret generic postgres --from-file={root}/secret/postgres/postgres-secret'
    try: 
        run(cmd1)
    except:
        print('failed to delete remote postgres secret, probably because it does not exist')
        pass 
    run(cmd2) 
    ## deploy 
    cmd3 = f'helm upgrade postgres {root}/src/helm/postgres --install'
    run(cmd3) 
    pass 
Пример #15
0
def __tls_crt_gen(root, config):
    host = str(config['domain_prefix']) + '.eastus.cloudapp.azure.com'
    cmd1 = 'openssl req -newkey rsa:4096 -nodes -sha512 -x509 -days 3650 -nodes '+\
            f'-subj "/CN=${host}" -out {root}/secret/crt/crt.pub -keyout {root}/secret/crt/crt.key'
    run(cmd1)
    pass
Пример #16
0
def __copy_phase_2_tf_files(root):
    'copies phase 2 terraform files from terraform/phase-2/ to terraform_state/'
    cmd = f'cp {root}/src/terraform/phase-2/*.tf {root}/terraform_state'
    run(cmd, os_system=True)
    pass
Пример #17
0
def __tear_down_docker_build_env(root, conf): 
    'tears-down build env helm chart'
    name = 'build' 
    cmd = f'helm uninstall {name}' 
    run(cmd) 
    pass 
Пример #18
0
    ## without this, ssh cannot resolve full host names
    write_ssh_aliases(args.replicas)

    ## starting sshd
    os.system('service ssh start')

    if args.interactive_debugging_mode:
        interactive_debugging_mode()
        pass

    if args.is_head_node:
        wait_for_dns(args.replicas)
        ## construct cmd
        cmd = f'horovodrun -np {args.replicas} -H '
        for idx in range(args.replicas):
            if idx > 0:
                cmd += ','
                pass
            cmd += f'horovod-{idx}.horovod:1'
            pass
        cmd += ' xvfb-run python /app/src/python/ai/ai_runner.py'
        ## execute
        run(cmd, os_system=True)
        pass

    ## check for master
    pod_name = os.environ.get('POD_NAME')
    print(f'POD_NAME: {pod_name}')
    interactive_debugging_mode()
    pass
Пример #19
0
def __build(root, conf): 
    'runs a remote docker build'
    ## load secrets 
    cmd1 = f'cat {root}/secret/acr/server' 
    cmd2 = f'cat {root}/secret/acr/token' 
    acr_server = run(cmd1, return_stdout=True) 
    acr_token = run(cmd2, return_stdout=True) 
    ## setup build environment 
    cmd3 = f'kubectl exec build -- mkdir -p /build' 
    cmd4 = f'kubectl cp {root}/docker build:/build/docker && kubectl cp {root}/src build:/build/docker/src' 
    run(cmd3) 
    run(cmd4) 
    ## build 
    image_name = acr_server + '/' + conf['image_name']
    acr_name = conf['terraform_prefix'] + 'acr'
    cmd5 = f'kubectl exec -it build -- sh -c "cd /build/docker && docker build -t {image_name} ."' 
    run(cmd5, os_system=True) 
    ## push 
    cmd6 = f'kubectl exec -it build -- docker login {acr_server} --username {acr_name} --password {acr_token}' 
    cmd7 = f'kubectl exec -it build -- docker push {image_name}' 
    run(cmd6) 
    run(cmd7, os_system=True) 
    pass
Пример #20
0
def __install_nvidia_drivers(root):
    cmd = f'kubectl apply -f {root}/src/k8s/azure-cuda-daemon.yaml'
    run(cmd, os_system=True)
    pass