def __get_kubeconfig(conf): tf_prefix = conf['terraform_prefix'] resource_group = f'{tf_prefix}rg' cluster_name = f'{tf_prefix}k8s' cmd = f'az aks get-credentials --name {cluster_name} --resource-group {resource_group} --overwrite-existing' run(cmd) pass
def __delete_compute_tf_files(root): cmd = f'rm {root}/terraform_state/k8s.tf {root}/terraform_state/compute_pool.tf '+\ f'rm {root}/terraform_state/storage_pool.tf' try: run(cmd) except Exception as e: ## if already deleted, not a problem pass pass
def minio_deploy(root, conf): ## install helm chart cmd1 = f'helm repo add minio https://helm.min.io/' run(cmd1) ## apply helm chart with values cmd2 = f'helm upgrade minio minio/minio --install '+\ f'-f {root}/src/helm/minio/values.yaml' run(cmd2) pass
def __upload_tls_crt(root): cmd1 = 'kubectl delete secret tls-secret' cmd2 = 'kubectl create secret tls tls-secret '+\ f'--cert="{root}/secret/crt/crt.pub" '+\ f'--key="{root}/secret/crt/crt.key"' try: run(cmd1) except: pass run(cmd2) pass
def viewer_deploy(root, conf, interactive_mode=True): ## get image name cmd1 = f'cat {root}/secret/acr/server' acr_server = run(cmd1, return_stdout=True) image_name = acr_server + '/ai:' + conf['image_tag'] ## apply helm chart with values cmd2 = f'helm upgrade viewer {root}/src/helm/viewer --install '+\ f'-f {root}/src/helm/viewer/values.yaml '+\ f'--set "image={image_name}" '+\ f'--set "interactive_mode={interactive_mode}"' run(cmd2) pass
def update_horovod_worker_src(root, conf): 'updates horovod worker src directory' horovod_instances = int(conf['horovod_instances']) for worker_idx in range(horovod_instances): ## delete remote src cmd1 = f'kubectl exec -it horovod-{worker_idx} -- rm -rf /app/src' run(cmd1, os_system=True) ## copy local src to remote cmd2 = f'kubectl cp {root}/src horovod-{worker_idx}:/app/src' run(cmd2, os_system=True) pass pass
def terraform_destroy(root, config): 'execute `terraform destroy` in in terraform_state/ directory' ## work from terraform_state directory cmd_part_1 = f'cd {root}/terraform_state' ## apply with variables tf_vars = __get_base_var_str(config) cmd_part_2 = 'terraform destroy -auto-approve' + tf_vars ## build command cmd = cmd_part_1 + ' && ' + cmd_part_2 ## execute run(cmd, os_system=True) pass
def deploy_service(root, conf): 'deploys service' interactive_debugging_mode = conf['interactive_debugging_mode'] ## build image name cmd1 = f'cat {root}/secret/acr/server' acr_server = run(cmd1, return_stdout=True) image_name = acr_server + '/' + conf['image_name'] domain = str(conf['domain_prefix']) + '.eastus.cloudapp.azure.com' ## helm deploy cmd2 = f'helm upgrade --install service {root}/src/helm/service '+\ f'--set service.image={image_name} '+\ f'--set ingress.host={domain} ' run(cmd2, os_system=True) pass
def __deploy_docker_build_env(root, conf, blocking=True): 'deploys build env helm chart' ## deploy build name = 'build' cmd1 = f'helm upgrade {name} {root}/src/helm/build/ --install '+\ f'--set name={name} ' run(cmd1) if blocking: ## wait until deployed cmd2 = f'kubectl wait --for=condition=ready pod -l name={name}' run(cmd2) ## docker daemon needs a little more time sleep(3) pass
def __upload_acr_secret_to_k8s(root, config): cmd1 = f'kubectl delete secret acr-creds' try: run(cmd1) except: ## if secret doesn't exist yet, just create a new one pass tf_prefix = config['terraform_prefix'] cmd2 = 'kubectl create secret docker-registry acr-creds '+\ f'--docker-server=$(cat {root}/secret/acr/server) '+\ f'--docker-username={tf_prefix}acr '+\ f'--docker-password=$(cat {root}/secret/acr/token)' run(cmd2) pass
def deploy_horovod(root, conf): 'deploys horovod' interactive_debugging_mode = conf['interactive_debugging_mode'] ## build image name cmd1 = f'cat {root}/secret/acr/server' acr_server = run(cmd1, return_stdout=True) image_name = acr_server + '/ai:' + conf['image_tag'] horovod_instances = int(conf['horovod_instances']) ## helm deploy cmd2 = f'helm upgrade --install horovod-ring {root}/src/helm/horovod-ring '+\ f'--set image={image_name} '+\ f'--set interactive_debugging_mode={interactive_debugging_mode} '+\ f'--set replicas={horovod_instances}' run(cmd2, os_system=True) pass
def __get_acr_server(root, config): ## get server in JSON from az cli stdout tf_prefix = config['terraform_prefix'] acr_name = f'{tf_prefix}acr' cmd = f'az acr show -n {acr_name} -o json' json_str = run(cmd, return_stdout=True) ## parse JSON and save server j = json.loads(json_str) server = j['loginServer'] server_path = os.path.join(root, 'secret', 'acr', 'server') with open(server_path, 'w') as f: f.write(server) pass pass
def __get_acr_token(root, config): ## get token in JSON from az cli stdout tf_prefix = config['terraform_prefix'] acr_name = f'{tf_prefix}acr' cmd = f'az acr credential show -n {acr_name} -o json' json_str = run(cmd, return_stdout=True) ## parse JSON and save token j = json.loads(json_str) token = j['passwords'][0]['value'] token_path = os.path.join(root, 'secret', 'acr', 'token') with open(token_path, 'w') as f: f.write(token) pass pass
def postgres_deploy(root, conf): ## update local secret postgres_secret = base64.urlsafe_b64encode(os.urandom(16)).decode() postgres_secret_path = f'{root}/secret/postgres/postgres-secret' if not os.path.isfile(postgres_secret_path): ## only write once with open(f'{root}/secret/postgres/postgres-secret', 'w') as f: f.write(postgres_secret) cmd1 = f'kubectl delete secret postgres' cmd2 = f'kubectl create secret generic postgres --from-file={root}/secret/postgres/postgres-secret' try: run(cmd1) except: print('failed to delete remote postgres secret, probably because it does not exist') pass run(cmd2) ## deploy cmd3 = f'helm upgrade postgres {root}/src/helm/postgres --install' run(cmd3) pass
def __tls_crt_gen(root, config): host = str(config['domain_prefix']) + '.eastus.cloudapp.azure.com' cmd1 = 'openssl req -newkey rsa:4096 -nodes -sha512 -x509 -days 3650 -nodes '+\ f'-subj "/CN=${host}" -out {root}/secret/crt/crt.pub -keyout {root}/secret/crt/crt.key' run(cmd1) pass
def __copy_phase_2_tf_files(root): 'copies phase 2 terraform files from terraform/phase-2/ to terraform_state/' cmd = f'cp {root}/src/terraform/phase-2/*.tf {root}/terraform_state' run(cmd, os_system=True) pass
def __tear_down_docker_build_env(root, conf): 'tears-down build env helm chart' name = 'build' cmd = f'helm uninstall {name}' run(cmd) pass
## without this, ssh cannot resolve full host names write_ssh_aliases(args.replicas) ## starting sshd os.system('service ssh start') if args.interactive_debugging_mode: interactive_debugging_mode() pass if args.is_head_node: wait_for_dns(args.replicas) ## construct cmd cmd = f'horovodrun -np {args.replicas} -H ' for idx in range(args.replicas): if idx > 0: cmd += ',' pass cmd += f'horovod-{idx}.horovod:1' pass cmd += ' xvfb-run python /app/src/python/ai/ai_runner.py' ## execute run(cmd, os_system=True) pass ## check for master pod_name = os.environ.get('POD_NAME') print(f'POD_NAME: {pod_name}') interactive_debugging_mode() pass
def __build(root, conf): 'runs a remote docker build' ## load secrets cmd1 = f'cat {root}/secret/acr/server' cmd2 = f'cat {root}/secret/acr/token' acr_server = run(cmd1, return_stdout=True) acr_token = run(cmd2, return_stdout=True) ## setup build environment cmd3 = f'kubectl exec build -- mkdir -p /build' cmd4 = f'kubectl cp {root}/docker build:/build/docker && kubectl cp {root}/src build:/build/docker/src' run(cmd3) run(cmd4) ## build image_name = acr_server + '/' + conf['image_name'] acr_name = conf['terraform_prefix'] + 'acr' cmd5 = f'kubectl exec -it build -- sh -c "cd /build/docker && docker build -t {image_name} ."' run(cmd5, os_system=True) ## push cmd6 = f'kubectl exec -it build -- docker login {acr_server} --username {acr_name} --password {acr_token}' cmd7 = f'kubectl exec -it build -- docker push {image_name}' run(cmd6) run(cmd7, os_system=True) pass
def __install_nvidia_drivers(root): cmd = f'kubectl apply -f {root}/src/k8s/azure-cuda-daemon.yaml' run(cmd, os_system=True) pass