def gh_summ( #pylint: disable=unused-argument train_steps: 'Integer' = 2019300, project: str = 'YOUR_PROJECT_HERE', github_token: str = 'YOUR_GITHUB_TOKEN_HERE', working_dir: 'GCSPath' = 'gs://YOUR_GCS_DIR_HERE', checkpoint_dir: 'GCSPath' = 'gs://aju-dev-demos-codelabs/kubecon/model_output_tbase.bak2019000/', deploy_webapp: str = 'true', data_dir: 'GCSPath' = 'gs://aju-dev-demos-codelabs/kubecon/t2t_data_gh_all/' ): copydata = copydata_op( data_dir=data_dir, checkpoint_dir=checkpoint_dir, model_dir='%s/%s/model_output' % (working_dir, dsl.RUN_ID_PLACEHOLDER), action=COPY_ACTION, ) train = train_op( data_dir=data_dir, model_dir=copydata.outputs['copy_output_path'], action=TRAIN_ACTION, train_steps=train_steps, deploy_webapp=deploy_webapp ) serve = dsl.ContainerOp( name='serve', image='gcr.io/google-samples/ml-pipeline-kubeflow-tfserve:v5', arguments=["--model_name", 'ghsumm-%s' % (dsl.RUN_ID_PLACEHOLDER,), "--model_path", train.outputs['train_output_path'] ] ) # train.set_gpu_limit(1) train.set_gpu_limit(1).apply(gcp.use_preemptible_nodepool()).set_retry(10) with dsl.Condition(train.outputs['launch_server'] == 'true'): webapp = dsl.ContainerOp( name='webapp', image='gcr.io/google-samples/ml-pipeline-webapp-launcher:v7ap', arguments=["--model_name", 'ghsumm-%s' % (dsl.RUN_ID_PLACEHOLDER,), "--github_token", github_token] ) webapp.after(serve)
def gh_summ( #pylint: disable=unused-argument train_steps=2019300, project='YOUR_PROJECT_HERE', github_token='YOUR_GITHUB_TOKEN_HERE', working_dir='YOUR_GCS_DIR_HERE', checkpoint_dir='gs://aju-dev-demos-codelabs/kubecon/model_output_tbase.bak2019000', deploy_webapp='true', data_dir='gs://aju-dev-demos-codelabs/kubecon/t2t_data_gh_all/'): copydata = copydata_op( working_dir=working_dir, data_dir=data_dir, checkpoint_dir=checkpoint_dir, model_dir='%s/%s/model_output' % (working_dir, '{{workflow.name}}'), action=COPY_ACTION).apply(gcp.use_gcp_secret('user-gcp-sa')) log_dataset = metadata_log_op(log_type=DATASET, workspace_name=WORKSPACE_NAME, run_name='{{workflow.name}}', data_uri=data_dir) train = train_op( working_dir=working_dir, data_dir=data_dir, checkpoint_dir=checkpoint_dir, model_dir='%s/%s/model_output' % (working_dir, '{{workflow.name}}'), action=TRAIN_ACTION, train_steps=train_steps, deploy_webapp=deploy_webapp).apply(gcp.use_gcp_secret('user-gcp-sa')) log_model = metadata_log_op(log_type=MODEL, workspace_name=WORKSPACE_NAME, run_name='{{workflow.name}}', model_uri='%s/%s/model_output' % (working_dir, '{{workflow.name}}')) serve = dsl.ContainerOp( name='serve', image='gcr.io/google-samples/ml-pipeline-kubeflow-tfserve', arguments=[ "--model_name", 'ghsumm-%s' % ('{{workflow.name}}', ), "--model_path", '%s/%s/model_output/export' % (working_dir, '{{workflow.name}}') ]) log_dataset.after(copydata) train.after(copydata) log_model.after(train) serve.after(train) train.set_gpu_limit(4).apply(gcp.use_preemptible_nodepool()).set_retry(5) train.set_memory_limit('48G') with dsl.Condition(train.output == 'true'): webapp = dsl.ContainerOp( name='webapp', image='gcr.io/google-samples/ml-pipeline-webapp-launcher:v2ap', arguments=[ "--model_name", 'ghsumm-%s' % ('{{workflow.name}}', ), "--github_token", github_token ]) webapp.after(serve)
def flipcoin(): flip = FlipCoinOp().apply(gcp.use_preemptible_nodepool()).set_gpu_limit( 1, 'nvidia').set_retry(5)
def baseline_repro_pipeline( data_bucket: str = 'voxsrc-2020-voxceleb-v4', test_list: str = 'vox1_full.txt', # @note test_utterances_list is in the same format as train_list, but for # the test data. Whereas test_list contains utterance pairs for # evaluation test_utterances_list: str = 'vox1_full_utterances.txt', train_list: str = 'vox2_full.txt', test_path: str = 'vox1_full.tar.gz', train_path: str = 'vox2_full.tar.gz', checkpoint_bucket: str = 'voxsrc-2020-checkpoints', batch_size: int = 750, max_epoch: int = 21, n_speakers: int = 2, test_interval: int = 3, feature_extraction_threads: int = 16, data_loader_threads: int = 7, # @note This run ID contains "full" pre-extracted features for vox1 and vox2 reuse_run_with_id: str = "milo_webster-19rvuxfu", gaussian_noise_std: float = .9, ): # set prod_hw=True to enable production hardware (preemptible V100). # Encountered odd issues when node resource constraints aren't known at # "compile time" of kf pipeline file prod_hw = True run_id = '{{workflow.uid}}' feature_extraction_task = feature_extraction_op( data_bucket=data_bucket, test_utterances_list=test_list, train_list=train_list, test_path=test_path, train_path=train_path, run_id=run_id, num_threads=feature_extraction_threads, reuse_run_with_id=reuse_run_with_id) # default feature extractor to high-perf pool if not in pass-through mode # if in pass-through mode, there's no reason to use a beefy node if not reuse_run_with_id: feature_extraction_task.set_cpu_request("9").set_cpu_limit("16") train_task = train_op( data_bucket=data_bucket, test_list=test_list, train_list=train_list, test_path=feature_extraction_task.outputs['test_feats_tar_path'], train_path=feature_extraction_task.outputs['train_feats_tar_path'], batch_size=batch_size, max_epoch=max_epoch, checkpoint_bucket=checkpoint_bucket, run_id=run_id, n_speakers=n_speakers, test_interval=test_interval, gaussian_noise_std=gaussian_noise_std, n_data_loader_thread=data_loader_threads, ) train_task.add_pvolumes({'/dev/shm': ipc_shared_mem_volume}) train_task.after(feature_extraction_task) # add Weights & Biases credentials if "WANDB_API_KEY" in os.environ: train_task.add_env_variable( k8s_client.V1EnvVar(name='WANDB_API_KEY', value=os.environ["WANDB_API_KEY"])) else: raise 'Error: No WandB API key set in environment' # @note These resource requests autoscale an autoscalable node pool from # 0->1 that matches the corresponding config. Autoscaled nodes will be # deactivated on GCP after 10 minutes of inactivity if prod_hw: # require training to run on a preemptible node pool train_task\ .apply(gcp.use_preemptible_nodepool(hard_constraint=True))\ .set_retry(5) # require training to run on a node with a gpu of type 'train_gpu_type' train_task\ .set_gpu_limit(1)\ .add_node_selector_constraint('cloud.google.com/gke-accelerator', 'nvidia-tesla-v100')