예제 #1
0
def get_container(train_op,
                  train_env,
                  train_num_gpus,
                  drive='coco-headset-vol-1'):
    (train_op.container.set_memory_request('56Gi').set_memory_limit(
        '56Gi').set_cpu_request('7.5').set_cpu_limit('7.5').set_gpu_limit(
            str(train_num_gpus)).add_volume_mount(
                V1VolumeMount(
                    name='tensorboard',
                    mount_path='/shared/tensorboard')).add_volume_mount(
                        V1VolumeMount(name='data',
                                      mount_path='/data/')).add_volume_mount(
                                          V1VolumeMount(
                                              name='shm',
                                              mount_path='/dev/shm')))
    (add_env(add_ssh_volume(train_op), train_env).add_toleration(
        V1Toleration(key='nvidia.com/gpu',
                     operator='Exists',
                     effect='NoSchedule')).add_node_selector_constraint(
                         'beta.kubernetes.io/instance-type',
                         f'p3.{2 * train_num_gpus}xlarge').
     add_volume(
         V1Volume(name='tensorboard',
                  persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
                      'tensorboard-research-kf'))
     ).add_volume(
         V1Volume(name='data',
                  persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
                      drive)))
     # .add_volume(V1Volume(name='shm', host_path=V1HostPathVolumeSource(path='/dev/shm')))
     .add_volume(
         V1Volume(name='shm',
                  empty_dir=V1EmptyDirVolumeSource(medium='Memory'))))
예제 #2
0
def use_preemptible_nodepool(toleration: V1Toleration = V1Toleration(
    effect='NoSchedule', key='preemptible', operator='Equal', value='true'),
                             hard_constraint: bool = False):
    """An operator that configures the GKE preemptible in a container op.
  Args:
    toleration (V1Toleration): toleration to pods, default is the preemptible label.
    hard_constraint (bool): the constraint of scheduling the pods on preemptible
        nodepools is hard. (Default: False)
  """
    def _set_preemptible(task):
        task.add_toleration(toleration)
        node_selector_term = V1NodeSelectorTerm(match_expressions=[
            V1NodeSelectorRequirement(key='cloud.google.com/gke-preemptible',
                                      operator='In',
                                      values=['true'])
        ])
        if hard_constraint:
            node_affinity = V1NodeAffinity(
                required_during_scheduling_ignored_during_execution=
                V1NodeSelector(node_selector_terms=[node_selector_term]))
        else:
            node_affinity = V1NodeAffinity(
                preferred_during_scheduling_ignored_during_execution=[
                    V1PreferredSchedulingTerm(preference=node_selector_term,
                                              weight=50)
                ])
        affinity = V1Affinity(node_affinity=node_affinity)
        task.add_affinity(affinity=affinity)
        return task

    return _set_preemptible
예제 #3
0
def use_preemptible_nodepool(toleration: V1Toleration = V1Toleration(
    effect='NoSchedule', key='preemptible', operator='Equal', value='true')):
    """An operator that configures the GKE preemptible in a container op.
  """
    def _set_preemptible(task):
        task.add_toleration(toleration)
        task.add_node_selector_constraint("cloud.google.com/gke-preemptible",
                                          "true")
        return task

    return _set_preemptible
예제 #4
0
def tolerations():
    """A pipeline with tolerations"""
    op1 = dsl.ContainerOp(
        name='download',
        image='busybox',
        command=['sh', '-c'],
        arguments=['sleep 10; wget localhost:5678 -O /tmp/results.txt'],
        file_outputs={'downloaded': '/tmp/results.txt'})\
        .add_toleration(V1Toleration(effect='NoSchedule',
                                     key='gpu',
                                     operator='Equal',
                                     value='run'))
예제 #5
0
파일: gcp.py 프로젝트: rpatil524/pipelines
def add_gpu_toleration(toleration: V1Toleration = V1Toleration(
    effect='NoSchedule', key='nvidia.com/gpu', operator='Equal', value='true')):
    """An operator that configures the GKE GPU nodes in a container op.

    Args:
      toleration: toleration to pods, default is the nvidia.com/gpu label.
    """

    def _set_toleration(task):
        task.add_toleration(toleration)

    return _set_toleration
예제 #6
0
def train_eval_epic(owner,
                    project,
                    experiment,
                    model,
                    git_rev,
                    pretrained_s3,
                    mode,
                    train_additional_args='',
                    eval_additional_args=''):
    train_env = {}

    train_num_gpus = 1
    train_op = components.load_component_from_file('components/train.yaml')(
        owner=owner,
        project=project,
        experiment=experiment,
        model=model,
        git_rev=git_rev,
        pretrained_s3=pretrained_s3,
        mode=mode,
        additional_args=train_additional_args)
    (train_op.container.set_memory_request('56Gi').set_memory_limit(
        '56Gi').set_cpu_request('7.5').set_cpu_limit('7.5').set_gpu_limit(
            str(train_num_gpus)).add_volume_mount(
                V1VolumeMount(
                    name='tensorboard',
                    mount_path='/shared/tensorboard')).add_volume_mount(
                        V1VolumeMount(name='data',
                                      mount_path='/data/')).add_volume_mount(
                                          V1VolumeMount(
                                              name='shm',
                                              mount_path='/dev/shm')))
    (add_env(add_ssh_volume(train_op), train_env).add_toleration(
        V1Toleration(key='nvidia.com/gpu',
                     operator='Exists',
                     effect='NoSchedule')).add_node_selector_constraint(
                         'beta.kubernetes.io/instance-type',
                         f'p3.{2*train_num_gpus}xlarge').
     add_volume(
         V1Volume(name='tensorboard',
                  persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
                      'tensorboard-research-kf'))
     ).add_volume(
         V1Volume(name='data',
                  persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
                      'dataset-epic-kitchen')))
     # .add_volume(V1Volume(name='shm', host_path=V1HostPathVolumeSource(path='/dev/shm')))
     .add_volume(
         V1Volume(name='shm',
                  empty_dir=V1EmptyDirVolumeSource(medium='Memory'))))
예제 #7
0
    def test_tolerations(self):
        """Test a pipeline with a tolerations."""
        op1 = dsl.ContainerOp(
          name='download',
          image='busybox',
          command=['sh', '-c'],
          arguments=['sleep 10; wget localhost:5678 -O /tmp/results.txt'],
          file_outputs={'downloaded': '/tmp/results.txt'}) \
          .add_toleration(V1Toleration(
          effect='NoSchedule',
          key='gpu',
          operator='Equal',
          value='run'))

        self._test_op_to_template_yaml(op1, file_base_name='tolerations')
예제 #8
0
    def tolerations(preemptible: bool = True) -> Optional[List[V1Toleration]]:
        """creates tolerations for pod spec

    Args:
    preemptible: tolerate preemptible vm instances

    Returns:
    list of tolerations
    """

        if not preemptible:
            return []

        return [
            V1Toleration(key=k.NODE_SELECTOR_PREEMPTIBLE,
                         operator='Equal',
                         value='true',
                         effect='NoSchedule')
        ]
예제 #9
0
    def start_fat_pod(self, node_taint_key, node_taint_value):
        """
        Start fat pod
        """
        core_api = kubernetes.client.CoreV1Api(_build_client())

        pod = V1Pod(api_version='v1',
                    kind='Pod',
                    metadata=V1ObjectMeta(
                        name=FAT_POD_NAME,
                        annotations={"sidecar.istio.io/inject": "false"},
                    ),
                    spec=V1PodSpec(restart_policy='Never',
                                   priority=0,
                                   tolerations=[
                                       V1Toleration(
                                           key=node_taint_key,
                                           operator="Equal",
                                           value=node_taint_value,
                                           effect="NoSchedule",
                                       )
                                   ],
                                   containers=[
                                       V1Container(
                                           name=FAT_POD_NAME,
                                           image=FAT_POD_IMAGE,
                                           resources=V1ResourceRequirements(
                                               limits={
                                                   'cpu': FAT_POD_CPU,
                                                   'memory': FAT_POD_MEMORY
                                               },
                                               requests={
                                                   'cpu': FAT_POD_CPU,
                                                   'memory': FAT_POD_MEMORY
                                               }),
                                           command=["echo"],
                                           args=["I am a fat :("])
                                   ]))

        core_api.create_namespaced_pod(self._namespace, pod)
예제 #10
0
# Load custom components
#######################################

###################
# Train Op
comp_train_fname = op.join('components', 'od_train', 'component.yaml')
train_component = components.load_component(filename=comp_train_fname)

###################
# Export Op
comp_export_fname = op.join('components', 'od_export', 'component.yaml')
export_component = components.load_component(filename=comp_export_fname)

########################################
# Define a toleration to a ML node taint 
ml_tol = V1Toleration(effect='NoSchedule', key='mlUseOnly', operator='Equal', value='true')
ml_tol2 = V1Toleration(effect='NoSchedule', key='nvidia.com/gpu', operator='Equal', value='present')

@dsl.pipeline(name='OD API training/export',
              description='A pipeline to train/export an instance segmentation model.')
def divot_detect_pipeline(
        pipeline_config_path,
        model_dir,
        eval_dir,
        inference_output_directory,
        checkpoint_every_n=5000,
        num_train_steps=200000,
        sample_1_of_n_eval_examples=10,
        inference_input_type='encoded_image_string_tensor',
        eval_checkpoint_metric='loss',
        metric_objective_type='min'):
예제 #11
0
#######################################

###################
# Train Op
comp_train_fname = op.join('components', 'od_train', 'component.yaml')
train_component = components.load_component(filename=comp_train_fname)

###################
# Export Op
comp_export_fname = op.join('components', 'od_export', 'component.yaml')
export_component = components.load_component(filename=comp_export_fname)

########################################
# Define a toleration to a ML node taint
ml_tol = V1Toleration(effect='NoSchedule',
                      key='mlUseOnly',
                      operator='Equal',
                      value='true')


@dsl.pipeline(
    name='OD API training/export',
    description='A pipeline to train/export an instance segmentation model.')
def divot_detect_pipeline(pipeline_config_path,
                          model_dir,
                          eval_dir,
                          inference_output_directory,
                          num_train_steps=200000,
                          sample_1_of_n_eval_examples=10,
                          inference_input_type='encoded_image_string_tensor',
                          eval_checkpoint_metric='loss',
                          metric_objective_type='min'):