示例#1
0
def run_test(args):
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    t.name = os.path.basename(args.spec)

    loader = jinja2.FileSystemLoader(os.path.dirname(args.spec))

    if not args.image_tag:
        raise ValueError("--image_tag must be provided.")

    logging.info("Loading spec from %s with image_tag=%s", args.spec,
                 args.image_tag)
    spec_contents = jinja2.Environment(loader=loader).get_template(
        os.path.basename(args.spec)).render(image_tag=args.image_tag)

    spec = yaml.load(spec_contents)

    # Make the job name unique.
    spec["metadata"]["name"] += "-" + uuid.uuid4().hex[0:4]
    try:
        start = time.time()
        api_response = tf_job_client.create_tf_job(api_client, spec)
        namespace = api_response["metadata"]["namespace"]
        name = api_response["metadata"]["name"]

        logging.info("Created job %s in namespaces %s", name, namespace)
        results = tf_job_client.wait_for_job(
            api_client,
            namespace,
            name,
            status_callback=tf_job_client.log_status)

        if results["status"]["state"] != "succeeded":
            t.failure = "Job {0} in namespace {1} in state {2}".format(
                name, namespace, results["status"]["state"])

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check tensorboard is created if its part of the job spec.
        #  2. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
示例#2
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='Kubeflow TFJob launcher')
    parser.add_argument(
        '--container-image',
        type=str,
        help=
        '''Container image to run using KubeFlow TFJob. The command line should be added after --.'''
    )
    parser.add_argument('--workers', type=int, default=0)
    parser.add_argument('--pss', type=int, default=0)
    parser.add_argument(
        '--cluster',
        type=str,
        help='GKE cluster set up for kubeflow. If set, zone must be provided. '
        + 'If not set, assuming this runs in a GKE container and current ' +
        'cluster is used.')
    parser.add_argument('--zone',
                        type=str,
                        help='zone of the kubeflow cluster.')
    parser.add_argument('--kfversion',
                        type=str,
                        default='v1alpha2',
                        help='The version of the deployed kubeflow. ' +
                        'If not set, the default version is v1alpha2')
    parser.add_argument('--tfjob-ns',
                        type=str,
                        default='default',
                        help='The namespace where the tfjob is submitted' +
                        'If not set, the default namespace is default')
    parser.add_argument(
        '--tfjob-timeout-minutes',
        type=int,
        default=10,
        help='Time in minutes to wait for the TFJob to complete')
    parser.add_argument('--output-dir', type=str)
    parser.add_argument('--ui-metadata-type', type=str, default='tensorboard')
    import sys
    all_args = sys.argv[1:]
    separator_idx = all_args.index('--')
    launcher_args = all_args[:separator_idx]
    remaining_args = all_args[separator_idx + 1:]

    args = parser.parse_args(launcher_args)

    logging.getLogger().setLevel(logging.INFO)
    args_dict = vars(args)
    if args.cluster and args.zone:
        cluster = args_dict.pop('cluster')
        zone = args_dict.pop('zone')
    else:
        # Get culster name and zone from metadata
        metadata_server = "http://metadata/computeMetadata/v1/instance/"
        metadata_flavor = {'Metadata-Flavor': 'Google'}
        cluster = requests.get(metadata_server + "attributes/cluster-name",
                               headers=metadata_flavor).text
        zone = requests.get(metadata_server + "zone",
                            headers=metadata_flavor).text.split('/')[-1]

    logging.info('Getting credentials for GKE cluster %s.' % cluster)
    subprocess.call([
        'gcloud', 'container', 'clusters', 'get-credentials', cluster,
        '--zone', zone
    ])

    workers = args_dict.pop('workers')
    pss = args_dict.pop('pss')
    kf_version = args_dict.pop('kfversion')
    tfjob_ns = args_dict.pop('tfjob_ns')
    tfjob_timeout_minutes = args_dict.pop('tfjob_timeout_minutes')
    trainer_image = args.container_image or os.environ['TRAINER_IMAGE_NAME']
    command = remaining_args
    logging.info('Generating training template.')
    template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 'train.template.yaml')
    content_yaml = _generate_train_yaml(template_file, tfjob_ns, workers, pss,
                                        trainer_image, command)

    logging.info('Start training.')
    # Set up handler for k8s clients
    config.load_incluster_config()
    api_client = k8s_client.ApiClient()
    create_response = tf_job_client.create_tf_job(api_client,
                                                  content_yaml,
                                                  version=kf_version)
    job_name = create_response['metadata']['name']

    if args.output_dir:
        # Create metadata.json file for visualization.
        metadata = {
            'outputs': [{
                'type': args.ui_metadata_type,
                'source': args.output_dir,
            }]
        }
        with open('/mlpipeline-ui-metadata.json', 'w') as f:
            json.dump(metadata, f)

    wait_response = tf_job_client.wait_for_job(
        api_client,
        tfjob_ns,
        job_name,
        kf_version,
        timeout=datetime.timedelta(minutes=tfjob_timeout_minutes))
    succ = True
    #TODO: update this failure checking after tf-operator has the condition checking function.
    if 'Worker' in wait_response['status']['tfReplicaStatuses']:
        if 'Failed' in wait_response['status']['tfReplicaStatuses']['Worker']:
            logging.error('Training failed since workers failed.')
            succ = False
    if 'PS' in wait_response['status']['tfReplicaStatuses']:
        if 'Failed' in wait_response['status']['tfReplicaStatuses']['PS']:
            logging.error('Training failed since PSs failed.')
            succ = False
    if 'MASTER' in wait_response['status']['tfReplicaStatuses']:
        if 'Failed' in wait_response['status']['tfReplicaStatuses']['MASTER']:
            logging.error('Training failed since MASTER failed.')
            succ = False

    #TODO: remove this after kubeflow fixes the wait_for_job issue
    # because the wait_for_job returns when the worker finishes but the master might not be complete yet.
    if 'MASTER' in wait_response['status'][
            'tfReplicaStatuses'] and 'active' in wait_response['status'][
                'tfReplicaStatuses']['MASTER']:
        master_active = True
        while master_active:
            # Wait for master to finish
            time.sleep(2)
            wait_response = tf_job_client.wait_for_job(
                api_client,
                tfjob_ns,
                job_name,
                kf_version,
                timeout=datetime.timedelta(minutes=tfjob_timeout_minutes))
            if 'active' not in wait_response['status']['tfReplicaStatuses'][
                    'MASTER']:
                master_active = False

    if succ:
        logging.info('Training success.')

    tf_job_client.delete_tf_job(api_client,
                                tfjob_ns,
                                job_name,
                                version=kf_version)
    with open('/output.txt', 'w') as f:
        f.write(args.output_dir)
示例#3
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='ML Trainer')
    parser.add_argument('--working-dir',
                        help='Training job working directory.',
                        required=True)
    parser.add_argument('--train-files-dir',
                        help='Path to training data',
                        required=True)
    parser.add_argument('--train-files-prefix',
                        help='The prefix of the training input files.',
                        required=True)

    parser.add_argument(
        '--tf-transform-dir',
        help='Tf-transform directory with model from preprocessing step',
        required=True)

    parser.add_argument('--output-dir',
                        help="""\
      Directory under which which the serving model (under /serving_model_dir)\
      and the tf-mode-analysis model (under /eval_model_dir) will be written\
      """,
                        required=True)

    parser.add_argument('--eval-files-dir',
                        help='Path to evaluation data',
                        required=True)
    parser.add_argument('--eval-files-prefix',
                        help='The prefix of the eval input files.',
                        required=True)

    # Training arguments
    parser.add_argument(
        '--job-dir',
        help='GCS location to write checkpoints and export models',
        required=True)

    # Argument to turn on all logging
    parser.add_argument(
        '--verbosity',
        choices=['DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARN'],
        default='INFO',
    )
    # Experiment arguments
    parser.add_argument('--train-steps',
                        help='Count of steps to run the training job for',
                        required=True,
                        type=int)
    parser.add_argument(
        '--eval-steps',
        help='Number of steps to run evalution for at each checkpoint',
        default=100,
        type=int)
    parser.add_argument('--workers', type=int, default=0)
    parser.add_argument('--pss', type=int, default=0)
    parser.add_argument(
        '--cluster',
        type=str,
        help='GKE cluster set up for kubeflow. If set, zone must be provided. '
        + 'If not set, assuming this runs in a GKE container and current ' +
        'cluster is used.')
    parser.add_argument('--zone',
                        type=str,
                        help='zone of the kubeflow cluster.')
    parser.add_argument('--kfversion',
                        type=str,
                        default='v1alpha2',
                        help='The version of the deployed kubeflow. ' +
                        'If not set, the default version is v1alpha2')
    parser.add_argument('--tfjob-ns',
                        type=str,
                        default='kubeflow',
                        help='The namespace where the tfjob is submitted' +
                        'If not set, the namespace is kubeflow')
    parser.add_argument(
        '--tfjob-timeout-minutes',
        type=int,
        default=10,
        help='Time in minutes to wait for the TFJob to complete')
    args = parser.parse_args()

    # KUBEFLOW_NAMESPACE = 'default'

    logging.getLogger().setLevel(logging.INFO)
    args_dict = vars(args)
    if args.cluster and args.zone:
        cluster = args_dict.pop('cluster')
        zone = args_dict.pop('zone')
    else:
        # Get cluster name and zone from metadata
        metadata_server = "http://metadata/computeMetadata/v1/instance/"
        metadata_flavor = {'Metadata-Flavor': 'Google'}
        cluster = requests.get(metadata_server + "attributes/cluster-name",
                               headers=metadata_flavor).text
        zone = requests.get(metadata_server + "zone",
                            headers=metadata_flavor).text.split('/')[-1]

    logging.info('Getting credentials for GKE cluster %s.' % cluster)
    subprocess.call([
        'gcloud', 'container', 'clusters', 'get-credentials', cluster,
        '--zone', zone
    ])

    # Create metadata.json file for visualization.
    tb_dir = args_dict.pop(
        'working_dir')  # don't pass this arg to the training module
    metadata = {
        'outputs': [{
            'type': 'tensorboard',
            'source': tb_dir,
        }]
    }
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)

    workers = args_dict.pop('workers')
    pss = args_dict.pop('pss')
    kf_version = args_dict.pop('kfversion')
    tfjob_ns = args_dict.pop('tfjob_ns')
    tfjob_timeout_minutes = args_dict.pop('tfjob_timeout_minutes')
    args_list = [
        '--%s=%s' % (k.replace('_', '-'), v)
        for k, v in six.iteritems(args_dict) if v is not None
    ]
    logging.info('Generating training template.')
    template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 'train.template.yaml')
    content_yaml = _generate_train_yaml(template_file, tfjob_ns, workers, pss,
                                        args_list)

    logging.info('Start training.')
    # Set up handler for k8s clients
    config.load_incluster_config()
    api_client = k8s_client.ApiClient()
    create_response = tf_job_client.create_tf_job(api_client,
                                                  content_yaml,
                                                  version=kf_version)
    job_name = create_response['metadata']['name']

    wait_response = tf_job_client.wait_for_job(
        api_client,
        tfjob_ns,
        job_name,
        kf_version,
        timeout=datetime.timedelta(minutes=tfjob_timeout_minutes))
    succ = True
    #TODO: update this failure checking after tf-operator has the condition checking function.
    if 'Worker' in wait_response['status']['tfReplicaStatuses']:
        if 'Failed' in wait_response['status']['tfReplicaStatuses']['Worker']:
            logging.error('Training failed since workers failed.')
            succ = False
    if 'PS' in wait_response['status']['tfReplicaStatuses']:
        if 'Failed' in wait_response['status']['tfReplicaStatuses']['PS']:
            logging.error('Training failed since PSs failed.')
            succ = False
    if 'MASTER' in wait_response['status']['tfReplicaStatuses']:
        if 'Failed' in wait_response['status']['tfReplicaStatuses']['MASTER']:
            logging.error('Training failed since MASTER failed.')
            succ = False

    #TODO: remove this after kubeflow fixes the wait_for_job issue
    # because the wait_for_job returns when the worker finishes but the master might not be complete yet.
    if 'MASTER' in wait_response['status'][
            'tfReplicaStatuses'] and 'active' in wait_response['status'][
                'tfReplicaStatuses']['MASTER']:
        master_active = True
        while master_active:
            # Wait for master to finish
            time.sleep(2)
            wait_response = tf_job_client.wait_for_job(
                api_client,
                tfjob_ns,
                job_name,
                kf_version,
                timeout=datetime.timedelta(minutes=tfjob_timeout_minutes))
            if 'active' not in wait_response['status']['tfReplicaStatuses'][
                    'MASTER']:
                master_active = False

    if succ:
        logging.info('Training success.')

    tf_job_client.delete_tf_job(api_client,
                                tfjob_ns,
                                job_name,
                                version=kf_version)
    with open('/output.txt', 'w') as f:
        f.write(args.job_dir)
示例#4
0
def main(argv=None):
  parser = argparse.ArgumentParser(description='ML Trainer')
  parser.add_argument(
      '--working-dir',
      help='Training job working directory.',
      required=True)
  parser.add_argument(
      '--train-files-dir',
      help='Path to training data',
      required=True)
  parser.add_argument(
      '--train-files-prefix',
      help='The prefix of the training input files.',
      required=True)

  parser.add_argument(
      '--tf-transform-dir',
      help='Tf-transform directory with model from preprocessing step',
      required=True)

  parser.add_argument(
      '--output-dir',
      help="""\
      Directory under which which the serving model (under /serving_model_dir)\
      and the tf-mode-analysis model (under /eval_model_dir) will be written\
      """,
      required=True)

  parser.add_argument(
      '--eval-files-dir',
      help='Path to evaluation data',
      required=True
  )
  parser.add_argument(
      '--eval-files-prefix',
      help='The prefix of the eval input files.',
      required=True)

  # Training arguments
  parser.add_argument(
      '--job-dir',
      help='GCS location to write checkpoints and export models',
      required=True)

  # Argument to turn on all logging
  parser.add_argument(
      '--verbosity',
      choices=['DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARN'],
      default='INFO',
  )
  # Experiment arguments
  parser.add_argument(
      '--train-steps',
      help='Count of steps to run the training job for',
      required=True,
      type=int)
  parser.add_argument(
      '--eval-steps',
      help='Number of steps to run evalution for at each checkpoint',
      default=100,
      type=int)
  parser.add_argument('--workers', type=int, default=0)
  parser.add_argument('--pss', type=int, default=0)
  parser.add_argument('--cluster', type=str,
                      help='GKE cluster set up for kubeflow. If set, zone must be provided. ' +
                           'If not set, assuming this runs in a GKE container and current ' +
                           'cluster is used.')
  parser.add_argument('--zone', type=str, help='zone of the kubeflow cluster.')
  parser.add_argument('--kfversion', type=str,
                      default='v1beta1',
                      help='The version of the deployed kubeflow. ' +
                           'If not set, the default version is v1beta1')
  parser.add_argument('--tfjob-ns', type=str,
                      default='kubeflow',
                      help='The namespace where the tfjob is submitted' +
                           'If not set, the namespace is kubeflow')
  parser.add_argument('--tfjob-timeout-minutes', type=int,
                      default=20,
                      help='Time in minutes to wait for the TFJob to complete')
  args = parser.parse_args()

  logging.getLogger().setLevel(logging.INFO)
  args_dict = vars(args)
  if args.cluster and args.zone:
    cluster = args_dict.pop('cluster')
    zone = args_dict.pop('zone')
  else:
    # Get cluster name and zone from metadata
    metadata_server = "http://metadata/computeMetadata/v1/instance/"
    metadata_flavor = {'Metadata-Flavor' : 'Google'}
    cluster = requests.get(metadata_server + "attributes/cluster-name",
                           headers = metadata_flavor).text
    zone = requests.get(metadata_server + "zone",
                        headers = metadata_flavor).text.split('/')[-1]

  # logging.info('Getting credentials for GKE cluster %s.' % cluster)
  # subprocess.call(['gcloud', 'container', 'clusters', 'get-credentials', cluster,
                   # '--zone', zone])

  # Create metadata.json file for visualization.
  tb_dir = args_dict.pop('working_dir') # don't pass this arg to the training module
  metadata = {
    'outputs' : [{
      'type': 'tensorboard',
      'source': tb_dir,
    }]
  }
  with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
    json.dump(metadata, f)

  workers = args_dict.pop('workers')
  pss = args_dict.pop('pss')
  kf_version = args_dict.pop('kfversion')
  tfjob_ns = args_dict.pop('tfjob_ns')
  tfjob_timeout_minutes = args_dict.pop('tfjob_timeout_minutes')
  args_list = ['--%s=%s' % (k.replace('_', '-'),v)
               for k,v in six.iteritems(args_dict) if v is not None]
  logging.info('Generating training template.')
  template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'train.template.yaml')
  content_yaml = _generate_train_yaml(template_file, tfjob_ns, workers, pss, args_list)

  logging.info('Start training.')
  # Set up handler for k8s clients
  config.load_incluster_config()
  api_client = k8s_client.ApiClient()
  create_response = tf_job_client.create_tf_job(api_client, content_yaml, version=kf_version)
  job_name = create_response['metadata']['name']

  wait_response = tf_job_client.wait_for_job(
      api_client, tfjob_ns, job_name, kf_version,
      timeout=datetime.timedelta(minutes=tfjob_timeout_minutes))
  succ = True

  # TODO: update this failure checking after tf-operator has the condition checking function.
  if 'Worker' in wait_response['status']['replicaStatuses']:
    if 'Failed' in wait_response['status']['replicaStatuses']['Worker']:
      logging.error('Training failed since workers failed.')
      succ = False
  if 'PS' in wait_response['status']['replicaStatuses']:
    if 'Failed' in wait_response['status']['replicaStatuses']['PS']:
      logging.error('Training failed since PSs failed.')
      succ = False
  if 'Master' in wait_response['status']['replicaStatuses']:
    if 'Failed' in wait_response['status']['replicaStatuses']['Master']:
      logging.error('Training failed since Master failed.')
      succ = False

  # #TODO: remove this after kubeflow fixes the wait_for_job issue
  # # because the wait_for_job returns when the worker finishes but the master might not be complete yet.
  # if 'Master' in wait_response['status']['replicaStatuses'] and 'active' in wait_response['status']['replicaStatuses']['Master']:
  #   master_active = True
  #   while master_active:
  #     # Wait for master to finish
  #     time.sleep(2)
  #     wait_response = tf_job_client.wait_for_job(api_client, tfjob_ns, job_name, kf_version,
  #                                            timeout=datetime.timedelta(minutes=tfjob_timeout_minutes))
  #     if 'active' not in wait_response['status']['tfReplicaStatuses']['Master']:
  #       master_active = False

  if succ:
    logging.info('Training success.')

  tf_job_client.delete_tf_job(api_client, tfjob_ns, job_name, version=kf_version)
  with open('/output.txt', 'w') as f:
    f.write(args.job_dir)