示例#1
0
def repackage_to_staging(output_path):
  """Repackage it from local installed location and copy it to GCS."""

  import google.datalab.ml as ml

  # Find the package root. __file__ is under [package_root]/mltoolbox/image/classification.
  package_root = os.path.join(os.path.dirname(__file__), '../../../')
  # We deploy setup.py in the same dir for repackaging purpose.
  setup_py = os.path.join(os.path.dirname(__file__), 'setup.py')
  staging_package_url = os.path.join(output_path, 'staging', 'image_classification.tar.gz')
  ml.package_and_copy(package_root, setup_py, staging_package_url)
  return staging_package_url
示例#2
0
def repackage_to_staging(output_path):
    """Repackage it from local installed location and copy it to GCS."""

    import google.datalab.ml as ml

    # Find the package root. __file__ is under [package_root]/mltoolbox/image/classification.
    package_root = os.path.join(os.path.dirname(__file__), '../../../')
    # We deploy setup.py in the same dir for repackaging purpose.
    setup_py = os.path.join(os.path.dirname(__file__), 'setup.py')
    staging_package_url = os.path.join(output_path, 'staging',
                                       'image_classification.tar.gz')
    ml.package_and_copy(package_root, setup_py, staging_package_url)
    return staging_package_url
示例#3
0
def _package_to_staging(staging_package_url):
    """Repackage this package from local installed location and copy it to GCS.

    Args:
      staging_package_url: GCS path.
    """
    import google.datalab.ml as ml

    # Find the package root. __file__ is under [package_root]/mltoolbox/_structured_data/this_file
    package_root = os.path.abspath(
        os.path.join(os.path.dirname(__file__), '../../'))
    setup_path = os.path.abspath(
        os.path.join(os.path.dirname(__file__), 'master_setup.py'))
    tar_gz_path = os.path.join(staging_package_url, 'staging', 'trainer.tar.gz')

    print('Building package and uploading to %s' % tar_gz_path)
    ml.package_and_copy(package_root, setup_path, tar_gz_path)

    return tar_gz_path
示例#4
0
def _train(args, cell):
    env = google.datalab.utils.commands.notebook_environment()
    cell_data = google.datalab.utils.commands.parse_config(cell, env)
    required_keys = ['training_data', 'evaluation_data']
    if args['cloud']:
        required_keys.append('cloud')

    google.datalab.utils.commands.validate_config(cell_data,
                                                  required_keys=required_keys,
                                                  optional_keys=['model_args'])
    job_args = [
        '--job-dir',
        _abs_path(args['output_dir']), '--output-dir-from-analysis-step',
        _abs_path(args['output_dir_from_analysis_step'])
    ]

    def _process_train_eval_data(data, arg_name, job_args):
        if isinstance(data, dict):
            if 'csv_file_pattern' in data:
                job_args.extend(
                    [arg_name, _abs_path(data['csv_file_pattern'])])
                if '--run-transforms' not in job_args:
                    job_args.append('--run-transforms')
            elif 'transformed_file_pattern' in data:
                job_args.extend(
                    [arg_name,
                     _abs_path(data['transformed_file_pattern'])])
            else:
                raise ValueError(
                    'Invalid training_data dict. ' +
                    'Requires either "csv_file_pattern" or "transformed_file_pattern".'
                )
        elif isinstance(data, google.datalab.ml.CsvDataSet):
            for file_name in data.input_files:
                job_args.append(arg_name + '=' + _abs_path(file_name))
        else:
            raise ValueError(
                'Invalid training data. Requires either a dict, or ' +
                'a google.datalab.ml.CsvDataSet')

    _process_train_eval_data(cell_data['training_data'], '--train-data-paths',
                             job_args)
    _process_train_eval_data(cell_data['evaluation_data'], '--eval-data-paths',
                             job_args)

    # TODO(brandondutra) document that any model_args that are file paths must
    # be given as an absolute path
    if 'model_args' in cell_data:
        for k, v in six.iteritems(cell_data['model_args']):
            job_args.extend(['--' + k, str(v)])

    try:
        tmpdir = None
        if args['package']:
            tmpdir = tempfile.mkdtemp()
            code_path = os.path.join(tmpdir, 'package')
            _archive.extract_archive(args['package'], code_path)
        else:
            code_path = MLTOOLBOX_CODE_PATH

        if args['cloud']:
            cloud_config = cell_data['cloud']
            if not args['output_dir'].startswith('gs://'):
                raise ValueError(
                    'Cloud training requires a GCS (starting with "gs://") output_dir.'
                )

            staging_tarball = os.path.join(args['output_dir'], 'staging',
                                           'trainer.tar.gz')
            datalab_ml.package_and_copy(code_path,
                                        os.path.join(code_path, 'setup.py'),
                                        staging_tarball)
            job_request = {
                'package_uris': [staging_tarball],
                'python_module': 'trainer.task',
                'job_dir': args['output_dir'],
                'args': job_args,
            }
            job_request.update(cloud_config)
            job_id = cloud_config.get('job_id', None)
            job = datalab_ml.Job.submit_training(job_request, job_id)
            _show_job_link(job)
        else:
            cmd_args = ['python', '-m', 'trainer.task'] + job_args
            _shell_process.run_and_monitor(cmd_args,
                                           os.getpid(),
                                           cwd=code_path)
    finally:
        if tmpdir:
            shutil.rmtree(tmpdir)
示例#5
0
def _train(args, cell):
  if args['cloud_config'] and not args['cloud']:
    raise ValueError('"cloud_config" is provided but no "--cloud". '
                     'Do you want local run or cloud run?')

  job_args = ['--job-dir', _abs_path(args['output']),
              '--analysis', _abs_path(args['analysis'])]

  def _process_train_eval_data(data, arg_name, job_args):
    if isinstance(data, dict):
      if 'csv' in data:
        job_args.append(arg_name + '=' + _abs_path(data['csv']))
        if '--transform' not in job_args:
          job_args.append('--transform')
      elif 'transformed' in data:
        job_args.append(arg_name + '=' + _abs_path(data['transformed']))
      else:
        raise ValueError('Invalid training_data dict. '
                         'Requires either "csv" or "transformed".')
    elif isinstance(data, google.datalab.ml.CsvDataSet):
      for file_name in data.input_files:
        job_args.append(arg_name + '=' + _abs_path(file_name))
    else:
      raise ValueError('Invalid training data. Requires either a dict, or '
                       'a google.datalab.ml.CsvDataSet')

  _process_train_eval_data(args['training_data'], '--train', job_args)
  _process_train_eval_data(args['evaluation_data'], '--eval', job_args)

  # TODO(brandondutra) document that any model_args that are file paths must
  # be given as an absolute path
  if args['model_args']:
    for k, v in six.iteritems(args['model_args']):
      job_args.extend(['--' + k, str(v)])

  try:
    tmpdir = None
    if args['package']:
      tmpdir = tempfile.mkdtemp()
      code_path = os.path.join(tmpdir, 'package')
      _archive.extract_archive(args['package'], code_path)
    else:
      code_path = MLTOOLBOX_CODE_PATH

    if args['cloud']:
      cloud_config = args['cloud_config']
      if not args['output'].startswith('gs://'):
        raise ValueError('Cloud training requires a GCS (starting with "gs://") output.')

      staging_tarball = os.path.join(args['output'], 'staging', 'trainer.tar.gz')
      datalab_ml.package_and_copy(code_path,
                                  os.path.join(code_path, 'setup.py'),
                                  staging_tarball)
      job_request = {
          'package_uris': [staging_tarball],
          'python_module': 'trainer.task',
          'job_dir': args['output'],
          'args': job_args,
      }
      job_request.update(cloud_config)
      job_id = cloud_config.get('job_id', None)
      job = datalab_ml.Job.submit_training(job_request, job_id)
      _show_job_link(job)
    else:
      cmd_args = ['python', '-m', 'trainer.task'] + job_args
      _shell_process.run_and_monitor(cmd_args, os.getpid(), cwd=code_path)
  finally:
    if tmpdir:
      shutil.rmtree(tmpdir)