Exemplo n.º 1
0
def train(job_id, bucket_name, region, config_files, dataset, scale_tier,
          master_type, worker_type, worker_count, parameter_server_type,
          parameter_server_count):
    account = ServiceAccount()
    account.validate_region(region)

    if bucket_name is None:
        bucket_name = 'luminoth-{}'.formata(account.client_id)
        click.echo(
            'Bucket name not specified. Using "{}".'.format(bucket_name))

    # Creates bucket for logs and models if it doesn't exist
    bucket = account.get_bucket(bucket_name)

    if not job_id:
        job_id = 'train_{}'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))

    # Define path in bucket to store job's config, logs, etc.
    base_path = 'lumi_{}'.format(job_id)

    package_path = build_package(bucket, base_path)
    job_dir = 'gs://{}/{}'.format(bucket_name, base_path)

    override_params = [
        'train.job_dir={}'.format(job_dir),
    ]

    if dataset:
        # Check if absolute or relative dataset path
        if not dataset.startswith('gs://'):
            dataset = 'gs://{}'.format(dataset)
        override_params.append('dataset.dir={}'.format(dataset))

    config = get_config(config_files, override_params=override_params)

    # Update final config file to job bucket
    config_path = '{}/{}'.format(base_path, DEFAULT_CONFIG_FILENAME)
    upload_data(bucket, config_path, dump_config(config))

    args = ['--config', '{}/{}'.format(job_dir, DEFAULT_CONFIG_FILENAME)]

    cloudml = account.cloud_service('ml')

    training_inputs = {
        'scaleTier': scale_tier,
        'packageUris': ['gs://{}/{}'.format(bucket_name, package_path)],
        'pythonModule': 'luminoth.train',
        'args': args,
        'region': region,
        'jobDir': job_dir,
        'runtimeVersion': RUNTIME_VERSION
    }

    if scale_tier == 'CUSTOM':
        training_inputs['masterType'] = master_type
        if worker_count > 0:
            training_inputs['workerCount'] = worker_count
            training_inputs['workerType'] = worker_type

        if parameter_server_count > 0:
            training_inputs['parameterServerCount'] = parameter_server_count
            training_inputs['parameterServerType'] = parameter_server_type

    job_spec = {'jobId': job_id, 'trainingInput': training_inputs}

    jobrequest = cloudml.projects().jobs().create(body=job_spec,
                                                  parent='projects/{}'.format(
                                                      account.project_id))

    try:
        click.echo('Submitting training job.')
        res = jobrequest.execute()
        click.echo('Job submitted successfully.')
        click.echo('state = {}, createTime = {}'.format(
            res.get('state'), res.get('createTime')))
        click.echo('\nJob id: {}'.format(job_id))

        save_run(config, environment='gcloud', extra_config=job_spec)

    except Exception as err:
        click.echo('There was an error creating the training job. '
                   'Check the details: \n{}'.format(err._get_reason()))
Exemplo n.º 2
0
def train(job_id, service_account_json, bucket_name, region, config_files,
          dataset, scale_tier, master_type, worker_type, worker_count,
          parameter_server_type, parameter_server_count):

    project_id = get_project_id(service_account_json)
    if project_id is None:
        raise ValueError(
            'Missing "project_id" in service_account_json "{}"'.format(
                service_account_json))

    if bucket_name is None:
        client_id = get_client_id(service_account_json)
        bucket_name = 'luminoth-{}'.format(client_id)
        click.echo(
            'Bucket name not specified. Using "{}".'.format(bucket_name))

    credentials = get_credentials(service_account_json)
    validate_region(region, project_id, credentials)

    # Creates bucket for logs and models if it doesn't exist
    bucket = get_bucket(service_account_json, bucket_name)

    if not job_id:
        job_id = 'train_{}'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))

    # Define path in bucket to store job's config, logs, etc.
    base_path = 'lumi_{}'.format(job_id)

    package_path = build_package(bucket, base_path)
    job_dir = 'gs://{}/{}/'.format(bucket_name, base_path)

    override_params = [
        'train.job_dir={}'.format(job_dir),
    ]

    if dataset:
        # Check if absolute or relative dataset path
        if not dataset.startswith('gs://'):
            dataset = 'gs://{}'.format(dataset)
        override_params.append('dataset.dir={}'.format(dataset))

    config = get_config(config_files, override_params=override_params)
    # We should validate config before submitting job

    # Update final config file to job bucket
    config_path = os.path.join(base_path, DEFAULT_CONFIG_FILENAME)
    upload_data(bucket, config_path, dump_config(config))

    args = ['--config', os.path.join(job_dir, DEFAULT_CONFIG_FILENAME)]

    cloudml = cloud_service(credentials, 'ml')

    training_inputs = {
        'scaleTier': scale_tier,
        'packageUris': [
            'gs://{}/{}'.format(bucket_name, package_path)
        ],
        'pythonModule': 'luminoth.train',
        'args': args,
        'region': region,
        'jobDir': job_dir,
        'runtimeVersion': RUNTIME_VERSION,
    }

    if scale_tier == 'CUSTOM':
        training_inputs['masterType'] = master_type
        if worker_count > 0:
            training_inputs['workerCount'] = worker_count
            training_inputs['workerType'] = worker_type

        if parameter_server_count > 0:
            training_inputs['parameterServerCount'] = parameter_server_count
            training_inputs['parameterServerType'] = parameter_server_type

    job_spec = {
        'jobId': job_id,
        'trainingInput': training_inputs
    }

    jobrequest = cloudml.projects().jobs().create(
        body=job_spec, parent='projects/{}'.format(project_id))

    try:
        click.echo('Submitting training job.')
        res = jobrequest.execute()
        click.echo('Job {} submitted successfully.'.format(job_id))
        click.echo('state = {}, createTime = {}'.format(
            res.get('state'), res.get('createTime')))

        save_run(config, environment='gcloud', extra_config=job_spec)

    except Exception as err:
        click.echo(
            'There was an error creating the training job. '
            'Check the details: \n{}'.format(err._get_reason())
        )
Exemplo n.º 3
0
def train(
    job_id,
    resume_job_id,
    bucket_name,
    region,
    config_files,
    dataset,
    scale_tier,
    master_type,
    worker_type,
    worker_count,
    parameter_server_type,
    parameter_server_count,
):
    account = ServiceAccount()
    account.validate_region(region)

    if bucket_name is None:
        bucket_name = "luminoth-{}".format(account.client_id)
        click.echo(
            'Bucket name not specified. Using "{}".'.format(bucket_name))

    # Creates bucket for logs and models if it doesn't exist
    bucket = account.get_bucket(bucket_name)

    if not job_id:
        job_id = "train_{}".format(datetime.now().strftime("%Y%m%d_%H%M%S"))

    # Path in bucket to store job's config, logs, etc.
    # If we are resuming a previous job, then we will use the same path
    # that job used, so Luminoth will load the checkpoint from there.
    base_path = "lumi_{}".format(resume_job_id if resume_job_id else job_id)

    package_path = build_package(bucket, base_path)
    job_dir = "gs://{}/{}".format(bucket_name, base_path)

    override_params = [
        "train.job_dir={}".format(job_dir),
    ]

    if dataset:
        # Check if absolute or relative dataset path
        if not dataset.startswith("gs://"):
            dataset = "gs://{}".format(dataset)
        override_params.append("dataset.dir={}".format(dataset))

    # Even if we are resuming job, we will use a new config. Thus, we will
    # overwrite the config in the old job's dir if it existed.
    config = get_config(config_files, override_params=override_params)

    # Update final config file to job bucket
    config_path = "{}/{}".format(base_path, DEFAULT_CONFIG_FILENAME)
    upload_data(bucket, config_path, dump_config(config))

    args = ["--config", "{}/{}".format(job_dir, DEFAULT_CONFIG_FILENAME)]

    cloudml = account.cloud_service("ml")

    training_inputs = {
        "scaleTier": scale_tier,
        "packageUris": ["gs://{}/{}".format(bucket_name, package_path)],
        "pythonModule": "luminoth.train",
        "args": args,
        "region": region,
        "jobDir": job_dir,
        "runtimeVersion": RUNTIME_VERSION,
        "pythonVersion": PYTHON_VERSION,
    }

    if scale_tier == "CUSTOM":
        training_inputs["masterType"] = master_type
        if worker_count > 0:
            training_inputs["workerCount"] = worker_count
            training_inputs["workerType"] = worker_type

        if parameter_server_count > 0:
            training_inputs["parameterServerCount"] = parameter_server_count
            training_inputs["parameterServerType"] = parameter_server_type

    job_spec = {"jobId": job_id, "trainingInput": training_inputs}

    jobrequest = (cloudml.projects().jobs().create(body=job_spec,
                                                   parent="projects/{}".format(
                                                       account.project_id)))

    try:
        click.echo("Submitting training job.")
        res = jobrequest.execute()
        click.echo("Job submitted successfully.")
        click.echo("state = {}, createTime = {}".format(
            res.get("state"), res.get("createTime")))
        if resume_job_id:
            click.echo(
                "\nNote: this job is resuming job {}.\n".format(resume_job_id))
        click.echo("Job id: {}".format(job_id))
        click.echo("Job directory: {}".format(job_dir))

        save_run(config, environment="gcloud", extra_config=job_spec)

    except Exception as err:
        click.echo("There was an error creating the training job. "
                   "Check the details: \n{}".format(err._get_reason()))
Exemplo n.º 4
0
def train(job_id, service_account_json, bucket_name, region, config_files,
          dataset, scale_tier, master_type, worker_type, worker_count,
          parameter_server_type, parameter_server_count):

    project_id = get_project_id(service_account_json)
    if project_id is None:
        raise ValueError(
            'Missing "project_id" in service_account_json "{}"'.format(
                service_account_json))

    if bucket_name is None:
        client_id = get_client_id(service_account_json)
        bucket_name = 'luminoth-{}'.format(client_id)
        click.echo(
            'Bucket name not specified. Using "{}".'.format(bucket_name))

    credentials = get_credentials(service_account_json)
    validate_region(region, project_id, credentials)

    # Creates bucket for logs and models if it doesn't exist
    bucket = get_bucket(service_account_json, bucket_name)

    if not job_id:
        job_id = 'train_{}'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))

    # Define path in bucket to store job's config, logs, etc.
    base_path = 'lumi_{}'.format(job_id)

    package_path = build_package(bucket, base_path)

    # Check if absolute or relative dataset path
    if not dataset.startswith('gs://'):
        dataset = 'gs://{}'.format(dataset)

    args = []

    args.extend([
        '-o',
        'dataset.dir={}'.format(dataset),
    ])

    override_params = [
        'dataset.dir={}'.format(dataset),
    ]

    custom_config = load_config(config_files)
    model_class = get_model(custom_config.model.type)
    config = get_model_config(
        model_class.base_config,
        custom_config,
        override_params,
    )
    # We should validate config before submitting job

    # Update final config file to job bucket
    config_path = os.path.join(base_path, DEFAULT_CONFIG_FILENAME)
    upload_data(bucket, config_path, dump_config(config))

    args = ['--config', 'gs://{}/{}'.format(bucket_name, config_path)]

    cloudml = cloud_service(credentials, 'ml')

    training_inputs = {
        'scaleTier': scale_tier,
        'packageUris': ['gs://{}/{}'.format(bucket_name, package_path)],
        'pythonModule': 'luminoth.train',
        'args': args,
        'region': region,
        'jobDir': 'gs://{}/{}/'.format(bucket_name, base_path),
        'runtimeVersion': RUNTIME_VERSION
    }

    if scale_tier == 'CUSTOM':
        training_inputs['masterType'] = master_type
        training_inputs['workerType'] = worker_type
        training_inputs['workerCount'] = worker_count
        if parameter_server_count > 0:
            training_inputs['parameterServerCount'] = parameter_server_count
            training_inputs['parameterServerType'] = parameter_server_type

    job_spec = {'jobId': job_id, 'trainingInput': training_inputs}

    jobrequest = cloudml.projects().jobs().create(
        body=job_spec, parent='projects/{}'.format(project_id))

    try:
        click.echo('Submitting training job.')
        res = jobrequest.execute()
        click.echo('Job {} submitted successfully.'.format(job_id))
        click.echo('state = {}, createTime = {}'.format(
            res.get('state'), res.get('createTime')))

        save_run(config, environment='gcloud', extra_config=job_spec)

    except Exception as err:
        click.echo('There was an error creating the training job. '
                   'Check the details: \n{}'.format(err._get_reason()))