示例#1
0
def info(job_name_or_id):
    """
    Prints detailed info for the run
    """
    try:
        experiment = ExperimentClient().get(normalize_job_name(job_name_or_id))
    except FloydException:
        experiment = ExperimentClient().get(job_name_or_id)

    task_instance_id = get_module_task_instance_id(experiment.task_instances)
    task_instance = TaskInstanceClient().get(
        task_instance_id) if task_instance_id else None
    normalized_job_name = normalize_job_name(experiment.name)
    table = [["Job name", normalized_job_name],
             [
                 "Output name",
                 normalized_job_name + '/output' if task_instance else None
             ], ["Created", experiment.created_pretty],
             ["Status", experiment.state],
             ["Duration(s)", experiment.duration_rounded],
             ["Instance", experiment.instance_type_trimmed],
             ["Description", experiment.description]]
    if task_instance and task_instance.mode in ['jupyter', 'serving']:
        table.append(["Mode", task_instance.mode])
        table.append(["Url", experiment.service_url])
    if experiment.tensorboard_url:
        table.append(["Tensorboard", experiment.tensorboard_url])
    floyd_logger.info(tabulate(table))
示例#2
0
def delete(id, yes):
    """
    Delete project run
    """
    experiment = ExperimentClient().get(id)
    task_instance = TaskInstanceClient().get(
        get_module_task_instance_id(experiment.task_instances))

    if experiment.state in ["queued", "running"]:
        floyd_logger.info(
            "Experiment in {} state cannot be deleted. Stop it first".format(
                experiment.state))
        return

    if not yes:
        click.confirm('Delete Run: {}?'.format(experiment.name),
                      abort=True,
                      default=False)

    if task_instance.module_id:
        ModuleClient().delete(task_instance.module_id)

    if ExperimentClient().delete(id):
        floyd_logger.info("Experiment deleted")
    else:
        floyd_logger.error("Failed to delete experiment")
示例#3
0
def clone(id):
    """
    Download files from a job.

    This will download the files that were originally uploaded at
    the start of the job.
    """
    try:
        experiment = ExperimentClient().get(
            normalize_job_name(id, use_config=False))
    except FloydException:
        experiment = ExperimentClient().get(id)

    task_instance_id = get_module_task_instance_id(experiment.task_instances)
    task_instance = TaskInstanceClient().get(
        task_instance_id) if task_instance_id else None
    if not task_instance:
        sys.exit(
            "Cannot clone this version of the job. Try a different version.")
    module = ModuleClient().get(
        task_instance.module_id) if task_instance else None
    code_url = "{}/api/v1/resources/{}?content=true&download=true".format(
        floyd.floyd_host, module.resource_id)
    ExperimentClient().download_tar(url=code_url,
                                    untar=True,
                                    delete_after_untar=True)
示例#4
0
def delete(names, yes):
    """
    Delete project runs
    """
    failures = False
    for name in names:
        try:
            experiment = ExperimentClient().get(normalize_job_name(name))
        except FloydException:
            experiment = ExperimentClient().get(name)

        if not experiment:
            failures = True
            continue

        if not yes and not click.confirm("Delete Job: {}?".format(
                experiment.name),
                                         abort=False,
                                         default=False):
            floyd_logger.info("Job {}: Skipped.".format(experiment.name))
            continue

        if not ExperimentClient().delete(experiment.id):
            failures = True
        else:
            floyd_logger.info("Job %s Deleted", experiment.name)

    if failures:
        sys.exit(1)
示例#5
0
def output(id, url, download):
    """
    Shows the output url of the run.
    By default opens the output page in your default browser.
    """
    experiment = ExperimentClient().get(id)
    task_instance = TaskInstanceClient().get(
        get_module_task_instance_id(experiment.task_instances))
    if "output" in task_instance.output_ids:
        resource = ResourceClient().get(task_instance.output_ids["output"])
        output_dir_url = "{}/viewer/{}".format(floyd.floyd_host, resource.uri)
        if url:
            floyd_logger.info(output_dir_url)
        else:
            if download:
                output_dir_url = "{}&download=true".format(output_dir_url)
                ExperimentClient().download_tar(url=output_dir_url,
                                                untar=True,
                                                delete_after_untar=True)
            else:
                floyd_logger.info(
                    "Opening output directory in your browser ...")
                webbrowser.open(output_dir_url)
    else:
        floyd_logger.error("Output directory not available")
示例#6
0
def delete(ids, yes):
    """
    Delete project runs
    """
    failures = False
    for id in ids:
        experiment = ExperimentClient().get(id)
        if not experiment:
            failures = True
            continue

        if not yes and not click.confirm("Delete Job: {}?".format(
                experiment.name),
                                         abort=False,
                                         default=False):
            floyd_logger.info("Job {}: Skipped.".format(experiment.name))
            continue

        if not ExperimentClient().delete(experiment.id):
            failures = True
        else:
            floyd_logger.info("Job %s Deleted", experiment.name)

    if failures:
        sys.exit(1)
示例#7
0
def status(id):
    """
    View status of all or specific run.
    It can also list status of all the runs in the project.
    """
    if id:
        experiment = ExperimentClient().get(id)
        print_experiments([experiment])
    else:
        experiments = ExperimentClient().get_all()
        print_experiments(experiments)
示例#8
0
def stop(id):
    """
    Stop a run before it can finish.
    """
    experiment = ExperimentClient().get(id)
    if experiment.state not in ["queued", "running"]:
        floyd_logger.info("Job in {} state cannot be stopped".format(experiment.state))
        return

    if ExperimentClient().stop(experiment.id):
        floyd_logger.info("Experiment shutdown request submitted. Check status to confirm shutdown")
    else:
        floyd_logger.error("Failed to stop job")
示例#9
0
def get_output(id, path, untar, delete_after_untar):
    """
    - Download all files in a dataset or from a Job output
    Eg: alice/projects/mnist/1/files, alice/projects/mnist/1/output or
    alice/dataset/mnist-data/1/

    Using /output will download the files that are saved at the end of the job.
    Note: This will download the files that are saved at
    the end of the job.
    - Download a directory from a dataset or from Job output
    Specify the path to a directory and download all its files and
    subdirectories.
    Eg: --path models/checkpoint1
    """
    data_source = get_data_object(id, use_data_config=False)

    if not data_source:
        if "output" in id:
            floyd_logger.info(
                "Note: You cannot clone the output of a running job. You need "
                "to wait for it to finish.")
        sys.exit()

    if path:
        # Download a directory from Dataset or Files
        # Get the type of data resource from the id
        # (foo/projects/bar/ or foo/datasets/bar/)
        if "/datasets/" in id:
            resource_type = "data"
            resource_id = data_source.id
        else:
            resource_type = "files"
            try:
                experiment = ExperimentClient().get(
                    normalize_job_name(id, use_config=False))
            except FloydException:
                experiment = ExperimentClient().get(id)
            resource_id = experiment.id

        data_url = "{}/api/v1/download/artifacts/{}/{}?is_dir=true&path={}" \
            .format(floyd.floyd_host, resource_type, resource_id, path)
    else:
        # Download the full Dataset
        data_url = "{}/api/v1/resources/{}?content=true&download=true".format(
            floyd.floyd_host, data_source.resource_id)

    DataClient().download_tar(
        url=data_url,
        untar=untar,
        delete_after_untar=untar and delete_after_untar,
    )
示例#10
0
def output(id, url):
    """
    View the files from a job.
    """
    try:
        experiment = ExperimentClient().get(normalize_job_name(id))
    except FloydException:
        experiment = ExperimentClient().get(id)

    output_dir_url = "%s/%s/files" % (floyd.floyd_web_host, experiment.name)
    if url:
        floyd_logger.info(output_dir_url)
    else:
        floyd_logger.info("Opening output path in your browser ...")
        webbrowser.open(output_dir_url)
示例#11
0
def clone(id):
    """
    Download the code for the experiment to the current path
    """
    experiment = ExperimentClient().get(id)
    task_instance_id = get_module_task_instance_id(experiment.task_instances)
    task_instance = TaskInstanceClient().get(task_instance_id) if task_instance_id else None
    if not task_instance:
        sys.exit("Cannot clone this version of the job. Try a different version.")
    module = ModuleClient().get(task_instance.module_id) if task_instance else None
    code_url = "{}/api/v1/resources/{}?content=true&download=true".format(floyd.floyd_host,
                                                                          module.resource_id)
    ExperimentClient().download_tar(url=code_url,
                                    untar=True,
                                    delete_after_untar=True)
示例#12
0
def status(id):
    """
    View status of all or specific run.
    It can also list status of all the runs in the project.
    """
    if id:
        try:
            experiment = ExperimentClient().get(normalize_job_name(id))
        except FloydException:
            experiment = ExperimentClient().get(id)

        print_experiments([experiment])
    else:
        experiments = ExperimentClient().get_all()
        print_experiments(experiments)
示例#13
0
def logs(id, url, tail, sleep_duration=1):
    """
    Print the logs of the run.
    """
    experiment = ExperimentClient().get(id)
    task_instance = TaskInstanceClient().get(
        get_module_task_instance_id(experiment.task_instances))
    log_url = "{}/api/v1/resources/{}?content=true".format(
        floyd.floyd_host, task_instance.log_id)
    if url:
        floyd_logger.info(log_url)
        return
    if tail:
        floyd_logger.info("Launching job ...")
        current_shell_output = ""
        while True:
            # Get the logs in a loop and log the new lines
            log_file_contents = get_url_contents(log_url)
            print_output = log_file_contents[len(current_shell_output):]
            if len(print_output.strip()):
                floyd_logger.info(print_output)
            current_shell_output = log_file_contents
            sleep(sleep_duration)
    else:
        log_file_contents = get_url_contents(log_url)
        if len(log_file_contents.strip()):
            floyd_logger.info(log_file_contents)
        else:
            floyd_logger.info("Launching job now. Try after a few seconds.")
示例#14
0
def info(id):
    """
    Prints detailed info for the run
    """
    experiment = ExperimentClient().get(id)
    task_instance_id = get_module_task_instance_id(experiment.task_instances)
    task_instance = TaskInstanceClient().get(
        task_instance_id) if task_instance_id else None
    mode = url = None
    if experiment.state == "running":
        if task_instance and task_instance.mode in ['jupyter', 'serving']:
            mode = task_instance.mode
            url = get_task_url(task_instance.id)
    table = [["Run ID", experiment.id], ["Name", experiment.name],
             ["Created", experiment.created_pretty],
             ["Status", experiment.state],
             ["Duration(s)", experiment.duration_rounded],
             ["Output ID", task_instance.id if task_instance else None],
             ["Instance", experiment.instance_type_trimmed],
             ["Version", experiment.description]]
    if mode:
        table.append(["Mode", mode])
    if url:
        table.append(["Url", url])
    floyd_logger.info(tabulate(table))
示例#15
0
def status(id):
    """
    View status of all jobs in a project.

    The command also accepts a specific job name.
    """
    if id:
        try:
            experiment = ExperimentClient().get(normalize_job_name(id))
        except FloydException:
            experiment = ExperimentClient().get(id)

        print_experiments([experiment])
    else:
        experiments = ExperimentClient().get_all()
        print_experiments(experiments)
示例#16
0
def output(id, url):
    """
    Shows the output url of the run.
    By default opens the output page in your default browser.
    """
    try:
        experiment = ExperimentClient().get(normalize_job_name(id))
    except FloydException:
        experiment = ExperimentClient().get(id)

    output_dir_url = "%s/%s/output" % (floyd.floyd_web_host, experiment.name)
    if url:
        floyd_logger.info(output_dir_url)
    else:
        floyd_logger.info("Opening output path in your browser ...")
        webbrowser.open(output_dir_url)
示例#17
0
def restart(ctx, job_name, data, open_notebook, env, message, gpu, cpu, gpup,
            cpup, command):
    """
    Restart a given job as a new job.
    """
    parameters = {}

    expt_client = ExperimentClient()

    try:
        job = expt_client.get(normalize_job_name(job_name))
    except FloydException:
        job = expt_client.get(job_name)

    if gpup:
        instance_type = G1P_INSTANCE_TYPE
    elif cpup:
        instance_type = C1P_INSTANCE_TYPE
    elif gpu:
        instance_type = G1_INSTANCE_TYPE
    elif cpu:
        instance_type = C1_INSTANCE_TYPE
    else:
        instance_type = job.instance_type

    if instance_type is not None:
        parameters['instance_type'] = instance_type
    else:
        instance_type = job.instance_type

    if env is not None:
        if not validate_env(env, instance_type):
            sys.exit(1)
        parameters['env'] = env

    success, data_ids = process_data_ids(data)
    if not success:
        sys.exit(1)

    if message:
        parameters['message'] = message

    if command:
        parameters['command'] = ' '.join(command)

    floyd_logger.info('Restarting job %s...', job_name)

    new_job_info = expt_client.restart(job.id, parameters=parameters)
    if not new_job_info:
        floyd_logger.error("Failed to restart job")
        sys.exit(1)

    floyd_logger.info('New job created:')
    table_output = [["JOB NAME"], [new_job_info['name']]]
    floyd_logger.info('\n' + tabulate(table_output, headers="firstrow") + '\n')

    show_new_job_info(expt_client, new_job_info['name'], new_job_info,
                      job.mode, open_notebook)
示例#18
0
def get_log_id(job_id):
    log_msg_printed = False
    while True:
        try:
            experiment = ExperimentClient().get(normalize_job_name(job_id))
        except FloydException:
            experiment = ExperimentClient().get(job_id)

        instance_log_id = experiment.instance_log_id
        if instance_log_id:
            break
        elif not log_msg_printed:
            floyd_logger.info("Waiting for logs ...\n")
            log_msg_printed = True

        sleep(1)

    return instance_log_id
示例#19
0
def logs(id, url, tail, follow, sleep_duration=1):
    """
    Print the logs of the run.
    """
    tail = tail or follow

    log_msg_printed = False
    while True:
        try:
            experiment = ExperimentClient().get(normalize_job_name(id))
        except FloydException:
            experiment = ExperimentClient().get(id)

        instance_log_id = experiment.instance_log_id
        if instance_log_id:
            break
        elif not log_msg_printed:
            floyd_logger.info("Waiting for logs ...\n")
            log_msg_printed = True

        sleep(1)

    log_url = "{}/api/v1/resources/{}?content=true".format(
        floyd.floyd_host, instance_log_id)
    if url:
        floyd_logger.info(log_url)
        return
    if tail:
        floyd_logger.info("Launching job ...")
        current_shell_output = ""
        while True:
            # Get the logs in a loop and log the new lines
            log_file_contents = ResourceClient().get_content(instance_log_id)
            print_output = log_file_contents[len(current_shell_output):]
            if len(print_output.strip()):
                floyd_logger.info(print_output)
            current_shell_output = log_file_contents
            sleep(sleep_duration)
    else:
        log_file_contents = ResourceClient().get_content(instance_log_id)
        if len(log_file_contents.strip()):
            floyd_logger.info(log_file_contents)
        else:
            floyd_logger.info("Launching job now. Try after a few seconds.")
示例#20
0
def clone(id, path):
    """
    - Download all files from a job

    Eg: alice/projects/mnist/1/

    Note: This will download the files that were originally uploaded at
    the start of the job.

    - Download files in a specific path from a job

    Specify the path to a directory and download all its files and subdirectories.

    Eg: --path models/checkpoint1
    """
    try:
        experiment = ExperimentClient().get(
            normalize_job_name(id, use_config=False))
    except FloydException:
        experiment = ExperimentClient().get(id)

    task_instance_id = get_module_task_instance_id(experiment.task_instances)
    task_instance = TaskInstanceClient().get(
        task_instance_id) if task_instance_id else None
    if not task_instance:
        sys.exit(
            "Cannot clone this version of the job. Try a different version.")
    module = ModuleClient().get(
        task_instance.module_id) if task_instance else None

    if path:
        # Download a directory from Code
        code_url = "{}/api/v1/download/artifacts/code/{}?is_dir=true&path={}".format(
            floyd.floyd_host, experiment.id, path)
    else:
        # Download the full Code
        code_url = "{}/api/v1/resources/{}?content=true&download=true".format(
            floyd.floyd_host, module.resource_id)
    ExperimentClient().download_tar(url=code_url,
                                    untar=True,
                                    delete_after_untar=True)
示例#21
0
def stop(id):
    """
    Stop a running job.
    """
    try:
        experiment = ExperimentClient().get(normalize_job_name(id))
    except FloydException:
        experiment = ExperimentClient().get(id)

    if experiment.state not in ["queued", "queue_scheduled", "running"]:
        floyd_logger.info("Job in {} state cannot be stopped".format(
            experiment.state))
        sys.exit(1)

    if not ExperimentClient().stop(experiment.id):
        floyd_logger.error("Failed to stop job")
        sys.exit(1)

    floyd_logger.info(
        "Experiment shutdown request submitted. Check status to confirm shutdown"
    )
示例#22
0
def logs(id, url, tail, sleep_duration=1):
    """
    Print the logs of the run.
    """
    try:
        experiment = ExperimentClient().get(normalize_job_name(id))
    except FloydException:
        experiment = ExperimentClient().get(id)

    if experiment.state == 'queued':
        floyd_logger.info("Job is currently in a queue")
        return

    instance_log_id = experiment.instance_log_id
    if not instance_log_id:
        floyd_logger.info("Job not started yet, no log to show.")
        sys.exit(1)

    log_url = "{}/api/v1/resources/{}?content=true".format(
        floyd.floyd_host, instance_log_id)
    if url:
        floyd_logger.info(log_url)
        return
    if tail:
        floyd_logger.info("Launching job ...")
        current_shell_output = ""
        while True:
            # Get the logs in a loop and log the new lines
            log_file_contents = ResourceClient().get_content(instance_log_id)
            print_output = log_file_contents[len(current_shell_output):]
            if len(print_output.strip()):
                floyd_logger.info(print_output)
            current_shell_output = log_file_contents
            sleep(sleep_duration)
    else:
        log_file_contents = ResourceClient().get_content(instance_log_id)
        if len(log_file_contents.strip()):
            floyd_logger.info(log_file_contents)
        else:
            floyd_logger.info("Launching job now. Try after a few seconds.")
示例#23
0
def output(id, url):
    """
    Shows the output url of the run.
    By default opens the output page in your default browser.
    """
    experiment = ExperimentClient().get(id)
    task_instance = TaskInstanceClient().get(get_module_task_instance_id(experiment.task_instances))
    if "output" in task_instance.output_ids:
        output_dir_url = "{}/api/v1/resources/{}?content=true".format(floyd.floyd_host,
                                                                      task_instance.output_ids["output"])
        if url:
            floyd_logger.info(output_dir_url)
        else:
            floyd_logger.info("Opening output directory in your browser ...")
            webbrowser.open(output_dir_url)
    else:
        floyd_logger.error("Output directory not available")
示例#24
0
"""View experiments by job."""

import streamlit as st
import numpy as np
import pandas as pd
import subprocess

from floyd.client.experiment import ExperimentClient
from floyd.client.data import DataClient
from pathlib import Path
from torch import tensor

from metalearn import plotting

experiment_client = ExperimentClient()
data_client = DataClient()

cache_dir = Path.home() / "floyd_cache"

EXPERIMENT_LIMIT = 10000
SUCCESS_STATE = "success"
METRICS_FILE = "rnn_metalearn_controller_experiment.csv"


@st.cache
def get_experiments():
    return {
        exp.name: exp
        for exp in experiment_client.get_all(limit=EXPERIMENT_LIMIT)
        if exp.state == SUCCESS_STATE
    }
示例#25
0
def run(ctx, gpu, env, data, mode, command):
    """
    Run a command on Floyd. Floyd will upload contents of the
    current directory and run your command remotely.
    This command will generate a run id for reference.
    """
    command_str = ' '.join(command)
    experiment_config = ExperimentConfigManager.get_config()
    access_token = AuthConfigManager.get_access_token()
    version = experiment_config.version
    experiment_name = "{}/{}:{}".format(access_token.username,
                                        experiment_config.name,
                                        version)

    # Create module
    module = Module(name=experiment_name,
                    description=version,
                    command=command_str,
                    mode=get_mode_parameter(mode),
                    family_id=experiment_config.family_id,
                    default_container=get_docker_image(env, gpu),
                    version=version)
    module_id = ModuleClient().create(module)
    floyd_logger.debug("Created module with id : {}".format(module_id))

    # Create experiment request
    instance_type = GPU_INSTANCE_TYPE if gpu else CPU_INSTANCE_TYPE
    experiment_request = ExperimentRequest(name=experiment_name,
                                           description=version,
                                           module_id=module_id,
                                           data_id=data,
                                           predecessor=experiment_config.experiment_predecessor,
                                           family_id=experiment_config.family_id,
                                           version=version,
                                           instance_type=instance_type)
    experiment_id = ExperimentClient().create(experiment_request)
    floyd_logger.debug("Created experiment : {}".format(experiment_id))

    # Update expt config including predecessor
    experiment_config.increment_version()
    experiment_config.set_module_predecessor(module_id)
    experiment_config.set_experiment_predecessor(experiment_id)
    ExperimentConfigManager.set_config(experiment_config)

    table_output = [["RUN ID", "NAME", "VERSION"],
                    [experiment_id, experiment_name, version]]
    floyd_logger.info(tabulate(table_output, headers="firstrow"))
    floyd_logger.info("")

    if mode != 'default':
        while True:
            # Wait for the experiment to become available
            try:
                experiment = ExperimentClient().get(experiment_id)
                break
            except Exception:
                floyd_logger.debug("Experiment not available yet: {}".format(experiment_id))
                sleep(1)
                continue

        # Print the path to jupyter notebook
        if mode == 'jupyter':
            jupyter_url = get_task_url(get_module_task_instance_id(experiment.task_instances))
            floyd_logger.info("Waiting for Jupyter notebook to become available ...")
            if wait_for_url(jupyter_url):
                floyd_logger.info("\nPath to jupyter notebook: {}".format(jupyter_url))
            else:
                floyd_logger.info("Problem starting the notebook. View logs for more information")

        # Print the path to serving endpoint
        if mode == 'serve':
            floyd_logger.info("Path to service endpoint: {}".format(
                get_task_url(get_module_task_instance_id(experiment.task_instances))))

    floyd_logger.info("""
To view logs enter:
    floyd logs {}
        """.format(experiment_id))
示例#26
0
def run(ctx, gpu, env, message, data, mode, open_notebook, tensorboard, gpup,
        cpup, gpu2, cpu2, command):
    """
    Run a command on Floyd. Floyd will upload contents of the
    current directory and run your command remotely.
    This command will generate a run id for reference.
    """
    # Error early if more than one --env is passed.  Then get the first/only
    # --env out of the list so all other operations work normally (they don't
    # expect an iterable). For details on this approach, see the comment above
    # the --env click option
    if len(env) > 1:
        floyd_logger.error(
            "You passed more than one environment: {}. Please specify a single environment."
            .format(env))
        sys.exit(1)
    env = env[0]
    experiment_config = ExperimentConfigManager.get_config()
    access_token = AuthConfigManager.get_access_token()
    namespace = experiment_config.namespace or access_token.username

    if not ProjectClient().exists(experiment_config.name, namespace=namespace):
        floyd_logger.error(
            'Invalid project id, please run '
            '"floyd init PROJECT_NAME" before scheduling a job.')
        sys.exit(1)

    experiment_name = "{}/{}".format(namespace, experiment_config.name)

    success, data_ids = process_data_ids(data)
    if not success:
        sys.exit(2)

    # Create module
    default_name = 'input' if len(data_ids) <= 1 else None
    module_inputs = [{
        'name': get_data_name(data_str, default_name),
        'type': 'dir'
    } for data_str in data_ids]

    if gpu2:
        instance_type = G2_INSTANCE_TYPE
    elif cpu2:
        instance_type = C2_INSTANCE_TYPE
    elif gpup:
        instance_type = G1P_INSTANCE_TYPE
    elif cpup:
        instance_type = C1P_INSTANCE_TYPE
    elif gpu:
        instance_type = G1_INSTANCE_TYPE
    else:
        instance_type = C1_INSTANCE_TYPE

    if not validate_env(env, instance_type):
        sys.exit(3)

    command_str = ' '.join(command)
    if command_str and mode in ('jupyter', 'serve'):
        floyd_logger.error(
            'Command argument "%s" cannot be used with mode: %s.\nSee http://docs.floydhub.com/guides/run_a_job/#mode for more information about run modes.',
            command_str, mode)  # noqa
        sys.exit(3)

    module = Module(name=experiment_name,
                    description=message or '',
                    command=command_str,
                    mode=get_mode_parameter(mode),
                    enable_tensorboard=tensorboard,
                    family_id=experiment_config.family_id,
                    inputs=module_inputs,
                    env=env,
                    arch=INSTANCE_ARCH_MAP[instance_type])

    try:
        module_id = ModuleClient().create(module)
    except BadRequestException as e:
        if 'Project not found, ID' in e.message:
            floyd_logger.error(
                'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.'
            )
        else:
            floyd_logger.error('ERROR: %s', e.message)
        sys.exit(4)
    floyd_logger.debug("Created module with id : %s", module_id)

    # Create experiment request
    # Get the actual command entered in the command line
    full_command = get_command_line(instance_type, env, message, data, mode,
                                    open_notebook, tensorboard, command_str)
    experiment_request = ExperimentRequest(
        name=experiment_name,
        description=message,
        full_command=full_command,
        module_id=module_id,
        env=env,
        data_ids=data_ids,
        family_id=experiment_config.family_id,
        instance_type=instance_type)
    expt_client = ExperimentClient()
    expt_info = expt_client.create(experiment_request)
    floyd_logger.debug("Created job : %s", expt_info['id'])

    job_name = expt_info['name']
    show_new_job_info(expt_client, job_name, expt_info, mode, open_notebook)
示例#27
0
文件: run.py 项目: rmdort/floyd-cli
def run(ctx, gpu, env, message, data, mode, open, tensorboard, command):
    """
    Run a command on Floyd. Floyd will upload contents of the
    current directory and run your command remotely.
    This command will generate a run id for reference.
    """
    experiment_config = ExperimentConfigManager.get_config()
    if not ProjectClient().exists(experiment_config.family_id):
        floyd_logger.error(
            'Invalid project id, please run '
            '"floyd init PROJECT_NAME" before scheduling a job.')
        return

    access_token = AuthConfigManager.get_access_token()
    experiment_name = "{}/{}".format(access_token.username,
                                     experiment_config.name)

    # Create module
    if len(data) > 5:
        floyd_logger.error("Cannot attach more than 5 datasets to an job")
        return

    # Get the data entity from the server to:
    # 1. Confirm that the data id or uri exists and has the right permissions
    # 2. If uri is used, get the id of the dataset
    data_ids = []
    for data_name_or_id in data:
        path = None
        if ':' in data_name_or_id:
            data_name_or_id, path = data_name_or_id.split(':')
        data_obj = DataClient().get(data_name_or_id)
        if not data_obj:
            floyd_logger.error(
                "Data not found for name or id: {}".format(data_name_or_id))
            return
        data_ids.append(
            "{}:{}".format(data_obj.id, path) if path else data_obj.id)

    default_name = 'input' if len(data_ids) <= 1 else None
    module_inputs = [{
        'name': get_data_name(data_str, default_name),
        'type': 'dir'
    } for data_str in data_ids]

    if gpu:
        arch = 'gpu'
        instance_type = GPU_INSTANCE_TYPE
    else:
        arch = 'cpu'
        instance_type = CPU_INSTANCE_TYPE

    env_map = EnvClient().get_all()
    envs = env_map.get(arch)
    if envs:
        if env not in envs:
            floyd_logger.error(
                "{} is not in the list of supported environments: {}".format(
                    env, ', '.join(envs.keys())))
            return
    else:
        floyd_logger.error("{} is not a supported architecture".format(arch))
        return

    command_str = ' '.join(command)
    module = Module(name=experiment_name,
                    description=message or '',
                    command=command_str,
                    mode=get_mode_parameter(mode),
                    enable_tensorboard=tensorboard,
                    family_id=experiment_config.family_id,
                    inputs=module_inputs,
                    env=env,
                    arch=arch)

    from floyd.exceptions import BadRequestException
    try:
        module_id = ModuleClient().create(module)
    except BadRequestException as e:
        if 'Project not found, ID' in e.message:
            floyd_logger.error(
                'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.'
            )
        else:
            floyd_logger.error('ERROR: %s', e.message)
        sys.exit(1)
    floyd_logger.debug("Created module with id : {}".format(module_id))

    # Create experiment request
    # Get the actual command entered in the command line
    full_command = get_command_line(gpu, env, message, data, mode, open,
                                    tensorboard, command)
    experiment_request = ExperimentRequest(
        name=experiment_name,
        description=message,
        full_command=full_command,
        module_id=module_id,
        data_ids=data_ids,
        family_id=experiment_config.family_id,
        instance_type=instance_type)
    expt_cli = ExperimentClient()
    expt_info = expt_cli.create(experiment_request)
    floyd_logger.debug("Created job : {}".format(expt_info['id']))

    table_output = [["JOB NAME"], [expt_info['name']]]
    floyd_logger.info(tabulate(table_output, headers="firstrow"))
    floyd_logger.info("")

    if mode in ['jupyter', 'serve']:
        while True:
            # Wait for the experiment / task instances to become available
            try:
                experiment = expt_cli.get(expt_info['id'])
                if experiment.task_instances:
                    break
            except Exception:
                floyd_logger.debug("Job not available yet: {}".format(
                    expt_info['id']))

            floyd_logger.debug("Job not available yet: {}".format(
                expt_info['id']))
            sleep(3)
            continue

        # Print the path to jupyter notebook
        if mode == 'jupyter':
            jupyter_url = experiment.service_url
            print(
                "Setting up your instance and waiting for Jupyter notebook to become available ...",
                end='')
            if wait_for_url(jupyter_url,
                            sleep_duration_seconds=2,
                            iterations=900):
                floyd_logger.info(
                    "\nPath to jupyter notebook: {}".format(jupyter_url))
                if open:
                    webbrowser.open(jupyter_url)
            else:
                floyd_logger.info(
                    "\nPath to jupyter notebook: {}".format(jupyter_url))
                floyd_logger.info(
                    "Notebook is still loading. View logs to track progress")
                floyd_logger.info("   floyd logs {}".format(expt_info['name']))

        # Print the path to serving endpoint
        if mode == 'serve':
            floyd_logger.info("Path to service endpoint: {}".format(
                experiment.service_url))

        if experiment.timeout_seconds < 4 * 60 * 60:
            floyd_logger.info(
                "\nYour job timeout is currently set to {} seconds".format(
                    experiment.timeout_seconds))
            floyd_logger.info(
                "This is because you are in a trial account. Paid users will have longer timeouts. "
                "See https://www.floydhub.com/pricing for details")

    else:
        floyd_logger.info("To view logs enter:")
        floyd_logger.info("   floyd logs {}".format(expt_info['name']))
示例#28
0
文件: run.py 项目: longhuei/floyd-cli
def run(ctx, cpu, gpu, env, message, data, mode, open_notebook, follow,
        tensorboard, gpup, cpup, gpu2, cpu2, max_runtime, task, command):
    """
    Run a command on Floyd. Floyd will upload contents of the
    current directory and run your command remotely.
    This command will generate a run id for reference.
    """
    # cli_default is used for any option that has default value
    cli_default = {'description': '', 'command': ''}
    # Error early if more than one --env is passed.  Then get the first/only
    # --env out of the list so all other operations work normally (they don't
    # expect an iterable). For details on this approach, see the comment above
    # the --env click option
    if not env:
        cli_default['env'] = DEFAULT_ENV
        env = None
    elif len(env) > 1:
        floyd_logger.error(
            "You passed more than one environment: {}. Please specify a single environment."
            .format(env))
        sys.exit(1)
    else:
        env = env[0]

    if not mode:
        cli_default['mode'] = 'command'

    experiment_config = ExperimentConfigManager.get_config()
    access_token = AuthConfigManager.get_access_token()
    namespace = experiment_config.namespace or access_token.username

    if not ProjectClient().exists(experiment_config.name, namespace=namespace):
        floyd_logger.error(
            'Invalid project id, please run '
            '"floyd init PROJECT_NAME" before scheduling a job.')
        sys.exit(1)

    experiment_name = "{}/{}".format(namespace, experiment_config.name)

    success, data_ids = process_data_ids(data)
    if not success:
        sys.exit(2)

    # Create module
    default_name = 'input' if len(data_ids) <= 1 else None
    module_inputs = [{
        'name': get_data_name(data_str, default_name),
        'type': 'dir'
    } for data_str in data_ids]

    instance_type = None
    if gpu2:
        instance_type = G2_INSTANCE_TYPE
    elif cpu2:
        instance_type = C2_INSTANCE_TYPE
    elif gpup:
        instance_type = G1P_INSTANCE_TYPE
    elif cpup:
        instance_type = C1P_INSTANCE_TYPE
    elif gpu:
        instance_type = G1_INSTANCE_TYPE
    elif cpu:
        instance_type = C1_INSTANCE_TYPE

    if not instance_type:
        cli_default['instance_type'] = C1_INSTANCE_TYPE

    yaml_config = read_yaml_config()
    arch = INSTANCE_ARCH_MAP[resolve_final_instance_type(
        instance_type, yaml_config, task, cli_default)]
    if not validate_env(env or cli_default['env'], arch):
        sys.exit(3)

    command_str = ' '.join(command)
    if command_str and mode in ('jupyter', 'serve'):
        floyd_logger.error(
            'Command argument "%s" cannot be used with mode: %s.\nSee http://docs.floydhub.com/guides/run_a_job/#mode for more information about run modes.',
            command_str, mode)  # noqa
        sys.exit(3)
    if command_str == '':
        # set to none so it won't override floyd config
        command_str = None

    module = Module(name=experiment_name,
                    description=message or '',
                    command=command_str,
                    mode=mode,
                    enable_tensorboard=tensorboard,
                    family_id=experiment_config.family_id,
                    inputs=module_inputs,
                    env=env,
                    instance_type=instance_type,
                    yaml_config=yaml_config,
                    task=task)

    try:
        module_id = ModuleClient().create(module, cli_default)
    except BadRequestException as e:
        if 'Project not found, ID' in e.message:
            floyd_logger.error(
                'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.'
            )
        else:
            floyd_logger.error('ERROR: %s', e.message)
        sys.exit(4)
    floyd_logger.debug("Created module with id : %s", module_id)

    # Create experiment request
    # Get the actual command entered in the command line
    if max_runtime:
        max_runtime = int(max_runtime)
    full_command = get_command_line(instance_type, env, message, data, mode,
                                    open_notebook, tensorboard, command_str)
    experiment_request = ExperimentRequest(
        name=experiment_name,
        description=message,
        full_command=full_command,
        module_id=module_id,
        max_runtime=max_runtime,
        env=env,
        data_ids=data_ids,
        family_id=experiment_config.family_id,
        instance_type=instance_type,
        yaml_config=yaml_config,
        task=task)
    expt_client = ExperimentClient()
    expt_info = expt_client.create(experiment_request, cli_default)
    floyd_logger.debug("Created job : %s", expt_info['id'])

    job_name = expt_info['name']
    if not follow:
        show_new_job_info(expt_client, job_name, expt_info, mode,
                          open_notebook)
    else:
        # If the user specified --follow, we assume they're only interested in
        # log output and not in anything that would be displayed by
        # show_new_job_info.
        floyd_logger.info("Opening logs ...")
        instance_log_id = instance_log_id = get_log_id(job_name)
        follow_logs(instance_log_id)
示例#29
0
def run(ctx, gpu, env, message, data, mode, open, tensorboard, gpup, cpup,
        command):
    """
    Run a command on Floyd. Floyd will upload contents of the
    current directory and run your command remotely.
    This command will generate a run id for reference.
    """
    experiment_config = ExperimentConfigManager.get_config()
    if not ProjectClient().exists(experiment_config.family_id):
        floyd_logger.error(
            'Invalid project id, please run '
            '"floyd init PROJECT_NAME" before scheduling a job.')
        sys.exit(1)

    access_token = AuthConfigManager.get_access_token()
    experiment_name = "{}/{}".format(access_token.username,
                                     experiment_config.name)

    success, data_ids = process_data_ids(data)
    if not success:
        sys.exit(2)

    # Create module
    default_name = 'input' if len(data_ids) <= 1 else None
    module_inputs = [{
        'name': get_data_name(data_str, default_name),
        'type': 'dir'
    } for data_str in data_ids]

    if gpup:
        instance_type = G1P_INSTANCE_TYPE
    elif cpup:
        instance_type = C1P_INSTANCE_TYPE
    elif gpu:
        instance_type = G1_INSTANCE_TYPE
    else:
        instance_type = C1_INSTANCE_TYPE

    if not validate_env(env, instance_type):
        sys.exit(3)

    command_str = ' '.join(command)
    if command_str and mode in ('jupyter', 'serve'):
        floyd_logger.error(
            'Command argument "%s" cannot be used with mode: %s.\nSee http://docs.floydhub.com/guides/run_a_job/#mode for more information about run modes.',
            command_str, mode)
        sys.exit(3)

    module = Module(name=experiment_name,
                    description=message or '',
                    command=command_str,
                    mode=get_mode_parameter(mode),
                    enable_tensorboard=tensorboard,
                    family_id=experiment_config.family_id,
                    inputs=module_inputs,
                    env=env,
                    arch=INSTANCE_ARCH_MAP[instance_type])

    try:
        module_id = ModuleClient().create(module)
    except BadRequestException as e:
        if 'Project not found, ID' in e.message:
            floyd_logger.error(
                'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.'
            )
        else:
            floyd_logger.error('ERROR: %s', e.message)
        sys.exit(4)
    floyd_logger.debug("Created module with id : %s", module_id)

    # Create experiment request
    # Get the actual command entered in the command line
    full_command = get_command_line(instance_type, env, message, data, mode,
                                    open, tensorboard, command_str)
    experiment_request = ExperimentRequest(
        name=experiment_name,
        description=message,
        full_command=full_command,
        module_id=module_id,
        data_ids=data_ids,
        family_id=experiment_config.family_id,
        instance_type=instance_type)
    expt_client = ExperimentClient()
    expt_info = expt_client.create(experiment_request)
    floyd_logger.debug("Created job : %s", expt_info['id'])

    job_name = normalize_job_name(expt_info['name'])
    floyd_logger.info("")
    table_output = [["JOB NAME"], [job_name]]
    floyd_logger.info(tabulate(table_output, headers="firstrow"))
    floyd_logger.info("")
    show_new_job_info(expt_client, job_name, expt_info, mode)
示例#30
0
文件: run.py 项目: longhuei/floyd-cli
def restart(ctx, job_name, data, open_notebook, env, message, gpu, cpu, gpup,
            cpup, command):
    """
    Restart a given job as a new job.
    """
    # Error early if more than one --env is passed. Then get the first/only
    # --env out of the list so all other operations work normally (they don't
    # expect an iterable). For details on this approach, see the comment above
    # the --env click option
    if len(env) > 1:
        floyd_logger.error(
            "You passed more than one environment: {}. Please specify a single environment."
            .format(env))
        sys.exit(1)
    env = env[0]

    parameters = {}

    expt_client = ExperimentClient()

    try:
        job = expt_client.get(normalize_job_name(job_name))
    except FloydException:
        job = expt_client.get(job_name)

    if gpup:
        instance_type = G1P_INSTANCE_TYPE
    elif cpup:
        instance_type = C1P_INSTANCE_TYPE
    elif gpu:
        instance_type = G1_INSTANCE_TYPE
    elif cpu:
        instance_type = C1_INSTANCE_TYPE
    else:
        instance_type = job.instance_type

    if instance_type is not None:
        parameters['instance_type'] = instance_type
    else:
        instance_type = job.instance_type

    if env is not None:
        arch = INSTANCE_ARCH_MAP[instance_type]
        if not validate_env(env, arch):
            sys.exit(1)
        parameters['env'] = env

    success, data_ids = process_data_ids(data)
    if not success:
        sys.exit(1)
    if data_ids:
        parameters['data_ids'] = data_ids

    if message:
        parameters['description'] = message

    if command:
        parameters['command'] = ' '.join(command)

    floyd_logger.info('Restarting job %s...', job_name)

    new_job_info = expt_client.restart(job.id, parameters=parameters)
    if not new_job_info:
        floyd_logger.error("Failed to restart job")
        sys.exit(1)

    show_new_job_info(expt_client, new_job_info['name'], new_job_info,
                      job.mode, open_notebook)