Exemplo n.º 1
0
def main(config_dir, output_dir):
    config, msg = validate_trials_config(config_dir)
    if config is None:
        write_status(output_dir, False, msg)
        return 1

    if not config['models']:
        write_status(output_dir, True, 'Nothing run')
        return 0

    for model in sorted(config['models']):
        cmd_id = 0
        for success, msg, processed_command in parse_commands(model, config):
            if not success:
                write_status(output_dir, False, msg)
                return 1
            else:
                print(f'Running on command: {model}: {processed_command}')
                success, msg = eval_command(model, processed_command, config,
                                            config_dir, output_dir, cmd_id)
                if not success:
                    write_status(output_dir, False, msg)
                    return 1
                cmd_id += 1

    write_status(output_dir, True, 'success')
Exemplo n.º 2
0
def main(config_dir,
         experiment_mode,
         model_name,
         input_idx,
         params_file,
         out_file,
         trial_run=False,
         trial_run_outfile=None):
    if 'DTR_MODEL_NAME' in os.environ:
        model_name = os.environ['DTR_MODEL_NAME']
    config, msg = validate_trials_config(config_dir)
    if config is None:
        print(msg)
        return 1

    use_dtr = (experiment_mode == 'dtr')

    i = int(input_idx)
    is_trial = trial_run == 'True'

    if config['set_seed']:
        torch.manual_seed(config['seed'] + i)
        random.seed(config['seed'] + i)

    cwd = os.getcwd()

    # handle specific params, esp. for DTR
    specific_params = read_json(cwd, params_file)
    if 'DTR_MEMORY_BUDGET' in os.environ:
        specific_params['memory_budget'] = float(
            os.environ['DTR_MEMORY_BUDGET'])

    assert 'batch_size' in specific_params
    if use_dtr:
        assert 'memory_budget' in specific_params
        if specific_params['memory_budget'] > 0:
            print(f'Setting budget to {int(specific_params["memory_budget"])}')
            torch.set_memory_budget(int(specific_params['memory_budget']))
    if is_trial:
        timing_loop(model_name, i, config, use_dtr, specific_params, None,
                    True, trial_run_outfile)
        return

    with open(out_file, 'a', newline='') as csvfile:
        writer = create_csv_writer(csvfile, specific_params)
        timing_loop(model_name,
                    i,
                    config,
                    use_dtr,
                    specific_params,
                    writer,
                    memory_budget=specific_params.get('memory_budget', -1))
Exemplo n.º 3
0
def main(data_dir, config_dir, output_dir):
    try:
        config, msg = validate_trials_config(config_dir)
        if config is None:
            write_status(output_dir, False, msg)
            return 1

        summary = {}

        baseline_dict = {}

        for model in sorted(config['models']):
            summary[model] = []
            baseline_dict[model] = {}
            # the script will not be run if there is an error
            cmd_id = 0
            for _, _, exp_config in parse_commands(model, config):
                baseline_params = None
                for specific_params in unfold_settings(exp_config):
                    batch_size = specific_params['batch_size']
                    if specific_params['type'] == 'baseline':
                        baseline_dict[model][batch_size] = {
                            'type': 'baseline',
                            'specific_params': specific_params,
                            'cmd_id': cmd_id
                        }

                    # if there is a corresponding baseline,
                    # let's match using the dict
                    baseline_params = None
                    if (batch_size in baseline_dict[model]
                            and specific_params['type'] != 'baseline'):
                        baseline_params = baseline_dict[model][batch_size]

                    stats, msg = parse_data_file(
                        exp_config['type'],
                        model,
                        config,
                        specific_params,
                        data_dir,
                        cmd_id,
                        baseline_params=baseline_params)
                    if stats is None:
                        write_status(output_dir, False, msg)
                        return 1
                    stats['command_id'] = cmd_id
                    summary[model].append(stats)
                cmd_id += 1
        write_json(output_dir, 'data.json', summary)
        write_status(output_dir, True, 'success')
    except Exception as e:
        write_status(output_dir, False, render_exception(e))
Exemplo n.º 4
0
def main(data_dir, config_dir, output_dir):
    try:
        config, msg = validate_trials_config(config_dir)
        if config is None:
            write_status(output_dir, False, msg)
            return 1

        all_data = sort_data(data_dir)
        most_recent = all_data[-1]
        success, msg = render_graph(config, most_recent, output_dir)
        write_status(output_dir, success, msg)
    except Exception as e:
        write_status(output_dir, False, 'Exception encountered: ' + render_exception(e))
        return 1
    finally:
        plt.close()
Exemplo n.º 5
0
def main(data_dir, config_dir, output_dir):
    try:
        config, msg = validate_trials_config(config_dir)
        if config is None:
            write_status(output_dir, False, msg)
            return 1

        all_data = sort_data(data_dir)
        most_recent = all_data[-1]

        summary = summarize(config, most_recent)
        write_summary(output_dir, 'Pareto Curve Trial', summary)
        write_status(output_dir, True, 'success')

    except Exception as e:
        write_status(output_dir, False, 'Exception encountered: ' + render_exception(e))
        return 1
Exemplo n.º 6
0
def main(config_dir,
         experiment_mode,
         model_name,
         input_idx,
         params_file,
         out_file,
         trial_run=False,
         trial_run_outfile=None):
    config, msg = validate_trials_config(config_dir)
    if config is None:
        print(msg)
        return 1

    use_dtr = (experiment_mode == 'dtr')
    i = int(input_idx)
    is_trial = (trial_run == 'True')

    cwd = os.getcwd()
    specific_params = read_json(cwd, params_file)

    dry_run = config["dry_run"]
    n_reps = config["n_reps"]

    # TODO: this is very messy and we should make the setup nicer
    extra_params = specific_params.get('extra_params', {})
    retry_on_error = extra_params.get('retry_on_error', False)
    attempt_timeout = extra_params.get('attempt_timeout', 15)
    max_retries = extra_params.get('max_retries', 10)

    num_retries = 0
    if not retry_on_error:
        results_queue = LocalQueue()
        heartbeat_queue = LocalQueue()
        run_measurements(config, specific_params, i, model_name, dry_run,
                         n_reps, use_dtr, results_queue, heartbeat_queue)
    else:
        results_queue = mp.Queue()
        heartbeat_queue = mp.Queue()
        remaining_reps = n_reps
        for attempt in range(max_retries):
            proc = mp.Process(target=run_measurements,
                              args=(config, specific_params, i, model_name,
                                    dry_run, remaining_reps, use_dtr,
                                    results_queue, heartbeat_queue))
            proc.start()
            num_heartbeats = dry_run + remaining_reps
            last_timeout = attempt_timeout

            # TODO: clean this up
            encountered_error = False
            for b in range(num_heartbeats):
                try:
                    (success,
                     last_time) = heartbeat_queue.get(block=True,
                                                      timeout=last_timeout)
                    if not success:
                        print("Error in attempt")
                        encountered_error = True
                        break
                    last_timeout = math.ceil(2 * last_time)
                except Exception as e:
                    print("Attempt timed out")
                    encountered_error = True
                    break
            if not encountered_error:
                break

            if proc.is_alive():
                proc.terminate()
            num_retries += 1

            successful_trials = (b + 1) - dry_run
            if successful_trials > 0:
                remaining_reps -= successful_trials

        if num_retries == max_retries:
            raise RuntimeError("Used the max number of retries")

    report_results(model_name, i, config, specific_params, num_retries,
                   out_file, use_dtr, is_trial, trial_run_outfile,
                   results_queue)