Exemplo n.º 1
0
def split_key_value_store(
        input_store: KeyValueStore,
        *,
        num_parts: int,
        random_seed: Optional[int] = None) -> Tuple[KeyValueStore]:
    """
    Splits *input_store* into *num_parts* pieces of nearly equal size.

    Some of the resulting key-value stores may be empty.
    """
    if num_parts <= 0:
        raise RuntimeError("Number of parts must be positive")

    split_locator = input_store.locator / "split"
    split_output_dir = directory_for(split_locator)
    param_args = {
        "input": input_store.input_parameters(),
        "num_slices": num_parts,
        "output_dir": split_output_dir,
    }
    if random_seed:
        param_args["random_seed"] = random_seed
    split_job = run_python_on_parameters(
        split_locator,
        split_entry_point,
        Parameters.from_mapping(param_args),
        depends_on=input_store,
    )
    return tuple(
        ZipKeyValueStore(
            path=split_output_dir / f"{slice_index}.zip",
            depends_on=split_job,
            locator=split_locator / str(slice_index),
        ) for slice_index in range(num_parts))
Exemplo n.º 2
0
def downsample(input_store: KeyValueStore,
               *,
               limit: int,
               output_locator: Optional[Locator] = None) -> KeyValueStore:
    """
    Convince function to run `vistautils.scripts.downsample_key_value_store` as a Pegasus Job
    """
    if not output_locator:
        output_locator = input_store.locator / f"downsampled-{limit}"
    output_zip_path = directory_for(output_locator) / "downsampled.zip"
    downsample_job = run_python_on_parameters(
        output_locator,
        downsample_key_value_store,
        Parameters.from_mapping({
            "input": input_store.input_parameters(),
            "output_zip_path": output_zip_path,
            "num_to_sample": limit,
            "random_seed": 0,
        }),
        depends_on=input_store,
    )
    return ZipKeyValueStore(
        path=output_zip_path,
        locator=output_locator,
        depends_on=[input_store.depends_on, downsample_job],
    )
Exemplo n.º 3
0
def test_not_clearing_ckpts(monkeypatch, tmp_path):

    workflow_params = Parameters.from_mapping({
        "workflow_name":
        "Test",
        "workflow_created":
        "Testing",
        "workflow_log_dir":
        str(tmp_path / "log"),
        "workflow_directory":
        str(tmp_path / "working"),
        "site":
        "saga",
        "namespace":
        "test",
        "partition":
        "scavenge",
        "home_dir":
        str(tmp_path),
    })

    initialize_vista_pegasus_wrapper(workflow_params)

    multiply_job_name = Locator(_parse_parts("jobs/multiply"))
    multiply_output_file = tmp_path / "multiplied_nums.txt"
    multiply_input_file = tmp_path / "raw_nums.txt"
    multiply_params = Parameters.from_mapping({
        "input_file": multiply_input_file,
        "output_file": multiply_output_file,
        "x": 4
    })

    multiple_dir = directory_for(multiply_job_name)

    checkpointed_multiply_file = multiple_dir / "___ckpt"
    checkpointed_multiply_file.touch()
    multiply_output_file.touch()

    run_python_on_parameters(multiply_job_name,
                             multiply_by_x_main,
                             multiply_params,
                             depends_on=[])
    monkeypatch.setattr("builtins.input", lambda _: "n")
    write_workflow_description()
    assert checkpointed_multiply_file.exists()
Exemplo n.º 4
0
def join_to_key_value_zip(key_value_zips_to_join: Iterable[ZipKeyValueStore],
                          *, output_locator: Locator) -> ZipKeyValueStore:
    key_value_zips_to_join = tuple(key_value_zips_to_join)
    output_zip_path = directory_for(output_locator) / "joined.zip"
    join_job = run_python_on_parameters(
        output_locator,
        join_key_value_stores,
        Parameters.from_mapping({
            "input_store_list_file": [p.path for p in key_value_zips_to_join],
            "output": {
                "type": "zip",
                "path": output_zip_path
            },
        }),
        depends_on=key_value_zips_to_join,
    )
    return ZipKeyValueStore(path=output_zip_path,
                            locator=output_locator,
                            depends_on=join_job)
Exemplo n.º 5
0
def gaze_ablation_runner_entry_point(params: Parameters) -> None:
    """This function creates all possible gaze ablation param files within a given range"""
    initialize_vista_pegasus_wrapper(params)

    # Get the baseline experiment parameters for gaze ablation -- these are things common to all of
    # the experiments, like:
    #
    #     include_image_links: true
    #     sort_learner_descriptions_by_length: True
    #     num_pretty_descriptions: 5
    baseline_parameters = params.namespace("gaze_ablation")

    # get the minimum and maximum number of objects in a scene
    min_num_objects = params.integer("min_num_objects", default=1)
    max_num_objects = params.integer("max_num_objects", default=7)

    # this gets the number of different accuracies to try; default = increment by 0.1
    num_accuracy_increments = params.integer("num_increments", default=11)
    values_for_accuracy = np.linspace(0, 1, num_accuracy_increments)

    # the number of noise instances to be included
    min_num_noise_instances = params.integer("min_num_noise", default=0)
    max_num_noise_instances = params.integer("max_num_noise", default=0)

    # get the number of instances in the entire curriculum
    min_num_instances_in_curriculum = params.integer("min_instances",
                                                     default=10)
    max_num_instances_in_curriculum = params.integer("max_instances",
                                                     default=20)

    # all possible numbers of noise instances
    for num_noise_instances in range(min_num_noise_instances,
                                     max_num_noise_instances + 1):
        # all possible numbers of instances in the curriculum
        for num_instances in range(min_num_instances_in_curriculum,
                                   max_num_instances_in_curriculum + 1):
            # all possible numbers of instances
            for num_objects_in_instance in range(min_num_objects,
                                                 max_num_objects + 1):
                # all possible accuracies
                for prob_given in values_for_accuracy:
                    for prob_not_given in values_for_accuracy:
                        # both ignoring and perceiving gaze
                        for add_gaze in [True, False]:
                            # Define the experiment name, which is used both as a job name and to
                            # choose a directory in which to store the experiment results.
                            experiment_name_string = EXPERIMENT_NAME_FORMAT.format(
                                num_instances=num_instances,
                                num_noise_instances=num_noise_instances,
                                num_objects_in_instance=num_objects_in_instance,
                                prob_given=prob_given,
                                prob_not_given=prob_not_given,
                                add_gaze=add_gaze,
                            )
                            experiment_name = Locator(
                                experiment_name_string.split("-"))

                            # Note that the input parameters should include the root params and
                            # anything else we want.
                            experiment_params = baseline_parameters.unify(
                                FIXED_PARAMETERS).unify({
                                    "experiment":
                                    experiment_name_string,
                                    "experiment_group_dir":
                                    directory_for(experiment_name),
                                    "hypothesis_log_dir":
                                    directory_for(experiment_name) /
                                    "hypotheses",
                                    "learner_logging_path":
                                    directory_for(experiment_name),
                                    "log_learner_state":
                                    True,
                                    "resume_from_latest_logged_state":
                                    True,
                                    "pursuit-curriculum-params": {
                                        "num_instances": num_instances,
                                        "num_noise_instances":
                                        num_noise_instances,
                                        "num_objects_in_instance":
                                        num_objects_in_instance,
                                        "add_gaze": add_gaze,
                                        "prob_given": float(prob_given),
                                        "prob_not_given":
                                        float(prob_not_given),
                                    },
                                })

                            run_python_on_parameters(
                                experiment_name,
                                log_experiment_script,
                                experiment_params,
                                depends_on=[],
                            )

    write_workflow_description()
Exemplo n.º 6
0
def main(params: Parameters):
    adam_root = params.existing_directory("adam_root")
    m13_experiments_dir = adam_root / "parameters" / "experiments" / "m13"
    use_pegasus = params.boolean("use_pegasus", default=False)
    if use_pegasus:
        initialize_vista_pegasus_wrapper(params)

    param_files: List[Path] = []

    if params.boolean("include_objects", default=True):
        param_files.append(m13_experiments_dir / "objects.params")

    if params.boolean("include_imprecise_size", default=True):
        param_files.append(m13_experiments_dir / "imprecise_size.params")

    if params.boolean("include_imprecise_temporal", default=True):
        param_files.append(m13_experiments_dir / "imprecise_temporal.params")

    if params.boolean("include_subtle_verb", default=True):
        param_files.append(m13_experiments_dir / "subtle_verb.params")

    if params.boolean("include_object_restrictions", default=True):
        param_files.append(m13_experiments_dir / "object_restrictions.params")

    if params.boolean("include_functionally_defined_objects", default=True):
        param_files.append(m13_experiments_dir / "functionally_defined_objects.params")

    if params.boolean("include_relations", default=True):
        param_files.append(m13_experiments_dir / "relations.params")

    if params.boolean("include_generics", default=True):
        param_files.append(m13_experiments_dir / "generics.params")

    if params.boolean("include_verbs_with_dynamic_prepositions", default=True):
        param_files.append(
            m13_experiments_dir / "events_with_dynamic_prepositions.params"
        )

    if params.boolean("include_m9_complete", default=False):
        param_files.append(m13_experiments_dir / "m9_complete.params")

    if params.boolean("include_m13_complete", default=False):
        param_files.append(m13_experiments_dir / "m13_complete.params")

    if params.boolean("include_m13_shuffled", default=False):
        param_files.append(m13_experiments_dir / "m13_shuffled.params")

    # This activates a special "debug" curriculum,
    # which is meant to be edited in the code by a developer to do fine-grained debugging.
    if params.boolean("include_debug", default=False):
        param_files.append(m13_experiments_dir / "debug.params")

    # If any of the param files don't exist, bail out earlier instead of making the user
    # wait for the error.
    for param_file in param_files:
        if not param_file.exists():
            raise RuntimeError(f"Expected param file {param_file} does not exist")

    for param_file in param_files:
        logging.info("Running %s", param_file)
        experiment_params = YAMLParametersLoader().load(param_file)
        if not use_pegasus:
            log_experiment_entry_point(experiment_params)
        else:
            experiment_name = Locator(experiment_params.string("experiment"))
            experiment_params = experiment_params.unify(
                {
                    "experiment_group_dir": directory_for(experiment_name) / "output",
                    "hypothesis_log_dir": directory_for(experiment_name) / "hypotheses",
                    # State pickles will go under experiment_name/learner_state
                    "learner_logging_path": directory_for(experiment_name),
                    "log_learner_state": True,
                    "resume_from_latest_logged_state": True,
                    "log_hypothesis_every_n_steps": params.integer(
                        "save_state_every_n_steps"
                    ),
                    "debug_learner_pickling": params.boolean(
                        "debug_learner_pickling", default=False
                    ),
                }
            )

            run_python_on_parameters(
                experiment_name, log_experiment_script, experiment_params, depends_on=[]
            )

    if use_pegasus:
        write_workflow_description()
Exemplo n.º 7
0
def test_dax_with_python_into_container_jobs(tmp_path):
    docker_tar = Path(f"{tmp_path}/docker/tar.tar")
    docker_build_dir = tmp_path
    docker_image_name = "pegasus_wrapper_container_demo"
    docker_image_tag = "0.2"

    # Generating parameters for initializing a workflow
    # We recommend making workflow directory, site, and partition parameters
    # in an research workflow
    workflow_params = Parameters.from_mapping({
        "workflow_name":
        "Test",
        "workflow_created":
        "Testing",
        "workflow_log_dir":
        str(tmp_path / "log"),
        "workflow_directory":
        str(tmp_path / "working"),
        "site":
        "saga",
        "namespace":
        "test",
        "home_dir":
        str(tmp_path),
        "partition":
        "scavenge",
    })

    saga31_request = SlurmResourceRequest.from_parameters(
        Parameters.from_mapping({
            "run_on_single_node": "saga31",
            "partition": "gaia"
        }))

    # Our source input for the sample jobs
    input_file = tmp_path / "raw_nums.txt"
    add_y_output_file_nas = tmp_path / "nums_y.txt"
    sorted_output_file_nas = tmp_path / "sorted.txt"

    random = Random()
    random.seed(0)
    nums = [int(random.random() * 100) for _ in range(0, 25)]

    # Base Job Locator
    job_locator = Locator(("jobs", ))
    docker_python_root = Path("/home/app/")

    # Write a list of numbers out to be able to run the workflow
    with input_file.open("w") as mult_file:
        mult_file.writelines(f"{num}\n" for num in nums)

    initialize_vista_pegasus_wrapper(workflow_params)

    build_container_locator = job_locator / "build_docker"
    build_container = run_bash(
        build_container_locator,
        command=[
            "mkdir -p /scratch/dockermount/pegasus_wrapper_tmp",
            f"cd {docker_build_dir}",
            f"docker build . -t {docker_image_name}:{docker_image_tag}",
            f"docker save -o /scratch/dockermount/pegasus_wrapper_tmp/{docker_tar.name} {docker_image_name}:{docker_image_tag}",
            f"cp /scratch/dockermount/pegasus_wrapper_tmp/{docker_tar.name} {docker_tar.absolute()}",
            f"chmod go+r {docker_tar.absolute()}",
        ],
        depends_on=[],
        resource_request=saga31_request,
    )
    build_container_dir = directory_for(build_container_locator)
    assert (build_container_dir / "script.sh").exists()

    python36 = add_container(
        f"{docker_image_name}:{docker_image_tag}",
        "docker",
        str(docker_tar.absolute()),
        image_site="saga",
        bypass_staging=True,
    )

    job_profile = PegasusProfile(namespace="pegasus",
                                 key="transfer.bypass.input.staging",
                                 value="True")

    mongo4_4 = add_container("mongo:4.4",
                             "docker",
                             "path/to/tar.tar",
                             image_site="saga",
                             bypass_staging=True)

    with pytest.raises(RuntimeError):
        _ = stop_docker_as_service(mongo4_4,
                                   depends_on=[],
                                   resource_request=saga31_request)

    start_mongo = start_docker_as_service(
        mongo4_4,
        depends_on=[build_container],
        docker_args=f"-v /scratch/mongo/data/db:/data/db",
        resource_request=saga31_request,
    )
    mongo4_4_dir = directory_for(Locator(("containers", mongo4_4.name)))
    assert (mongo4_4_dir / "start.sh").exists()
    assert (mongo4_4_dir / "stop.sh").exists()

    add_y_locator = job_locator / "add"
    add_y_job = run_python_on_args(
        add_y_locator,
        docker_python_root / "add_y.py",
        set_args=f"{input_file} {add_y_output_file_nas} --y 10",
        depends_on=[build_container],
        job_profiles=[job_profile],
        resource_request=saga31_request,
        container=python36,
        input_file_paths=[input_file],
        output_file_paths=[add_y_output_file_nas],
    )
    add_y_dir = directory_for(add_y_locator)
    assert (add_y_dir / "___run.sh").exists()

    with pytest.raises(RuntimeError):
        _ = run_python_on_args(
            add_y_locator,
            docker_python_root / "add_y.py",
            set_args=f"{input_file} {add_y_output_file_nas} --y 10",
            depends_on=[build_container],
            job_profiles=[job_profile],
            resource_request=saga31_request,
            container=python36,
            input_file_paths=[input_file, input_file],
            output_file_paths=[add_y_output_file_nas],
        )

    sort_job_locator = job_locator / "sort"
    sort_job = run_python_on_parameters(
        sort_job_locator,
        sort_nums_main,
        {
            "input_file": add_y_output_file_nas,
            "output_file": sorted_output_file_nas
        },
        depends_on=[add_y_job],
        container=python36,
        job_profiles=[job_profile],
        resource_request=saga31_request,
        input_file_paths=add_y_output_file_nas,
        output_file_paths=sorted_output_file_nas,
    )
    assert sort_job == run_python_on_parameters(
        sort_job_locator,
        sort_nums_main,
        {
            "input_file": add_y_output_file_nas,
            "output_file": sorted_output_file_nas
        },
        depends_on=[add_y_job],
        container=python36,
        job_profiles=[job_profile],
        resource_request=saga31_request,
        input_file_paths=add_y_output_file_nas,
        output_file_paths=sorted_output_file_nas,
    )
    sort_job_dir = directory_for(sort_job_locator)
    assert (sort_job_dir / "___run.sh").exists()
    assert (sort_job_dir / "____params.params").exists()

    with pytest.raises(RuntimeError):
        _ = run_python_on_parameters(
            sort_job_locator,
            sort_nums_main,
            {
                "input_file": add_y_output_file_nas,
                "output_file": sorted_output_file_nas
            },
            depends_on=[add_y_job],
            container=python36,
            job_profiles=[job_profile],
            resource_request=saga31_request,
            input_file_paths=add_y_output_file_nas,
            output_file_paths=[sorted_output_file_nas, sorted_output_file_nas],
        )

    celebration_bash_locator = job_locator / "celebrate"
    celebration_bash = run_bash(
        celebration_bash_locator,
        'echo "Jobs Runs Successfully"',
        depends_on=[sort_job],
        job_profiles=[job_profile],
    )
    assert celebration_bash == run_bash(
        celebration_bash_locator,
        'echo "Jobs Runs Successfully"',
        depends_on=[sort_job],
        job_profiles=[job_profile],
    )
    celebration_bash_dir = directory_for(celebration_bash_locator)
    assert (celebration_bash_dir / "script.sh").exists()

    _ = stop_docker_as_service(mongo4_4,
                               depends_on=[start_mongo, sort_job],
                               resource_request=saga31_request)

    # Generate the Pegasus DAX file & a Submit Script
    dax_file_one = write_workflow_description(tmp_path)
    assert dax_file_one.exists()

    submit_script_one = tmp_path / "submit.sh"
    assert submit_script_one.exists()
Exemplo n.º 8
0
def test_dax_with_job_on_saga_with_dict_as_params(tmp_path):
    workflow_params = Parameters.from_mapping({
        "workflow_name":
        "Test",
        "workflow_created":
        "Testing",
        "workflow_log_dir":
        str(tmp_path / "log"),
        "workflow_directory":
        str(tmp_path / "working"),
        "site":
        "saga",
        "namespace":
        "test",
        "partition":
        "gaia",
        "experiment_name":
        "fred",
        "home_dir":
        str(tmp_path),
    })
    slurm_params = Parameters.from_mapping({
        "partition": "gaia",
        "num_cpus": 1,
        "num_gpus": 0,
        "memory": "4G"
    })
    multiply_input_file = tmp_path / "raw_nums.txt"
    random = Random()
    random.seed(0)
    nums = immutableset(int(random.random() * 100) for _ in range(25))
    multiply_output_file = tmp_path / "multiplied_nums.txt"
    sorted_output_file = tmp_path / "sorted_nums.txt"
    add_output_file = tmp_path / "add_nums.txt"
    with multiply_input_file.open("w") as mult_file:
        mult_file.writelines(f"{num}\n" for num in nums)
    multiply_params = {
        "input_file": multiply_input_file,
        "output_file": multiply_output_file,
        "x": 4,
    }

    sort_params = {
        "input_file": multiply_output_file,
        "output_file": sorted_output_file
    }

    add_args = f"{sorted_output_file} {add_output_file} --y 10"

    job_profile = PegasusProfile(namespace="pegasus",
                                 key="transfer.bypass.input.staging",
                                 value="True")
    resources = SlurmResourceRequest.from_parameters(slurm_params)
    initialize_vista_pegasus_wrapper(workflow_params)

    multiply_job_name = Locator(_parse_parts("jobs/multiply"))
    multiply_artifact = ValueArtifact(
        multiply_output_file,
        depends_on=run_python_on_parameters(
            multiply_job_name,
            multiply_by_x_main,
            multiply_params,
            depends_on=[],
            job_profiles=[job_profile],
        ),
        locator=Locator("multiply"),
    )
    multiple_dir = directory_for(multiply_job_name)
    assert (multiple_dir / "___run.sh").exists()
    assert (multiple_dir / "____params.params").exists()

    sort_job_name = Locator(_parse_parts("jobs/sort"))
    sort_dir = directory_for(sort_job_name)
    sort_artifact = run_python_on_parameters(
        sort_job_name,
        sort_nums_main,
        sort_params,
        depends_on=[multiply_artifact],
        resource_request=resources,
        category="add",
    )
    assert (sort_dir / "___run.sh").exists()
    assert (sort_dir / "____params.params").exists()

    add_job_name = Locator(_parse_parts("jobs/add"))
    add_dir = directory_for(add_job_name)
    run_python_on_args(add_job_name,
                       "add_job_main.py",
                       add_args,
                       depends_on=[sort_artifact])
    assert (add_dir / "___run.sh").exists()

    dax_file_one = write_workflow_description(tmp_path)
    dax_file_two = write_workflow_description()

    assert dax_file_one.exists()
    assert dax_file_two.exists()

    submit_script_one = tmp_path / "submit_script_one.sh"
    submit_script_two = tmp_path / "submit_script_two.sh"
    build_submit_script(submit_script_one, str(dax_file_one),
                        experiment_directory())
    build_submit_script(submit_script_two, str(dax_file_two),
                        experiment_directory())

    assert submit_script_one.exists()
    assert submit_script_two.exists()

    site_catalog = workflow_params.existing_directory(
        "workflow_directory") / "sites.yml"
    assert site_catalog.exists()

    replica_catalog = (
        workflow_params.existing_directory("workflow_directory") /
        "replicas.yml")
    assert replica_catalog.exists()

    transformations_catalog = (
        workflow_params.existing_directory("workflow_directory") /
        "transformations.yml")
    assert transformations_catalog.exists()

    properties_file = (
        workflow_params.existing_directory("workflow_directory") /
        "pegasus.properties")
    assert properties_file.exists()
Exemplo n.º 9
0
def test_dax_with_job_in_container(tmp_path):
    workflow_params = Parameters.from_mapping({
        "workflow_name":
        "Test",
        "workflow_created":
        "Testing",
        "workflow_log_dir":
        str(tmp_path / "log"),
        "workflow_directory":
        str(tmp_path / "working"),
        "site":
        "saga",
        "namespace":
        "test",
        "partition":
        "gaia",
        "experiment_name":
        "fred",
        "home_dir":
        str(tmp_path),
    })

    slurm_params = Parameters.from_mapping({
        "partition": "gaia",
        "num_cpus": 1,
        "num_gpus": 0,
        "memory": "4G"
    })

    multiply_input_file = tmp_path / "raw_nums.txt"
    random = Random()
    random.seed(0)
    nums = immutableset(int(random.random() * 100) for _ in range(25))
    multiply_output_file = tmp_path / "multiplied_nums.txt"
    sorted_output_file = tmp_path / "sorted_nums.txt"

    with multiply_input_file.open("w") as mult_file:
        mult_file.writelines(f"{num}\n" for num in nums)

    multiply_params = Parameters.from_mapping({
        "input_file": multiply_input_file,
        "output_file": multiply_output_file,
        "x": 4
    })
    sort_params = Parameters.from_mapping({
        "input_file": multiply_output_file,
        "output_file": sorted_output_file
    })

    resources = SlurmResourceRequest.from_parameters(slurm_params)
    initialize_vista_pegasus_wrapper(workflow_params)

    # Add Container
    example_docker = add_container("example_container", "docker",
                                   tmp_path / "docker.img")

    with pytest.raises(ValueError):
        _ = add_container("fake_container", "invalid",
                          tmp_path / "invalid_docker.img")

    multiply_job_name = Locator(_parse_parts("jobs/multiply"))

    multiply_artifact = ValueArtifact(
        multiply_output_file,
        depends_on=run_python_on_parameters(
            multiply_job_name,
            multiply_by_x_main,
            multiply_params,
            depends_on=[],
            container=example_docker,
        ),
        locator=Locator("multiply"),
    )
    multiple_dir = directory_for(multiply_job_name)
    assert (multiple_dir / "___run.sh").exists()
    assert (multiple_dir / "____params.params").exists()

    sort_job_name = Locator(_parse_parts("jobs/sort"))
    sort_dir = directory_for(sort_job_name)
    run_python_on_parameters(
        sort_job_name,
        sort_nums_main,
        sort_params,
        depends_on=[multiply_artifact],
        resource_request=resources,
        container=example_docker,
    )
    assert (sort_dir / "___run.sh").exists()
    assert (sort_dir / "____params.params").exists()

    dax_file_one = write_workflow_description()

    assert dax_file_one.exists()

    site_catalog = workflow_params.existing_directory(
        "workflow_directory") / "sites.yml"
    assert site_catalog.exists()

    replica_catalog = (
        workflow_params.existing_directory("workflow_directory") /
        "replicas.yml")
    assert replica_catalog.exists()

    transformations_catalog = (
        workflow_params.existing_directory("workflow_directory") /
        "transformations.yml")
    assert transformations_catalog.exists()

    properties_file = (
        workflow_params.existing_directory("workflow_directory") /
        "pegasus.properties")
    assert properties_file.exists()
Exemplo n.º 10
0
def test_dax_with_checkpointed_jobs_on_saga(tmp_path):
    workflow_params = Parameters.from_mapping({
        "workflow_name":
        "Test",
        "workflow_created":
        "Testing",
        "workflow_log_dir":
        str(tmp_path / "log"),
        "workflow_directory":
        str(tmp_path / "working"),
        "site":
        "saga",
        "namespace":
        "test",
        "partition":
        "gaia",
        "home_dir":
        str(tmp_path),
    })
    slurm_params = Parameters.from_mapping({
        "partition": "gaia",
        "num_cpus": 1,
        "num_gpus": 0,
        "memory": "4G"
    })
    resources = SlurmResourceRequest.from_parameters(slurm_params)
    initialize_vista_pegasus_wrapper(workflow_params)

    multiply_job_name = Locator(_parse_parts("jobs/multiply"))
    multiply_output_file = tmp_path / "multiplied_nums.txt"
    multiply_input_file = tmp_path / "raw_nums.txt"
    multiply_params = Parameters.from_mapping({
        "input_file": multiply_input_file,
        "output_file": multiply_output_file,
        "x": 4
    })

    multiple_dir = directory_for(multiply_job_name)

    # Create checkpointed file so that when trying to create the job again,
    # Pegasus just adds the file to the Replica Catalog
    checkpointed_multiply_file = multiple_dir / "___ckpt"
    checkpointed_multiply_file.touch()
    multiply_output_file.touch()

    assert checkpointed_multiply_file.exists()
    assert multiply_output_file.exists()

    multiply_artifact = ValueArtifact(
        multiply_output_file,
        depends_on=run_python_on_parameters(multiply_job_name,
                                            multiply_by_x_main,
                                            multiply_params,
                                            depends_on=[]),
        locator=Locator("multiply"),
    )

    sort_job_name = Locator(_parse_parts("jobs/sort"))
    sorted_output_file = tmp_path / "sorted_nums.txt"
    sort_params = Parameters.from_mapping({
        "input_file": multiply_output_file,
        "output_file": sorted_output_file
    })
    run_python_on_parameters(
        sort_job_name,
        sort_nums_main,
        sort_params,
        depends_on=[multiply_artifact],
        resource_request=resources,
    )

    write_workflow_description()

    site_catalog = workflow_params.existing_directory(
        "workflow_directory") / "sites.yml"
    assert site_catalog.exists()

    replica_catalog = (
        workflow_params.existing_directory("workflow_directory") /
        "replicas.yml")
    assert replica_catalog.exists()

    transformations_catalog = (
        workflow_params.existing_directory("workflow_directory") /
        "transformations.yml")
    assert transformations_catalog.exists()

    properties_file = (
        workflow_params.existing_directory("workflow_directory") /
        "pegasus.properties")
    assert properties_file.exists()

    # Make sure the Replica Catalog is not empty
    assert replica_catalog.stat().st_size > 0
Exemplo n.º 11
0
def integrated_experiment_entry_point(params: Parameters) -> None:
    initialize_vista_pegasus_wrapper(params)

    baseline_parameters = params.namespace("integrated_learners_experiment")
    pursuit_resource_request_params = params.namespace(
        "pursuit_resource_request")

    # This code is commented out but may be used in the near future to add language ablation
    # Capabilities to this curriculum.

    # get the minimum and maximum accuracy of the language with the situation
    # min_language_accuracy = params.floating_point("min_language_accuracy", default=0.1)
    # max_language_accuracy = params.floating_point("max_language_accuracy", default=0.5)
    # num_language_accuracy_increment = params.integer(
    #    "num_language_accuracy_increment", default=5
    # )
    # values_for_accuracy = np.linspace(
    #    min_language_accuracy, max_language_accuracy, num_language_accuracy_increment
    # )

    # Get if attributes or relations should be included
    include_attributes = params.boolean("include_attributes", default=True)
    include_relations = params.boolean("include_relations", default=True)

    limit_jobs_for_category(
        "pursuit_job_limit",
        params.integer("num_pursuit_learners_active", default=8))

    curriculum_repository_path = params.creatable_directory(
        "curriculum_repository_path")

    # Job to build desired curriculum(s) which our learners use

    curriculum_dependencies = immutableset((
        CURRICULUM_NAME_FORMAT.format(
            noise=add_noise,
            shuffled=shuffle,
            relations=include_relations,
            attributes=include_attributes,
        ),
        run_python_on_parameters(
            Locator(
                CURRICULUM_NAME_FORMAT.format(
                    noise=add_noise,
                    shuffled=shuffle,
                    relations=include_relations,
                    attributes=include_attributes,
                ).split("-")),
            generate_curriculum_script,
            baseline_parameters.unify({
                "train_curriculum":
                Parameters.from_mapping(CURRICULUM_PARAMS).unify(
                    {
                        "add_noise": add_noise,
                        "shuffled": shuffle,
                        "include_attributes": include_attributes,
                        "include_relations": include_relations,
                    }).as_mapping()
            }).unify(FIXED_PARAMETERS).unify(
                {"curriculum_repository_path": curriculum_repository_path}),
            depends_on=[],
        ),
        Parameters.from_mapping(CURRICULUM_PARAMS).unify(
            {
                "add_noise": add_noise,
                "shuffled": shuffle,
                "include_attributes": include_attributes,
                "include_relations": include_relations,
            }),
    ) for add_noise in (True, False) for shuffle in (True, False))

    # jobs to build experiment
    for (curriculum_str, curriculum_dep,
         curr_params) in curriculum_dependencies:
        object_learner_type = params.string(
            "object_learner.learner_type",
            valid_options=["pursuit", "subset", "pbv"],
            default="pursuit",
        )
        attribute_learner_type = params.string(
            "attribute_learner.learner__type",
            valid_options=["none", "pursuit", "subset"],
            default="pursuit",
        )
        relation_learner_type = params.string(
            "relation_learner.learner_type",
            valid_options=["none", "pursuit", "subset"],
            default="pursuit",
        )
        experiment_name_string = EXPERIMENT_NAME_FORMAT.format(
            curriculum_name=curriculum_str.replace("-", "+"),
            object_learner=object_learner_type,
            attribute_learner=attribute_learner_type,
            relation_learner=relation_learner_type,
        )
        experiment_name = Locator(experiment_name_string.split("-"))

        # Note that the input parameters should include the root params and
        # anything else we want.
        experiment_params = baseline_parameters.unify(FIXED_PARAMETERS).unify({
            "experiment":
            experiment_name_string,
            "experiment_group_dir":
            directory_for(experiment_name),
            "hypothesis_log_dir":
            directory_for(experiment_name) / "hypotheses",
            "learner_logging_path":
            directory_for(experiment_name),
            "log_learner_state":
            True,
            "resume_from_latest_logged_state":
            True,
            "load_from_curriculum_repository":
            curriculum_repository_path,
            "train_curriculum":
            curr_params,
        })

        run_python_on_parameters(
            experiment_name,
            log_experiment_script,
            experiment_params,
            depends_on=[curriculum_dep],
            resource_request=SlurmResourceRequest.from_parameters(
                pursuit_resource_request_params) if "pursuit" in [
                    object_learner_type, attribute_learner_type,
                    relation_learner_type
                ] else None,
            category="pursuit" if "pursuit" in [
                object_learner_type, attribute_learner_type,
                relation_learner_type
            ] else "subset",
            use_pypy=True,
        )

    write_workflow_description()
def object_language_ablation_runner_entry_point(params: Parameters) -> None:
    """This function creates all possible object language ablation param files within a given range"""
    initialize_vista_pegasus_wrapper(params)

    baseline_parameters = params.namespace("object_language_ablation")
    pursuit_resource_request_params = params.namespace(
        "pursuit_resource_request")

    # get the minimum and maximum number of objects in a scene
    min_num_objects = params.integer("min_num_objects", default=1)
    max_num_objects = params.integer("max_num_objects", default=7)

    # get the minimum and maximum accuracy of the language with the situation
    min_language_accuracy = params.floating_point("min_language_accuracy",
                                                  default=0.1)
    max_language_accuracy = params.floating_point("max_language_accuracy",
                                                  default=0.5)
    num_language_accuracy_increment = params.integer(
        "num_language_accuracy_increment", default=5)
    values_for_accuracy = np.linspace(min_language_accuracy,
                                      max_language_accuracy,
                                      num_language_accuracy_increment)

    limit_jobs_for_category(
        "pursuit", params.integer("num_pursuit_learners_active", default=8))

    for num_objects in range(min_num_objects, max_num_objects + 1):
        for language_accuracy in values_for_accuracy:
            for learner_type in LEARNER_VALUES_TO_PARAMS:
                for params_str, learner_params in LEARNER_VALUES_TO_PARAMS[
                        learner_type]:
                    experiment_name_string = EXPERIMENT_NAME_FORMAT.format(
                        num_objects=num_objects,
                        language_accuracy=language_accuracy,
                        learner_type=learner_type,
                        learner_params=params_str,
                    )
                    experiment_name = Locator(
                        experiment_name_string.split("-"))

                    # Note that the input parameters should include the root params and
                    # anything else we want.
                    experiment_params = baseline_parameters.unify(
                        FIXED_PARAMETERS
                    ).unify({
                        "experiment":
                        experiment_name_string,
                        "experiment_group_dir":
                        directory_for(experiment_name),
                        "hypothesis_log_dir":
                        directory_for(experiment_name) / "hypotheses",
                        "learner_logging_path":
                        directory_for(experiment_name),
                        "log_learner_state":
                        True,
                        "resume_from_latest_logged_state":
                        True,
                        "train_curriculum": {
                            "accurate_language_percentage":
                            float(language_accuracy)
                        },
                        "object_learner_type":
                        learner_type,
                        "object_learner":
                        learner_params,
                        # We subtract one because the target object is a given
                        "num_noise_objects":
                        num_objects - 1,
                    })

                    run_python_on_parameters(
                        experiment_name,
                        log_experiment_script,
                        experiment_params,
                        depends_on=[],
                        resource_request=SlurmResourceRequest.from_parameters(
                            pursuit_resource_request_params)
                        if learner_type == "pursuit" else None,
                        category=learner_type,
                    )

    write_workflow_description()
Exemplo n.º 13
0
def explicit_train_dev_test_split(
    corpus: KeyValueStore,
    *,
    train_ids: ValueArtifact[Path],
    dev_ids: ValueArtifact[Path],
    test_ids: ValueArtifact[Path],
    output_locator: Locator,
    exhaustive: bool = True,
    downsample_to: Optional[int] = None,
) -> DataSplit:
    """
    Explicit implementation for handling a train/dev/test split over a `KeyValueStore`

    The split is done by a list of keys handed explicitly to the user.

    If *exhaustive* is True then an exception will be thrown if a document does not get assigned to
    one of the three sets. This is to help prevent accidental omissions in the key lists

    *downsample_to* is an optional integer to reduce to the size of the key_value split
    for quicker debugging. See `vistautils.scripts.downsample_key_value_store` as the function
    which is

    See `DataSplit` for the output description.
    """
    train_locator = output_locator / "train"
    dev_locator = output_locator / "dev"
    test_locator = output_locator / "test"

    train_zip = directory_for(train_locator) / "train.zip"
    dev_zip = directory_for(dev_locator) / "dev.zip"
    test_zip = directory_for(test_locator) / "test.zip"

    split_job = run_python_on_parameters(
        output_locator,
        split_entry_point,
        parameters={
            "input": corpus.input_parameters(),
            "explicit_split": {
                "train": {
                    "keys_file": train_ids.value,
                    "output_file": train_zip
                },
                "dev": {
                    "keys_file": dev_ids.value,
                    "output_file": dev_zip
                },
                "test": {
                    "keys_file": test_ids.value,
                    "output_file": test_zip
                },
                "must_be_exhaustive": exhaustive,
            },
        },
        depends_on=[corpus],
    )

    deps = [
        corpus.depends_on,
        split_job,
        train_ids.depends_on,
        dev_ids.depends_on,
        test_ids.depends_on,
    ]

    train_store = ZipKeyValueStore(train_zip,
                                   locator=train_locator,
                                   depends_on=deps)
    dev_store = ZipKeyValueStore(dev_zip, locator=dev_locator, depends_on=deps)
    test_store = ZipKeyValueStore(test_zip,
                                  locator=test_locator,
                                  depends_on=deps)
    if downsample_to is None:
        return DataSplit(train=train_store, dev=dev_store, test=test_store)
    else:
        return DataSplit(
            train=downsample(train_store, limit=downsample_to),
            dev=downsample(dev_store, limit=downsample_to),
            test=downsample(test_store, limit=downsample_to),
        )