def split_key_value_store( input_store: KeyValueStore, *, num_parts: int, random_seed: Optional[int] = None) -> Tuple[KeyValueStore]: """ Splits *input_store* into *num_parts* pieces of nearly equal size. Some of the resulting key-value stores may be empty. """ if num_parts <= 0: raise RuntimeError("Number of parts must be positive") split_locator = input_store.locator / "split" split_output_dir = directory_for(split_locator) param_args = { "input": input_store.input_parameters(), "num_slices": num_parts, "output_dir": split_output_dir, } if random_seed: param_args["random_seed"] = random_seed split_job = run_python_on_parameters( split_locator, split_entry_point, Parameters.from_mapping(param_args), depends_on=input_store, ) return tuple( ZipKeyValueStore( path=split_output_dir / f"{slice_index}.zip", depends_on=split_job, locator=split_locator / str(slice_index), ) for slice_index in range(num_parts))
def downsample(input_store: KeyValueStore, *, limit: int, output_locator: Optional[Locator] = None) -> KeyValueStore: """ Convince function to run `vistautils.scripts.downsample_key_value_store` as a Pegasus Job """ if not output_locator: output_locator = input_store.locator / f"downsampled-{limit}" output_zip_path = directory_for(output_locator) / "downsampled.zip" downsample_job = run_python_on_parameters( output_locator, downsample_key_value_store, Parameters.from_mapping({ "input": input_store.input_parameters(), "output_zip_path": output_zip_path, "num_to_sample": limit, "random_seed": 0, }), depends_on=input_store, ) return ZipKeyValueStore( path=output_zip_path, locator=output_locator, depends_on=[input_store.depends_on, downsample_job], )
def test_not_clearing_ckpts(monkeypatch, tmp_path): workflow_params = Parameters.from_mapping({ "workflow_name": "Test", "workflow_created": "Testing", "workflow_log_dir": str(tmp_path / "log"), "workflow_directory": str(tmp_path / "working"), "site": "saga", "namespace": "test", "partition": "scavenge", "home_dir": str(tmp_path), }) initialize_vista_pegasus_wrapper(workflow_params) multiply_job_name = Locator(_parse_parts("jobs/multiply")) multiply_output_file = tmp_path / "multiplied_nums.txt" multiply_input_file = tmp_path / "raw_nums.txt" multiply_params = Parameters.from_mapping({ "input_file": multiply_input_file, "output_file": multiply_output_file, "x": 4 }) multiple_dir = directory_for(multiply_job_name) checkpointed_multiply_file = multiple_dir / "___ckpt" checkpointed_multiply_file.touch() multiply_output_file.touch() run_python_on_parameters(multiply_job_name, multiply_by_x_main, multiply_params, depends_on=[]) monkeypatch.setattr("builtins.input", lambda _: "n") write_workflow_description() assert checkpointed_multiply_file.exists()
def join_to_key_value_zip(key_value_zips_to_join: Iterable[ZipKeyValueStore], *, output_locator: Locator) -> ZipKeyValueStore: key_value_zips_to_join = tuple(key_value_zips_to_join) output_zip_path = directory_for(output_locator) / "joined.zip" join_job = run_python_on_parameters( output_locator, join_key_value_stores, Parameters.from_mapping({ "input_store_list_file": [p.path for p in key_value_zips_to_join], "output": { "type": "zip", "path": output_zip_path }, }), depends_on=key_value_zips_to_join, ) return ZipKeyValueStore(path=output_zip_path, locator=output_locator, depends_on=join_job)
def gaze_ablation_runner_entry_point(params: Parameters) -> None: """This function creates all possible gaze ablation param files within a given range""" initialize_vista_pegasus_wrapper(params) # Get the baseline experiment parameters for gaze ablation -- these are things common to all of # the experiments, like: # # include_image_links: true # sort_learner_descriptions_by_length: True # num_pretty_descriptions: 5 baseline_parameters = params.namespace("gaze_ablation") # get the minimum and maximum number of objects in a scene min_num_objects = params.integer("min_num_objects", default=1) max_num_objects = params.integer("max_num_objects", default=7) # this gets the number of different accuracies to try; default = increment by 0.1 num_accuracy_increments = params.integer("num_increments", default=11) values_for_accuracy = np.linspace(0, 1, num_accuracy_increments) # the number of noise instances to be included min_num_noise_instances = params.integer("min_num_noise", default=0) max_num_noise_instances = params.integer("max_num_noise", default=0) # get the number of instances in the entire curriculum min_num_instances_in_curriculum = params.integer("min_instances", default=10) max_num_instances_in_curriculum = params.integer("max_instances", default=20) # all possible numbers of noise instances for num_noise_instances in range(min_num_noise_instances, max_num_noise_instances + 1): # all possible numbers of instances in the curriculum for num_instances in range(min_num_instances_in_curriculum, max_num_instances_in_curriculum + 1): # all possible numbers of instances for num_objects_in_instance in range(min_num_objects, max_num_objects + 1): # all possible accuracies for prob_given in values_for_accuracy: for prob_not_given in values_for_accuracy: # both ignoring and perceiving gaze for add_gaze in [True, False]: # Define the experiment name, which is used both as a job name and to # choose a directory in which to store the experiment results. experiment_name_string = EXPERIMENT_NAME_FORMAT.format( num_instances=num_instances, num_noise_instances=num_noise_instances, num_objects_in_instance=num_objects_in_instance, prob_given=prob_given, prob_not_given=prob_not_given, add_gaze=add_gaze, ) experiment_name = Locator( experiment_name_string.split("-")) # Note that the input parameters should include the root params and # anything else we want. experiment_params = baseline_parameters.unify( FIXED_PARAMETERS).unify({ "experiment": experiment_name_string, "experiment_group_dir": directory_for(experiment_name), "hypothesis_log_dir": directory_for(experiment_name) / "hypotheses", "learner_logging_path": directory_for(experiment_name), "log_learner_state": True, "resume_from_latest_logged_state": True, "pursuit-curriculum-params": { "num_instances": num_instances, "num_noise_instances": num_noise_instances, "num_objects_in_instance": num_objects_in_instance, "add_gaze": add_gaze, "prob_given": float(prob_given), "prob_not_given": float(prob_not_given), }, }) run_python_on_parameters( experiment_name, log_experiment_script, experiment_params, depends_on=[], ) write_workflow_description()
def main(params: Parameters): adam_root = params.existing_directory("adam_root") m13_experiments_dir = adam_root / "parameters" / "experiments" / "m13" use_pegasus = params.boolean("use_pegasus", default=False) if use_pegasus: initialize_vista_pegasus_wrapper(params) param_files: List[Path] = [] if params.boolean("include_objects", default=True): param_files.append(m13_experiments_dir / "objects.params") if params.boolean("include_imprecise_size", default=True): param_files.append(m13_experiments_dir / "imprecise_size.params") if params.boolean("include_imprecise_temporal", default=True): param_files.append(m13_experiments_dir / "imprecise_temporal.params") if params.boolean("include_subtle_verb", default=True): param_files.append(m13_experiments_dir / "subtle_verb.params") if params.boolean("include_object_restrictions", default=True): param_files.append(m13_experiments_dir / "object_restrictions.params") if params.boolean("include_functionally_defined_objects", default=True): param_files.append(m13_experiments_dir / "functionally_defined_objects.params") if params.boolean("include_relations", default=True): param_files.append(m13_experiments_dir / "relations.params") if params.boolean("include_generics", default=True): param_files.append(m13_experiments_dir / "generics.params") if params.boolean("include_verbs_with_dynamic_prepositions", default=True): param_files.append( m13_experiments_dir / "events_with_dynamic_prepositions.params" ) if params.boolean("include_m9_complete", default=False): param_files.append(m13_experiments_dir / "m9_complete.params") if params.boolean("include_m13_complete", default=False): param_files.append(m13_experiments_dir / "m13_complete.params") if params.boolean("include_m13_shuffled", default=False): param_files.append(m13_experiments_dir / "m13_shuffled.params") # This activates a special "debug" curriculum, # which is meant to be edited in the code by a developer to do fine-grained debugging. if params.boolean("include_debug", default=False): param_files.append(m13_experiments_dir / "debug.params") # If any of the param files don't exist, bail out earlier instead of making the user # wait for the error. for param_file in param_files: if not param_file.exists(): raise RuntimeError(f"Expected param file {param_file} does not exist") for param_file in param_files: logging.info("Running %s", param_file) experiment_params = YAMLParametersLoader().load(param_file) if not use_pegasus: log_experiment_entry_point(experiment_params) else: experiment_name = Locator(experiment_params.string("experiment")) experiment_params = experiment_params.unify( { "experiment_group_dir": directory_for(experiment_name) / "output", "hypothesis_log_dir": directory_for(experiment_name) / "hypotheses", # State pickles will go under experiment_name/learner_state "learner_logging_path": directory_for(experiment_name), "log_learner_state": True, "resume_from_latest_logged_state": True, "log_hypothesis_every_n_steps": params.integer( "save_state_every_n_steps" ), "debug_learner_pickling": params.boolean( "debug_learner_pickling", default=False ), } ) run_python_on_parameters( experiment_name, log_experiment_script, experiment_params, depends_on=[] ) if use_pegasus: write_workflow_description()
def test_dax_with_python_into_container_jobs(tmp_path): docker_tar = Path(f"{tmp_path}/docker/tar.tar") docker_build_dir = tmp_path docker_image_name = "pegasus_wrapper_container_demo" docker_image_tag = "0.2" # Generating parameters for initializing a workflow # We recommend making workflow directory, site, and partition parameters # in an research workflow workflow_params = Parameters.from_mapping({ "workflow_name": "Test", "workflow_created": "Testing", "workflow_log_dir": str(tmp_path / "log"), "workflow_directory": str(tmp_path / "working"), "site": "saga", "namespace": "test", "home_dir": str(tmp_path), "partition": "scavenge", }) saga31_request = SlurmResourceRequest.from_parameters( Parameters.from_mapping({ "run_on_single_node": "saga31", "partition": "gaia" })) # Our source input for the sample jobs input_file = tmp_path / "raw_nums.txt" add_y_output_file_nas = tmp_path / "nums_y.txt" sorted_output_file_nas = tmp_path / "sorted.txt" random = Random() random.seed(0) nums = [int(random.random() * 100) for _ in range(0, 25)] # Base Job Locator job_locator = Locator(("jobs", )) docker_python_root = Path("/home/app/") # Write a list of numbers out to be able to run the workflow with input_file.open("w") as mult_file: mult_file.writelines(f"{num}\n" for num in nums) initialize_vista_pegasus_wrapper(workflow_params) build_container_locator = job_locator / "build_docker" build_container = run_bash( build_container_locator, command=[ "mkdir -p /scratch/dockermount/pegasus_wrapper_tmp", f"cd {docker_build_dir}", f"docker build . -t {docker_image_name}:{docker_image_tag}", f"docker save -o /scratch/dockermount/pegasus_wrapper_tmp/{docker_tar.name} {docker_image_name}:{docker_image_tag}", f"cp /scratch/dockermount/pegasus_wrapper_tmp/{docker_tar.name} {docker_tar.absolute()}", f"chmod go+r {docker_tar.absolute()}", ], depends_on=[], resource_request=saga31_request, ) build_container_dir = directory_for(build_container_locator) assert (build_container_dir / "script.sh").exists() python36 = add_container( f"{docker_image_name}:{docker_image_tag}", "docker", str(docker_tar.absolute()), image_site="saga", bypass_staging=True, ) job_profile = PegasusProfile(namespace="pegasus", key="transfer.bypass.input.staging", value="True") mongo4_4 = add_container("mongo:4.4", "docker", "path/to/tar.tar", image_site="saga", bypass_staging=True) with pytest.raises(RuntimeError): _ = stop_docker_as_service(mongo4_4, depends_on=[], resource_request=saga31_request) start_mongo = start_docker_as_service( mongo4_4, depends_on=[build_container], docker_args=f"-v /scratch/mongo/data/db:/data/db", resource_request=saga31_request, ) mongo4_4_dir = directory_for(Locator(("containers", mongo4_4.name))) assert (mongo4_4_dir / "start.sh").exists() assert (mongo4_4_dir / "stop.sh").exists() add_y_locator = job_locator / "add" add_y_job = run_python_on_args( add_y_locator, docker_python_root / "add_y.py", set_args=f"{input_file} {add_y_output_file_nas} --y 10", depends_on=[build_container], job_profiles=[job_profile], resource_request=saga31_request, container=python36, input_file_paths=[input_file], output_file_paths=[add_y_output_file_nas], ) add_y_dir = directory_for(add_y_locator) assert (add_y_dir / "___run.sh").exists() with pytest.raises(RuntimeError): _ = run_python_on_args( add_y_locator, docker_python_root / "add_y.py", set_args=f"{input_file} {add_y_output_file_nas} --y 10", depends_on=[build_container], job_profiles=[job_profile], resource_request=saga31_request, container=python36, input_file_paths=[input_file, input_file], output_file_paths=[add_y_output_file_nas], ) sort_job_locator = job_locator / "sort" sort_job = run_python_on_parameters( sort_job_locator, sort_nums_main, { "input_file": add_y_output_file_nas, "output_file": sorted_output_file_nas }, depends_on=[add_y_job], container=python36, job_profiles=[job_profile], resource_request=saga31_request, input_file_paths=add_y_output_file_nas, output_file_paths=sorted_output_file_nas, ) assert sort_job == run_python_on_parameters( sort_job_locator, sort_nums_main, { "input_file": add_y_output_file_nas, "output_file": sorted_output_file_nas }, depends_on=[add_y_job], container=python36, job_profiles=[job_profile], resource_request=saga31_request, input_file_paths=add_y_output_file_nas, output_file_paths=sorted_output_file_nas, ) sort_job_dir = directory_for(sort_job_locator) assert (sort_job_dir / "___run.sh").exists() assert (sort_job_dir / "____params.params").exists() with pytest.raises(RuntimeError): _ = run_python_on_parameters( sort_job_locator, sort_nums_main, { "input_file": add_y_output_file_nas, "output_file": sorted_output_file_nas }, depends_on=[add_y_job], container=python36, job_profiles=[job_profile], resource_request=saga31_request, input_file_paths=add_y_output_file_nas, output_file_paths=[sorted_output_file_nas, sorted_output_file_nas], ) celebration_bash_locator = job_locator / "celebrate" celebration_bash = run_bash( celebration_bash_locator, 'echo "Jobs Runs Successfully"', depends_on=[sort_job], job_profiles=[job_profile], ) assert celebration_bash == run_bash( celebration_bash_locator, 'echo "Jobs Runs Successfully"', depends_on=[sort_job], job_profiles=[job_profile], ) celebration_bash_dir = directory_for(celebration_bash_locator) assert (celebration_bash_dir / "script.sh").exists() _ = stop_docker_as_service(mongo4_4, depends_on=[start_mongo, sort_job], resource_request=saga31_request) # Generate the Pegasus DAX file & a Submit Script dax_file_one = write_workflow_description(tmp_path) assert dax_file_one.exists() submit_script_one = tmp_path / "submit.sh" assert submit_script_one.exists()
def test_dax_with_job_on_saga_with_dict_as_params(tmp_path): workflow_params = Parameters.from_mapping({ "workflow_name": "Test", "workflow_created": "Testing", "workflow_log_dir": str(tmp_path / "log"), "workflow_directory": str(tmp_path / "working"), "site": "saga", "namespace": "test", "partition": "gaia", "experiment_name": "fred", "home_dir": str(tmp_path), }) slurm_params = Parameters.from_mapping({ "partition": "gaia", "num_cpus": 1, "num_gpus": 0, "memory": "4G" }) multiply_input_file = tmp_path / "raw_nums.txt" random = Random() random.seed(0) nums = immutableset(int(random.random() * 100) for _ in range(25)) multiply_output_file = tmp_path / "multiplied_nums.txt" sorted_output_file = tmp_path / "sorted_nums.txt" add_output_file = tmp_path / "add_nums.txt" with multiply_input_file.open("w") as mult_file: mult_file.writelines(f"{num}\n" for num in nums) multiply_params = { "input_file": multiply_input_file, "output_file": multiply_output_file, "x": 4, } sort_params = { "input_file": multiply_output_file, "output_file": sorted_output_file } add_args = f"{sorted_output_file} {add_output_file} --y 10" job_profile = PegasusProfile(namespace="pegasus", key="transfer.bypass.input.staging", value="True") resources = SlurmResourceRequest.from_parameters(slurm_params) initialize_vista_pegasus_wrapper(workflow_params) multiply_job_name = Locator(_parse_parts("jobs/multiply")) multiply_artifact = ValueArtifact( multiply_output_file, depends_on=run_python_on_parameters( multiply_job_name, multiply_by_x_main, multiply_params, depends_on=[], job_profiles=[job_profile], ), locator=Locator("multiply"), ) multiple_dir = directory_for(multiply_job_name) assert (multiple_dir / "___run.sh").exists() assert (multiple_dir / "____params.params").exists() sort_job_name = Locator(_parse_parts("jobs/sort")) sort_dir = directory_for(sort_job_name) sort_artifact = run_python_on_parameters( sort_job_name, sort_nums_main, sort_params, depends_on=[multiply_artifact], resource_request=resources, category="add", ) assert (sort_dir / "___run.sh").exists() assert (sort_dir / "____params.params").exists() add_job_name = Locator(_parse_parts("jobs/add")) add_dir = directory_for(add_job_name) run_python_on_args(add_job_name, "add_job_main.py", add_args, depends_on=[sort_artifact]) assert (add_dir / "___run.sh").exists() dax_file_one = write_workflow_description(tmp_path) dax_file_two = write_workflow_description() assert dax_file_one.exists() assert dax_file_two.exists() submit_script_one = tmp_path / "submit_script_one.sh" submit_script_two = tmp_path / "submit_script_two.sh" build_submit_script(submit_script_one, str(dax_file_one), experiment_directory()) build_submit_script(submit_script_two, str(dax_file_two), experiment_directory()) assert submit_script_one.exists() assert submit_script_two.exists() site_catalog = workflow_params.existing_directory( "workflow_directory") / "sites.yml" assert site_catalog.exists() replica_catalog = ( workflow_params.existing_directory("workflow_directory") / "replicas.yml") assert replica_catalog.exists() transformations_catalog = ( workflow_params.existing_directory("workflow_directory") / "transformations.yml") assert transformations_catalog.exists() properties_file = ( workflow_params.existing_directory("workflow_directory") / "pegasus.properties") assert properties_file.exists()
def test_dax_with_job_in_container(tmp_path): workflow_params = Parameters.from_mapping({ "workflow_name": "Test", "workflow_created": "Testing", "workflow_log_dir": str(tmp_path / "log"), "workflow_directory": str(tmp_path / "working"), "site": "saga", "namespace": "test", "partition": "gaia", "experiment_name": "fred", "home_dir": str(tmp_path), }) slurm_params = Parameters.from_mapping({ "partition": "gaia", "num_cpus": 1, "num_gpus": 0, "memory": "4G" }) multiply_input_file = tmp_path / "raw_nums.txt" random = Random() random.seed(0) nums = immutableset(int(random.random() * 100) for _ in range(25)) multiply_output_file = tmp_path / "multiplied_nums.txt" sorted_output_file = tmp_path / "sorted_nums.txt" with multiply_input_file.open("w") as mult_file: mult_file.writelines(f"{num}\n" for num in nums) multiply_params = Parameters.from_mapping({ "input_file": multiply_input_file, "output_file": multiply_output_file, "x": 4 }) sort_params = Parameters.from_mapping({ "input_file": multiply_output_file, "output_file": sorted_output_file }) resources = SlurmResourceRequest.from_parameters(slurm_params) initialize_vista_pegasus_wrapper(workflow_params) # Add Container example_docker = add_container("example_container", "docker", tmp_path / "docker.img") with pytest.raises(ValueError): _ = add_container("fake_container", "invalid", tmp_path / "invalid_docker.img") multiply_job_name = Locator(_parse_parts("jobs/multiply")) multiply_artifact = ValueArtifact( multiply_output_file, depends_on=run_python_on_parameters( multiply_job_name, multiply_by_x_main, multiply_params, depends_on=[], container=example_docker, ), locator=Locator("multiply"), ) multiple_dir = directory_for(multiply_job_name) assert (multiple_dir / "___run.sh").exists() assert (multiple_dir / "____params.params").exists() sort_job_name = Locator(_parse_parts("jobs/sort")) sort_dir = directory_for(sort_job_name) run_python_on_parameters( sort_job_name, sort_nums_main, sort_params, depends_on=[multiply_artifact], resource_request=resources, container=example_docker, ) assert (sort_dir / "___run.sh").exists() assert (sort_dir / "____params.params").exists() dax_file_one = write_workflow_description() assert dax_file_one.exists() site_catalog = workflow_params.existing_directory( "workflow_directory") / "sites.yml" assert site_catalog.exists() replica_catalog = ( workflow_params.existing_directory("workflow_directory") / "replicas.yml") assert replica_catalog.exists() transformations_catalog = ( workflow_params.existing_directory("workflow_directory") / "transformations.yml") assert transformations_catalog.exists() properties_file = ( workflow_params.existing_directory("workflow_directory") / "pegasus.properties") assert properties_file.exists()
def test_dax_with_checkpointed_jobs_on_saga(tmp_path): workflow_params = Parameters.from_mapping({ "workflow_name": "Test", "workflow_created": "Testing", "workflow_log_dir": str(tmp_path / "log"), "workflow_directory": str(tmp_path / "working"), "site": "saga", "namespace": "test", "partition": "gaia", "home_dir": str(tmp_path), }) slurm_params = Parameters.from_mapping({ "partition": "gaia", "num_cpus": 1, "num_gpus": 0, "memory": "4G" }) resources = SlurmResourceRequest.from_parameters(slurm_params) initialize_vista_pegasus_wrapper(workflow_params) multiply_job_name = Locator(_parse_parts("jobs/multiply")) multiply_output_file = tmp_path / "multiplied_nums.txt" multiply_input_file = tmp_path / "raw_nums.txt" multiply_params = Parameters.from_mapping({ "input_file": multiply_input_file, "output_file": multiply_output_file, "x": 4 }) multiple_dir = directory_for(multiply_job_name) # Create checkpointed file so that when trying to create the job again, # Pegasus just adds the file to the Replica Catalog checkpointed_multiply_file = multiple_dir / "___ckpt" checkpointed_multiply_file.touch() multiply_output_file.touch() assert checkpointed_multiply_file.exists() assert multiply_output_file.exists() multiply_artifact = ValueArtifact( multiply_output_file, depends_on=run_python_on_parameters(multiply_job_name, multiply_by_x_main, multiply_params, depends_on=[]), locator=Locator("multiply"), ) sort_job_name = Locator(_parse_parts("jobs/sort")) sorted_output_file = tmp_path / "sorted_nums.txt" sort_params = Parameters.from_mapping({ "input_file": multiply_output_file, "output_file": sorted_output_file }) run_python_on_parameters( sort_job_name, sort_nums_main, sort_params, depends_on=[multiply_artifact], resource_request=resources, ) write_workflow_description() site_catalog = workflow_params.existing_directory( "workflow_directory") / "sites.yml" assert site_catalog.exists() replica_catalog = ( workflow_params.existing_directory("workflow_directory") / "replicas.yml") assert replica_catalog.exists() transformations_catalog = ( workflow_params.existing_directory("workflow_directory") / "transformations.yml") assert transformations_catalog.exists() properties_file = ( workflow_params.existing_directory("workflow_directory") / "pegasus.properties") assert properties_file.exists() # Make sure the Replica Catalog is not empty assert replica_catalog.stat().st_size > 0
def integrated_experiment_entry_point(params: Parameters) -> None: initialize_vista_pegasus_wrapper(params) baseline_parameters = params.namespace("integrated_learners_experiment") pursuit_resource_request_params = params.namespace( "pursuit_resource_request") # This code is commented out but may be used in the near future to add language ablation # Capabilities to this curriculum. # get the minimum and maximum accuracy of the language with the situation # min_language_accuracy = params.floating_point("min_language_accuracy", default=0.1) # max_language_accuracy = params.floating_point("max_language_accuracy", default=0.5) # num_language_accuracy_increment = params.integer( # "num_language_accuracy_increment", default=5 # ) # values_for_accuracy = np.linspace( # min_language_accuracy, max_language_accuracy, num_language_accuracy_increment # ) # Get if attributes or relations should be included include_attributes = params.boolean("include_attributes", default=True) include_relations = params.boolean("include_relations", default=True) limit_jobs_for_category( "pursuit_job_limit", params.integer("num_pursuit_learners_active", default=8)) curriculum_repository_path = params.creatable_directory( "curriculum_repository_path") # Job to build desired curriculum(s) which our learners use curriculum_dependencies = immutableset(( CURRICULUM_NAME_FORMAT.format( noise=add_noise, shuffled=shuffle, relations=include_relations, attributes=include_attributes, ), run_python_on_parameters( Locator( CURRICULUM_NAME_FORMAT.format( noise=add_noise, shuffled=shuffle, relations=include_relations, attributes=include_attributes, ).split("-")), generate_curriculum_script, baseline_parameters.unify({ "train_curriculum": Parameters.from_mapping(CURRICULUM_PARAMS).unify( { "add_noise": add_noise, "shuffled": shuffle, "include_attributes": include_attributes, "include_relations": include_relations, }).as_mapping() }).unify(FIXED_PARAMETERS).unify( {"curriculum_repository_path": curriculum_repository_path}), depends_on=[], ), Parameters.from_mapping(CURRICULUM_PARAMS).unify( { "add_noise": add_noise, "shuffled": shuffle, "include_attributes": include_attributes, "include_relations": include_relations, }), ) for add_noise in (True, False) for shuffle in (True, False)) # jobs to build experiment for (curriculum_str, curriculum_dep, curr_params) in curriculum_dependencies: object_learner_type = params.string( "object_learner.learner_type", valid_options=["pursuit", "subset", "pbv"], default="pursuit", ) attribute_learner_type = params.string( "attribute_learner.learner__type", valid_options=["none", "pursuit", "subset"], default="pursuit", ) relation_learner_type = params.string( "relation_learner.learner_type", valid_options=["none", "pursuit", "subset"], default="pursuit", ) experiment_name_string = EXPERIMENT_NAME_FORMAT.format( curriculum_name=curriculum_str.replace("-", "+"), object_learner=object_learner_type, attribute_learner=attribute_learner_type, relation_learner=relation_learner_type, ) experiment_name = Locator(experiment_name_string.split("-")) # Note that the input parameters should include the root params and # anything else we want. experiment_params = baseline_parameters.unify(FIXED_PARAMETERS).unify({ "experiment": experiment_name_string, "experiment_group_dir": directory_for(experiment_name), "hypothesis_log_dir": directory_for(experiment_name) / "hypotheses", "learner_logging_path": directory_for(experiment_name), "log_learner_state": True, "resume_from_latest_logged_state": True, "load_from_curriculum_repository": curriculum_repository_path, "train_curriculum": curr_params, }) run_python_on_parameters( experiment_name, log_experiment_script, experiment_params, depends_on=[curriculum_dep], resource_request=SlurmResourceRequest.from_parameters( pursuit_resource_request_params) if "pursuit" in [ object_learner_type, attribute_learner_type, relation_learner_type ] else None, category="pursuit" if "pursuit" in [ object_learner_type, attribute_learner_type, relation_learner_type ] else "subset", use_pypy=True, ) write_workflow_description()
def object_language_ablation_runner_entry_point(params: Parameters) -> None: """This function creates all possible object language ablation param files within a given range""" initialize_vista_pegasus_wrapper(params) baseline_parameters = params.namespace("object_language_ablation") pursuit_resource_request_params = params.namespace( "pursuit_resource_request") # get the minimum and maximum number of objects in a scene min_num_objects = params.integer("min_num_objects", default=1) max_num_objects = params.integer("max_num_objects", default=7) # get the minimum and maximum accuracy of the language with the situation min_language_accuracy = params.floating_point("min_language_accuracy", default=0.1) max_language_accuracy = params.floating_point("max_language_accuracy", default=0.5) num_language_accuracy_increment = params.integer( "num_language_accuracy_increment", default=5) values_for_accuracy = np.linspace(min_language_accuracy, max_language_accuracy, num_language_accuracy_increment) limit_jobs_for_category( "pursuit", params.integer("num_pursuit_learners_active", default=8)) for num_objects in range(min_num_objects, max_num_objects + 1): for language_accuracy in values_for_accuracy: for learner_type in LEARNER_VALUES_TO_PARAMS: for params_str, learner_params in LEARNER_VALUES_TO_PARAMS[ learner_type]: experiment_name_string = EXPERIMENT_NAME_FORMAT.format( num_objects=num_objects, language_accuracy=language_accuracy, learner_type=learner_type, learner_params=params_str, ) experiment_name = Locator( experiment_name_string.split("-")) # Note that the input parameters should include the root params and # anything else we want. experiment_params = baseline_parameters.unify( FIXED_PARAMETERS ).unify({ "experiment": experiment_name_string, "experiment_group_dir": directory_for(experiment_name), "hypothesis_log_dir": directory_for(experiment_name) / "hypotheses", "learner_logging_path": directory_for(experiment_name), "log_learner_state": True, "resume_from_latest_logged_state": True, "train_curriculum": { "accurate_language_percentage": float(language_accuracy) }, "object_learner_type": learner_type, "object_learner": learner_params, # We subtract one because the target object is a given "num_noise_objects": num_objects - 1, }) run_python_on_parameters( experiment_name, log_experiment_script, experiment_params, depends_on=[], resource_request=SlurmResourceRequest.from_parameters( pursuit_resource_request_params) if learner_type == "pursuit" else None, category=learner_type, ) write_workflow_description()
def explicit_train_dev_test_split( corpus: KeyValueStore, *, train_ids: ValueArtifact[Path], dev_ids: ValueArtifact[Path], test_ids: ValueArtifact[Path], output_locator: Locator, exhaustive: bool = True, downsample_to: Optional[int] = None, ) -> DataSplit: """ Explicit implementation for handling a train/dev/test split over a `KeyValueStore` The split is done by a list of keys handed explicitly to the user. If *exhaustive* is True then an exception will be thrown if a document does not get assigned to one of the three sets. This is to help prevent accidental omissions in the key lists *downsample_to* is an optional integer to reduce to the size of the key_value split for quicker debugging. See `vistautils.scripts.downsample_key_value_store` as the function which is See `DataSplit` for the output description. """ train_locator = output_locator / "train" dev_locator = output_locator / "dev" test_locator = output_locator / "test" train_zip = directory_for(train_locator) / "train.zip" dev_zip = directory_for(dev_locator) / "dev.zip" test_zip = directory_for(test_locator) / "test.zip" split_job = run_python_on_parameters( output_locator, split_entry_point, parameters={ "input": corpus.input_parameters(), "explicit_split": { "train": { "keys_file": train_ids.value, "output_file": train_zip }, "dev": { "keys_file": dev_ids.value, "output_file": dev_zip }, "test": { "keys_file": test_ids.value, "output_file": test_zip }, "must_be_exhaustive": exhaustive, }, }, depends_on=[corpus], ) deps = [ corpus.depends_on, split_job, train_ids.depends_on, dev_ids.depends_on, test_ids.depends_on, ] train_store = ZipKeyValueStore(train_zip, locator=train_locator, depends_on=deps) dev_store = ZipKeyValueStore(dev_zip, locator=dev_locator, depends_on=deps) test_store = ZipKeyValueStore(test_zip, locator=test_locator, depends_on=deps) if downsample_to is None: return DataSplit(train=train_store, dev=dev_store, test=test_store) else: return DataSplit( train=downsample(train_store, limit=downsample_to), dev=downsample(dev_store, limit=downsample_to), test=downsample(test_store, limit=downsample_to), )