def main(params: Parameters): # List of the six ACE corpus /adj/ folders (one for each type: bc, bn, cts, nw, un, wl) corpus_paths = params.arbitrary_list("corpus_paths") # Path to the project config file template (json file) json_template_path = params.existing_file("json_template_path") # Path to the cached_annotation_ser directory annotation_ser_path = params.existing_directory("annotation_ser_path") # Path to the cached_xmi directory cached_xmi_path = params.existing_directory("cached_xmi_path") # Path to target corpus (narrowed ACE-Corpus) cached_ace_data_path = params.creatable_directory("cached_ace_data_path") # List of users (strings) user_list = params.arbitrary_list("user_list") # List of event type (Format: "EVENT_TYPE.SUBTYPE" strings) event_list = params.arbitrary_list("event_list") # Output Directory Path where configured projects are moved to (use an empty directory) output_dir_path = params.creatable_directory("output_dir_path") flatten_ace_data(corpus_paths, cached_ace_data_path) complete_map = get_complete_project_to_doc_mapping(cached_ace_data_path) for user in user_list: for event_type in event_list: # For All events to be printed if event_type == "All": for event in complete_map: configure_and_generate_project( json_template_path=json_template_path, event_name=event, user_name=user, event_doc_map=complete_map, cached_ser_path=annotation_ser_path, cached_xmi_path=cached_xmi_path, output_dir_path=output_dir_path) else: configure_and_generate_project( json_template_path=json_template_path, event_name=event_type, user_name=user, event_doc_map=complete_map, cached_ser_path=annotation_ser_path, cached_xmi_path=cached_xmi_path, output_dir_path=output_dir_path)
def main(params: Parameters): curriculum_repository_path = params.creatable_directory( CURRICULUM_REPOSITORY_PATH_PARAMETER ) language_mode = params.enum( LANGUAGE_MODE_PARAMETER, LanguageMode, default=LanguageMode.ENGLISH ) train_curriculum, test_curriculum = curriculum_from_params( params, language_mode=language_mode ) strict_curriculum = ExperimentCurriculum( evaluate_curriculum(train_curriculum), evaluate_curriculum(test_curriculum) ) write_experiment_curriculum( curriculum_repository_path, params, language_mode, strict_curriculum, ignored_parameters=immutableset( IGNORED_PARAMETERS.union( {CURRICULUM_REPOSITORY_PATH_PARAMETER, LANGUAGE_MODE_PARAMETER} ) ), )
def from_parameters(params: Parameters) -> "WorkflowBuilder": wb = WorkflowBuilder( name=params.string("workflow_name", default="Workflow"), created_by=params.string("workflow_created", default="Default Constructor"), workflow_directory=params.creatable_directory( "workflow_directory"), default_site=params.string("site"), conda_script_generator=CondaJobScriptGenerator.from_parameters( params), docker_script_generator=DockerJobScriptGenerator.from_parameters( params), namespace=params.string("namespace"), default_resource_request=ResourceRequest.from_parameters(params), data_configuration=params.string("data_configuration", default="sharedfs"), experiment_name=params.string("experiment_name", default=""), ) if params.boolean("include_nas", default=True): add_local_nas_to_sites( wb._sites_catalog, params # pylint: disable=protected-access ) if params.boolean("include_saga", default=True): add_saga_cluster_to_sites( wb._sites_catalog, params # pylint: disable=protected-access ) configure_saga_properities( wb._properties, params # pylint: disable=protected-access ) return wb
def main(params: Parameters): # create_cas_from_apf(TEST_APF_PATH, TEST_SGM_PATH, OUTPUT_DIR_PATH) corpus_paths = params.arbitrary_list("corpus_paths") output_xmi_dir_path = params.creatable_directory("cached_xmi_path") type_system_path = params.existing_file("type_system_path") cas_xmi_template_path = params.existing_file("cas_xmi_template_path") # Load Typesystem with type_system_path.open('rb') as file: typesystem = load_typesystem(file) # Load xmi_template with cas_xmi_template_path.open('rb') as cas_xmi_file: cas_template = load_cas_from_xmi(cas_xmi_file, typesystem=typesystem) for ace_corpus_path in corpus_paths: print('Processing apf files from: ' + ace_corpus_path) start_time = time.perf_counter() for filename in os.listdir(ace_corpus_path): if filename.endswith(".apf.xml"): print("Processing " + filename) create_cas_from_apf(apf_filename=filename, apf_path=ace_corpus_path + filename, source_sgm_path=ace_corpus_path + filename.replace( ".apf.xml", ".sgm"), output_dir_path=output_xmi_dir_path, typesystem=typesystem, cas_template=cas_template) elapsed_time = time.perf_counter() - start_time print(f"Processing Completed. Time elapsed: {elapsed_time:0.4f} seconds")
def main(params: Parameters) -> None: root_output_directory = params.creatable_directory("output_directory") curriculum_string = params.string("curriculum", valid_options=STR_TO_CURRICULUM.keys(), default="phase1") language_mode = params.enum("language_mode", LanguageMode, default=LanguageMode.ENGLISH) language_string = str(language_mode).split(".")[-1].lower() num_samples = params.optional_positive_integer("num_samples") num_noise_objects = params.optional_positive_integer("num_noise_objects") phase1_curriculum_dir = root_output_directory / language_string / curriculum_string phase1_curriculum_dir.mkdir(parents=True, exist_ok=True) # We lazily instantiate the curriculum so we don't need to worry # about any of them we don't actually use. curriculum_to_render = STR_TO_CURRICULUM[curriculum_string]( num_samples, num_noise_objects, phase2_language_generator(language_mode)) sort_by_utterance_length_flag = params.boolean("sort_by_utterance", default=False) if sort_by_utterance_length_flag: random_seed = params.integer("random_seed", default=1) CurriculumToHtmlDumper().dump_to_html_as_sorted_by_utterance_length( curriculum_to_render, output_directory=phase1_curriculum_dir, title="GAILA Phase 1 Curriculum Sorted by Utterance Length", curriculum_string=curriculum_string, random_seed=random_seed, ) else: CurriculumToHtmlDumper().dump_to_html( curriculum_to_render, output_directory=phase1_curriculum_dir, title="GAILA Phase 1 Curriculum", )
def from_parameters(params: Parameters) -> "SlurmPythonRunner": return SlurmPythonRunner( conda_config=CondaConfiguration.from_parameters(params), spack_config=SpackConfiguration.from_parameters(params), log_base_directory=params.creatable_directory( "log_directory").absolute(), )
def create_logger(params: Parameters) -> "LearningProgressHtmlLogger": output_dir = params.creatable_directory("experiment_group_dir") experiment_name = params.string("experiment") include_links_to_images = params.optional_boolean("include_image_links") num_pretty_descriptions = params.positive_integer( "num_pretty_descriptions", default=3 ) sort_by_length = params.boolean( "sort_learner_descriptions_by_length", default=False ) logging_dir = output_dir / experiment_name logging_dir.mkdir(parents=True, exist_ok=True) output_html_path = str(logging_dir / "index.html") if include_links_to_images is None: include_links_to_images = False logging.info("Experiment will be logged to %s", output_html_path) with open(output_html_path, "w") as outfile: html_dumper = CurriculumToHtmlDumper() outfile.write(f"<head>\n\t<style>{CSS}\n\t</style>\n</head>") outfile.write(f"\n<body>\n\t<h1>{experiment_name}</h1>") # A JavaScript function to allow toggling perception information outfile.write( """ <script> function myFunction(id) { var x = document.getElementById(id); if (x.style.display === "none") { x.style.display = "block"; } else { x.style.display = "none"; } } </script> """ ) return LearningProgressHtmlLogger( outfile_dir=output_html_path, html_dumper=html_dumper, include_links_to_images=include_links_to_images, num_pretty_descriptions=num_pretty_descriptions, sort_by_length=sort_by_length, )
def from_parameters(params: Parameters) -> "WorkflowBuilder": workflow_directory = params.creatable_directory("workflow_directory") replica_catalog = workflow_directory / "rc.dat" if replica_catalog.exists(): replica_catalog.unlink() replica_catalog.touch(mode=0o744) return WorkflowBuilder( name=params.string("workflow_name", default="Workflow"), created_by=params.string("workflow_created", default="Default Constructor"), workflow_directory=workflow_directory, default_site=params.string("site"), conda_script_generator=CondaJobScriptGenerator.from_parameters( params), namespace=params.string("namespace"), default_resource_request=ResourceRequest.from_parameters(params), replica_catalog=replica_catalog, )
def _split_into_even_slices(input_source: KeyValueSource[str, bytes], params: Parameters): output_directory = params.creatable_directory("output_dir") slices = params.positive_integer("num_slices") random_seed = params.optional_positive_integer("random_seed") slice_paths = [ output_directory / "{!s}.zip".format(i) for i in range(slices) ] CharSink.to_file(output_directory / "_slices.txt").write("\n".join( str(x) for x in slice_paths)) output_sinks = [ KeyValueSink.zip_bytes_sink(slice_path) for slice_path in slice_paths ] # this is the magic incantation for handling variable-length lists of context managers with ExitStack() as exit_stack: for output_sink in output_sinks: exit_stack.enter_context(output_sink) input_keys = sorted(list(input_source.keys()) # type: ignore ) # guarantee deterministic iteration order if random_seed: random.seed(random_seed) random.shuffle(input_keys) for (i, k) in enumerate(input_keys): output_sinks[i % slices].put(k, input_source[k])
def example_workflow(params: Parameters): # pragma: no cover """ An example script to generate a container workflow for submission to Pegasus. """ tmp_path = params.creatable_directory("example_root_dir") docker_tar = params.creatable_file("docker_tar") docker_build_dir = params.existing_directory("docker_build_dir") docker_image_name = params.string( "docker_image_name", default="pegasus_wrapper_container_demo" ) docker_image_tag = params.string("docker_image_tag", default="0.2") mongo_db_tar = params.string( "mongo_db_tar", default="/nas/gaia/shared/cluster/docker/mongo-4.4.tar" ) monogo_db_data = "/scratch/dockermount/pegasus_wrapper_tmp/data" mongo_db_config = "/scratch/dockermount/pegasus_wrapper_tmp/config" # Generating parameters for initializing a workflow # We recommend making workflow directory, site, and partition parameters # in an research workflow workflow_params = Parameters.from_mapping( { "workflow_name": "Test", "workflow_created": "Testing", "workflow_log_dir": str(tmp_path / "log"), "workflow_directory": str(tmp_path / "working"), "site": "saga", "namespace": "test", "home_dir": str(tmp_path), "partition": "scavenge", } ) saga31_request = SlurmResourceRequest.from_parameters( Parameters.from_mapping({"run_on_single_node": "saga31", "partition": "gaia"}) ) workflow_params = workflow_params.unify(params) # Our source input for the sample jobs input_file = tmp_path / "raw_nums.txt" add_y_output_file_nas = tmp_path / "nums_y.txt" sorted_output_file_nas = tmp_path / "sorted.txt" random = Random() random.seed(0) nums = [int(random.random() * 100) for _ in range(0, 25)] # Base Job Locator job_locator = Locator(("jobs",)) docker_python_root = Path("/home/app/") job_profile = PegasusProfile( namespace="pegasus", key="transfer.bypass.input.staging", value="True" ) # Write a list of numbers out to be able to run the workflow with input_file.open("w") as mult_file: mult_file.writelines(f"{num}\n" for num in nums) initialize_vista_pegasus_wrapper(workflow_params) build_container = run_bash( job_locator / "build_docker", command=[ "mkdir -p /scratch/dockermount/pegasus_wrapper_tmp", f"cd {docker_build_dir}", f"docker build . -t {docker_image_name}:{docker_image_tag}", f"docker save -o /scratch/dockermount/pegasus_wrapper_tmp/{docker_tar.name} {docker_image_name}:{docker_image_tag}", f"cp /scratch/dockermount/pegasus_wrapper_tmp/{docker_tar.name} {docker_tar.absolute()}", f"chmod go+r {docker_tar.absolute()}", f"docker load --input {mongo_db_tar}", f"mkdir -p {monogo_db_data}", f"mkdir -p {mongo_db_config}", ], depends_on=[], resource_request=saga31_request, ) python36 = add_container( f"{docker_image_name}:{docker_image_tag}", "docker", str(docker_tar.absolute()), image_site="saga", bypass_staging=True, ) mongo4_4 = add_container( "mongo:4.4", "docker", mongo_db_tar, image_site="saga", bypass_staging=True ) start_mongo = start_docker_as_service( mongo4_4, depends_on=[build_container], mounts=[f"{monogo_db_data}:/data/db", f"{mongo_db_config}/etc/custom"], docker_args=f"-p 27017:27017", resource_request=saga31_request, ) add_y_job = run_python_on_args( job_locator / "add", docker_python_root / "add_y.py", set_args=f"{input_file} {add_y_output_file_nas} --y 10", depends_on=[build_container], job_profiles=[job_profile], resource_request=saga31_request, container=python36, input_file_paths=[input_file], output_file_paths=[add_y_output_file_nas], ) sort_job = run_python_on_parameters( job_locator / "sort", sort_nums_in_file, {"input_file": add_y_output_file_nas, "output_file": sorted_output_file_nas}, depends_on=[add_y_job], container=python36, job_profiles=[job_profile], resource_request=saga31_request, input_file_paths=add_y_output_file_nas, output_file_paths=sorted_output_file_nas, ) _ = stop_docker_as_service( mongo4_4, depends_on=[start_mongo, sort_job], resource_request=saga31_request ) # Generate the Pegasus DAX file & a Submit Script write_workflow_description(tmp_path)
def example_workflow(params: Parameters): """ An example script to generate a workflow for submission to Pegasus. """ tmp_path = params.creatable_directory("example_root_dir") # Generating parameters for initializing a workflow # We recommend making workflow directory, site, and partition parameters # in an research workflow workflow_params = Parameters.from_mapping({ "workflow_name": "Test", "workflow_created": "Testing", "workflow_log_dir": str(tmp_path / "log"), "workflow_directory": str(tmp_path / "working"), "site": "saga", "namespace": "test", }) workflow_params = workflow_params.unify(params) # Our source input for the sample jobs multiply_input_file = tmp_path / "raw_nums.txt" random = Random() random.seed(0) nums = [int(random.random() * 100) for _ in range(0, 25)] multiply_output_file = tmp_path / "multiplied_nums.txt" sorted_output_file = tmp_path / "sorted_nums.txt" # Base Job Locator job_locator = Locator(("jobs", )) # Write a list of numbers out to be able to run the workflow with multiply_input_file.open("w") as mult_file: mult_file.writelines(f"{num}\n" for num in nums) initialize_vista_pegasus_wrapper(workflow_params) multiply_artifact = ValueArtifact( multiply_output_file, depends_on=run_python_on_parameters( job_locator / "multiply", multiply_by_x, { "input_file": multiply_input_file, "output_file": multiply_output_file, "x": 4, "logfile": str(tmp_path / "multiply_log.txt"), }, depends_on=[], ), locator=Locator("multiply"), ) run_python_on_parameters( job_locator / "sort", sort_nums_in_file, { "input_file": multiply_output_file, "output_file": sorted_output_file }, depends_on=[multiply_artifact], # if you want to use a different resource for some task, you can do this way # resource_request=SlurmResourceRequest.from_parameters(slurm_params), ) # Generate the Pegasus DAX file dax_file = write_workflow_description(tmp_path) submit_script = tmp_path / "submit_script.sh" # Our attempt at an easy submit file, it MAY NOT be accurate for more complicated # workflows but it # does work for this simple example. # See https://github.com/isi-vista/vista-pegasus-wrapper/issues/27 build_submit_script( submit_script, str(dax_file), experiment_directory(), # pylint:disable=protected-access )
def example_workflow(params: Parameters): """ An example script to generate a workflow for submission to Pegasus. """ tmp_path = params.creatable_directory("example_root_dir") # Generating parameters for initializing a workflow # We recommend making workflow directory, site, and partition parameters # in an research workflow workflow_params = Parameters.from_mapping({ "workflow_name": "Test", "workflow_created": "Testing", "workflow_log_dir": str(tmp_path / "log"), "workflow_directory": str(tmp_path / "working"), "site": "saga", "namespace": "test", "home_dir": str(tmp_path), "partition": "scavenge", }) workflow_params = workflow_params.unify(params) # Our source input for the sample jobs multiply_input_file = tmp_path / "raw_nums.txt" random = Random() random.seed(0) nums = [int(random.random() * 100) for _ in range(0, 25)] multiply_output_file = tmp_path / "multiplied_nums.txt" sorted_output_file = tmp_path / "sorted_nums.txt" add_output_file = tmp_path / "add_nums.txt" # Base Job Locator job_locator = Locator(("jobs", )) # Write a list of numbers out to be able to run the workflow with multiply_input_file.open("w") as mult_file: mult_file.writelines(f"{num}\n" for num in nums) initialize_vista_pegasus_wrapper(workflow_params) multiply_artifact = ValueArtifact( multiply_output_file, depends_on=run_python_on_parameters( job_locator / "multiply", multiply_by_x, { "input_file": multiply_input_file, "output_file": multiply_output_file, "x": 4, "logfile": str(tmp_path / "multiply_log.txt"), }, depends_on=[], ), locator=Locator("multiply"), ) # You can also just track the dep node itself to pass to a future job if you don't # need the value portion of an artifacy mul_dep = run_python_on_parameters( job_locator / "sort", sort_nums_in_file, { "input_file": multiply_output_file, "output_file": sorted_output_file }, depends_on=[multiply_artifact], # if you want to use a different resource for some task, you can do this way # resource_request=SlurmResourceRequest.from_parameters(slurm_params), ) run_python_on_args( job_locator / "add", add_y, set_args=f"{sorted_output_file} {add_output_file} --y 10", depends_on=[mul_dep], category="add", # Can be used as a custom category for job limits ) # If you want to limit the number of active jobs in a category use the following # limit_jobs_for_category("scavenge", 1) # Generate the Pegasus DAX file & a Submit Script write_workflow_description(tmp_path)
def integrated_experiment_entry_point(params: Parameters) -> None: initialize_vista_pegasus_wrapper(params) baseline_parameters = params.namespace("integrated_learners_experiment") pursuit_resource_request_params = params.namespace( "pursuit_resource_request") # This code is commented out but may be used in the near future to add language ablation # Capabilities to this curriculum. # get the minimum and maximum accuracy of the language with the situation # min_language_accuracy = params.floating_point("min_language_accuracy", default=0.1) # max_language_accuracy = params.floating_point("max_language_accuracy", default=0.5) # num_language_accuracy_increment = params.integer( # "num_language_accuracy_increment", default=5 # ) # values_for_accuracy = np.linspace( # min_language_accuracy, max_language_accuracy, num_language_accuracy_increment # ) # Get if attributes or relations should be included include_attributes = params.boolean("include_attributes", default=True) include_relations = params.boolean("include_relations", default=True) limit_jobs_for_category( "pursuit_job_limit", params.integer("num_pursuit_learners_active", default=8)) curriculum_repository_path = params.creatable_directory( "curriculum_repository_path") # Job to build desired curriculum(s) which our learners use curriculum_dependencies = immutableset(( CURRICULUM_NAME_FORMAT.format( noise=add_noise, shuffled=shuffle, relations=include_relations, attributes=include_attributes, ), run_python_on_parameters( Locator( CURRICULUM_NAME_FORMAT.format( noise=add_noise, shuffled=shuffle, relations=include_relations, attributes=include_attributes, ).split("-")), generate_curriculum_script, baseline_parameters.unify({ "train_curriculum": Parameters.from_mapping(CURRICULUM_PARAMS).unify( { "add_noise": add_noise, "shuffled": shuffle, "include_attributes": include_attributes, "include_relations": include_relations, }).as_mapping() }).unify(FIXED_PARAMETERS).unify( {"curriculum_repository_path": curriculum_repository_path}), depends_on=[], ), Parameters.from_mapping(CURRICULUM_PARAMS).unify( { "add_noise": add_noise, "shuffled": shuffle, "include_attributes": include_attributes, "include_relations": include_relations, }), ) for add_noise in (True, False) for shuffle in (True, False)) # jobs to build experiment for (curriculum_str, curriculum_dep, curr_params) in curriculum_dependencies: object_learner_type = params.string( "object_learner.learner_type", valid_options=["pursuit", "subset", "pbv"], default="pursuit", ) attribute_learner_type = params.string( "attribute_learner.learner__type", valid_options=["none", "pursuit", "subset"], default="pursuit", ) relation_learner_type = params.string( "relation_learner.learner_type", valid_options=["none", "pursuit", "subset"], default="pursuit", ) experiment_name_string = EXPERIMENT_NAME_FORMAT.format( curriculum_name=curriculum_str.replace("-", "+"), object_learner=object_learner_type, attribute_learner=attribute_learner_type, relation_learner=relation_learner_type, ) experiment_name = Locator(experiment_name_string.split("-")) # Note that the input parameters should include the root params and # anything else we want. experiment_params = baseline_parameters.unify(FIXED_PARAMETERS).unify({ "experiment": experiment_name_string, "experiment_group_dir": directory_for(experiment_name), "hypothesis_log_dir": directory_for(experiment_name) / "hypotheses", "learner_logging_path": directory_for(experiment_name), "log_learner_state": True, "resume_from_latest_logged_state": True, "load_from_curriculum_repository": curriculum_repository_path, "train_curriculum": curr_params, }) run_python_on_parameters( experiment_name, log_experiment_script, experiment_params, depends_on=[curriculum_dep], resource_request=SlurmResourceRequest.from_parameters( pursuit_resource_request_params) if "pursuit" in [ object_learner_type, attribute_learner_type, relation_learner_type ] else None, category="pursuit" if "pursuit" in [ object_learner_type, attribute_learner_type, relation_learner_type ] else "subset", use_pypy=True, ) write_workflow_description()
def create_gaze_ablation_entry_point(params: Parameters) -> None: """This function creates all possible gaze ablation param files within a given range""" # get the parameters directory, which must be non-null parameters_dir = params.creatable_directory("parameters_directory") if not parameters_dir: raise RuntimeError( "Must specify a directory where you wish to write your param files" ) # get the minimum and maximum number of objects in a scene min_num_objects = params.integer("min_num_objects", default=1) max_num_objects = params.integer("max_num_objects", default=7) # this gets the number of different accuracies to try; default = increment by 0.1 num_accuracy_increments = params.integer("num_increments", default=11) values_for_accuracy = np.linspace(0, 1, num_accuracy_increments) # the number of noise instances to be included min_num_noise_instances = params.integer("min_num_noise", default=0) max_num_noise_instances = params.integer("max_num_noise", default=0) # get the number of instances in the entire curriculum min_num_instances_in_curriculum = params.integer("min_instances", default=10) max_num_instances_in_curriculum = params.integer("max_instances", default=20) # all possible numbers of noise instances for num_noise_instances in range(min_num_noise_instances, max_num_noise_instances + 1): # all possible numbers of instances in the curriculum for num_instances in range(min_num_instances_in_curriculum, max_num_instances_in_curriculum + 1): # all possible numbers of instances for num_objects_in_instance in range(min_num_objects, max_num_objects + 1): # all possible accuracies for prob_given in values_for_accuracy: for prob_not_given in values_for_accuracy: # both ignoring and perceiving gaze for add_gaze in [True, False]: # add the required arguments to create a unique filename file_name = FILE_NAME_STRING.format( num_instances=num_instances, num_noise_instances=num_noise_instances, num_objects_in_instance=num_objects_in_instance, prob_given=prob_given, prob_not_given=prob_not_given, add_gaze=add_gaze, ) # format the arguments in the parameter file and write them out param_file_string = PARAM_FILE_STRING.format( experiment=file_name, num_instances=num_instances, num_noise_instances=num_noise_instances, num_objects_in_instance=num_objects_in_instance, add_gaze=add_gaze, prob_given=prob_given, prob_not_given=prob_not_given, ) with open(f"{parameters_dir}/{file_name}", "a") as f: f.write(param_file_string)