def test_protocol_graph_execution(calculation_backend, compute_resources): if calculation_backend is not None: calculation_backend.start() protocol_a = DummyInputOutputProtocol("protocol_a") protocol_a.input_value = 1 protocol_b = DummyInputOutputProtocol("protocol_b") protocol_b.input_value = ProtocolPath("output_value", protocol_a.id) protocol_graph = ProtocolGraph() protocol_graph.add_protocols(protocol_a, protocol_b) with tempfile.TemporaryDirectory() as directory: results = protocol_graph.execute(directory, calculation_backend, compute_resources) final_result = results[protocol_b.id] if calculation_backend is not None: final_result = final_result.result() with open(final_result[1]) as file: results_b = json.load(file, cls=TypedJSONDecoder) assert results_b[".output_value"] == protocol_a.input_value if compute_resources is not None: assert protocol_b.output_value == protocol_a.input_value if calculation_backend is not None: calculation_backend.stop()
def test_protocol_group_resume(): """A test that protocol groups can recover after being killed (e.g. by a worker being killed due to hitting a wallclock limit) """ compute_resources = ComputeResources() # Fake a protocol group which executes the first # two protocols and then 'gets killed'. protocol_a = DummyInputOutputProtocol("protocol_a") protocol_a.input_value = 1 protocol_b = DummyInputOutputProtocol("protocol_b") protocol_b.input_value = ProtocolPath("output_value", protocol_a.id) protocol_group_a = ProtocolGroup("group_a") protocol_group_a.add_protocols(protocol_a, protocol_b) protocol_graph = ProtocolGraph() protocol_graph.add_protocols(protocol_group_a) protocol_graph.execute("graph_a", compute_resources=compute_resources) # Remove the output file so it appears the the protocol group had not # completed. os.unlink( os.path.join("graph_a", protocol_group_a.id, f"{protocol_group_a.id}_output.json")) # Build the 'full' group with the last two protocols which # 'had not been exited' after the group was 'killed' protocol_a = DummyInputOutputProtocol("protocol_a") protocol_a.input_value = 1 protocol_b = DummyInputOutputProtocol("protocol_b") protocol_b.input_value = ProtocolPath("output_value", protocol_a.id) protocol_c = DummyInputOutputProtocol("protocol_c") protocol_c.input_value = ProtocolPath("output_value", protocol_b.id) protocol_d = DummyInputOutputProtocol("protocol_d") protocol_d.input_value = ProtocolPath("output_value", protocol_c.id) protocol_group_a = ProtocolGroup("group_a") protocol_group_a.add_protocols(protocol_a, protocol_b, protocol_c, protocol_d) protocol_graph = ProtocolGraph() protocol_graph.add_protocols(protocol_group_a) protocol_graph.execute("graph_a", compute_resources=compute_resources) assert all(x != UNDEFINED for x in protocol_group_a.outputs.values())
def test_protocol_graph_simple(protocols_a, protocols_b): # Make sure that the graph can merge simple protocols # when they are added one after the other. protocol_graph = ProtocolGraph() protocol_graph.add_protocols(*protocols_a) dependants_graph = protocol_graph._build_dependants_graph( protocol_graph.protocols, False, apply_reduction=True) assert len(protocol_graph.protocols) == len(protocols_a) assert len(dependants_graph) == len(protocols_a) n_root_protocols = len(protocol_graph.root_protocols) protocol_graph.add_protocols(*protocols_b) dependants_graph = protocol_graph._build_dependants_graph( protocol_graph.protocols, False, apply_reduction=False) assert len(protocol_graph.protocols) == len(protocols_a) assert len(dependants_graph) == len(protocols_a) assert len(protocol_graph.root_protocols) == n_root_protocols # Currently the graph shouldn't merge with an # addition protocol_graph = ProtocolGraph() protocol_graph.add_protocols(*protocols_a, *protocols_b) dependants_graph = protocol_graph._build_dependants_graph( protocol_graph.protocols, False, apply_reduction=False) assert len(protocol_graph.protocols) == len(protocols_a) + len(protocols_b) assert len(dependants_graph) == len(protocols_a) + len(protocols_b) assert len(protocol_graph.root_protocols) == 2 * n_root_protocols
def test_protocol_group_merging(): def build_protocols(prefix): # .-------------------. # | / i - j -|- b # a - | g - h - | | # | \ k - l -|- c # .-------------------. protocol_a = DummyInputOutputProtocol(prefix + "protocol_a") protocol_a.input_value = 1 fork_protocols = build_fork(prefix) fork_protocols[0].input_value = ProtocolPath("output_value", protocol_a.id) protocol_group = ProtocolGroup(prefix + "protocol_group") protocol_group.add_protocols(*fork_protocols) protocol_b = DummyInputOutputProtocol(prefix + "protocol_b") protocol_b.input_value = ProtocolPath("output_value", protocol_group.id, "protocol_j") protocol_c = DummyInputOutputProtocol(prefix + "protocol_c") protocol_c.input_value = ProtocolPath("output_value", protocol_group.id, "protocol_l") return [protocol_a, protocol_group, protocol_b, protocol_c] protocols_a = build_protocols("a_") protocols_b = build_protocols("b_") protocol_graph = ProtocolGraph() protocol_graph.add_protocols(*protocols_a) protocol_graph.add_protocols(*protocols_b) assert len(protocol_graph.protocols) == len(protocols_a) assert "a_protocol_group" in protocol_graph.protocols original_protocol_group = protocols_a[1] merged_protocol_group = protocol_graph.protocols["a_protocol_group"] assert original_protocol_group.schema.json( ) == merged_protocol_group.schema.json()
class WorkflowGraph: """A hierarchical structure for storing and submitting the workflows which will estimate a set of physical properties.. """ @property def protocols(self): """dict of str and Protocol: The protocols in this graph.""" return self._protocol_graph.protocols @property def root_protocols(self): """list of str: The ids of the protocols in the group which do not take input from the other grouped protocols.""" return self._protocol_graph.root_protocols def __init__(self): super(WorkflowGraph, self).__init__() self._workflows_to_execute = {} self._protocol_graph = ProtocolGraph() def add_workflows(self, *workflows): """Insert a set of workflows into the workflow graph. Parameters ---------- workflow: Workflow The workflow to insert. """ workflow_uuids = [x.uuid for x in workflows] if len(set(workflow_uuids)) != len(workflow_uuids): raise ValueError("A number of workflows have the same uuid.") existing_uuids = [ x for x in workflow_uuids if x in self._workflows_to_execute ] if len(existing_uuids) > 0: raise ValueError( f"Workflows with the uuids {existing_uuids} are already in the graph." ) original_protocols = [] for workflow in workflows: original_protocols.extend(workflow.protocols.values()) self._workflows_to_execute[workflow.uuid] = workflow # Add the workflow protocols to the graph. merged_protocol_ids = self._protocol_graph.add_protocols( *original_protocols, allow_external_dependencies=False) # Update the workflow to use the possibly merged protocols for original_id, new_id in merged_protocol_ids.items(): original_protocol = original_id new_protocol = new_id for workflow in workflows: if (retrieve_uuid(original_protocol if isinstance( original_protocol, str) else original_protocol.id) != workflow.uuid): continue if original_protocol in workflow.protocols: # Only retrieve the actual protocol if it isn't nested in # a group. original_protocol = workflow.protocols[original_id] new_protocol = self._protocol_graph.protocols[new_id] workflow.replace_protocol(original_protocol, new_protocol, True) def execute(self, root_directory="", calculation_backend=None, compute_resources=None): """Executes the workflow graph. Parameters ---------- root_directory: str The directory to execute the graph in. calculation_backend: CalculationBackend, optional. The backend to execute the graph on. This parameter is mutually exclusive with `compute_resources`. compute_resources: CalculationBackend, optional. The compute resources to run using. If None and no `calculation_backend` is specified, the workflow will be executed on a single CPU thread. This parameter is mutually exclusive with `calculation_backend`. Returns ------- list of WorkflowResult or list of Future of WorkflowResult: The results of executing the graph. If a `calculation_backend` is specified, these results will be wrapped in a `Future`. """ if calculation_backend is None and compute_resources is None: compute_resources = ComputeResources(number_of_threads=1) protocol_outputs = self._protocol_graph.execute( root_directory, calculation_backend, compute_resources) value_futures = [] for workflow_id in self._workflows_to_execute: workflow = self._workflows_to_execute[workflow_id] data_futures = [] # Make sure we keep track of all of the futures which we # will use to populate things such as a final property value # or gradient keys. if workflow.final_value_source != UNDEFINED: protocol_id = workflow.final_value_source.start_protocol data_futures.append(protocol_outputs[protocol_id]) if workflow.gradients_sources != UNDEFINED: for gradient_source in workflow.gradients_sources: protocol_id = gradient_source.start_protocol data_futures.append(protocol_outputs[protocol_id]) if workflow.outputs_to_store != UNDEFINED: for output_label, output_to_store in workflow.outputs_to_store.items( ): for attribute_name in output_to_store.get_attributes( StorageAttribute): attribute_value = getattr(output_to_store, attribute_name) if not isinstance(attribute_value, ProtocolPath): continue data_futures.append( protocol_outputs[attribute_value.start_protocol]) if len(data_futures) == 0: data_futures = [*protocol_outputs.values()] if calculation_backend is None: value_futures.append( WorkflowGraph._gather_results( root_directory, workflow.uuid, workflow.final_value_source, workflow.gradients_sources, workflow.outputs_to_store, *data_futures, )) else: value_futures.append( calculation_backend.submit_task( WorkflowGraph._gather_results, root_directory, workflow.uuid, workflow.final_value_source, workflow.gradients_sources, workflow.outputs_to_store, *data_futures, )) return value_futures @staticmethod def _gather_results( directory, workflow_id, value_reference, gradient_sources, outputs_to_store, *protocol_result_paths, **_, ): """Gather the data associated with the workflows in this graph. Parameters ---------- directory: str The directory to store any working files in. workflow_id: str The id of the workflow associated with this result. value_reference: ProtocolPath, optional A reference to which property in the output dictionary is the actual value. gradient_sources: list of ProtocolPath A list of references to those entries in the output dictionaries which correspond to parameter gradients. outputs_to_store: dict of str and WorkflowOutputToStore A list of references to data which should be stored on the storage backend. protocol_results: dict of str and str The result dictionary of the protocol which calculated the value of the property. Returns ------- CalculationLayerResult, optional The result of attempting to estimate this property from a workflow graph. `None` will be returned if the target uncertainty is set but not met. """ return_object = WorkflowResult() return_object.workflow_id = workflow_id try: results_by_id = {} for protocol_id, protocol_result_path in protocol_result_paths: with open(protocol_result_path, "r") as file: protocol_results = json.load(file, cls=TypedJSONDecoder) # Make sure none of the protocols failed and we actually have a value # and uncertainty. if isinstance(protocol_results, EvaluatorException): return_object.exceptions.append(protocol_results) return return_object # Store the protocol results in a dictionary, with keys of the # path to the original protocol output. for protocol_path, output_value in protocol_results.items(): protocol_path = ProtocolPath.from_string(protocol_path) if (protocol_path.start_protocol is None or protocol_path.start_protocol != protocol_id): protocol_path.prepend_protocol_id(protocol_id) results_by_id[protocol_path] = output_value if value_reference is not None: return_object.value = results_by_id[value_reference] for gradient_source in gradient_sources: return_object.gradients.append(results_by_id[gradient_source]) return_object.data_to_store = [] for output_to_store in outputs_to_store.values(): unique_id = str(uuid.uuid4()).replace("-", "") data_object_path = path.join(directory, f"data_{unique_id}.json") data_directory = path.join(directory, f"data_{unique_id}") WorkflowGraph._store_output_data( data_object_path, data_directory, output_to_store, results_by_id, ) return_object.data_to_store.append( (data_object_path, data_directory)) except Exception as e: return_object.exceptions.append( EvaluatorException.from_exception(e)) return return_object @staticmethod def _store_output_data( data_object_path, data_directory, output_to_store, results_by_id, ): """Collects all of the simulation to store, and saves it into a directory whose path will be passed to the storage backend to process. Parameters ---------- data_object_path: str The file path to serialize the data object to. data_directory: str The path of the directory to store ancillary data in. output_to_store: BaseStoredData An object which contains `ProtocolPath`s pointing to the data to store. results_by_id: dict of ProtocolPath and any The results of the protocols which formed the property estimation workflow. """ makedirs(data_directory, exist_ok=True) for attribute_name in output_to_store.get_attributes(StorageAttribute): attribute = getattr(output_to_store.__class__, attribute_name) attribute_value = getattr(output_to_store, attribute_name) if not isinstance(attribute_value, ProtocolPath): continue attribute_value = results_by_id[attribute_value] if issubclass(attribute.type_hint, FilePath): file_copy(attribute_value, data_directory) attribute_value = path.basename(attribute_value) setattr(output_to_store, attribute_name, attribute_value) with open(data_object_path, "w") as file: json.dump(output_to_store, file, cls=TypedJSONEncoder)