def install_arcadia(operations, **kwargs): graph = wctx.graph_mode() send_event_starting_tasks = {} send_event_done_tasks = {} for node in wctx.nodes: for instance in node.instances: send_event_starting_tasks[instance.id] = instance.send_event('Starting to run operation') send_event_done_tasks[instance.id] = instance.send_event('Done running operation') for node in wctx.nodes: for instance in node.instances: sequence = graph.sequence() inst_kwargs = dict() inst_kwargs['id'] = actx.test_component(instance) sequence.add( send_event_starting_tasks[instance.id], instance.execute_operation("create_and_configure", kwargs=inst_kwargs), send_event_done_tasks[instance.id]) for node in wctx.nodes: sequence = graph.sequence() for instance in node.instances: for relationship in instance.relationships: rel_kwargs = dict() rel_kwargs['id'] = actx.test_relationship(relationship) sequence.add( relationship.execute_source_operation('preconfigure', kwargs=rel_kwargs)) for node in wctx.nodes: for instance in node.instances: for rel in instance.relationships: instance_starting_task = send_event_starting_tasks.get(instance.id) target_done_task = send_event_done_tasks.get(rel.target_id) if instance_starting_task and target_done_task: graph.add_dependency(instance_starting_task, target_done_task) graph.execute() try: actx.client.generate_service_graph(actx.service_graph) actx.client.install_service_graph() except NotImplementedError: message = 'cancel service graph deployment: failed to generate or install graph, due too some missing functionality' wctx.logger.error(message) raise api.ExecutionCancelled(message) except ARCADIAServerRequestError as ex: message = 'cancel service graph deployment: arcadia server responded with an error message: {0}'.format(ex.message) wctx.logger.error(message) raise api.ExecutionCancelled(message)
def sleep_with_cancel_support(ctx, use_legacy_cancel, **kwargs): node_instance = get_instance(ctx) node_instance.execute_operation('test_interface.operation', kwargs={ 'key': 'before-sleep', 'value': None }) node_instance.set_state('asleep') is_cancelled = False for i in range(10): if api.has_cancel_request(): is_cancelled = True break time.sleep(1) if is_cancelled: if use_legacy_cancel: return api.EXECUTION_CANCELLED_RESULT else: raise api.ExecutionCancelled() node_instance.execute_operation('test_interface.operation', kwargs={ 'key': 'after-sleep', 'value': None })
def get(self, retry_on_failure=True): """Get the task result. Will block until the task execution ends. :return: The task result """ done = threading.Event() api.cancel_callbacks.add(done.set) self.on_result(lambda _result: done.set()) done.wait() api.cancel_callbacks.discard(done.set) if api.has_cancel_request(): if self._result is self._NOT_SET: self.result = api.ExecutionCancelled() raise self.result ctx = self.task.workflow_context if not ctx.internal.graph_mode: ctx.internal.task_graph.remove_task(self.task) if self.task.get_state() in (TASK_FAILED, TASK_RESCHEDULED): handler_result = self.task.handle_task_terminated() if handler_result.retried_task and retry_on_failure: handler_result.retried_task.apply_async() return handler_result.retried_task.async_result.get() else: raise self.result return self._result
def _wait_for_sent_tasks(ctx, graph): """Wait for tasks that are in the SENT state to return""" for task in graph.tasks_iter(): # Check type. ctx.logger.debug( 'Parallel task to failed task: {0}. State: {1}'.format( task.id, task.get_state())) try: deadline = time.time() + ctx.wait_after_fail except AttributeError: deadline = time.time() + 1800 while deadline > time.time(): try: cancelled = api.has_cancel_request() except AttributeError: cancelled = graph._is_execution_cancelled() if cancelled: raise api.ExecutionCancelled() try: finished_tasks = graph._finished_tasks() except AttributeError: finished_tasks = graph._terminated_tasks() for task in finished_tasks: try: graph._handle_terminated_task(task) except RuntimeError: ctx.logger.error('Unhandled Failed task: {0}'.format(task)) if not any(task.get_state() == tasks.TASK_SENT for task in graph.tasks_iter()): break else: time.sleep(0.1)
def preconfig_rship_source(**kwargs): try: api_r = ARCADIARelationshipAPI(client=actx.client) api_r.preconfig_src_relationship(_instance=actx.relationships[kwargs.get('id')]) except ARCADIAServerRequestError as error: ctx.logger.error(error.message) raise api.ExecutionCancelled(error.message)
def create_policy(**kwargs): try: api_policy = ARCADIAPolicyAPI(client=actx.client) api_policy.init_policy(_instance=actx.components[kwargs.get('id')]) except ARCADIAServerRequestError as error: ctx.logger.error(error.message) raise api.ExecutionCancelled(error.message)
def run_jobs(**kwargs): # pylint: disable=W0613 """ Workflow to execute long running batch operations """ success = True root_nodes, job_instances_map = build_graph(ctx.nodes) monitor = Monitor(job_instances_map, ctx.logger) new_exec_nodes = root_nodes # Monitoring and next executions loop while new_exec_nodes or monitor.is_something_executing( ) and not api.has_cancel_request(): # perform new executions jobs_result_list = [] for new_node in new_exec_nodes: monitor.add_node(new_node) if new_node.is_job: jobs_result_list += new_node.launch_all_instances() wait_jobs_to_finish(jobs_result_list) # Monitor the infrastructure monitor.update_status() exec_nodes_finished = [] new_exec_nodes = [] for node_name, exec_node in monitor.get_executions_iterator(): if exec_node.check_status(): if exec_node.completed: exec_node.clean_all_instances() exec_nodes_finished.append(node_name) new_nodes_to_execute = exec_node.get_children_ready() for new_node in new_nodes_to_execute: new_exec_nodes.append(new_node) else: # Something went wrong in the node, cancel execution cancel_all(monitor.get_executions_iterator()) return # remove finished nodes for node_name in exec_nodes_finished: monitor.finish_node(node_name) wait_jobs_to_finish(jobs_result_list) if monitor.is_something_executing(): ctx.logger.info("Cancelling jobs...") cancel_all(monitor.get_executions_iterator()) success = False deleted_reservations = [] for instance_name in job_instances_map: instance = job_instances_map[instance_name] if instance.reservation not in deleted_reservations and instance.reservation: instance.delete_reservation() deleted_reservations.append(instance.reservation) if not success: raise api.ExecutionCancelled() ctx.logger.info( "------------------Workflow Finished-----------------------") return
def create_serv_graph(**kwargs): try: api_srv = ARCADIAServiceGraphAPI(client=actx.client) api_srv.init_service_graph(_instance=actx.components[kwargs.get('id')]) actx.service_graph = actx.components[kwargs.get('id')] except ARCADIAServerRequestError as error: ctx.logger.error(error.message) raise api.ExecutionCancelled(error.message)
def execute(self): """ Start executing the graph based on tasks and dependencies between them. Calling this method will block until one of the following occurs: 1. all tasks terminated 2. a task failed 3. an unhandled exception is raised 4. the execution is cancelled Note: This method will raise an api.ExecutionCancelled error if the execution has been cancelled. When catching errors raised from this method, make sure to re-raise the error if it's api.ExecutionsCancelled in order to allow the execution to be set in cancelled mode properly. Also note that for the time being, if such a cancelling event occurs, the method might return even while there's some operations still being executed. """ while True: if self._is_execution_cancelled(): raise api.ExecutionCancelled() self._check_dump_request() # handle all terminated tasks # it is important this happens before handling # executable tasks so we get to make tasks executable # and then execute them in this iteration (otherwise, it would # be the next one) for task in self._terminated_tasks(): self._handle_terminated_task(task) # handle all executable tasks for task in self._executable_tasks(): self._handle_executable_task(task) # no more tasks to process, time to move on if len(self.graph.node) == 0: return # sleep some and do it all over again else: time.sleep(0.1)
def _is_finished(self): if api.has_cancel_request(): self._error = api.ExecutionCancelled() return True if not self._tasks: return True if self._error: if not self._waiting_for: return True deadline = self._error_time + self.ctx.wait_after_fail if time.time() > deadline: return True else: self._wake_after_fail = threading.Timer( deadline - time.time(), self._tasks_wait.set) self._wake_after_fail.daemon = True self._wake_after_fail.start() return False
def _check_execution_cancelled(): if api.has_cancel_request(): raise api.ExecutionCancelled()
def cancel_all(executions): """Cancel all pending or running jobs""" for _, exec_node in executions: exec_node.cancel_all_instances() raise api.ExecutionCancelled()
def execute(self): """ Start executing the graph based on tasks and dependencies between them.\ Calling this method will block until one of the following occurs:\ 1. all tasks terminated\ 2. a task failed\ 3. an unhandled exception is raised\ 4. the execution is cancelled\ Note: This method will raise an api.ExecutionCancelled error if the\ execution has been cancelled. When catching errors raised from this\ method, make sure to re-raise the error if it's\ api.ExecutionsCancelled in order to allow the execution to be set in\ cancelled mode properly.\ Also note that for the time being, if such a cancelling event\ occurs, the method might return even while there's some operations\ still being executed. """ # clear error, in case the tasks graph has been reused self._error = None while self._error is None: if self._is_execution_cancelled(): raise api.ExecutionCancelled() # handle all terminated tasks # it is important this happens before handling # executable tasks so we get to make tasks executable # and then execute them in this iteration (otherwise, it would # be the next one) for task in self._terminated_tasks(): self._handle_terminated_task(task) # if there was an error when handling terminated tasks, don't # continue on to sending new tasks in handle_executable if self._error: break # handle all executable tasks for task in self._executable_tasks(): self._handle_executable_task(task) # no more tasks to process, time to move on if len(self.graph.node) == 0: if self._error: raise self._error return # sleep some and do it all over again else: time.sleep(0.1) # if we got here, we had an error in a task, and we're just waiting # for other tasks to return, but not sending new tasks deadline = time.time() + self.ctx.wait_after_fail while deadline > time.time(): if self._is_execution_cancelled(): raise api.ExecutionCancelled() for task in self._terminated_tasks(): self._handle_terminated_task(task) if not any(self._sent_tasks()): break else: time.sleep(0.1) raise self._error
def _run_scale_settings(ctx, scale_settings, scalable_entity_properties, scale_transaction_field=None, scale_transaction_value=None, ignore_failure=False, ignore_rollback_failure=True, instances_remove_ids=None, node_sequence=None): modification = ctx.deployment.start_modification(scale_settings) graph = ctx.graph_mode() try: ctx.logger.info('Deployment modification started. ' '[modification_id={0}]'.format(modification.id)) if len(set(modification.added.node_instances)): ctx.logger.info('Added: {}'.format( repr([ node_instance._node_instance.id for node_instance in modification.added.node_instances if node_instance.modification == 'added' ]))) added_and_related = set(modification.added.node_instances) added = set(i for i in added_and_related if i.modification == 'added') related = added_and_related - added try: for node_instance in added: properties_updates = scalable_entity_properties.get( node_instance._node_instance.node_id, {}) # save properties updates properties = {} if properties_updates: # pop one dict for runtime properties properties.update(properties_updates.pop()) # save transaction list if scale_transaction_field: # save original set of instances in scale up. if scale_transaction_value: properties.update({ scale_transaction_field: scale_transaction_value }) else: properties.update( {scale_transaction_field: modification.id}) # check properties to update if properties: ctx.logger.debug( "{}: Updating {} runtime properties by {}".format( node_instance._node_instance.node_id, node_instance._node_instance.id, repr(properties))) _update_runtime_properties( ctx, node_instance._node_instance.id, properties) if node_sequence: subgraph_func = lifecycle.install_node_instance_subgraph _process_node_instances( ctx=ctx, graph=graph, node_instances=added, ignore_failure=ignore_failure, node_instance_subgraph_func=subgraph_func, node_sequence=node_sequence) else: lifecycle.install_node_instances(graph=graph, node_instances=added, related_nodes=related) except Exception as ex: ctx.logger.error( 'Scale out failed, scaling back in. {}'.format(repr(ex))) _uninstall_instances(ctx=ctx, graph=graph, removed=added, related=related, ignore_failure=ignore_rollback_failure, node_sequence=node_sequence) raise ex if len(set(modification.removed.node_instances)): ctx.logger.info('Removed: {}'.format( repr([ node_instance._node_instance.id for node_instance in modification.removed.node_instances if node_instance.modification == 'removed' ]))) removed_and_related = set(modification.removed.node_instances) removed = set(i for i in removed_and_related if i.modification == 'removed') ctx.logger.info('Proposed: {}'.format(repr(instances_remove_ids))) if instances_remove_ids: for instance in removed: if instance._node_instance.id not in instances_remove_ids: raise Exception( "Instance {} not in proposed list {}.".format( repr(instance._node_instance.id), repr(instances_remove_ids))) related = removed_and_related - removed _uninstall_instances(ctx=ctx, graph=graph, removed=removed, ignore_failure=ignore_failure, related=related, node_sequence=node_sequence) except Exception as ex: ctx.logger.warn('Rolling back deployment modification. ' '[modification_id={0}]: {1}'.format( modification.id, repr(ex))) try: deadline = time.time() + ctx.wait_after_fail except AttributeError: deadline = time.time() + 1800 while deadline > time.time(): if graph._is_execution_cancelled(): raise api.ExecutionCancelled() for task in graph._terminated_tasks(): graph._handle_terminated_task(task) if not any(task.get_state() == tasks.TASK_SENT for task in graph.tasks_iter()): break else: time.sleep(0.1) modification.rollback() raise ex else: modification.finish()