def start_all_cores(self, executable_targets, app_id, txrx, sync_state_changes): """ :param executable_targets: the mapping between cores and binaries :param app_id: the app id that being used by the simulation :param sync_state_changes: the number of runs been done between setup\ and end :param txrx: the python interface to the spinnaker machine :return: None """ total_processors = executable_targets.total_processors all_core_subsets = executable_targets.all_core_subsets # check that the right number of processors are in correct sync if sync_state_changes % 2 == 0: sync_state = SCPSignal.SYNC0 else: sync_state = SCPSignal.SYNC1 # if correct, start applications logger.info("Starting application") txrx.send_signal(app_id, sync_state) sync_state_changes += 1 # check all apps have gone into run state logger.info("Checking that the application has started") processors_running = txrx.get_core_state_count( app_id, CPUState.RUNNING) if processors_running < total_processors: # deduce the correct state value if sync_state_changes % 2 == 0: sync_state = CPUState.SYNC0 else: sync_state = CPUState.SYNC1 processors_finished = txrx.get_core_state_count( app_id, sync_state) if processors_running + processors_finished >= total_processors: logger.warn("some processors finished between signal " "transmissions. Could be a sign of an error") else: unsuccessful_cores = helpful_functions.get_cores_not_in_state( all_core_subsets, CPUState.RUNNING, txrx) break_down = helpful_functions.get_core_status_string( unsuccessful_cores) raise exceptions.ExecutableFailedToStartException( "Only {} of {} processors started:{}" .format(processors_running, total_processors, break_down))
def wait_for_cores_to_be_ready( self, executable_targets, app_id, txrx, no_sync_state_changes): """ :param executable_targets: the mapping between cores and binaries :param app_id: the app id that being used by the simulation :param no_sync_state_changes: the number of runs been done between\ setup and end :param txrx: the python interface to the spinnaker machine :return: """ total_processors = executable_targets.total_processors all_core_subsets = executable_targets.all_core_subsets processor_c_main = txrx.get_core_state_count( app_id, CPUState.C_MAIN) # check that everything has gone though c main to reach sync0 or # failing for some unknown reason while processor_c_main != 0: time.sleep(0.1) processor_c_main = txrx.get_core_state_count( app_id, CPUState.C_MAIN) # check that the right number of processors are in correct sync if no_sync_state_changes % 2 == 0: sync_state = CPUState.SYNC0 else: sync_state = CPUState.SYNC1 # check that the right number of processors are in sync0 processors_ready = txrx.get_core_state_count( app_id, sync_state) if processors_ready != total_processors: unsuccessful_cores = helpful_functions.get_cores_not_in_state( all_core_subsets, sync_state, txrx) # last chance to slip out of error check if len(unsuccessful_cores) != 0: break_down = helpful_functions.get_core_status_string( unsuccessful_cores) raise exceptions.ExecutableFailedToStartException( "Only {} processors out of {} have successfully reached " "{}:{}".format( processors_ready, total_processors, sync_state.name, break_down))
def start_all_cores(executable_targets, app_id, txrx, sync_state_changes): """ :param executable_targets: the mapping between cores and binaries :param app_id: the app id that being used by the simulation :param sync_state_changes: the number of runs been done between setup\ and end :param txrx: the python interface to the spinnaker machine :return: None """ total_processors = executable_targets.total_processors all_core_subsets = executable_targets.all_core_subsets # check that the right number of processors are in correct sync if sync_state_changes % 2 == 0: sync_state = SCPSignal.SYNC0 else: sync_state = SCPSignal.SYNC1 # if correct, start applications logger.info("Starting application ({})".format(sync_state)) txrx.send_signal(app_id, sync_state) sync_state_changes += 1 # check all apps have gone into run state logger.info("Checking that the application has started") processors_running = txrx.get_core_state_count(app_id, CPUState.RUNNING) if processors_running < total_processors: processors_finished = txrx.get_core_state_count( app_id, CPUState.PAUSED) if processors_running + processors_finished >= total_processors: logger.warn("some processors finished between signal " "transmissions. Could be a sign of an error") else: unsuccessful_cores = helpful_functions.get_cores_not_in_state( all_core_subsets, {CPUState.RUNNING, CPUState.PAUSED}, txrx) # Last chance to get out of error state if len(unsuccessful_cores) > 0: break_down = helpful_functions.get_core_status_string( unsuccessful_cores) raise exceptions.ExecutableFailedToStartException( "Only {} of {} processors started:{}".format( processors_running, total_processors, break_down), helpful_functions.get_core_subsets(unsuccessful_cores))
def wait_for_cores_to_be_ready(executable_targets, app_id, txrx, no_sync_state_changes): """ :param executable_targets: the mapping between cores and binaries :param app_id: the app id that being used by the simulation :param no_sync_state_changes: the number of runs been done between\ setup and end :param txrx: the python interface to the spinnaker machine :return: """ total_processors = executable_targets.total_processors all_core_subsets = executable_targets.all_core_subsets # check that everything has gone though c main to reach sync0 or # failing for some unknown reason processor_c_main = txrx.get_core_state_count(app_id, CPUState.C_MAIN) while processor_c_main != 0: time.sleep(0.1) processor_c_main = txrx.get_core_state_count( app_id, CPUState.C_MAIN) # check that the right number of processors are in correct sync if no_sync_state_changes % 2 == 0: sync_state = CPUState.SYNC0 else: sync_state = CPUState.SYNC1 # check that the right number of processors are in sync0 processors_ready = txrx.get_core_state_count(app_id, sync_state) if processors_ready != total_processors: unsuccessful_cores = helpful_functions.get_cores_not_in_state( all_core_subsets, sync_state, txrx) # last chance to slip out of error check if len(unsuccessful_cores) != 0: break_down = helpful_functions.get_core_status_string( unsuccessful_cores) raise exceptions.ExecutableFailedToStartException( "Only {} processors out of {} have successfully reached " "{}:{}".format(processors_ready, total_processors, sync_state.name, break_down), helpful_functions.get_core_subsets(unsuccessful_cores))
def __call__(self, txrx, app_id, all_core_subsets): # check that the right number of processors are in sync processors_completed = txrx.get_core_state_count( app_id, CPUState.FINISHED) total_processors = len(all_core_subsets) left_to_do_cores = total_processors - processors_completed progress_bar = ProgressBar( left_to_do_cores, "Forcing error cores to generate provenance data") # check that all cores are in the state CPU_STATE_12 which shows that # the core has received the message and done provenance updating while processors_completed != total_processors: unsuccessful_cores = helpful_functions.get_cores_not_in_state( all_core_subsets, CPUState.FINISHED, txrx) for (x, y, p) in unsuccessful_cores: data = struct.pack( "<I", constants.SDP_RUNNING_MESSAGE_CODES. SDP_UPDATE_PROVENCE_REGION_AND_EXIT.value) txrx.send_sdp_message( SDPMessage(SDPHeader( flags=SDPFlag.REPLY_NOT_EXPECTED, destination_cpu=p, destination_chip_x=x, destination_port=(constants.SDP_PORTS. RUNNING_COMMAND_SDP_PORT.value), destination_chip_y=y), data=data)) processors_completed = txrx.get_core_state_count( app_id, CPUState.FINISHED) left_over_now = total_processors - processors_completed to_update = left_to_do_cores - left_over_now if to_update != 0: progress_bar.update(to_update) progress_bar.end()
def __call__( self, placements, txrx, no_sync_changes, app_id, executable_targets, graph_mapper): # check that the right number of processors are in sync0 processors_ready = \ txrx.get_core_state_count(app_id, CPUState.CPU_STATE_12) total_processors = executable_targets.total_processors all_core_subsets = executable_targets.all_core_subsets # check that all cores are in the state CPU_STATE_12 which shows that # the core has received the new runtime while processors_ready != total_processors: unsuccessful_cores = helpful_functions.get_cores_not_in_state( all_core_subsets, CPUState.CPU_STATE_12, txrx) for (x, y, p) in unsuccessful_cores: subvertex = placements.get_subvertex_on_processor(x, y, p) vertex = graph_mapper.get_vertex_from_subvertex(subvertex) infinite_run = 0 steps = vertex.no_machine_time_steps if steps is None: infinite_run = 1 steps = 0 data = struct.pack( "<III", constants.SDP_RUNNING_MESSAGE_CODES.SDP_NEW_RUNTIME_ID_CODE .value, steps, infinite_run) txrx.send_sdp_message(SDPMessage(SDPHeader( flags=SDPFlag.REPLY_NOT_EXPECTED, destination_cpu=p, destination_chip_x=x, destination_port=( constants.SDP_PORTS.RUNNING_COMMAND_SDP_PORT.value), destination_chip_y=y), data=data)) processors_ready = txrx.get_core_state_count( app_id, CPUState.CPU_STATE_12) # reset the state to the old state so that it can be used by the # application runner code if no_sync_changes % 2 == 0: sync_state = CPUState.SYNC0 else: sync_state = CPUState.SYNC1 processors_ready = txrx.get_core_state_count(app_id, sync_state) # check that all cores are in the state CPU_STATE_12 which shows that # the core has received the new runtime while processors_ready != total_processors: unsuccessful_cores = helpful_functions.get_cores_not_in_state( all_core_subsets, sync_state, txrx) for (x, y, p) in unsuccessful_cores: data = struct.pack( "<II", constants.SDP_RUNNING_MESSAGE_CODES.SDP_SWITCH_STATE.value, sync_state.value) txrx.send_sdp_message(SDPMessage(SDPHeader( flags=SDPFlag.REPLY_NOT_EXPECTED, destination_cpu=p, destination_chip_x=x, destination_port=( constants.SDP_PORTS.RUNNING_COMMAND_SDP_PORT.value), destination_chip_y=y), data=data)) processors_ready = txrx.get_core_state_count(app_id, sync_state) return {'no_sync_changes': no_sync_changes}
def wait_for_execution_to_complete( self, executable_targets, app_id, runtime, time_scaling, txrx, buffer_manager, no_sync_state_changes): """ :param executable_targets: :param app_id: :param runtime: :param time_scaling: :param buffer_manager: :param no_sync_state_changes: the number of runs been done between\ setup and end :return: """ total_processors = executable_targets.total_processors all_core_subsets = executable_targets.all_core_subsets time_to_wait = ((runtime * time_scaling) / 1000.0) + 1.0 logger.info("Application started - waiting {} seconds for it to" " stop".format(time_to_wait)) time.sleep(time_to_wait) processors_not_finished = total_processors while processors_not_finished != 0: processors_rte = txrx.get_core_state_count( app_id, CPUState.RUN_TIME_EXCEPTION) if processors_rte > 0: rte_cores = helpful_functions.get_cores_in_state( all_core_subsets, CPUState.RUN_TIME_EXCEPTION, txrx) break_down = \ helpful_functions.get_core_status_string(rte_cores) raise exceptions.ExecutableFailedToStopException( "{} cores have gone into a run time error state:" "{}".format(processors_rte, break_down)) processors_not_finished = txrx.get_core_state_count( app_id, CPUState.RUNNING) if processors_not_finished > 0: logger.info("Simulation still not finished or failed - " "waiting a bit longer...") time.sleep(0.5) if no_sync_state_changes % 2 == 1: sync_state = CPUState.SYNC0 else: sync_state = CPUState.SYNC1 processors_exited = txrx.get_core_state_count( app_id, sync_state) if processors_exited < total_processors: unsuccessful_cores = helpful_functions.get_cores_not_in_state( all_core_subsets, sync_state, txrx) break_down = helpful_functions.get_core_status_string( unsuccessful_cores) raise exceptions.ExecutableFailedToStopException( "{} of {} processors failed to exit successfully:" "{}".format( total_processors - processors_exited, total_processors, break_down)) logger.info("Application has run to completion")
def wait_for_execution_to_complete(self, executable_targets, app_id, runtime, time_scaling, txrx, time_threshold): """ :param executable_targets: :param app_id: :param runtime: :param time_scaling: :param time_threshold: :param txrx: :param no_sync_state_changes: the number of runs been done between\ setup and end :return: """ total_processors = executable_targets.total_processors all_core_subsets = executable_targets.all_core_subsets time_to_wait = ((runtime * time_scaling) / 1000.0) + 0.1 logger.info( "Application started - waiting {} seconds for it to stop".format( time_to_wait)) time.sleep(time_to_wait) processors_not_finished = total_processors start_time = time.time() retries = 0 while (processors_not_finished != 0 and not self._has_overrun(start_time, time_threshold)): try: processors_rte = txrx.get_core_state_count( app_id, CPUState.RUN_TIME_EXCEPTION) processors_wdog = txrx.get_core_state_count( app_id, CPUState.WATCHDOG) if processors_rte > 0 or processors_wdog > 0: error_cores = helpful_functions.get_cores_in_state( all_core_subsets, {CPUState.RUN_TIME_EXCEPTION, CPUState.WATCHDOG}, txrx) break_down = helpful_functions.get_core_status_string( error_cores) raise exceptions.ExecutableFailedToStopException( "{} cores have gone into an error state:" "{}".format(processors_rte, break_down), helpful_functions.get_core_subsets(error_cores), True) processors_not_finished = txrx.get_core_state_count( app_id, CPUState.RUNNING) if processors_not_finished > 0: logger.info("Simulation still not finished or failed - " "waiting a bit longer...") time.sleep(0.5) except Exception as e: retries += 1 if retries >= 10: logger.error("Error getting state") raise e logger.info("Error getting state - retrying...") time.sleep(0.5) if processors_not_finished != 0: running_cores = helpful_functions.get_cores_in_state( all_core_subsets, CPUState.RUNNING, txrx) if len(running_cores) > 0: raise exceptions.ExecutableFailedToStopException( "Simulation did not finish within the time allocated. " "Please try increasing the machine time step and / " "or time scale factor in your simulation.", helpful_functions.get_core_subsets(running_cores), False) processors_exited = txrx.get_core_state_count(app_id, CPUState.PAUSED) if processors_exited < total_processors: unsuccessful_cores = helpful_functions.get_cores_not_in_state( all_core_subsets, CPUState.PAUSED, txrx) # Last chance to get out of the error state if len(unsuccessful_cores) > 0: break_down = helpful_functions.get_core_status_string( unsuccessful_cores) raise exceptions.ExecutableFailedToStopException( "{} of {} processors failed to exit successfully:" "{}".format(total_processors - processors_exited, total_processors, break_down), helpful_functions.get_core_subsets(unsuccessful_cores), True) logger.info("Application has run to completion")