def handle_event_register_prog(event): """ handle 'register_prog' event data must include 'name', 'checksum' and 'data' :event: event to handle :returns: event result data if event sucessfully handled :raises: ValueError: if program data does not match checksum """ data = event.data['data'] name = event.data['name'] checksum = event.data['checksum'] with client.client_access() as c: user_progs_dir = c.user_progs_dir hardware = c.hardware prog_dir = os.path.join(user_progs_dir, checksum) data_file = os.path.join(prog_dir, 'data.json') os.mkdir(prog_dir) with open(data_file, 'w') as fp: fp.write(data) program = user_prog.UserProg( name, checksum, data_file, hardware, build_dir=prog_dir) program.verify_checksum() with client.client_access() as c: cuda_bin = c.args.bin include_path = c.args.include program.build(cuda_bin=cuda_bin, include_path=include_path) with client.client_access() as c: c.user_programs[checksum] = program LOG.info('Registered program: %s', program) return {}
def run_server(args, tmpdir): """ entrypoint for server :args: parsed cmdline args :tmpdir: temporary directory :returns: 0 on success """ # create server state server.create_state(args, tmpdir) # init remote event system remote_event.create_remote_events() # start api server call = functools.partial( server.APP.run, debug=False, host=args.host, port=args.port) thread = threading.Thread(target=call) thread.daemon = True thread.start() # start server worker worker = server_worker.ServerWorker() LOG.info('starting server worker') try: worker.run() except queue.Empty: server_util.shutdown_all_clients() return 0
def check_gpus(args, tmpdir): """ check for CUDA capable GPUs :args: parsed cmdline args :tmpdir: temporary directory :returns: dict with GPU info """ if args.no_gpu: LOG.warning("Not scanning available gpus, running programs will fail") return {'num_gpus': 0, 'gpu_info': []} LOG.info('Checking CUDA build system') program = setup_cuda_detect(args, tmpdir) res = { 'num_gpus': program.get_num_gpus(), 'gpu_info': [], } for gpu_index in range(res['num_gpus']): props = GPUProps() program.get_gpu_data(gpu_index, ctypes.byref(props)) gpu_info = { 'gpu_index': props.gpu_index, 'comp_level_major': props.comp_level_major, 'comp_level_minor': props.comp_level_minor, 'sm_count': props.sm_count, 'max_sm_threads': props.max_sm_threads, 'max_sm_blocks': props.max_sm_blocks, 'max_block_size': props.max_block_size, 'max_total_threads': props.max_total_threads, 'max_total_blocks': props.max_total_blocks, 'name': props.name.decode(), } gpu_info['reasonable_block_size'] = get_reasonable_block_size(gpu_info) res['gpu_info'].append(gpu_info) return res
def unregister_client(self, client_uuid): """ unregister a client with the server :client_uuid: client uuid :returns: client object """ self.all_clients_hardware.pop(client_uuid, None) client = self.clients.pop(client_uuid, None) if client: LOG.info('Deleted client: %s', client) return client
def __exit__(self, etype, value, trace): """ Destroy tempdir if no errors occurred or preserve set :etype: exception type :value: exception value :trace: exception traceback """ if etype or self.preserve: LOG.info('Preserving tempdir: %s', self.tempdir) else: shutil.rmtree(self.tempdir)
def run_iteration(self, global_state_enc): """ update global state, run iteration, and encode aggregation result :global_state_enc: encoded global state :returns: encoded aggregation result """ LOG.info("Running iteration") # FIXME move into py_mod import array self.global_state = array.array('d', global_state_enc[1]) partial_results = self.py_mod.run_iteration(self.global_params, self.data_count, self.global_state, self.pinned_memory, self.dataset) return partial_results
def load_data(self, dataset_enc): """ load dataset :dataset_enc: encoded data """ LOG.info('in load_data') LOG.info('first line of data:') # LOG.info(dataset_enc[0]) # LOG.info(dataset_enc[1][0]) self.dataset = self.py_mod.to_array(dataset_enc[1]) self.data_count = dataset_enc[0] LOG.info('converted to array') self.pinned_memory = self.prog.pin_gpu_memory(self.dataset) LOG.info('data count %i', self.data_count) LOG.info('pinned data')
def shutdown_all_clients(max_wait=5, wait_interval=0.2): """ shut down all clients because the server is terminating :max_wait: max time to wait for clients to shutdown before returning :wait_interval: poll interval to check if all clients have terminated """ LOG.info('Instructing all clients to shutdown') with server.state_access() as s: s.get_all('/shutdown') for _ in range(int(max_wait / wait_interval)): time.sleep(wait_interval) with server.state_access() as s: client_count = len(s.clients) if client_count == 0: LOG.info('All clients terminated') break else: LOG.warn('Not all clients terminated, shutting down anyway')
def handle_event_register_prog(event): """ handle 'register_prog' event data must include 'name', 'checksum' and 'data' :event: event to handle :returns: event result data if event sucessfully handled :raises: Exception: if error occurs handling event """ data = event.data['data'] name = event.data['name'] checksum = event.data['checksum'] wakeup_ev = threading.Event() def multi_callback_func(event_props): wakeup_ev.set() def callback_func(client, event_props): if event_props['status'] != events.EventStatus.SUCCESS.value: raise ValueError('{}: failed to register program'.format(client)) client.registered_progs.append(checksum) with server.state_access() as s: user_progs_dir = s.user_progs_dir all_hardware = s.all_clients_hardware prog_dir = os.path.join(user_progs_dir, checksum) data_file = os.path.join(prog_dir, 'data.json') os.mkdir(prog_dir) with open(data_file, 'w') as fp: fp.write(data) program = user_prog.UserProg( name, checksum, data_file, all_hardware, build_dir=prog_dir) program.build_for_server() post_data = event.data.copy() post_data['send_remote_event'] = True with server.state_access() as s: s.post_all('/programs', post_data, callback_func=callback_func, multi_callback_func=multi_callback_func) # NOTE: timeout for registering program on all nodes set to 10 min wakeup_ev.wait(timeout=600) LOG.info('Registered user program: %s', program) with server.state_access() as s: s.registered_progs[checksum] = program return program.properties
def handle_event_init_runtime(event): """ handle 'init_runtime' event data must include: - 'runtime_id', - 'dataset_enc', - 'checksum', - 'global_params_enc' :event: event to handle :returns: event result """ runtime_id = event.data['runtime_id'] dataset_enc = event.data['dataset_enc'] prog_checksum = event.data['checksum'] global_params_enc = event.data['global_params_enc'] with client.client_access() as c: program = c.user_programs[prog_checksum] runtime = program.get_new_program_runtime(runtime_id) runtime.prepare_datastructures(global_params_enc) runtime.load_data(dataset_enc) LOG.info('Loaded client program instance') return {}
def partition_data(self, data): """ load dataset and partition among clients :data: data """ client_uuids = list(self.hardware.keys()) client_count = len(client_uuids) LOG.debug("data size %i", sys.getsizeof(data)) # FIXME this is a really rough estimate as the final calculation is done # after casting to double data_generator = self.py_mod.split_data(data) LOG.info(self.global_params) split_size = self.global_params[0] // client_count + 1 LOG.debug("split size %i", split_size) post_datasets = {} for client_uuid in client_uuids: LOG.info("Splitting data") # FIXME use hardware scan to discover GPU mem size # currently rounded slightly down to avoid overflowing in loop # 8G gpu ram size # gpu_mem_remaining = 8589934592 gpu_mem_remaining = 8500000000 split_remaining = split_size data_count = 0 LOG.info("global_params %s", self.global_params) dataset = [] # subtract params size gpu_mem_remaining = (gpu_mem_remaining - sys.getsizeof(self.global_params)) try: while split_remaining > 0 and gpu_mem_remaining > 0: next_split = next(data_generator) split_remaining = split_remaining - 1 gpu_mem_remaining = (gpu_mem_remaining - sys.getsizeof(next_split)) dataset.append(next_split) data_count = data_count + 1 except StopIteration: pass dataset_enc = [data_count, dataset] self.client_datasets[client_uuid] = dataset_enc self._initialize_global_state()
self.user_progs_dir = os.path.join(self.tmpdir, 'user_progs_server') os.mkdir(self.user_progs_dir) def register_client(self, hardware, client_ip, client_port): """ register a client with the server :hardware: hardware info dict :client_ip: addr of client :client_port: port number for client :returns: client uuid """ client_uuid = util.hex_uuid() url = 'http://{}:{}'.format(client_ip, client_port) self.clients[client_uuid] = ClientState(client_uuid, hardware, url) <<<<<<< HEAD LOG.info('Registered client: %s', self.clients[client_uuid]) return client_uuid def get_all(self, endpoint, params=None, expect_json=True): ======= with remote_event.remote_events_access() as r: r.register_client(client_uuid) self.all_clients_hardware[client_uuid] = hardware LOG.info('Registered client: %s', self.clients[client_uuid]) return client_uuid def unregister_client(self, client_uuid): """ unregister a client with the server :client_uuid: client uuid :returns: client object
client.create_client(args, tmpdir, hardware) >>>>>>> ef9b13b186c1a356f50a36e78ad91a3ccff76392 # automatically find available port client_port = util.get_free_port() # start client api server call = functools.partial( client.APP.run, debug=False, host='0.0.0.0', port=client_port) thread = threading.Thread(target=call) thread.daemon = True thread.start() # register with server with client.client_access() as c: c.register(client_port) # start client worker worker = client_worker.ClientWorker() LOG.info('starting client worker') try: worker.run() except queue.Empty: with client.client_access() as c: c.shutdown() return 0 def run_server(args, tmpdir): """ entrypoint for server :args: parsed cmdline args :tmpdir: temporary directory :returns: 0 on success """
def client_item(client_id): """ GET,DELETE /clients/<client_id>: query clients :client_id: client uuid :returns: flask response """ if request.method == 'GET': with server.state_access() as state: client = state.clients.get(client_id) return (respond_json(client.properties) if client else respond_error(404)) elif request.method == 'DELETE': with server.state_access() as state: <<<<<<< HEAD res = state.clients.pop(client_id, None) LOG.info('Deleted client: %s', res) return Response("ok") if res is not None else respond_error(404) ======= client = state.unregister_client(client_id) return Response("ok") if client is not None else respond_error(404) @APP.route('/programs', methods=['GET', 'POST']) def programs(): """ GET,POST /programs: register or list programs :returns: flask response """ if request.method == 'POST': event_data = request.get_json() if not all(n in event_data for n in ('name', 'data', 'checksum')):
def handle_event_run_program(event): """ handle 'run_program' eent :event: event to handle :returns: program result :raises: Exception: if error occurs or invalid request """ runtime_id = util.hex_uuid() dataset_enc = event.data['dataset_enc'] prog_checksum = event.data['checksum'] global_params_enc = event.data['global_params_enc'] init_path = os.path.join('/runtimes', prog_checksum, runtime_id) iterate_path = os.path.join(init_path, 'iterate') cleanup_path = os.path.join(init_path, 'cleanup') wakeup_ev = threading.Event() def multi_callback_wakeup(event_props): wakeup_ev.set() def runtime_init_callback(client, event_props): if event_props['status'] != events.EventStatus.SUCCESS.value: raise ValueError('{}: error on prog runtime init'.format(client)) with server.state_access() as s: program = s.registered_progs[prog_checksum] if not program.ready: raise ValueError('cannot run program, not ready') runtime = program.get_new_server_runtime(runtime_id) runtime.prepare_datastructures(global_params_enc) runtime.partition_data(dataset_enc) runtime_init_remote_event_ids = [] for client_uuid, dataset_enc in runtime.dataset_partitions_encoded.items(): data = { 'runtime_id': runtime_id, 'checksum': prog_checksum, 'dataset_enc': dataset_enc, 'global_params_enc': global_params_enc, 'send_remote_event': True, } with server.state_access() as s: c = s.clients[client_uuid] res = c.post(init_path, data, callback_func=runtime_init_callback) runtime_init_remote_event_ids.append(res['event_id']) with remote_event.remote_events_access() as r: r.register_multi_callback( runtime_init_remote_event_ids, multi_callback_wakeup) wakeup_ev.wait(timeout=300) wakeup_ev.clear() LOG.info('Runtime initialized for user program: %s', program) aggregation_lock = threading.Lock() def run_iteration_callback(client, event_props): if event_props['status'] != events.EventStatus.SUCCESS.value: raise ValueError('{}: error running prog iteration'.format(client)) with aggregation_lock: runtime.aggregate(event_props['result']['aggregation_result_enc']) runtime.reset_aggregation_result() iteration_count = 0 while True: post_data = { 'runtime_id': runtime_id, 'checksum': prog_checksum, 'global_state_enc': runtime.global_state_encoded, 'send_remote_event': True, } with server.state_access() as s: s.post_all( iterate_path, post_data, callback_func=run_iteration_callback, multi_callback_func=multi_callback_wakeup) wakeup_ev.wait(timeout=600) wakeup_ev.clear() runtime.update_global_state() runtime.reset_aggregation_result() LOG.debug('Completed iteration for user program: %s', program) iteration_count = iteration_count + 1 if runtime.done: break LOG.info('Cleaning up...') def runtime_cleanup_callback(client, event_props): if event_props['status'] != events.EventStatus.SUCCESS.value: raise ValueError('{}: error on prog runtime clean up'.format(client)) post_data = { 'runtime_id': runtime_id, 'checksum': prog_checksum, 'send_remote_event': True, } with server.state_access() as s: s.post_all( cleanup_path, post_data, callback_func=runtime_cleanup_callback, multi_callback_func=multi_callback_wakeup) wakeup_ev.wait(timeout=60) wakeup_ev.clear() LOG.info('Finished running user program: %s %i', program, iteration_count) return { 'end_aggregate': runtime.top_level_aggregate_encoded, 'end_global_state': runtime.global_state_encoded, }