def stopTask(self, request, context): task_id = request.id self._logger.info( 'mxnet_service has received a new request to stop the task with id:%s' % task_id) self._record_user_action(task_id, 'stop_task') executor_process = self._task_dict.get(task_id) if executor_process is None: self._logger.warn('mxnet_service can not find a task with id: %s' % task_id) return mxserver_pb2.TaskState(task_id=task_id, state_code=TASK_STATE_CODES[1], state_desc=TASK_STATES[2]) else: try: executor_process.terminate() self._logger.warn( 'mxnet_service has terminated the task with id: %s' % task_id) # After terminate, the key-value should be deleted del self._task_dict[task_id] return mxserver_pb2.TaskState(task_id=task_id, state_code=TASK_STATE_CODES[0], state_desc=TASK_STATES[3]) except BaseException as e: self._logger.warn( 'mxnet_service can not terminate the task with id: %s! Because %s' % (task_id, exception_msg(e))) return mxserver_pb2.TaskState(task_id=task_id, state_code=TASK_STATE_CODES[1], state_desc=TASK_STATES[4])
def query_gpu(): mxserver_flask_logger.info( 'The mxserver_flask_server receives a request to query local GPU infos' ) try: result = gpu_monitor.query_gpu() response = make_response(result) response.headers['Content-Type'] = 'application/json' return response except BaseException as e: mxserver_flask_logger.error( 'The mxserver_flask_server fails to query local GPU infos! Error message: %s' % exception_msg(e)) return jsonify([])
return task_state def __task_state_2_json(task_state): state_id = task_state.id state_code = task_state.state_code state_desc = task_state.state_desc return '{"task_id": "%s", "state_code": "%s", "state_desc": "%s"}' % ( state_id, state_code, state_desc) if __name__ == '__main__': try: if ZkRegister.use_zk(): mxserver_flask_logger.info( 'The mxserver flask server is trying to register to ZooKeeper') zk_register = ZkRegister() zk_register.register_flask_to_zk() if ZkRegister.use_zk(): mxserver_flask_logger.info( 'The mxserver flask server has registered to ZooKeeper') except BaseException as e: mxserver_flask_logger.error( 'The mxserver flask server can not register to ZooKeeper! System exists! ' 'Error message: \n%s' % exception_msg(e)) sys.exit('Failed to register to ZooKeeper') mxserver_flask_logger.info('The mxserver flask server has been started') app.run(host=mxserver_flask_config['host'], port=mxserver_flask_config['port'])
# -*- coding: utf-8 -*- # @Author: Terence Wu # @Time: 26/02/18 上午 11:37 from requests import post from test_resources import STOP_TEST_URL, STOP_REQUEST_JSON from util.logger_generator import get_logger from util.exception_handler import exception_msg if __name__ == '__main__': logger = get_logger('test_stop_request') logger.info('Begin to test API for deep learning training') logger.info('Begin to send request to url: %s' % STOP_TEST_URL) try: response = post(url=STOP_TEST_URL, json=STOP_REQUEST_JSON) logger.info('Receive a response') logger.info('Response\'s status code: %s' % response.status_code) logger.info('Response\'s content: %s' % response.content) except BaseException as e: logger.error('Fail! Error message: %s\n' % exception_msg(e))
def run(self): self._task_progress_recorder.insert_one({ 'task_id': self._process_id, 'task_progresses': [] }) for_training, exec_type, executor_params_dict = parse_task_desc( self._task_desc) executor_params_dict['task_id'] = self._process_id data_config = get_data_config(self._task_desc) self._update_task_state('TASK_BEGIN_PREPARE_DATA') try: data_manager = DataManager(for_training=for_training, target=exec_type, data_config=data_config) data_iters = data_manager.prepare_data() self._update_task_state('TASK_PREPARE_DATA_DONE') except BaseException as e: self._update_task_state('TASK_BEGIN_PREPARE_DATA_FAILED') excep_msg = exception_msg(e) _logger.error( 'Task_%s\'s DataIter instances creation failed! Because %s' % (self._process_id, excep_msg)) return if for_training: executor_params_dict['train_iter'] = data_iters[0] if len(data_iters) == 1: executor_params_dict['val_iter'] = None else: executor_params_dict['val_iter'] = data_iters[1] else: executor_params_dict['data_batch_list'] = data_iters try: executor = Executor.create_executor(for_training=for_training, exec_type=exec_type, **executor_params_dict) self._update_task_state('TASK_EXECUTOR_CREATION_DONE') _logger.error('Task_%s\'s Executor instances creation done!' % self._process_id) except BaseException as e: self._update_task_state('TASK_EXECUTOR_CREATION_FAILED') _logger.error( 'Task_%s\'s Executor instances creation failed! Because %s' % (self._process_id, exception_msg(e))) return try: self._update_task_state('TASK_BEGIN_RUNNING') _logger.info('Task_%s running is starting now' % self._process_id) executor.execute() self._update_task_state('TASK_DONE_SUCCESSFULLY') _logger.info('Task_%s running is done successfully' % self._process_id) except BaseException as e: self._update_task_state('TASK_TERMINATED_BY_INTERNAL_ERROR') excep_msg = exception_msg(e) _logger.error( 'Task_%s has been terminated by server internal error! Because %s' % (self._process_id, excep_msg))
# Add rcnn package to sys.path sys.path.append(mxserver_mxnet_config['rcnn-path']) print sys.path if __name__ == '__main__': main_logger = get_logger('mxserver_worker_logger') try: if ZkRegister.use_zk(): main_logger.info('The mxserver worker is trying to register to ZooKeeper') zk_register = ZkRegister() zk_register.register_worker_to_zk() if ZkRegister.use_zk(): main_logger.info('The mxserver worker has registered to ZooKeeper') except BaseException as e: main_logger.error('The mxserver worker can not register to ZooKeeper! System exists! Error message: \n%s' % exception_msg(e)) sys.exit('Failed to register to ZooKeeper') task_queue = Queue(int(mxserver_task_queue_config['queue-max-size'])) try: executor_process_manager = ExecutorProcessManager(task_queue=task_queue) executor_process_manager.start() server = grpc.server(futures.ThreadPoolExecutor(max_workers=int(mxserver_rpc_config['max-thread-num']))) mxserver_pb2_grpc.add_MXNetServiceServicer_to_server(MXNetService(task_queue), server) uri = mxserver_rpc_config['host'] + ':' + str(mxserver_rpc_config['port']) server.add_insecure_port(uri) server.start() main_logger.info('The mxserver worker has been started at: %s, waiting for request.' % uri) try: