def test_compare_tensor_value(self, app_client): """Test compare tensor value.""" node_name = 'Default/args0' with self._debugger_client.get_thread_instance(): check_state(app_client) # prepare tensor values url = 'control' body_data = {'mode': 'continue', 'steps': 2} get_request_result(app_client, url, body_data) check_state(app_client) get_request_result(app_client=app_client, url='tensor-history', body_data={'name': node_name}) res = get_request_result(app_client=app_client, url='poll-data', body_data={'pos': 0}, method='get') assert res.get('receive_tensor', {}).get('node_name') == node_name # get compare results url = 'tensor-comparisons' body_data = { 'name': node_name + ':0', 'detail': 'data', 'shape': quote('[:, :]'), 'tolerance': 1 } expect_file = 'compare_tensors.json' send_and_compare_result(app_client, url, body_data, expect_file, method='get') send_terminate_cmd(app_client)
def test_pause(self, app_client): """Test pause the training.""" with self._debugger_client.get_thread_instance(): check_state(app_client) # send run command to execute to next node url = 'control' body_data = {'mode': 'continue', 'steps': -1} res = get_request_result(app_client, url, body_data) assert res == { 'metadata': { 'state': 'sending', 'enable_recheck': False } } # send pause command check_state(app_client, 'running') url = 'control' body_data = {'mode': 'pause'} res = get_request_result(app_client, url, body_data) assert res == { 'metadata': { 'state': 'sending', 'enable_recheck': False } } send_terminate_cmd(app_client)
def test_update_watchpoint(self, app_client): """Test retrieve when train_begin.""" watch_point_id = 1 leaf_node_name = 'Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias' with self._debugger_client.get_thread_instance(): check_state(app_client) condition = { 'id': 'tensor_too_large', 'params': [{ 'name': 'max_gt', 'value': 1.0 }] } create_watchpoint(app_client, condition, watch_point_id) # update watchpoint watchpoint list url = 'update-watchpoint' body_data = { 'watch_point_id': watch_point_id, 'watch_nodes': [leaf_node_name], 'mode': 1 } get_request_result(app_client, url, body_data) # get updated nodes url = 'search' body_data = { 'name': leaf_node_name, 'watch_point_id': watch_point_id } expect_file = 'search_unwatched_leaf_node.json' send_and_compare_result(app_client, url, body_data, expect_file, method='get') send_terminate_cmd(app_client)
def test_create_and_delete_watchpoint(self, app_client): """Test create and delete watchpoint.""" with self._debugger_client.get_thread_instance(): check_state(app_client) conditions = [ {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, {'id': 'tensor_too_small', 'params': [{'name': 'max_lt', 'value': -1.0}]}, {'id': 'tensor_too_large', 'params': [{'name': 'min_gt', 'value': 1e+32}]}, {'id': 'tensor_too_small', 'params': [{'name': 'min_lt', 'value': -1e+32}]}, {'id': 'tensor_too_large', 'params': [{'name': 'mean_gt', 'value': 0}]}, {'id': 'tensor_too_small', 'params': [{'name': 'mean_lt', 'value': 0}]} ] for idx, condition in enumerate(conditions): create_watchpoint(app_client, condition, idx + 1) # delete 4-th watchpoint url = 'delete-watchpoint' body_data = {'watch_point_id': 4} get_request_result(app_client, url, body_data) # test watchpoint list url = 'retrieve' body_data = {'mode': 'watchpoint'} expect_file = 'create_and_delete_watchpoint.json' if self.save_results: send_and_save_result(app_client, url, body_data, expect_file) send_and_compare_result(app_client, url, body_data, expect_file) send_terminate_cmd(app_client)
def test_recheck(self, app_client): """Test recheck request.""" with self._debugger_client.get_thread_instance(): create_watchpoint_and_wait(app_client) # send recheck when disable to do recheck get_request_result(app_client, 'recheck', {}, method='post', expect_code=400) # send recheck when enable to do recheck create_watchpoint(app_client, {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, 2) res = get_request_result(app_client, 'recheck', {}, method='post') assert res['metadata']['enable_recheck'] is False send_terminate_cmd(app_client)
def test_compare_tensor_value(self, app_client): """Test compare tensor value.""" node_name = 'Default/args0' with self._debugger_client.get_thread_instance(): check_state(app_client) # prepare tensor values url = 'control' body_data = {'mode': 'continue', 'steps': 2} get_request_result(app_client, url, body_data) check_state(app_client) get_request_result( app_client=app_client, url='tensor-history', body_data={'name': node_name, 'rank_id': 0}) res = get_request_result( app_client=app_client, url='poll-data', body_data={'pos': 0}, method='get') assert res.get('receive_tensor', {}).get('node_name') == node_name, 'Node name unmatched.' # get compare results url = 'tensor-comparisons' body_data = { 'name': node_name + ':0', 'detail': 'data', 'shape': quote('[:, :]'), 'tolerance': 1, 'rank_id': 0} get_request_result(app_client, url, body_data, method='GET') # sleep 0.01 second to wait the tensor update. time.sleep(0.01) res = get_request_result( app_client=app_client, url='poll-data', body_data={'pos': 0}, method='get') assert res.get('receive_tensor', {}).get('node_name') == node_name, 'Node name unmatched.' expect_file = 'compare_tensors.json' if self.save_results: send_and_save_result(app_client, url, body_data, expect_file, method='get') send_and_compare_result(app_client, url, body_data, expect_file, method='get') send_terminate_cmd(app_client)
def test_retrieve_tensor_graph(self, app_client, body_data, expect_file): """Test retrieve tensor graph.""" url = 'tensor-graphs' with self._debugger_client.get_thread_instance(): create_watchpoint_and_wait(app_client) get_request_result(app_client, url, body_data, method='GET') # sleep 0.01 second to wait the tensor update. time.sleep(0.01) # check full tensor history from poll data res = get_request_result( app_client=app_client, url='poll-data', body_data={'pos': 0}, method='get') assert res.get('receive_tensor', {}).get('tensor_name') == body_data.get('tensor_name') if self.save_results: send_and_save_result(app_client, url, body_data, expect_file, method='GET') send_and_compare_result(app_client, url, body_data, expect_file, method='GET') send_terminate_cmd(app_client)
def test_continue_on_gpu(self, app_client, params, expect_file): """Test get next node on GPU.""" gpu_debugger_client = MockDebuggerClient(backend='GPU', graph_num=2) original_value = settings.ENABLE_RECOMMENDED_WATCHPOINTS settings.ENABLE_RECOMMENDED_WATCHPOINTS = True try: with gpu_debugger_client.get_thread_instance(): check_state(app_client) # send run command to get watchpoint hit url = 'control' body_data = {'mode': 'continue'} body_data.update(params) res = get_request_result(app_client, url, body_data) assert res == { 'metadata': { 'state': 'sending', 'enable_recheck': False } } # get metadata check_state(app_client) url = 'retrieve' body_data = {'mode': 'all'} send_and_compare_result(app_client, url, body_data, expect_file) send_terminate_cmd(app_client) finally: settings.ENABLE_RECOMMENDED_WATCHPOINTS = original_value
def test_retrieve_tensor_value(self, app_client): """Test retrieve tensor value.""" node_name = 'Default/TransData-op99' with self._debugger_client.get_thread_instance(): check_state(app_client) # prepare tensor value url = 'tensor-history' body_data = {'name': node_name} expect_file = 'retrieve_empty_tensor_history.json' send_and_compare_result(app_client, url, body_data, expect_file) # check full tensor history from poll data res = get_request_result(app_client=app_client, url='poll-data', body_data={'pos': 0}, method='get') assert res.get('receive_tensor', {}).get('node_name') == node_name expect_file = 'retrieve_full_tensor_history.json' send_and_compare_result(app_client, url, body_data, expect_file) # check tensor value url = 'tensors' body_data = { 'name': node_name + ':0', 'detail': 'data', 'shape': quote('[1, 1:3]') } expect_file = 'retrieve_tensor_value.json' send_and_compare_result(app_client, url, body_data, expect_file, method='get') send_terminate_cmd(app_client)
def test_next_node_on_gpu(self, app_client): """Test get next node on GPU.""" gpu_debugger_client = MockDebuggerClient(backend='GPU') with gpu_debugger_client.get_thread_instance(): check_state(app_client) # send run command to get watchpoint hit url = 'control' body_data = { 'mode': 'continue', 'level': 'node', 'name': 'Default/TransData-op99' } res = get_request_result(app_client, url, body_data) assert res == { 'metadata': { 'state': 'sending', 'enable_recheck': False } } # get metadata check_state(app_client) url = 'retrieve' body_data = {'mode': 'all'} expect_file = 'retrieve_next_node_on_gpu.json' send_and_compare_result(app_client, url, body_data, expect_file) send_terminate_cmd(app_client)
def test_create_watchpoint(self, app_client, filter_condition, expect_id): """Test create watchpoint with multiple graphs.""" url = 'create-watchpoint' with self._debugger_client.get_thread_instance(): check_state(app_client) res = get_request_result(app_client, url, filter_condition) assert res.get('id') == expect_id send_terminate_cmd(app_client)
def test_recheck(self, app_client, url, body_data, enable_recheck): """Test recheck.""" with self._debugger_client.get_thread_instance(): create_watchpoint_and_wait(app_client) # create watchpoint res = get_request_result(app_client, url, body_data, method='post') assert res['metadata']['enable_recheck'] is enable_recheck send_terminate_cmd(app_client)
def test_retrieve_tensor_value(self, app_client): """Test retrieve tensor value.""" node_name = 'Default/TransData-op99' with self._debugger_client.get_thread_instance(): check_state(app_client) # prepare tensor value url = 'tensor-history' body_data = {'name': node_name, 'rank_id': 0} get_request_result(app_client, url, body_data, method='post') get_request_result(app_client=app_client, url='poll-data', body_data={'pos': 0}, method='get') url = 'tensors' body_data = { 'name': node_name + ':0', 'detail': 'data', 'shape': quote('[1, 1:3]') } get_request_result(app_client, url, body_data, method='GET') # sleep 0.01 second to wait the tensor update. time.sleep(0.01) res = get_request_result( app_client=app_client, url='poll-data', body_data={'pos': 0}, method='get') assert res.get('receive_tensor', {}).get('node_name') == node_name, 'Node name unmatched.' expect_file = 'retrieve_tensor_value.json' if self.save_results: send_and_save_result(app_client, url, body_data, expect_file, method='get') send_and_compare_result(app_client, url, body_data, expect_file, method='get') send_terminate_cmd(app_client)
def test_recheck_state(self, app_client, url, body_data, enable_recheck): """Test update watchpoint and check the value of enable_recheck.""" with self._debugger_client.get_thread_instance(): create_watchpoint_and_wait(app_client) if not isinstance(body_data, list): body_data = [body_data] for sub_body_data in body_data: res = get_request_result(app_client, url, sub_body_data, method='post') assert res['metadata']['enable_recheck'] is enable_recheck send_terminate_cmd(app_client)
def create_watchpoint(app_client, condition, expect_id): """Create watchpoint.""" url = 'create-watchpoint' body_data = {'condition': condition, 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7', 'Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias', 'Default/optimizer-Momentum/Parameter[18]_7/moments.fc1.bias', 'Default/TransData-op99']} res = get_request_result(app_client, url, body_data) assert res.get('id') == expect_id
def create_watchpoint_and_wait(app_client): """Preparation for recheck.""" check_state(app_client) create_watchpoint(app_client, condition={'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, expect_id=1) # send run command to get watchpoint hit url = 'control' body_data = {'mode': 'continue', 'steps': 2} res = get_request_result(app_client, url, body_data) assert res == {'metadata': {'state': 'sending', 'enable_recheck': False}} # wait for server has received watchpoint hit check_state(app_client)
def test_retrieve_tensor_history(self, app_client): """Test retrieve tensor value.""" node_name = 'Default/TransData-op99' with self._debugger_client.get_thread_instance(): check_state(app_client) # prepare tensor value url = 'tensor-history' body_data = {'name': node_name, 'rank_id': 0} expect_file = 'retrieve_empty_tensor_history.json' if self.save_results: send_and_save_result(app_client, url, body_data, expect_file) send_and_compare_result(app_client, url, body_data, expect_file) # check full tensor history from poll data res = get_request_result( app_client=app_client, url='poll-data', body_data={'pos': 0}, method='get') assert res.get('receive_tensor', {}).get('node_name') == node_name, 'Node name unmatched.' expect_file = 'retrieve_full_tensor_history.json' if self.save_results: send_and_save_result(app_client, url, body_data, expect_file) send_and_compare_result(app_client, url, body_data, expect_file) send_terminate_cmd(app_client)