def main(): debugger_backend = d.DbgServices( dump_file_path="/home/jtzanaka/dumps/sync_trans_false/032421/alexnet") _ = debugger_backend.initialize( net_name="Network Name goes here!", is_sync_mode=True) # NOTES: # -> watch_condition=6 is MIN_LT # -> watch_condition=18 is CHANGE_TOO_LARGE # test 1: watchpoint set and hit (watch_condition=6) param1 = d.Parameter(name="param", disabled=False, value=0.0) _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6, check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/" "Conv2D-op308": {"device_id": [0], "root_graph_id": [0], "is_parameter": False }}, parameter_list=[param1]) watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2) if len(watchpoint_hits_test_1) != 1: print("ERROR -> test 1: watchpoint set but not hit just once") print_watchpoint_hits(watchpoint_hits_test_1, 1) # test 2: watchpoint remove and ensure it's not hit _ = debugger_backend.remove_watchpoint(watchpoint_id=1) watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2) if watchpoint_hits_test_2: print("ERROR -> test 2: watchpoint removed but hit") # test 3: watchpoint set and not hit, then remove param2 = d.Parameter(name="param", disabled=False, value=-1000.0) _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/" "Conv2D-op308": {"device_id": [0], "root_graph_id": [0], "is_parameter": False }}, parameter_list=[param2]) watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2) if watchpoint_hits_test_3: print("ERROR -> test 3: watchpoint set but not supposed to be hit") _ = debugger_backend.remove_watchpoint(watchpoint_id=2) # test 4: weight change watchpoint set and hit param_abs_mean_update_ratio_gt = d.Parameter( name="abs_mean_update_ratio_gt", disabled=False, value=0.0) param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0) _ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18, check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/" "Parameter[6]_11/fc3.bias": {"device_id": [0], "root_graph_id": [0], "is_parameter": True }}, parameter_list=[param_abs_mean_update_ratio_gt, param_epsilon]) watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3) if len(watchpoint_hits_test_4) != 1: print("ERROR -> test 4: watchpoint weight change set but not hit just once") print_watchpoint_hits(watchpoint_hits_test_4, 4)
def main(): debugger_backend = d.DbgServices( dump_file_path="/opt/nvme2n1/j00455527/dumps/async_sink_true/032421") _ = debugger_backend.initialize(net_name="alexnet", is_sync_mode=False) # NOTES: # -> watch_condition=6 is MIN_LT # -> watch_condition=18 is CHANGE_TOO_LARGE # test 1: watchpoint set and hit (watch_condition=6) param1 = d.Parameter(name="param", disabled=False, value=0.0) _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6, check_node_list={"Default/network-TrainOneStepCell/network-WithLossCell/" "_backbone-AlexNet/conv3-Conv2d/Conv2D-op169": {"device_id": [0], "root_graph_id": [1], "is_parameter": False }}, parameter_list=[param1]) watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2) if len(watchpoint_hits_test_1) != 1: print("ERROR -> test 1: watchpoint set but not hit just once") print_watchpoint_hits(watchpoint_hits_test_1, 1) # test 2: watchpoint remove and ensure it's not hit _ = debugger_backend.remove_watchpoint(watchpoint_id=1) watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2) if watchpoint_hits_test_2: print("ERROR -> test 2: watchpoint removed but hit") # test 3: watchpoint set and not hit, then remove param2 = d.Parameter(name="param", disabled=False, value=-1000.0) _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, check_node_list={"Default/network-TrainOneStepCell/network-WithLossCell/" "_backbone-AlexNet/conv3-Conv2d/Conv2D-op169": {"device_id": [0], "root_graph_id": [1], "is_parameter": False }}, parameter_list=[param2]) watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2) if watchpoint_hits_test_3: print("ERROR -> test 3: watchpoint set but not supposed to be hit") _ = debugger_backend.remove_watchpoint(watchpoint_id=2)
def main(): debugger_backend = d.DbgServices( dump_file_path= "/opt/nvme2n1/j00455527/dumps/sync_trans_false/032421/alexnet") _ = debugger_backend.initialize(net_name="Network Name goes here!", is_sync_mode=True) # parameter info1 = d.TensorInfo( node_name= "Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias", slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True) # output tensor with zero slot info2 = d.TensorInfo( node_name= "Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168", slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) # output tensor with non-zero slot info3 = d.TensorInfo( node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op346", slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) tensor_info = [info1, info2, info3] tensor_data = debugger_backend.read_tensors(tensor_info) print_read_tensors(tensor_info, tensor_data)
def test_sync_trans_read_tensors(): debugger_backend = d.DbgServices( dump_file_path="../data/dump/gpu_dumps/sync_trans_true/alexnet") _ = debugger_backend.initialize(net_name="Network Name goes here!", is_sync_mode=True) # parameter info1 = d.TensorInfo( node_name= "Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias", slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True) # output tensor with zero slot info2 = d.TensorInfo( node_name= "Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308", slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) # output tensor with non-zero slot info3 = d.TensorInfo( node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300", slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False) tensor_info = [info1, info2, info3] tensor_data = debugger_backend.read_tensors(tensor_info) print_read_tensors(tensor_info, tensor_data) assert compare_actual_with_expected(test_name)
def main(): debugger_backend = d.DbgServices( dump_file_path="/opt/nvme2n1/j00455527/dumps/async_sink_true/032421") _ = debugger_backend.initialize(net_name="alexnet", is_sync_mode=False) # output tensor with zero slot info1 = d.TensorInfo(node_name="Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/" "conv3-Conv2d/Conv2D-op169", slot=0, iteration=2, device_id=0, root_graph_id=1, is_parameter=False) # output tensor with non-zero slot info2 = d.TensorInfo(node_name="Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/" "ReLUV2-op348", slot=1, iteration=2, device_id=0, root_graph_id=1, is_parameter=False) tensor_info = [info1, info2] tensor_data = debugger_backend.read_tensors(tensor_info) print_read_tensors(tensor_info, tensor_data)
def test_sync_trans_false_watchpoints(): if GENERATE_GOLDEN: f_write = open(test_name + ".expected", "w") else: f_write = open(test_name + ".actual", "w") debugger_backend = d.DbgServices( dump_file_path="../data/dump/gpu_dumps/sync_trans_false/alexnet") _ = debugger_backend.initialize( net_name="Alexnet", is_sync_mode=True) # NOTES: # -> watch_condition=6 is MIN_LT # -> watch_condition=18 is CHANGE_TOO_LARGE # test 1: watchpoint set and hit (watch_condition=6) param1 = d.Parameter(name="param", disabled=False, value=0.0) _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6, check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/" "Conv2D-op308": {"device_id": [0], "root_graph_id": [0], "is_parameter": False }}, parameter_list=[param1]) watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2) if len(watchpoint_hits_test_1) != 1: f_write.write( "ERROR -> test 1: watchpoint set but not hit just once\n") print_watchpoint_hits(watchpoint_hits_test_1, 1, f_write) # test 2: watchpoint remove and ensure it's not hit _ = debugger_backend.remove_watchpoint(watchpoint_id=1) watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2) if watchpoint_hits_test_2: f_write.write("ERROR -> test 2: watchpoint removed but hit\n") # test 3: watchpoint set and not hit, then remove param2 = d.Parameter(name="param", disabled=False, value=-1000.0) _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/" "Conv2D-op308": {"device_id": [0], "root_graph_id": [0], "is_parameter": False }}, parameter_list=[param2]) watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2) if watchpoint_hits_test_3: f_write.write( "ERROR -> test 3: watchpoint set but not supposed to be hit\n") _ = debugger_backend.remove_watchpoint(watchpoint_id=2) # test 4: weight change watchpoint set and hit param_abs_mean_update_ratio_gt = d.Parameter( name="abs_mean_update_ratio_gt", disabled=False, value=0.0) param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0) _ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18, check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/" "Parameter[6]_11/fc3.bias": {"device_id": [0], "root_graph_id": [0], "is_parameter": True }}, parameter_list=[param_abs_mean_update_ratio_gt, param_epsilon]) watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3) if len(watchpoint_hits_test_4) != 1: f_write.write( "ERROR -> test 4: watchpoint weight change set but not hit just once\n") print_watchpoint_hits(watchpoint_hits_test_4, 4, f_write) f_write.close() assert compare_actual_with_expected(test_name)