predict_net_proto = caffe2_pb2.NetDef() with open(PREDICT_NET, "r") as f: predict_net_proto.ParseFromString(f.read()) test_model.net = test_model.net.AppendNet(core.Net(predict_net_proto)) # Add an accuracy feature to the model for convenient reporting during testing accuracy = brew.accuracy(test_model, ['softmax', 'label'], 'accuracy') # ### Run Testing # # At this point, our model is initialized as the saved model from Part 1. We can now run the testing loop and check the accuracy. # In[6]: # Run the param init net to put the trained model info into the workspace workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net, overwrite=True) # Stat keeper avg_accuracy = 0.0 # Number of test iterations to run here, since the full test set is 10k images and the # batch size is 1, we will run 10000 test batches to cover the entire test set test_iters = 10000 # Main testing loop for i in range(test_iters): workspace.RunNet(test_model.net) acc = workspace.FetchBlob('accuracy') avg_accuracy += acc if (i % 500 == 0) and (i > 0):
def bmuf_process(filestore_dir, process_id, shared_results, nesterov=False): # We need to import caffe2 in every process to initialize CUDA independently. from caffe2.python import core, cnn, data_parallel_model, workspace, dyndep from caffe2.proto import caffe2_pb2 dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops") if not workspace.has_gpu_support: log.info('No GPU support test is Ignored.') return if workspace.NumCudaDevices() < 4: log.info('Not enough GPU support, test IGNORED') return model = cnn.CNNModelHelper(order="NHWC", name="test") gpu_ids = [0, 1] if process_id == 0 else [2, 3] def _model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def _input_builder_fun(model): return None def _param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): grad = model.param_to_grad[param] model.WeightedSum([param, ONE, grad, LR], param) def _generate_data(gpu_devices, process_id): np.random.seed(26 + process_id * 10) # Each run has same input, independent of number of gpus batch_size = 64 for _ in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/data".format(g), data) workspace.FeedBlob("gpu_{}/label".format(g), labels) _generate_data(gpu_ids, process_id) workspace.RunOperatorOnce( core.CreateOperator("FileStoreHandlerCreate", [], ["store_handler"], path=filestore_dir)) rendezvous = dict(kv_handler="store_handler", shard_id=process_id, num_shards=2, engine="GLOO", exit_nets=None) data_parallel_model.Parallelize_GPU_BMUF(model, _input_builder_fun, _model_build_fun, _param_update_fun, devices=gpu_ids, rendezvous=rendezvous, nesterov=nesterov) data_parallel_model.RunInitNet(model) def _gpu_pid(gpu_id, pid): if pid == 1: return gpu_id + 2 return gpu_id np.testing.assert_equal( workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id))), np.zeros(16).astype(np.float32).reshape(1, 16)) # Run the algorithm for one iteration to have non-zero params. data_parallel_model.RunNet(model, 1) # Save iteration momentum and post local update params results = {} v_b_ = workspace.FetchBlob("gpu_{}/fc_b_v".format(_gpu_pid(0, process_id))) v_w_ = workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id))) results['v_b_'] = v_b_ results['v_w_'] = v_w_ workspace.RunNetOnce(model.net) b_0_ = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(0, process_id))) w_0_ = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(0, process_id))) b_1_ = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(1, process_id))) w_1_ = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(1, process_id))) results['b_0_'] = b_0_ results['w_0_'] = w_0_ results['b_1_'] = b_1_ results['w_1_'] = w_1_ # Compute block gradients. b_g_ = workspace.FetchBlob("gpu_{}/fc_b_g".format(_gpu_pid(0, process_id))) w_g_ = workspace.FetchBlob("gpu_{}/fc_w_g".format(_gpu_pid(0, process_id))) results['b_g_'] = b_g_ results['w_g_'] = w_g_ workspace.RunNetOnce(model._global_model_param_updates_net) # g_b = (b_0_ + b_1_) / 2 - b_g_ # g_w = (w_0_ + w_1_) / 2 - w_g_ v_b = workspace.FetchBlob("gpu_{}/fc_b_v".format(_gpu_pid(0, process_id))) v_w = workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id))) w_g = workspace.FetchBlob("gpu_{}/fc_w_g".format(_gpu_pid(0, process_id))) b_g = workspace.FetchBlob("gpu_{}/fc_b_g".format(_gpu_pid(0, process_id))) w_0 = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(0, process_id))) b_0 = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(0, process_id))) w_1 = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(1, process_id))) b_1 = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(1, process_id))) results['v_b'] = v_b results['v_w'] = v_w results['w_g'] = w_g results['b_g'] = b_g results['w_0'] = w_0 results['b_0'] = b_0 results['w_1'] = w_1 results['b_1'] = b_1 shared_results[process_id] = results
def test_lstm_with_recurrent_attention( self, encoder_output_length, encoder_output_dim, decoder_input_length, decoder_state_dim, batch_size, lstm_mem_optim, attention_mem_optim, gc, dc, ): with core.DeviceScope(gc): model = CNNModelHelper(name="external") ( encoder_outputs, decoder_inputs, decoder_input_lengths, initial_decoder_hidden_state, initial_decoder_cell_state, initial_attention_weighted_encoder_context, ) = model.net.AddExternalInputs( "encoder_outputs", "decoder_inputs", "decoder_input_lengths", "initial_decoder_hidden_state", "initial_decoder_cell_state", "initial_attention_weighted_encoder_context", ) recurrent.LSTMWithAttention( model=model, decoder_inputs=decoder_inputs, decoder_input_lengths=decoder_input_lengths, initial_decoder_hidden_state=initial_decoder_hidden_state, initial_decoder_cell_state=initial_decoder_cell_state, initial_attention_weighted_encoder_context=( initial_attention_weighted_encoder_context), encoder_output_dim=encoder_output_dim, encoder_outputs=encoder_outputs, decoder_input_dim=decoder_state_dim, decoder_state_dim=decoder_state_dim, scope='external/LSTMWithAttention', attention_type=AttentionType.Recurrent, lstm_memory_optimization=lstm_mem_optim, attention_memory_optimization=attention_mem_optim) op = model.net._net.op[-1] workspace.RunNetOnce(model.param_init_net) # This is original decoder_inputs after linear layer decoder_input_blob = op.input[0] workspace.FeedBlob( decoder_input_blob, np.random.randn( decoder_input_length, batch_size, decoder_state_dim * 4, ).astype(np.float32)) workspace.FeedBlob( "external/LSTMWithAttention/encoder_outputs_transposed", np.random.randn( batch_size, encoder_output_dim, encoder_output_length, ).astype(np.float32), ) workspace.FeedBlob( "external/LSTMWithAttention/weighted_encoder_outputs", np.random.randn( encoder_output_length, batch_size, encoder_output_dim, ).astype(np.float32), ) workspace.FeedBlob( decoder_input_lengths, np.random.randint(0, decoder_input_length + 1, size=(batch_size, )).astype(np.int32)) workspace.FeedBlob( initial_decoder_hidden_state, np.random.randn(1, batch_size, decoder_state_dim).astype(np.float32)) workspace.FeedBlob( initial_decoder_cell_state, np.random.randn(1, batch_size, decoder_state_dim).astype(np.float32)) workspace.FeedBlob( initial_attention_weighted_encoder_context, np.random.randn(1, batch_size, encoder_output_dim).astype(np.float32)) inputs = [workspace.FetchBlob(name) for name in op.input] self.assertReferenceChecks( device_option=gc, op=op, inputs=inputs, reference=lstm_with_recurrent_attention_reference, grad_reference=None, output_to_grad=None, outputs_to_check=range(6), ) gradients_to_check = [ index for (index, input_name) in enumerate(op.input) if input_name != "decoder_input_lengths" ] for param in gradients_to_check: self.assertGradientChecks( device_option=gc, op=op, inputs=inputs, outputs_to_check=param, outputs_with_grads=[0, 4], threshold=0.01, stepsize=0.001, )
def run_model(self, V, gpu_devices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): gpu_vecs_gathered = [] gpu_vecs = [] for num, vec in enumerate(self.vecs): gpu_vec = model.param_init_net.CopyCPUToGPU( vec, 'gpuvec_{}'.format(num), ) if num != 2: model.params.append(gpu_vec) gpu_vecs.append(gpu_vec) for num, gpu_vec in enumerate(gpu_vecs): gpu_vec_gathered = model.net.Gather( [gpu_vec, 'indices'], ['gpu_vec_gathered_{}'.format(num)]) gpu_vecs_gathered.append(gpu_vec_gathered) assert len(gpu_vecs_gathered) == 3 fc = model.net.FC( [ gpu_vecs_gathered[2], gpu_vecs_gathered[0], gpu_vecs_gathered[1], ], ['fc'], ) _, loss = model.net.SoftmaxWithLoss( [fc, 'label'], ['ce_loss', 'avg_loss'], only_loss=True, ) loss = model.Scale(loss, scale=loss_scale) model.net.Print(loss, [], limit=10) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: model.net.ScatterWeightedSum( [ param, ONE, param_grad.indices, param_grad.values, ONE, ], param, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) batch_size = 32 batch_per_device = batch_size // len(gpu_devices) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) ''' self.vecs consists of 3 big blobs on which we call Gather: 1) FC weights, shape=(V, 16) 2) FC bias, shape=(V) 3) FC input, shape=(batch_per_device, 16) ''' self.vecs = [ model.param_init_net.UniformFill([], "vec_{}".format(num), shape=[V, 16]) for num in range(2) ] self.vecs.append( model.param_init_net.UniformFill( [], "vec_2", shape=[batch_per_device, 16])) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): for num, vec in enumerate(self.vecs[:-1]): model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec) # Each run has same input, independent of number of gpus for i in range(0, 10): np.random.seed(2603) full_indices = np.random.permutation(V)[:batch_size].reshape( batch_size) full_labels = full_indices[:] % batch_per_device for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en].astype(np.int32) labels = full_labels[st:en].astype(np.int32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/indices".format(g), indices) workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = [ np.random.rand(V, 16).astype(np.float32), np.random.rand(V).astype(np.float32), np.random.rand(V, 16).astype(np.float32), ] for vec, orig_vec in zip(self.vecs, orig_vecs): workspace.FeedBlob(vec, orig_vec) for g in gpu_devices: for num, orig_vec in enumerate(orig_vecs): workspace.FeedBlob( "gpu_{}/gpuvec_{}".format(g, num), orig_vec, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) idx = workspace.FetchBlob('gpu_0/indices') grad_slices = [ workspace.FetchBlob('gpu_{}/gpu_vec_gathered_{}_grad'.format( g, num)) for g in gpu_devices for num in range(2) ] for grad_slice in grad_slices: # print (len(idx), len(grad_slice)) assert len(idx) == len(grad_slice), ( 'Number of indices {} is not same as number of gradient ' 'slices {}. This might lead to illegal memory access'. format(len(idx), len(grad_slice)))
def RunWarmup(model): workspace.RunNet(model.net, model._warmup_iterations) workspace.RunNetOnce(model._warmup_broadcast)
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): workspace.FeedBlob( core.ScopedBlobReference("seq_lengths"), np.array([self.T] * self.batch_per_device, dtype=np.int32)) model.param_init_net.ConstantFill( [], "hidden_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim]) model.param_init_net.ConstantFill( [], "cell_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim]) output, _last_hidden, _, _last_state, = rnn_cell.LSTM( model=model, input_blob="data", seq_lengths="seq_lengths", initial_states=("hidden_init", "cell_init"), dim_in=self.input_dim, dim_out=self.hidden_dim, scope="partest", ) # A silly loss function loss = model.AveragedLoss( model.Sub([output, "target"], "dist"), "loss", ) loss = model.Scale(loss, "loss_scaled", scale=loss_scale) return [loss] def param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param) assert len( model.GetParams()) == len(model.params) // len(model._devices) workspace.ResetWorkspace() model = cnn.CNNModelHelper(name="recurrent_test{}".format(devices), ) self.T = 8 self.batch_size = 64 self.input_dim = 8 self.hidden_dim = 31 self.batch_per_device = self.batch_size // len(devices) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=devices, optimize_gradient_memory=True, cpu_device=not gpu, ) # Change all initialization to be ConstantFills so that # the everything is deterministic for op in model.param_init_net.Proto().op: if op.type.endswith('Fill'): op.type = 'ConstantFill' # Each run has same input, independent of number of gpus np.random.seed(20150210) for i in range(0, 10): full_data = np.random.rand(self.T, self.batch_size, self.input_dim) full_target = np.random.rand(self.T, self.batch_size, self.hidden_dim) for (j, g) in enumerate(devices): st = j * self.batch_per_device en = st + self.batch_per_device data = full_data[:, st:en, :].astype(np.float32) targets = full_target[:, st:en, :].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data) workspace.FeedBlob( "{}_{}/target".format(model._device_prefix, g), targets) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("{}_0/partest/i2h_w".format( model._device_prefix))
def run_model(self, V, gpu_devices, cpu_indices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): if cpu_indices: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): gathered_cpu = model.net.Gather([self.vecs, 'indices'], 'gathered_cpu') gathered = model.CopyCPUToGPU(gathered_cpu, "gathered") else: gpu_vecs = model.param_init_net.CopyCPUToGPU( self.vecs, "gpuvecs", ) model.params.append(gpu_vecs) gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered') flattened = model.Flatten(gathered, "flattened") fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: param_momentum = model.param_init_net.ConstantFill( [param], param + '_momentum', value=0.0, ) model.net.SparseMomentumSGDUpdate( [ param_grad.values, param_momentum, LR, param, param_grad.indices, ], [param_grad.values, param_momentum, param], momentum=0.1, nesterov=0, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) self.vecs = model.param_init_net.UniformFill([], "vecs", shape=[V, 16]) if cpu_indices: model.params.append(self.vecs) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs if cpu_indices: with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): for param in model.GetParams(): param_grad = model.param_to_grad[param] model.ScatterWeightedSum([ param, self.ONE_CPU, param_grad.indices, param_grad.values, self.LR ], self.vecs) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_indices = np.random.permutation(V)[:batch_size * 16].reshape( batch_size, 16) full_labels = full_indices[:, 0] % 2 batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en, :].astype(np.int32) labels = full_labels[st:en].astype(np.float32) device_for_indices = core.DeviceOption(caffe2_pb2.CPU) if not cpu_indices: device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g) with core.DeviceScope(device_for_indices): workspace.FeedBlob("gpu_{}/indices".format(g), indices) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = np.random.rand(V, 16).astype(np.float32) workspace.FeedBlob(self.vecs, orig_vecs) if not cpu_indices: for g in gpu_devices: workspace.FeedBlob( "gpu_{}/gpuvecs".format(g), orig_vecs, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) if len(gpu_devices) == 2: if not cpu_indices: idx = workspace.FetchBlob("gpu_0/indices") idx = list(idx.flatten()) n = len(idx) nu = len(set(idx)) assert n == nu, "We cannot have duplicate indices" # Sanity check to see the vecs were updated self.assertFalse(np.allclose(workspace.FetchBlob(self.vecs), orig_vecs)) return [ workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"), workspace.FetchBlob("gpu_0/fc_w") ]
def test_imageinput(self, size_tuple, means, stds, gc, dc): # TODO: Does not test on GPU and does not test use_gpu_transform # WARNING: Using ModelHelper automatically does NHWC to NCHW # transformation if needed. width, height, minsize, crop = size_tuple means = [float(m) for m in means] stds = [float(s) for s in stds] out_dir = tempfile.mkdtemp() count_images = 2 # One with bounding box and one without expected_images = create_test(out_dir, width=width, height=height, default_bound=(3, 5, height - 3, width - 5), minsize=minsize, crop=crop, means=means, stds=stds, count=count_images) for device_option in dc: with hu.temp_workspace(): reader_net = core.Net('reader') reader_net.CreateDB([], 'DB', db=out_dir, db_type="lmdb") workspace.RunNetOnce(reader_net) imageop = core.CreateOperator( 'ImageInput', ['DB'], ["data", "label"], batch_size=count_images, color=3, minsize=minsize, crop=crop, is_test=True, bounding_ymin=3, bounding_xmin=5, bounding_height=height - 3, bounding_width=width - 5, mean_per_channel=means, std_per_channel=stds, use_gpu_transform=(device_option.device_type == 1)) imageop.device_option.CopyFrom(device_option) main_net = core.Net('main') main_net.Proto().op.extend([imageop]) workspace.RunNetOnce(main_net) l = workspace.FetchBlob('label') result = workspace.FetchBlob('data').astype(np.int32) # If we don't use_gpu_transform, the output is in NHWC # Our reference output is CHW so we swap if device_option.device_type != 1: expected = [ img.swapaxes(0, 1).swapaxes(1, 2) for img in expected_images ] else: expected = expected_images for i in range(count_images): self.assertEqual(l[i], i) self.assertEqual((expected[i] - result[i] > 1).sum(), 0) # End for # End with # End for shutil.rmtree(out_dir)
def _prepare_rnn(t, n, dim_in, create_rnn, outputs_with_grads, forget_bias, memory_optim=False, forward_only=False, drop_states=False, T=None, two_d_initial_states=None, dim_out=None): if dim_out is None: dim_out = [dim_in] print("Dims: ", t, n, dim_in, dim_out) model = ModelHelper(name='external') if two_d_initial_states is None: two_d_initial_states = np.random.randint(2) def generate_input_state(n, d): if two_d_initial_states: return np.random.randn(n, d).astype(np.float32) else: return np.random.randn(1, n, d).astype(np.float32) states = [] for layer_id, d in enumerate(dim_out): h, c = model.net.AddExternalInputs( "hidden_init_{}".format(layer_id), "cell_init_{}".format(layer_id), ) states.extend([h, c]) workspace.FeedBlob(h, generate_input_state(n, d).astype(np.float32)) workspace.FeedBlob(c, generate_input_state(n, d).astype(np.float32)) # Due to convoluted RNN scoping logic we make sure that things # work from a namescope with scope.NameScope("test_name_scope"): input_blob, seq_lengths = model.net.AddScopedExternalInputs( 'input_blob', 'seq_lengths') outputs = create_rnn( model, input_blob, seq_lengths, states, dim_in=dim_in, dim_out=dim_out, scope="external/recurrent", outputs_with_grads=outputs_with_grads, memory_optimization=memory_optim, forget_bias=forget_bias, forward_only=forward_only, drop_states=drop_states, static_rnn_unroll_size=T, ) workspace.RunNetOnce(model.param_init_net) workspace.FeedBlob( seq_lengths, np.random.randint(1, t + 1, size=(n, )).astype(np.int32)) return outputs, model.net, states + [input_blob]
def caffe2_yellowfin(self, zero_debias, grad_coef, n_dim, n_iter, gpu): caffe2_res = {} alpha = 1.0 mu = 0.0 beta = 0.999 curv_win_width = 20 epsilon = 1e-6 net = core.Net("net") param_init_net = core.Net("param_init_net") workspace.ResetWorkspace() with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): iteration = param_init_net.ConstantFill([], "iteration", shape=[1], value=0, dtype=core.DataType.INT64) iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"]) net.AtomicIter([iter_mutex, iteration], [iteration]) pre_grad = param_init_net.ConstantFill([], "pre_grad", shape=[n_dim], value=grad_coef) if gpu: iteration = net.CopyCPUToGPU([iteration], "iteration_cpu") iteration_float = net.Cast([iteration], "iteration_float") grad = net.Mul([pre_grad, iteration_float], "grad", broadcast=True) w = param_init_net.ConstantFill([], "w", shape=[n_dim], value=0.0) # a hack to create an object with __dict__ param_info = lambda: None param_info.blob = w param_info.grad = grad optimizer.YellowFinOptimizer(alpha=alpha, mu=mu, beta=beta, curv_win_width=curv_win_width, epsilon=epsilon, zero_debias=zero_debias)._run( net, param_init_net, param_info) workspace.RunNetOnce(param_init_net) workspace.CreateNet(net, overwrite=True) for i in range(n_iter): workspace.RunNet(net) scalars_memory_blob = workspace.FetchBlob("w_scalars_memory") g_norm2_avg = scalars_memory_blob[1] g_norm2_min_avg = scalars_memory_blob[2] g_norm2_max_avg = scalars_memory_blob[3] distance_avg = scalars_memory_blob[4] g_avg_blob = workspace.FetchBlob("w_g_avg") res_lr = workspace.FetchBlob("w_lr_avg")[0] res_mu = workspace.FetchBlob("w_mu_avg")[0] g_deb = self.deb(g_avg_blob, beta, i + 1, zero_debias) variance = max( self.deb(g_norm2_avg, beta, i + 1, zero_debias) - g_deb.dot(g_deb), epsilon) if i > 0: caffe2_res[i] = { 'h_max': np.exp(self.deb(g_norm2_max_avg, beta, i + 1, zero_debias)), 'h_min': np.exp(self.deb(g_norm2_min_avg, beta, i + 1, zero_debias)), 'var': variance, 'dist': self.deb(distance_avg, beta, i + 1, zero_debias), 'lr': res_lr, 'mu': res_mu } return caffe2_res
def run_test(size_tuple, means, stds, label_type, num_labels, is_test, scale_jitter_type, color_jitter, color_lighting, dc, validator, output1=None, output2_size=None): # TODO: Does not test on GPU and does not test use_gpu_transform # WARNING: Using ModelHelper automatically does NHWC to NCHW # transformation if needed. width, height, minsize, crop = size_tuple means = [float(m) for m in means] stds = [float(s) for s in stds] out_dir = tempfile.mkdtemp() count_images = 2 # One with bounding box and one without expected_images = create_test(out_dir, width=width, height=height, default_bound=(3, 5, height - 3, width - 5), minsize=minsize, crop=crop, means=means, stds=stds, count=count_images, label_type=label_type, num_labels=num_labels, output1=output1, output2_size=output2_size) for device_option in dc: with hu.temp_workspace(): reader_net = core.Net('reader') reader_net.CreateDB([], 'DB', db=out_dir, db_type="lmdb") workspace.RunNetOnce(reader_net) outputs = ['data', 'label'] output_sizes = [] if output1: outputs.append('output1') output_sizes.append(1) if output2_size: outputs.append('output2') output_sizes.append(output2_size) imageop = core.CreateOperator( 'ImageInput', ['DB'], outputs, batch_size=count_images, color=3, minsize=minsize, crop=crop, is_test=is_test, bounding_ymin=3, bounding_xmin=5, bounding_height=height - 3, bounding_width=width - 5, mean_per_channel=means, std_per_channel=stds, use_gpu_transform=(device_option.device_type == 1), label_type=label_type, num_labels=num_labels, output_sizes=output_sizes, scale_jitter_type=scale_jitter_type, color_jitter=color_jitter, color_lighting=color_lighting) imageop.device_option.CopyFrom(device_option) main_net = core.Net('main') main_net.Proto().op.extend([imageop]) workspace.RunNetOnce(main_net) validator(expected_images, device_option, count_images) # End for # End with # End for shutil.rmtree(out_dir)
def run_conv_or_fc( test_case, init_net, net, X, W, b, op_type, engine, order, gc, outputs, scale=None, zero_point=None, x_scale=None, x_zero_point=None, ): if order: # Conv Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) else: # FC Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) # We run DNNLOWP ops multiple times to test their first runs that # do caching so exercises different code paths from the subsequent # runs # self.ws.run re-creates operator every time so this test covers # cases when we have multiple nets sharing the same workspace test_case.ws.create_blob("X").feed(X, device_option=gc) test_case.ws.create_blob("W").feed(W, device_option=gc) test_case.ws.create_blob("b").feed(b, device_option=gc) if scale is not None and zero_point is not None: with workspace.WorkspaceGuard(test_case.ws): dnnlowp_pybind11.CreateInt8QuantParamsBlob( "quant_param", float(scale), int(zero_point) ) if x_scale is not None and x_zero_point is not None: with workspace.WorkspaceGuard(test_case.ws): dnnlowp_pybind11.CreateInt8QuantParamsBlob( "X_quant_param", float(x_scale), int(x_zero_point) ) if init_net: test_case.ws.run(init_net) for i in range(1 if engine == "" else 2): test_case.ws.run(net) Y = test_case.ws.blobs["Y"].fetch() if order: outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) else: outputs.append(Output(Y=Y, op_type=op_type, engine=engine)) # workspace.CreateNet + workspace.RunNet reuses the same operator if engine != "": workspace.FeedBlob("X", X) workspace.FeedBlob("W", W) workspace.FeedBlob("b", b) if scale is not None and zero_point is not None: dnnlowp_pybind11.CreateInt8QuantParamsBlob( "quant_param", float(scale), int(zero_point) ) if x_scale is not None and x_zero_point is not None: dnnlowp_pybind11.CreateInt8QuantParamsBlob( "X_quant_param", float(x_scale), int(x_zero_point) ) if init_net: workspace.RunNetOnce(init_net) workspace.CreateNet(net) for i in range(2): workspace.RunNet(net) Y = workspace.FetchBlob("Y") if order: outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) else: outputs.append(Output(Y=Y, op_type=op_type, engine=engine))
def test_convolution_grouped_sum_relu_fusion(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, group, gc, dc): conv_S0 = core.CreateOperator( "Conv", ["SX0", "Sw0", "Sb0"] if use_bias else ["SX0", "Sw0"], ["S0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0]) conv = core.CreateOperator( "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0]) sum = core.CreateOperator("Sum", ["S0", "Y0"], ["S0"], device_option=dc[0]) relu = core.CreateOperator("Relu", ["S0"], ["S0"], device_option=dc[0]) SX = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 Sw = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 Sb = np.random.rand(output_channels * group).astype(np.float32) - 0.5 X = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 w = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels * group).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('SX0', SX, dc[0]) workspace.FeedBlob('Sw0', Sw, dc[0]) workspace.FeedBlob('Sb0', Sb, dc[0]) workspace.FeedBlob('X0', X, dc[0]) workspace.FeedBlob('w0', w, dc[0]) workspace.FeedBlob('b0', b, dc[0]) workspace.RunOperatorOnce(conv_S0) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(sum) workspace.RunOperatorOnce(relu) S0 = workspace.FetchBlob('S0') workspace.ResetWorkspace() old_net = caffe2_pb2.NetDef() conv_S0_old = caffe2_pb2.OperatorDef() conv_S0_old.CopyFrom(conv_S0) conv_S0_old.device_option.CopyFrom(dc[1]) conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) sum_old = caffe2_pb2.OperatorDef() sum_old.CopyFrom(sum) sum_old.device_option.CopyFrom(dc[1]) relu_old = caffe2_pb2.OperatorDef() relu_old.CopyFrom(relu) relu_old.device_option.CopyFrom(dc[1]) old_net.op.extend([conv_S0_old, conv_old, sum_old, relu_old]) workspace.FeedBlob('SX0', SX, dc[1]) workspace.FeedBlob('Sw0', Sw, dc[1]) workspace.FeedBlob('Sb0', Sb, dc[1]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) workspace.RunNetOnce(net.Proto()) # The output tensor name will be changed by optimization # sometimes when applying conv sum fusion S2 = workspace.FetchBlob(net.Proto().op[-1].output[0]) if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def test_convolution_sum_fusion(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, group, sum_add, gc, dc): pool_S0 = core.CreateOperator("MaxPool", ["SX0"], ["S0"], stride=2, pad=0, kernel=2, device_option=dc[0]) conv = core.CreateOperator( "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0]) sum = core.CreateOperator(sum_add, ["S0", "Y0"], ["S0"], device_option=dc[0]) # Manual fusion for Conv + Sum pool_S1 = core.CreateOperator("MaxPool", ["SX1"], ["S1"], stride=2, pad=0, kernel=2, group=group, device_option=dc[1]) conv_fusion = core.CreateOperator( "ConvFusion", ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"], ["S1"], stride=stride, pad=pad, kernel=kernel, group=group, fusion_type=2, device_option=dc[1]) pool_input_size = int( math.ceil(float(size + 2 * pad - kernel + 1) / stride)) * 2 SX = np.random.rand(batch_size, output_channels * group, pool_input_size, pool_input_size).astype( np.float32) - 0.5 X = np.random.rand(batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 w = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels * group).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('SX0', SX, dc[0]) workspace.FeedBlob('X0', X, dc[0]) workspace.FeedBlob('w0', w, dc[0]) workspace.FeedBlob('b0', b, dc[0]) workspace.RunOperatorOnce(pool_S0) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(sum) S0 = workspace.FetchBlob('S0') workspace.ResetWorkspace() workspace.FeedBlob('SX1', SX, dc[1]) workspace.FeedBlob('X1', X, dc[1]) workspace.FeedBlob('w1', w, dc[1]) workspace.FeedBlob('b1', b, dc[1]) workspace.RunOperatorOnce(pool_S1) workspace.RunOperatorOnce(conv_fusion) S1 = workspace.FetchBlob('S1') if not np.allclose(S0, S1, atol=0.01, rtol=0.01): print(S1.flatten()) print(S0.flatten()) print(np.max(np.abs(S1 - S0))) self.assertTrue(False) # Auto fusion for Conv + Sum workspace.ResetWorkspace() old_net = caffe2_pb2.NetDef() pool_S0_old = caffe2_pb2.OperatorDef() pool_S0_old.CopyFrom(pool_S0) pool_S0_old.device_option.CopyFrom(dc[1]) conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) sum_old = caffe2_pb2.OperatorDef() sum_old.CopyFrom(sum) sum_old.device_option.CopyFrom(dc[1]) old_net.op.extend([pool_S0_old, conv_old, sum_old]) # Conv + Sum should be fused case: [PreNode, Conv, Sum] workspace.FeedBlob('SX0', SX, dc[1]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) self.assertTrue(len(net.Proto().op) == 2) self.assertTrue(net.Proto().op[1].type == "ConvFusion") workspace.RunNetOnce(net.Proto()) # The output tensor name will be changed by optimization # sometimes when applying conv sum fusion S2 = workspace.FetchBlob(net.Proto().op[-1].output[0]) if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False) # Conv + Sum should be fused case: [Conv, PreNode, Sum] workspace.ResetWorkspace() old_net = caffe2_pb2.NetDef() workspace.FeedBlob('SX0', SX, dc[1]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) old_net.op.extend([conv_old, pool_S0_old, sum_old]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) self.assertTrue(len(net.Proto().op) == 2) self.assertTrue(net.Proto().op[1].type == "ConvFusion") workspace.RunNetOnce(net.Proto()) # The output tensor name will be changed by optimization # sometimes when applying conv sum fusion S2 = workspace.FetchBlob(net.Proto().op[-1].output[0]) if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False) # Conv + Sum should not be fused case: [Conv, midOp, preNode, Sum] Conv output is used by midOp dropout = core.CreateOperator("Dropout", ["Y0"], ["Y_dropout"], ratio=0.5, is_test=True, device_option=dc[1]) workspace.ResetWorkspace() workspace.FeedBlob('SX0', SX, dc[1]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) old_net = caffe2_pb2.NetDef() old_net.op.extend([conv_old, dropout, pool_S0_old, sum_old]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) self.assertTrue(len(net.Proto().op) == 4) workspace.RunNetOnce(net.Proto()) S2 = workspace.FetchBlob(net.Proto().op[-1].output[0]) if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False) # Conv + Sum should not be fused case: [Conv, preNode, Sum, midOp] preNode output is used by midOp sum1 = core.CreateOperator(sum_add, ["S0", "Y0"], ["S3"], device_option=dc[1]) dropout = core.CreateOperator("Dropout", ["S0"], ["Y_dropout"], ratio=0.5, is_test=True, device_option=dc[1]) workspace.ResetWorkspace() workspace.FeedBlob('SX0', SX, dc[1]) workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) old_net = caffe2_pb2.NetDef() old_net.op.extend([conv_old, pool_S0_old, sum1, dropout]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) print("net={}\n".format(net.Proto())) self.assertTrue(len(net.Proto().op) == 4) workspace.RunNetOnce(net.Proto()) S2 = workspace.FetchBlob(net.Proto().op[-2].output[0]) if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False) # Conv + Sum should not be fused case: [Conv, midOp, preNode, Sum] # midOp output has the same name with that of the Conv input relu_0 = core.CreateOperator("Relu", ["X0"], ["X1"], device_option=dc[0]) conv = core.CreateOperator( "Conv", ["X1", "w0", "b0"] if use_bias else ["X1", "w0"], ["Y0"], stride=1, pad=0, kernel=1, device_option=dc[0]) relu_1 = core.CreateOperator("Relu", ["X1"], ["X1"], device_option=dc[0]) pool = core.CreateOperator("MaxPool", ["X1"], ["S0"], stride=1, pad=0, kernel=1, device_option=dc[0]) sum = core.CreateOperator("Sum", ["S0", "Y0"], ["S0"], device_option=dc[0]) X = np.random.rand(batch_size, input_channels, size, size).astype( np.float32) - 0.5 w = np.random.rand(input_channels, input_channels, 1, 1).astype( np.float32) - 0.5 b = np.random.rand(input_channels).astype(np.float32) - 0.5 workspace.SwitchWorkspace(old_ws_name) workspace.ResetWorkspace() workspace.FeedBlob('X0', X, dc[0]) workspace.FeedBlob('w0', w, dc[0]) workspace.FeedBlob('b0', b, dc[0]) workspace.RunOperatorOnce(relu_0) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(relu_1) workspace.RunOperatorOnce(pool) workspace.RunOperatorOnce(sum) S0 = workspace.FetchBlob('S0') workspace.ResetWorkspace() workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) relu_0_old = caffe2_pb2.OperatorDef() relu_0_old.CopyFrom(relu_0) relu_0_old.device_option.CopyFrom(dc[1]) conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) relu_1_old = caffe2_pb2.OperatorDef() relu_1_old.CopyFrom(relu_1) relu_1_old.device_option.CopyFrom(dc[1]) pool_old = caffe2_pb2.OperatorDef() pool_old.CopyFrom(pool) pool_old.device_option.CopyFrom(dc[1]) sum_old = caffe2_pb2.OperatorDef() sum_old.CopyFrom(sum) sum_old.device_option.CopyFrom(dc[1]) old_net = caffe2_pb2.NetDef() old_net.op.extend( [relu_0_old, conv_old, relu_1_old, pool_old, sum_old]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForMKLDNN(net) self.assertTrue(len(net.Proto().op) == 5) workspace.RunNetOnce(net.Proto()) S2 = workspace.FetchBlob(net.Proto().op[-1].output[0]) if not np.allclose(S0, S2, atol=0.01, rtol=0.01): print(S2.flatten()) print(S0.flatten()) print(np.max(np.abs(S2 - S0))) self.assertTrue(False)
def test_sparse_lengths_sum(self, num_rows, blocksize, weighted, seed, empty_indices, fp16): net = core.Net("bench") np.random.seed(seed) if fp16: input_data = np.random.rand(num_rows, blocksize).astype(np.float16) else: input_data = np.random.rand(num_rows, blocksize).astype(np.float32) if empty_indices: lengths = np.zeros(num_rows, dtype=np.int32) num_indices = 0 else: num_indices = np.random.randint(len(input_data)) # the number of indices per sample lengths_split = np.clip(num_indices // 2, 1, 10) lengths = ( np.ones([num_indices // lengths_split], dtype=np.int32) * lengths_split) # readjust num_indices when lengths_split doesn't divide num_indices num_indices = num_indices // lengths_split * lengths_split indices = np.random.randint(low=0, high=len(input_data), size=[num_indices], dtype=np.int32) weights = np.random.uniform(size=[len(indices)]).astype(np.float32) if fp16: quantized_data = net.HalfFloatToFused8BitRowwiseQuantized( "input_data", "quantized_data") dequantized_data = net.Fused8BitRowwiseQuantizedToHalfFloat( quantized_data, "dequantized_data") else: quantized_data = net.FloatToFused8BitRowwiseQuantized( "input_data", "quantized_data") dequantized_data = net.Fused8BitRowwiseQuantizedToFloat( quantized_data, "dequantized_data") if weighted: net.SparseLengthsWeightedSum( [dequantized_data, "weights", "indices", "lengths"], "sum_reference") net.SparseLengthsWeightedSumFused8BitRowwise( [quantized_data, "weights", "indices", "lengths"], "sum_quantized") else: net.SparseLengthsSum([dequantized_data, "indices", "lengths"], "sum_reference") net.SparseLengthsSumFused8BitRowwise( [quantized_data, "indices", "lengths"], "sum_quantized") workspace.FeedBlob("input_data", input_data) workspace.FeedBlob("weights", weights) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"]) workspace.CreateNet(net) workspace.RunNetOnce(net) dequantized_data = workspace.FetchBlob("dequantized_data") np.testing.assert_array_almost_equal(input_data, workspace.FetchBlob("input_data")) compare_rowwise(input_data, dequantized_data, fp16) sum_reference = workspace.FetchBlob("sum_reference") sum_quantized = workspace.FetchBlob("sum_quantized") if fp16: np.testing.assert_array_almost_equal(sum_reference, sum_quantized, decimal=3) else: np.testing.assert_array_almost_equal(sum_reference, sum_quantized)
def Skip_test_SLS_NonQuantized_fp16(self): N = 20000 DIM = 64 D = (4 * np.random.random_sample((N, DIM)) + 1).astype(np.float32) I = (np.random.randint(0, N, size=12)).astype(np.int64) L = np.asarray([4, 4, 4]).astype(np.int32) workspace.FeedBlob("D", D) ref_c2_net = core.Net("test_ref_c2") ref_c2_net.SparseLengthsSum(["D", "I", "L"], "ref_out") ref_c2_net.Proto().external_input.extend(["D", "I", "L"]) ref_c2_net.Proto().external_output.extend(["ref_out"]) fp16_c2_net = core.Net("test_fp16_c2") fp16_c2_net.SparseLengthsSumFakeFP16AccFP16(["D", "I", "L"], "fp16_out") input_dict = {} pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["D", "I", "L"]) pred_net.external_output.append("glow_out") pred_net.op.add().CopyFrom( core.CreateOperator("SparseLengthsSum", ["D", "I", "L"], ["glow_out"])) onnxified_net = onnxifi_caffe2_net( pred_net, input_dict, max_batch_size=3, max_seq_size=16, debug=True, adjust_batch=False, use_onnx=False, ) num_onnxified_ops = sum(1 if op.type == "Onnxifi" else 0 for op in onnxified_net.op) print(onnxified_net) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("I", I) workspace.FeedBlob("L", L) workspace.RunNetOnce(ref_c2_net) ref_c2_out = workspace.FetchBlob("ref_out") workspace.RunNetOnce(fp16_c2_net) fp16_c2_out = workspace.FetchBlob("fp16_out") np.testing.assert_allclose(fp16_c2_out, ref_c2_out, atol=1e-3, rtol=1e-3) workspace.RunNetOnce(onnxified_net) fp16_glow_out = workspace.FetchBlob("glow_out") if not np.allclose(fp16_glow_out, fp16_c2_out): diff = np.abs(fp16_glow_out - fp16_c2_out) print_test_debug_info( "sls", { "indices": I, "data": D, "lengths": L, "Y_c2": fp16_c2_out, "Y_glow": fp16_glow_out, "diff": diff, "rowwise_diff": diff[:, 0], }, ) assert 0
def test_net_conversion_and_append_net(self): other = model_helper.ModelHelper() fc1 = brew.fc(other, "data", "other_fc1", dim_in=3 * 227 * 227, dim_out=10) fc2 = brew.fc(other, fc1, "other_fc2", dim_in=10, dim_out=10) brew.fc(other, fc2, "other_fc3", dim_in=10, dim_out=10) def add_input_ops(model): model.net.UniformFill([], ["data"], shape=[4, 227, 227, 3]) model.net.UniformFill([], ["label"], shape=[4]) def add_model_ops(model, loss_scale): model.NHWC2NCHW("data", "data_nchw") model.Conv("data_nchw", 'conv1', 3, 64, weight_init=("MSRAFill", {}), kernel=7, stride=2, pad=3, no_bias=0) model.SpatialBN('conv1', 'conv1_spatbn_relu', 64, epsilon=1e-3, is_test=False) model.Relu('conv1_spatbn_relu', 'conv1_spatbn_relu') model.MaxPool('conv1_spatbn_relu', 'pool1', kernel=3, stride=2) model.FC('pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=10) # Append the net and param_init_net of the other model appendnet = data_parallel_model.ConvertNetForDevice(other.net) model.net.AppendNet(appendnet) model.param_init_net.AppendNet( data_parallel_model.ConvertNetForDevice(other.param_init_net)) model.Sigmoid('fc', 'fc_sigm') model.Softmax('fc_sigm', 'softmax') loss = model.AveragedLoss('softmax', 'loss') return [loss] def add_optimizer(model): optimizer.build_sgd(model, 0.1, policy="fixed", momentum=0.9) model = cnn.CNNModelHelper( order="NCHW", name="test", ) data_parallel_model.Parallelize_CPU( model, input_builder_fun=add_input_ops, forward_pass_builder_fun=add_model_ops, optimizer_builder_fun=add_optimizer, devices=range(4)) # Just create and run net and confirm no exception is thrown workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net)
def assertReferenceChecks( self, device_option, op, inputs, reference, input_device_options=None, threshold=1e-4, output_to_grad=None, grad_reference=None, atol=None, outputs_to_check=None, ): """ This runs the reference Python function implementation (effectively calling `reference(*inputs)`, and compares that to the output of output, with an absolute/relative tolerance given by the `threshold` parameter. Useful for checking the implementation matches the Python (typically NumPy) implementation of the same functionality. Usage example: @given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs) def test_softsign(self, X, inplace, gc, dc): op = core.CreateOperator( "Softsign", ["X"], ["X" if inplace else "Y"]) def softsign(X): return (X / (1 + np.abs(X)),) self.assertReferenceChecks(gc, op, [X], softsign) """ if input_device_options is None: input_device_options = {} op = copy.deepcopy(op) op.device_option.CopyFrom(device_option) with temp_workspace(): for (n, b) in zip(op.input, inputs): workspace.FeedBlob( n, b, device_option=input_device_options.get(n, device_option) ) print("Input", n, input_device_options.get(n, device_option)) net = core.Net("opnet") net.Proto().op.extend([op]) test_shape_inference = False try: (shapes, types) = workspace.InferShapesAndTypes([net]) test_shape_inference = True except RuntimeError as e: # Temporarily catch runtime errors when inferring shape # and type info logging.warning(str(e)) if os.getenv('CAFFE2_ASSERT_SHAPEINFERENCE') == '1': raise e workspace.RunNetOnce(net) reference_outputs = reference(*inputs) if not (isinstance(reference_outputs, tuple) or isinstance(reference_outputs, list)): raise RuntimeError( "You are providing a wrong reference implementation. A " "proper one should return a tuple/list of numpy arrays.") if not outputs_to_check: self.assertEqual(len(reference_outputs), len(op.output)) outputs_to_check = list(range(len(op.output))) outs = [] for (output_index, ref) in zip(outputs_to_check, reference_outputs): output_blob_name = op.output[output_index] output = workspace.FetchBlob(output_blob_name) if output.dtype.kind in ('S', 'O'): np.testing.assert_array_equal(output, ref) else: if atol is None: atol = threshold np.testing.assert_allclose( output, ref, atol=atol, rtol=threshold, err_msg=( 'Output {0} is not matching the reference'.format( output_blob_name, )), ) if test_shape_inference: self._assertInferTensorChecks( output_blob_name, shapes, types, output) outs.append(output) if grad_reference and output_to_grad: with core.DeviceScope(device_option): self._assertGradReferenceChecks( op, inputs, reference_outputs, output_to_grad, grad_reference) return outs
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) # For testing explicit sync model.param_init_net.UniformFill([], ["sync_num"], shape=[1]) return [loss] def add_optimizer(model): return optimizer.build_sgd( model, 0.1, policy="fixed", max_gradient_norm=5.0, allow_lr_injection=True, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(devices), ) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, optimizer_builder_fun=add_optimizer, devices=devices, cpu_device=not gpu, shared_model=not gpu, ) data_parallel_model.AddBlobSync(model, ["sync_num"]) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(devices) for (j, g) in enumerate(devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data) workspace.FeedBlob( "{}_{}/label".format(model._device_prefix, g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.FeedBlob(model._device_prefix + "_0/sync_num", np.array([i * 2]).astype(np.float32), device_option=core.DeviceOption( model._device_type, 0)) workspace.RunNet(model.net.Proto().name) # Test AddBlobSync for j in model._devices: sync = workspace.FetchBlob(model._device_prefix + "_{}/sync_num".format(j))[0] self.assertTrue(abs(sync - i * 2) < 0.01) return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
def export_actor( cls, trainer, state_normalization_parameters, action_feature_ids, min_action_range_tensor_serving, max_action_range_tensor_serving, model_on_gpu=False, ): """Export caffe2 preprocessor net and pytorch actor forward pass as one caffe2 net. :param trainer DDPGTrainer :param state_normalization_parameters state NormalizationParameters :param min_action_range_tensor_serving pytorch tensor that specifies min action value for each dimension :param max_action_range_tensor_serving pytorch tensor that specifies min action value for each dimension :param state_normalization_parameters state NormalizationParameters :param model_on_gpu boolean indicating if the model is a GPU model or CPU model """ model = model_helper.ModelHelper(name="predictor") net = model.net C2.set_model(model) parameters: List[str] = [] workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32)) workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32)) input_feature_lengths = "input_feature_lengths" input_feature_keys = "input_feature_keys" input_feature_values = "input_feature_values" C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths]) C2.net().Copy(["input/float_features.keys"], [input_feature_keys]) C2.net().Copy(["input/float_features.values"], [input_feature_values]) preprocessor = PreprocessorNet() sparse_to_dense_processor = Caffe2SparseToDenseProcessor() sorted_features, _ = sort_features_by_normalization( state_normalization_parameters) state_dense_matrix, new_parameters = sparse_to_dense_processor( sorted_features, StackedAssociativeArray(input_feature_lengths, input_feature_keys, input_feature_values), ) parameters.extend(new_parameters) state_normalized_dense_matrix, new_parameters = preprocessor.normalize_dense_matrix( state_dense_matrix, sorted_features, state_normalization_parameters, "state_norm", False, ) parameters.extend(new_parameters) torch_init_net, torch_predict_net, new_parameters, actor_input_blob, actor_output_blob, min_action_training_blob, max_action_training_blob, min_action_serving_blob, max_action_serving_blob = DDPGPredictor.generate_train_net( trainer, model, min_action_range_tensor_serving, max_action_range_tensor_serving, model_on_gpu, ) parameters.extend(new_parameters) net.Copy([state_normalized_dense_matrix], [actor_input_blob]) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(torch_init_net) net.AppendNet(torch_predict_net) # Scale actors actions from [-1, 1] to serving range prev_range = C2.Sub(max_action_training_blob, min_action_training_blob) new_range = C2.Sub(max_action_serving_blob, min_action_serving_blob) subtract_prev_min = C2.Sub(actor_output_blob, min_action_training_blob) div_by_prev_range = C2.Div(subtract_prev_min, prev_range) scaled_for_serving_actions = C2.Add( C2.Mul(div_by_prev_range, new_range), min_action_serving_blob) output_lengths = "output/float_features.lengths" workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32)) C2.net().ConstantFill( [C2.FlattenToVec(C2.ArgMax(actor_output_blob))], [output_lengths], value=trainer.actor.layers[-1].out_features, dtype=caffe2_pb2.TensorProto.INT32, ) action_feature_ids_blob = C2.NextBlob("action_feature_ids") workspace.FeedBlob(action_feature_ids_blob, np.array(action_feature_ids, dtype=np.int64)) parameters.append(action_feature_ids_blob) output_keys = "output/float_features.keys" workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64)) num_examples, _ = C2.Reshape(C2.Size("input/float_features.lengths"), shape=[1]) C2.net().Tile([action_feature_ids_blob, num_examples], [output_keys], axis=1) output_values = "output/float_features.values" workspace.FeedBlob(output_values, np.zeros(1, dtype=np.float32)) C2.net().FlattenToVec([scaled_for_serving_actions], [output_values]) workspace.CreateNet(net) return DDPGPredictor(net, torch_init_net, parameters)
def test_parallelize_gpu_bmuf(self): model = cnn.CNNModelHelper(order="NHWC", name="test") gpu_ids = [0, 1] def input_builder_fun(model): return None self._generate_data(gpu_ids) data_parallel_model.Parallelize_GPU_BMUF( model, input_builder_fun, self._model_build_fun, self._param_update_fun, devices=gpu_ids, ) data_parallel_model.RunInitNet(model) # Check initial momentum params are zeros self.assertEqual(list(viewkeys(model._device_grouped_blobs)), ['fc_w', 'fc_b']) self.assertEqual(workspace.FetchBlob('gpu_0/fc_b_v'), 0) np.testing.assert_equal(workspace.FetchBlob('gpu_0/fc_w_v'), np.zeros(16).astype(np.float32).reshape(1, 16)) # Run the algorithm for one iteration to have non-zero params. data_parallel_model.RunNet(model, 1) # Save iteration momentum and post local update params v_b_ = workspace.FetchBlob('gpu_0/fc_b_v') v_w_ = workspace.FetchBlob('gpu_0/fc_w_v') workspace.RunNetOnce(model.net) b_0_ = workspace.FetchBlob('gpu_0/fc_b') w_0_ = workspace.FetchBlob('gpu_0/fc_w') b_1_ = workspace.FetchBlob('gpu_1/fc_b') w_1_ = workspace.FetchBlob('gpu_1/fc_w') # Compute block gradients. b_g_ = workspace.FetchBlob('gpu_0/fc_b_g') w_g_ = workspace.FetchBlob('gpu_0/fc_w_g') workspace.RunNetOnce(model._global_model_param_updates_net) g_b = (b_0_ + b_1_) / 2 - b_g_ g_w = (w_0_ + w_1_) / 2 - w_g_ v_b = workspace.FetchBlob('gpu_0/fc_b_v') v_w = workspace.FetchBlob('gpu_0/fc_w_v') w_g = workspace.FetchBlob('gpu_0/fc_w_g') b_g = workspace.FetchBlob('gpu_0/fc_b_g') w_0 = workspace.FetchBlob('gpu_0/fc_w') b_0 = workspace.FetchBlob('gpu_0/fc_b') w_1 = workspace.FetchBlob('gpu_1/fc_w') b_1 = workspace.FetchBlob('gpu_1/fc_b') # Check momentum update step np.testing.assert_equal(v_b, 0.5 * v_b_ + g_b) np.testing.assert_equal(v_w, 0.5 * v_w_ + g_w) np.testing.assert_equal(w_g, w_0) np.testing.assert_equal(w_g, w_1) np.testing.assert_equal(b_g, b_0) np.testing.assert_equal(b_g, b_1) # Check params update step np.testing.assert_equal(w_0, w_g_ + v_w) np.testing.assert_equal(b_0, b_g_ + v_b)
def export( cls, trainer, state_normalization_parameters, action_normalization_parameters, int_features=False, model_on_gpu=False, ): """Export caffe2 preprocessor net and pytorch DQN forward pass as one caffe2 net. :param trainer ParametricDQNTrainer :param state_normalization_parameters state NormalizationParameters :param action_normalization_parameters action NormalizationParameters :param int_features boolean indicating if int features blob will be present :param model_on_gpu boolean indicating if the model is a GPU model or CPU model """ input_dim = trainer.num_features buffer = PytorchCaffe2Converter.pytorch_net_to_buffer( trainer.q_network, input_dim, model_on_gpu ) qnet_input_blob, qnet_output_blob, caffe2_netdef = PytorchCaffe2Converter.buffer_to_caffe2_netdef( buffer ) torch_workspace = caffe2_netdef.workspace parameters = torch_workspace.Blobs() for blob_str in parameters: workspace.FeedBlob(blob_str, torch_workspace.FetchBlob(blob_str)) torch_init_net = core.Net(caffe2_netdef.init_net) torch_predict_net = core.Net(caffe2_netdef.predict_net) # ensure state and action IDs have no intersection assert ( len( set(state_normalization_parameters.keys()) & set(action_normalization_parameters.keys()) ) == 0 ) model = model_helper.ModelHelper(name="predictor") net = model.net C2.set_model(model) workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32)) workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32)) input_feature_lengths = "input_feature_lengths" input_feature_keys = "input_feature_keys" input_feature_values = "input_feature_values" if int_features: workspace.FeedBlob( "input/int_features.lengths", np.zeros(1, dtype=np.int32) ) workspace.FeedBlob("input/int_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/int_features.values", np.zeros(1, dtype=np.int32)) C2.net().Cast( ["input/int_features.values"], ["input/int_features.values_float"], dtype=caffe2_pb2.TensorProto.FLOAT, ) C2.net().MergeMultiScalarFeatureTensors( [ "input/float_features.lengths", "input/float_features.keys", "input/float_features.values", "input/int_features.lengths", "input/int_features.keys", "input/int_features.values_float", ], [input_feature_lengths, input_feature_keys, input_feature_values], ) else: C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths]) C2.net().Copy(["input/float_features.keys"], [input_feature_keys]) C2.net().Copy(["input/float_features.values"], [input_feature_values]) preprocessor = PreprocessorNet(clip_anomalies=True) state_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix( input_feature_lengths, input_feature_keys, input_feature_values, state_normalization_parameters, "state_norm", False, False, ) parameters.extend(new_parameters) action_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix( input_feature_lengths, input_feature_keys, input_feature_values, action_normalization_parameters, "action_norm", False, False, ) parameters.extend(new_parameters) state_action_normalized = "state_action_normalized" state_action_normalized_dim = "state_action_normalized_dim" net.Concat( [state_normalized_dense_matrix, action_normalized_dense_matrix], [state_action_normalized, state_action_normalized_dim], axis=1, ) net.Copy([state_action_normalized], [qnet_input_blob]) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(torch_init_net) net.AppendNet(torch_predict_net) new_parameters, q_values = RLPredictor._forward_pass( model, trainer, state_action_normalized, ["Q"], qnet_output_blob ) parameters.extend(new_parameters) flat_q_values_key = ( "output/string_weighted_multi_categorical_features.values.values" ) num_examples, _ = C2.Reshape(C2.Size(flat_q_values_key), shape=[1]) q_value_blob, _ = C2.Reshape(flat_q_values_key, shape=[1, -1]) # Get 1 x n (number of examples) action index tensor under the max_q policy max_q_act_idxs = "max_q_policy_actions" C2.net().FlattenToVec([C2.ArgMax(q_value_blob)], [max_q_act_idxs]) max_q_act_blob = C2.Tile(max_q_act_idxs, num_examples, axis=0) # Get 1 x n (number of examples) action index tensor under the softmax policy temperature = C2.NextBlob("temperature") parameters.append(temperature) workspace.FeedBlob( temperature, np.array([trainer.rl_temperature], dtype=np.float32) ) tempered_q_values = C2.Div(q_value_blob, temperature, broadcast=1) softmax_values = C2.Softmax(tempered_q_values) softmax_act_idxs_nested = "softmax_act_idxs_nested" C2.net().WeightedSample([softmax_values], [softmax_act_idxs_nested]) softmax_act_blob = C2.Tile( C2.FlattenToVec(softmax_act_idxs_nested), num_examples, axis=0 ) # Concat action idx vecs to get 2 x n tensor [[a_maxq, ..], [a_softmax, ..]] # transpose & flatten to get [a_maxq, a_softmax, a_maxq, a_softmax, ...] max_q_act_blob = C2.Cast(max_q_act_blob, to=caffe2_pb2.TensorProto.INT64) softmax_act_blob = C2.Cast(softmax_act_blob, to=caffe2_pb2.TensorProto.INT64) max_q_act_blob_nested, _ = C2.Reshape(max_q_act_blob, shape=[1, -1]) softmax_act_blob_nested, _ = C2.Reshape(softmax_act_blob, shape=[1, -1]) C2.net().Append( [max_q_act_blob_nested, softmax_act_blob_nested], [max_q_act_blob_nested] ) transposed_action_idxs = C2.Transpose(max_q_act_blob_nested) flat_transposed_action_idxs = C2.FlattenToVec(transposed_action_idxs) output_values = "output/int_single_categorical_features.values" workspace.FeedBlob(output_values, np.zeros(1, dtype=np.int64)) C2.net().Copy([flat_transposed_action_idxs], [output_values]) output_lengths = "output/int_single_categorical_features.lengths" workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32)) C2.net().ConstantFill( [flat_q_values_key], [output_lengths], value=2, dtype=caffe2_pb2.TensorProto.INT32, ) output_keys = "output/int_single_categorical_features.keys" workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64)) output_keys_tensor, _ = C2.Concat( C2.ConstantFill(shape=[1, 1], value=0, dtype=caffe2_pb2.TensorProto.INT64), C2.ConstantFill(shape=[1, 1], value=1, dtype=caffe2_pb2.TensorProto.INT64), axis=0, ) output_key_tile = C2.Tile(output_keys_tensor, num_examples, axis=0) C2.net().FlattenToVec([output_key_tile], [output_keys]) workspace.CreateNet(net) return ParametricDQNPredictor(net, parameters, int_features)
def GetLoss(new_value): workspace.blobs[input_to_check] = new_value workspace.RunNetOnce(net_copy) return sum( [workspace.blobs[output] for output in outputs_with_grad])
def test_dataset_ops(self): """ 1. Defining the schema of our dataset. This example schema could represent, for example, a search query log. """ schema = Struct( # fixed size vector, which will be stored as a matrix when batched ('dense', Scalar((np.float32, 3))), # could represent a feature map from feature ID to float value ('floats', Map(Scalar(np.int32), Scalar(np.float32))), # could represent a multi-valued categorical feature map ('int_lists', Map( Scalar(np.int32), List(Scalar(np.int64)), )), # could represent a multi-valued, weighted categorical feature map ('id_score_pairs', Map( Scalar(np.int32), Map(Scalar(np.int64), Scalar(np.float32), keys_name='ids', values_name='scores'), )), # additional scalar information ('metadata', Struct( ('user_id', Scalar(np.int64)), ('user_embed', Scalar((np.float32, 2))), ('query', Scalar(str)), )), ) """ This is what the flattened fields for this schema look like, along with its type. Each one of these fields will be stored, read and writen as a tensor. """ expected_fields = [ ('dense', (np.float32, 3)), ('floats:lengths', np.int32), ('floats:values:keys', np.int32), ('floats:values:values', np.float32), ('int_lists:lengths', np.int32), ('int_lists:values:keys', np.int32), ('int_lists:values:values:lengths', np.int32), ('int_lists:values:values:values', np.int64), ('id_score_pairs:lengths', np.int32), ('id_score_pairs:values:keys', np.int32), ('id_score_pairs:values:values:lengths', np.int32), ('id_score_pairs:values:values:values:ids', np.int64), ('id_score_pairs:values:values:values:scores', np.float32), ('metadata:user_id', np.int64), ('metadata:user_embed', (np.float32, 2)), ('metadata:query', str), ] zipped = zip(expected_fields, schema.field_names(), schema.field_types()) for (ref_name, ref_type), name, dtype in zipped: self.assertEquals(ref_name, name) self.assertEquals(np.dtype(ref_type), dtype) """ 2. The contents of our dataset. Contents as defined below could represent, for example, a log of search queries along with dense, sparse features and metadata. The datset below has 3 top-level entries. """ contents_raw = [ # dense [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]], # floats [1, 2, 3], # len [11, 21, 22, 31, 32, 33], # key [1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value # int lists [2, 0, 1], # len [11, 12, 31], # key [2, 4, 3], # value:len [111, 112, 121, 122, 123, 124, 311, 312, 313], # value:value # id score pairs [1, 2, 2], # len [11, 21, 22, 31, 32], # key [1, 1, 2, 2, 3], # value:len [111, 211, 221, 222, 311, 312, 321, 322, 323], # value:ids [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3], # val:score # metadata [123, 234, 456], # user_id [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]], # user_embed ['dog posts', 'friends who like to', 'posts about ca'], # query ] # convert the above content to ndarrays, checking against the schema contents = from_blob_list(schema, contents_raw) """ 3. Creating and appending to the dataset. We first create an empty dataset with the given schema. Then, a Writer is used to append these entries to the dataset. """ ds = dataset.Dataset(schema) net = core.Net('init') ds.init_empty(net) content_blobs = NewRecord(net, contents) FeedRecord(content_blobs, contents) writer = ds.writer(init_net=net) writer.write_record(net, content_blobs) workspace.RunNetOnce(net) """ 4. Iterating through the dataset contents. If we were to iterate through the top level entries of our dataset, this is what we should expect to see: """ entries_raw = [ ( [[1.1, 1.2, 1.3]], # dense [1], [11], [1.1], # floats [2], [11, 12], [2, 4], [111, 112, 121, 122, 123, 124], # intlst [1], [11], [1], [111], [11.1], # id score pairs [123], [[0.2, 0.8]], ['dog posts'], # metadata ), ( [[2.1, 2.2, 2.3]], # dense [2], [21, 22], [2.1, 2.2], # floats [0], [], [], [], # int list [2], [21, 22], [1, 2], [211, 221, 222], [21.1, 22.1, 22.2], [234], [[0.5, 0.5]], ['friends who like to'], # metadata ), ( [[3.1, 3.2, 3.3]], # dense [3], [31, 32, 33], [3.1, 3.2, 3.3], # floats [1], [31], [3], [311, 312, 313], # int lst [2], [31, 32], [2, 3], [311, 312, 321, 322, 323], [31.1, 31.2, 32.1, 32.2, 32.3], # id score list [456], [[0.7, 0.3]], ['posts about ca'], # metadata ), # after the end of the dataset, we will keep getting empty vectors ( [], ) * 16, ([], ) * 16, ] entries = [from_blob_list(schema, e) for e in entries_raw] """ Let's go ahead and create the reading nets. We will run `read` net multiple times and assert that we are reading the entries the way we stated above. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.reader(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net) for i, entry in enumerate(entries): workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) """ 5. Reading/writing in a single plan If all of operations on the data are expressible as Caffe2 operators, we don't need to load the data to python, iterating through the dataset in a single Plan. Where we will process the dataset a little and store it in a second dataset. We can reuse the same Reader since it supports reset. """ reset_net = core.Net('reset_net') reader.reset(reset_net) read_step, batch = reader.execution_step() """ We will add the line number * 1000 to the feature ids. """ process_net = core.Net('process') line_no = Const(process_net, 0, dtype=np.int32) const_one = Const(process_net, 1000, dtype=np.int32) process_net.Add([line_no, const_one], [line_no]) field = batch.floats.keys.get() process_net.Print(field, []) process_net.Add([field, line_no], field, broadcast=1, axis=0) """ Lets create a second dataset and append to it. """ ds2 = dataset.Dataset(schema, name='dataset2') ds2.init_empty(reset_net) writer = ds2.writer(reset_net) writer.write_record(process_net, batch) # commit is not necessary for DatasetWriter but will add it for # generality of the example commit_net = core.Net('commit') writer.commit(commit_net) """ Time to create and run a plan which will do the processing """ plan = core.Plan('process') plan.AddStep(core.execution_step('reset', reset_net)) plan.AddStep(read_step.AddNet(process_net)) plan.AddStep(core.execution_step('commit', commit_net)) workspace.RunPlan(plan) """ Now we should have dataset2 populated. """ ds2_data = FetchRecord(ds2.content()) field = ds2_data.floats.keys field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000]) _assert_records_equal(contents, ds2_data) """ 6. Slicing a dataset You can create a new schema from pieces of another schema and reuse the same data. """ subschema = Struct(('top_level', schema.int_lists.values)) int_list_contents = contents.int_lists.values.field_names() self.assertEquals(len(subschema.field_names()), len(int_list_contents)) """ 7. Random Access a dataset """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob) reader.computeoffset(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net) read_next_net_name = str(read_next_net) for i in range(len(entries)): k = idx[i] if i in idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) """ 8. Sort and shuffle a dataset This sort the dataset using the score of a certain column, and then shuffle within each chunk of size batch_size * shuffle_size before shuffling the chunks. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.random_reader(read_init_net) reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2) reader.computeoffset(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net) expected_idx = np.array([2, 1, 0]) for i in range(len(entries)): k = expected_idx[i] if i in expected_idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry)
def testPartialClone(self): params = core.Net('params') p1 = params.ConstantFill([], ['p1']) workspace.CreateNet(params) workspace.RunNetOnce(params) n = core.Net('original') a1 = n.AddExternalInput('a1') a2 = n.AddExternalInput('a2') b1, b2 = n.Concat([a1, a2], ['b1', 'b2'], axis=0) c1 = n.Sum([b1, p1], ['c1']) c2 = n.Sum([b2], ['c2']) d = n.Sum([c1, c2], ['d']) # test that gradient ops are ignored when partial-cloning n.AddGradientOperators([d]) # test some in-place ops k = n.Sum([p1], ['k']) e = n.Sum([d], ['e']) e = n.Sum([e, k], [e]) e = n.Sum([e], [e]) f = n.Sum(e, ['f']) def net_assert(net, num_ops, inputs, outputs, internals): self.assertEqual(len(net.Proto().op), num_ops) self.assertEqual(set(net.Proto().external_input), inputs) self.assertEqual(set(net.Proto().external_output), outputs) all_blobs = set(net.Proto().external_input) all_blobs |= set(net.Proto().external_output) for op in net.Proto().op: all_blobs |= set(op.input) | set(op.output) self.assertEqual(all_blobs, inputs | outputs | internals) # create net to make sure its valid for input in inputs: workspace.FeedBlob(input, np.array([])) workspace.CreateNet(net) n2, (d22, ) = n.ClonePartial('f1', {a1: 'a11', a2: 'a22'}, [d]) net_assert(n2, 4, {'p1', 'a11', 'a22'}, {'f1/d'}, {'f1/b1', 'f1/b2', 'f1/c1', 'f1/c2', 'p1'}) self.assertTrue(isinstance(d22, core.BlobReference)) self.assertEqual(d22.Net(), n2) self.assertEqual(str(d22), 'f1/d') n3, (d22, ) = n.ClonePartial('f2', [b1, b2], [d]) net_assert(n3, 3, {'p1', 'b1', 'b2'}, {'f2/d'}, {'f2/c1', 'f2/c2', 'p1'}) self.assertEqual(str(d22), 'f2/d') n4, (c22, ) = n.ClonePartial('f3', [b1], [c1]) net_assert(n4, 1, {'p1', 'b1'}, {'f3/c1'}, {'p1'}) self.assertEqual(str(c22), 'f3/c1') n5, (c11, c22) = n.ClonePartial('f4', [b1, b2], [c1, c2]) net_assert(n5, 2, {'p1', 'b1', 'b2'}, {'f4/c1', 'f4/c2'}, {'p1'}) self.assertEqual(str(c11), 'f4/c1') self.assertEqual(str(c22), 'f4/c2') with self.assertRaises(AssertionError): n.ClonePartial('f4', [a1, a2, c2], [d]) n6, (e22, ) = n.ClonePartial('f5', [d], [e]) net_assert(n6, 4, {'p1', 'd'}, {'f5/e'}, {'f5/k', 'p1'}) self.assertEqual(str(e22), 'f5/e') n8, (e22, f22) = n.ClonePartial('f7', [d], [e, f]) net_assert(n8, 5, {'p1', 'd'}, {'f7/e', 'f7/f'}, {'p1', 'f7/k'}) self.assertEqual(str(e22), 'f7/e') self.assertEqual(str(f22), 'f7/f') params._CheckLookupTables() n._CheckLookupTables()
# Params that will be optimized during training print "\n***************** OPT PARAM INFO *******************" print my_model.GetOptimizationParamInfo() #exit() ################################################################################## # Run the training # Initialization. train_dataset = UCF11_Dataset(TRAIN_DICTIONARY) for image, label in train_dataset.read(batch_size=1): workspace.FeedBlob("data", image) workspace.FeedBlob("label", label) break workspace.RunNetOnce(my_model.param_init_net) workspace.CreateNet(my_model.net, overwrite=True) # Main loop. batch_size = 20 print_freq = 1 losses = [] for epoch in range(5): for index, (image, label) in enumerate(train_dataset.read(batch_size)): workspace.FeedBlob("data", image) workspace.FeedBlob("label", label) workspace.RunNet(my_model.net) accuracy = float(workspace.FetchBlob("accuracy")) loss = workspace.FetchBlob("loss").mean() losses.append(loss) if index % print_freq == 0:
def lstm(self, create_lstm, t, n, d, ref, outputs_with_grads, memory_optim, forget_bias=0.0): model = CNNModelHelper(name='external') input_blob, seq_lengths, hidden_init, cell_init = ( model.net.AddExternalInputs('input_blob', 'seq_lengths', 'hidden_init', 'cell_init')) create_lstm( model, input_blob, seq_lengths, (hidden_init, cell_init), d, d, scope="external/recurrent", outputs_with_grads=outputs_with_grads, memory_optimization=memory_optim, forget_bias=forget_bias, ) op = model.net._net.op[-1] workspace.RunNetOnce(model.param_init_net) input_blob = op.input[0] def generate_random_state(n, d): ndim = int(np.random.choice(3, 1)) + 1 if ndim == 1: return np.random.randn(1, n, d).astype(np.float32) random_state = np.random.randn(n, d).astype(np.float32) if ndim == 3: random_state = random_state.reshape([1, n, d]) return random_state workspace.FeedBlob(str(input_blob), np.random.randn(t, n, d * 4).astype(np.float32)) workspace.FeedBlob("hidden_init", generate_random_state(n, d)) workspace.FeedBlob("cell_init", generate_random_state(n, d)) workspace.FeedBlob( "seq_lengths", np.random.randint(1, t + 1, size=(n, )).astype(np.int32)) inputs = [workspace.FetchBlob(name) for name in op.input] self.assertReferenceChecks( hu.cpu_do, op, inputs, ref, outputs_to_check=range(4), ) # Checking for input, gates_t_w and gates_t_b gradients for param in range(5): self.assertGradientChecks( device_option=hu.cpu_do, op=op, inputs=inputs, outputs_to_check=param, outputs_with_grads=outputs_with_grads, threshold=0.01, stepsize=0.005, )
def train_resnet50(args): # Model building functions def create_resnet50_model_ops(model, loss_scale=1.0): # Creates a residual network [softmax, loss] = resnet.create_resnet50( model, "data", num_input_channels=3, num_labels=1000, label="label", ) prefix = model.net.Proto().name loss = model.net.Scale(loss, prefix + "_loss", scale=loss_scale) brew.accuracy(model, [softmax, "label"], prefix + "_accuracy") return [loss] # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_gpu_engine': True, } train_model = model_helper.ModelHelper(name="resnet50", arg_scope=train_arg_scope) reader = train_model.CreateDB("train_reader", db=args.train_data, db_type="lmdb") def add_image_input_ops(model): # utilize the ImageInput operator to prep the images data, label = brew.image_input( model, reader, ["data", "label"], batch_size=args.batch_size, # mean: to remove color values that are common mean=128., # std is going to be modified randomly to influence the mean subtraction std=128., # scale to rescale each image to a common size scale=256, # crop to the square each image to exact dimensions crop=224, # not running in test mode is_test=False, # mirroring of the images will occur randomly mirror=1, use_caffe_datum=False, ) # prevent back-propagation: optional performance improvement; may not be observable at small scale data = model.net.StopGradient(data, data) def add_parameter_update_ops(model): brew.add_weight_decay(model, weight_decay) iter = brew.iter(model, "iter") lr = model.net.LearningRate( [iter], "lr", base_lr=base_learning_rate, policy="step", stepsize=int(10 * args.epochs_size / args.batch_size), gamma=0.1, ) for param in model.GetParams(): param_grad = model.param_to_grad[param] param_momentum = model.param_init_net.ConstantFill([param], param + '_momentum', value=0.0) # Update param_grad and param_momentum in place model.net.MomentumSGDUpdate( [param_grad, param_momentum, lr, param], [param_grad, param_momentum, param], # almost 100% but with room to grow momentum=0.9, # netsterov is a defenseman for the Montreal Canadiens, but # Nesterov Momentum works slightly better than standard momentum nesterov=1, ) dpm.Parallelize_GPU( train_model, input_builder_fun=add_image_input_ops, forward_pass_builder_fun=create_resnet50_model_ops, param_update_builder_fun=add_parameter_update_ops, devices=gpus, optimize_gradient_memory=True, cpu_device=True, ) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) #print(train_model.net.Proto()) for epoch in range(args.num_epochs): num_iters = int(args.epochs_size / args.batch_size) for iter in range(num_iters): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 print( ("Finished iteration {:>" + str(len(str(num_iters))) + "}/{}" + " (epoch {:>" + str(len(str(args.num_epochs))) + "}/{})" + " ({:.2f} images/sec)").format(iter + 1, num_iters, epoch + 1, args.num_epochs, args.batch_size / dt)) # Get the average accuracy for the training model train_accuracy = accuracy(train_model) print("Train accuracy: {:.3f}".format(train_accuracy))
def test_stateful_convolution_forward_only( self, sequence_length, conv_window, batch_size, state_size, ): ''' This unit test demonstrates another ways of using RecurrentNetwork. Imagine, that you want to compute convolution over a sequence, but sequence elements are not given to you from the beginning, so you have to loop over the sequence and compute convolution for each element separately. This situation can occur, during inference/generation step of the neural networks. First of all, you have to provide actual input via recurrent states, since the input of RecurrentNetwork should be known in advance. Here, we use `fake_inputs` as the input, and it's used by the op to extract batch size and sequence length. The actual input sequence is stored in the recurrent state `input_state`. At every step we generate a new element via input_state_t (in this example, input_state_t is generated at random, but in a real situation it can be created using convolution output from the previous step). A few important differences from regular RecurrentNetwork usecase: 1. input_state_t_prev is not only a single previous element of input_state sequence. It is last conv_window elements including (!) the current one - input_state_t. We specify that using `link_window` argument of RecurrentNetwork. We need that many elements to compute a single convolution step. Also, note that `link_window` specifies how many element to link starting at `timestep` + `link_offset` position. 2. First few steps might require additional zero padding from the left, since there is no enough element of input_state sequence are available. So the initial_state for input_state contains several elements (exactly how many pads we need for the first step). Also, because of that all offseting over input_state sequnece is being shifted by length of initial_input_state: see `link_offset` and `alias_offset` arguments of RecurrentNetwork. In this test, we assert that we get the same result if we apply convolution over all elements simultaneously, since the whole input_state sequence was generated at the end. ''' model = CNNModelHelper(name='model') fake_inputs = model.param_init_net.UniformFill( [], 'fake_inputs', min=-1.0, max=1.0, shape=[sequence_length, batch_size, state_size], ) initial_input_state = model.param_init_net.ConstantFill( [], 'initial_input_state', value=0.0, shape=[conv_window - 1, batch_size, state_size], ) initial_output_state = model.param_init_net.ConstantFill( [], 'initial_output_state', value=0.0, shape=[1, batch_size, state_size], ) step_model = CNNModelHelper(name='step_model', param_model=model) ( fake_input_t, timestep, input_state_t_prev, ) = step_model.net.AddExternalInputs( 'fake_input_t', 'timestep', 'input_state_t_prev', ) conv_filter = step_model.param_init_net.XavierFill( [], 'conv_filter', shape=[state_size, 1, conv_window, state_size], ) conv_bias = step_model.param_init_net.ConstantFill( [], 'conv_bias', shape=[state_size], value=0.0, ) step_model.params.extend([conv_filter, conv_bias]) input_state_t = step_model.net.UniformFill( [], 'input_state_t', min=-1.0, max=1.0, shape=[1, batch_size, state_size], ) output_state_t = self._convolution_1d( model=step_model, inputs=input_state_t_prev, conv_window=conv_window, conv_filter=conv_filter, conv_bias=conv_bias, output_name='output_state_t', left_pad=False, ) initial_recurrent_states = [initial_input_state, initial_output_state] all_inputs = ([fake_inputs] + step_model.params + initial_recurrent_states) all_outputs = ['input_state_all', 'output_state_all'] recurrent_states = ['input_state', 'output_state'] input_state_all, output_state_all, _ = model.net.RecurrentNetwork( all_inputs, all_outputs + ['step_workspaces'], param=map(all_inputs.index, step_model.params), alias_src=recurrent_states, alias_dst=all_outputs, alias_offset=[conv_window - 1, 1], recurrent_states=recurrent_states, initial_recurrent_state_ids=map( all_inputs.index, initial_recurrent_states, ), link_internal=map( str, [input_state_t_prev, input_state_t, output_state_t], ), link_external=['input_state', 'input_state', 'output_state'], link_offset=[0, conv_window - 1, 1], link_window=[conv_window, 1, 1], backward_link_internal=[], backward_link_external=[], backward_link_offset=[], step_net=str(step_model.net.Proto()), backward_step_net='', timestep='timestep' if timestep is None else str(timestep), outputs_with_grads=[], ) output_states_2 = self._convolution_1d( model=model, inputs=input_state_all, conv_window=conv_window, conv_filter=conv_filter, conv_bias=conv_bias, output_name='output_states_2', left_pad=True, ) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net) np.testing.assert_almost_equal( workspace.FetchBlob(output_state_all), workspace.FetchBlob(output_states_2), )
# - The instantiated net's Run() function is called # # Before we do anything, we should clear any earlier workspace variables with `ResetWorkspace()`. # # Then there are two ways to run a net from Python. We will do the first option in the example below. # # 1. Call `workspace.RunNetOnce()`, which instantiates, runs and immediately destructs the network # 2. Call `workspace.CreateNet()` to create the C++ net object owned by the workspace, then call `workspace.RunNet()`, passing the name of the network to it # # # In[23]: workspace.ResetWorkspace() print("Current blobs in the workspace: {}".format(workspace.Blobs())) workspace.RunNetOnce(net) print("Blobs in the workspace after execution: {}".format(workspace.Blobs())) # Let's dump the contents of the blobs for name in workspace.Blobs(): print("{}:\n{}".format(name, workspace.FetchBlob(name))) # Now let's try the second way to create the net, and run it. First, clear the variables with `ResetWorkspace()`. Then create the net with the workspace's `net` object that we created earlier using `CreateNet(net_object)`. Finally, run the net with `RunNet(net_name)`. # In[24]: workspace.ResetWorkspace() print("Current blobs in the workspace: {}".format(workspace.Blobs())) workspace.CreateNet(net) workspace.RunNet(net.Proto().name) print("Blobs in the workspace after execution: {}".format(workspace.Blobs())) for name in workspace.Blobs():