def build_embedding_decoder( model, decoder_layer_configs, inputs, input_lengths, encoder_lengths, encoder_outputs, weighted_encoder_outputs, final_encoder_hidden_states, final_encoder_cell_states, encoder_units_per_layer, vocab_size, embeddings, embedding_size, attention_type, forward_only, num_gpus=0, scope=None, ): with core.NameScope(scope or ''): if num_gpus == 0: embedded_decoder_inputs = model.net.Gather( [embeddings, inputs], ['embedded_decoder_inputs'], ) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): embedded_decoder_inputs_cpu = model.net.Gather( [embeddings, inputs], ['embedded_decoder_inputs_cpu'], ) embedded_decoder_inputs = model.CopyCPUToGPU( embedded_decoder_inputs_cpu, 'embedded_decoder_inputs', ) decoder_cells = [] decoder_units_per_layer = [] for i, layer_config in enumerate(decoder_layer_configs): num_units = layer_config['num_units'] decoder_units_per_layer.append(num_units) if i == 0: input_size = embedding_size else: input_size = decoder_cells[-1].get_output_dim() cell = rnn_cell.LSTMCell( name=get_layer_scope(scope, 'decoder', i), forward_only=forward_only, input_size=input_size, hidden_size=num_units, forget_bias=0.0, memory_optimization=False, ) dropout_keep_prob = layer_config.get('dropout_keep_prob', None) if dropout_keep_prob is not None: dropout_ratio = 1.0 - layer_config.dropout_keep_prob cell = rnn_cell.DropoutCell( internal_cell=cell, dropout_ratio=dropout_ratio, forward_only=forward_only, is_test=False, name=get_layer_scope(scope, 'decoder_dropout', i), ) decoder_cells.append(cell) states = build_initial_rnn_decoder_states( model=model, encoder_units_per_layer=encoder_units_per_layer, decoder_units_per_layer=decoder_units_per_layer, final_encoder_hidden_states=final_encoder_hidden_states, final_encoder_cell_states=final_encoder_cell_states, use_attention=(attention_type != 'none'), ) attention_decoder = LSTMWithAttentionDecoder( encoder_outputs=encoder_outputs, encoder_output_dim=encoder_units_per_layer[-1], encoder_lengths=encoder_lengths, vocab_size=vocab_size, attention_type=attention_type, embedding_size=embedding_size, decoder_num_units=decoder_units_per_layer[-1], decoder_cells=decoder_cells, weighted_encoder_outputs=weighted_encoder_outputs, ) decoder_outputs, _ = attention_decoder.apply_over_sequence( model=model, inputs=embedded_decoder_inputs, seq_lengths=input_lengths, initial_states=states, ) # we do softmax over the whole sequence # (max_length in the batch * batch_size) x decoder embedding size # -1 because we don't know max_length yet decoder_outputs_flattened, _ = model.net.Reshape( [decoder_outputs], [ 'decoder_outputs_flattened', 'decoder_outputs_and_contexts_combination_old_shape', ], shape=[-1, attention_decoder.get_output_dim()], ) decoder_outputs = decoder_outputs_flattened decoder_output_dim = attention_decoder.get_output_dim() return (decoder_outputs, decoder_output_dim)
def test_multiple_optimizers(self): from caffe2.python import brew, core, optimizer from caffe2.python.model_helper import ModelHelper model = ModelHelper(name="test") fc1 = brew.fc(model, 'data', 'fc1', 100, 50) fc2 = brew.fc(model, fc1, 'fc2', 50, 25) pred = brew.fc(model, fc2, 'fc3', 25, 10) (softmax, loss) = model.SoftmaxWithLoss( [pred, 'label'], ['softmax', 'loss'], ) model.AddGradientOperators([loss]) param_to_device = optimizer._get_param_to_device(model) def infer_blob_device(blob_name): return optimizer.get_param_device(blob_name, "{}_grad".format(blob_name), param_to_device) sgd_1 = optimizer.SgdOptimizer(base_learning_rate=0.1) sgd_2 = optimizer.SgdOptimizer(base_learning_rate=0.2) adagrad = optimizer.AdagradOptimizer() # Check same optimizer share the same learning rate. with core.DeviceScope(infer_blob_device("fc1_w")): sgd_1(model.net, model.param_init_net, "fc1_w", "fc1_w_grad") with core.DeviceScope(infer_blob_device("fc1_b")): sgd_1(model.net, model.param_init_net, "fc1_b", "fc1_b_grad") fc1_lr_blobs = [] for op in model.net.Proto().op: if op.type == 'WeightedSum' and op.input[0] == 'fc1_w' or \ op.input[0] == 'fc1_b': fc1_lr_blobs.append(op.input[3]) self.assertEqual(fc1_lr_blobs[0], fc1_lr_blobs[1]) # Check different instance of the same optimizer has a different lr. with core.DeviceScope(infer_blob_device("fc2_w")): sgd_2(model.net, model.param_init_net, "fc2_w", "fc2_w_grad") with core.DeviceScope(infer_blob_device("fc2_b")): sgd_2(model.net, model.param_init_net, "fc2_b", "fc2_b_grad") fc2_lr_blobs = [] for op in model.net.Proto().op: if op.type == 'WeightedSum' and op.input[0] == 'fc2_w' or \ op.input[0] == 'fc2_b': self.assertTrue(op.input[3] not in fc1_lr_blobs) fc2_lr_blobs.append(op.input[3]) self.assertEqual(fc2_lr_blobs[0], fc2_lr_blobs[1]) # Check different optimizer type case with core.DeviceScope(infer_blob_device("fc3_w")): adagrad(model.net, model.param_init_net, "fc3_w", "fc3_w_grad") with core.DeviceScope(infer_blob_device("fc3_b")): adagrad(model.net, model.param_init_net, "fc3_b", "fc3_b_grad") fc3_lr_blobs = [] for op in model.net.Proto().op: if op.type == 'Adagrad' and op.input[0] == 'fc3_w' or \ op.input[0] == 'fc3_b': self.assertTrue(op.input[3] not in fc2_lr_blobs) self.assertTrue(op.input[3] not in fc1_lr_blobs) fc3_lr_blobs.append(op.input[3]) self.assertEqual(fc3_lr_blobs[0], fc3_lr_blobs[1])
def _build( model, optimizer, weights_only=False, use_param_info_optim=True, max_gradient_norm=None, allow_lr_injection=False, ): param_to_device = _get_param_to_device(model) # Validate there are no duplicate params model.Validate() params = [] for param_info in model.GetOptimizationParamInfo(): if weights_only and param_info.blob not in model.weights: continue params.append(param_info) lr_multiplier = None if max_gradient_norm is not None: lr_multiplier = _calc_norm_ratio( model, params, 'norm_clipped_grad_update', param_to_device, max_gradient_norm, ) if allow_lr_injection: if not model.net.BlobIsDefined(_LEARNING_RATE_INJECTION): lr_injection = model.param_init_net.ConstantFill( [], _LEARNING_RATE_INJECTION, shape=[1], value=1.0, ) else: lr_injection = _LEARNING_RATE_INJECTION if lr_multiplier is None: lr_multiplier = lr_injection else: lr_multiplier = model.net.Mul( [lr_multiplier, lr_injection], 'lr_multiplier', broadcast=1, ) optimizer.add_lr_multiplier(lr_multiplier) for param_info in params: param_name = str(param_info.blob) device = get_param_device(param_name, param_info.grad, param_to_device) with core.DeviceScope(device): if param_info.optimizer and use_param_info_optim: param_info.optimizer(model.net, model.param_init_net, param_info) else: optimizer(model.net, model.param_init_net, param_info) return optimizer
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) # For testing explicit sync model.param_init_net.UniformFill([], ["sync_num"], shape=[1]) return [loss] def add_optimizer(model): return optimizer.build_sgd( model, 0.1, policy="fixed", max_gradient_norm=5.0, allow_lr_injection=True, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(devices), ) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, optimizer_builder_fun=add_optimizer, devices=devices, cpu_device=not gpu, shared_model=not gpu, combine_spatial_bn=not gpu, ) data_parallel_model.AddBlobSync(model, ["sync_num"]) # Light test for LR names lr_names = data_parallel_model.GetLearningRateBlobNames(model) self.assertGreater(len(lr_names), 0) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(devices) for (j, g) in enumerate(devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data) workspace.FeedBlob( "{}_{}/label".format(model._device_prefix, g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.FeedBlob(model._device_prefix + "_0/sync_num", np.array([i * 2]).astype(np.float32), device_option=core.DeviceOption( model._device_type, 0)) workspace.RunNet(model.net.Proto().name) # Test AddBlobSync for j in model._devices: sync = workspace.FetchBlob(model._device_prefix + "_{}/sync_num".format(j))[0] self.assertTrue(abs(sync - i * 2) < 0.01) return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
def run_model(self, V, gpu_devices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): gpu_vecs_gathered = [] gpu_vecs = [] for num, vec in enumerate(self.vecs): gpu_vec = model.param_init_net.CopyCPUToGPU( vec, 'gpuvec_{}'.format(num), ) if num != 2: model.params.append(gpu_vec) gpu_vecs.append(gpu_vec) for num, gpu_vec in enumerate(gpu_vecs): gpu_vec_gathered = model.net.Gather( [gpu_vec, 'indices'], ['gpu_vec_gathered_{}'.format(num)]) gpu_vecs_gathered.append(gpu_vec_gathered) assert len(gpu_vecs_gathered) == 3 fc = model.net.FC( [ gpu_vecs_gathered[2], gpu_vecs_gathered[0], gpu_vecs_gathered[1], ], ['fc'], ) _, loss = model.net.SoftmaxWithLoss( [fc, 'label'], ['ce_loss', 'avg_loss'], only_loss=True, ) loss = model.Scale(loss, scale=loss_scale) model.net.Print(loss, [], limit=10) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: model.net.ScatterWeightedSum( [ param, ONE, param_grad.indices, param_grad.values, ONE, ], param, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) batch_size = 32 batch_per_device = batch_size // len(gpu_devices) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) ''' self.vecs consists of 3 big blobs on which we call Gather: 1) FC weights, shape=(V, 16) 2) FC bias, shape=(V) 3) FC input, shape=(batch_per_device, 16) ''' self.vecs = [ model.param_init_net.UniformFill([], "vec_{}".format(num), shape=[V, 16]) for num in range(2) ] self.vecs.append( model.param_init_net.UniformFill( [], "vec_2", shape=[batch_per_device, 16])) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): for num, vec in enumerate(self.vecs[:-1]): model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec) # Each run has same input, independent of number of gpus for i in range(0, 10): np.random.seed(2603) full_indices = np.random.permutation(V)[:batch_size].reshape( batch_size) full_labels = full_indices[:] % batch_per_device for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en].astype(np.int32) labels = full_labels[st:en].astype(np.int32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/indices".format(g), indices) workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = [ np.random.rand(V, 16).astype(np.float32), np.random.rand(V).astype(np.float32), np.random.rand(V, 16).astype(np.float32), ] for vec, orig_vec in zip(self.vecs, orig_vecs): workspace.FeedBlob(vec, orig_vec) for g in gpu_devices: for num, orig_vec in enumerate(orig_vecs): workspace.FeedBlob( "gpu_{}/gpuvec_{}".format(g, num), orig_vec, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) idx = workspace.FetchBlob('gpu_0/indices') grad_slices = [ workspace.FetchBlob('gpu_{}/gpu_vec_gathered_{}_grad'.format( g, num)) for g in gpu_devices for num in range(2) ] for grad_slice in grad_slices: # print (len(idx), len(grad_slice)) assert len(idx) == len(grad_slice), ( 'Number of indices {} is not same as number of gradient ' 'slices {}. This might lead to illegal memory access'. format(len(idx), len(grad_slice)))
def _AllReduceBlobsDistributed( blob_names, devices, model, net, rendezvous, max_concurrent_distributed_ops, ): num_workers = model.net.Proto().num_workers assert num_workers > 1, "Please specify more than 1 worker" all_reduce_engine = rendezvous['engine'] master_device_opt = core.DeviceOption(model._device_type, devices[0]) reducing_device_opt = master_device_opt context = CollectivesConcurrencyControl( "allreduce", max_concurrent_distributed_ops, model.param_init_net, rendezvous ) nccl_control_blob = None for blob_name in blob_names: master_blob = model._device_grouped_blobs[blob_name][devices[0]] blobs_group = list(viewvalues(model._device_grouped_blobs[blob_name])) assert master_blob in blobs_group # Remark: NCCLReduce does not support in-place modifications # so we need a temporary blob reduced_blob = str(master_blob) + "_red" def allreduce(blobs): with core.DeviceScope(reducing_device_opt): comm_world, control_input = \ context.get_control_and_context(blobs[0]) net.Allreduce( inputs=[comm_world] + blobs, outputs=blobs, name=blob_name, engine=all_reduce_engine, control_input=control_input, status_blob="allreduce_{}_status".format(blob_name), ) if rendezvous['engine'] == 'GLOO': # With Gloo cross GPU and cross machine allreduce # can be executed in a single operation allreduce(blobs_group) else: # Step 1: sum blobs from local GPUs to master GPU with core.DeviceScope(master_device_opt): model.ConstantFill(master_blob, reduced_blob, value=0.0) # Temp fix since NCCLReduce does not work net.NCCLAllreduce( blobs_group, blobs_group, control_input=nccl_control_blob, ) nccl_control_blob = blobs_group[0] net.Copy(master_blob, reduced_blob) # Step 2: allreduce between all hosts, between master GPUs allreduce([reduced_blob]) with core.DeviceScope(master_device_opt): net.Copy(reduced_blob, master_blob) # Step 3: broadcast locally _Broadcast(devices, model, net, blob_name)
def test_device_scope_check(self): with self.assertRaises(AssertionError): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): data_parallel_model.Parallelize_GPU(None, None, None)
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def add_optimizer(model): optimizer.build_sgd(model, 0.1, policy="fixed") workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(devices), ) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, optimizer_builder_fun=add_optimizer, devices=devices, cpu_device=not gpu, ) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(devices) for (j, g) in enumerate(devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data) workspace.FeedBlob( "{}_{}/label".format(model._device_prefix, g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
def normalize_dense_matrix( self, input_matrix: str, features: List[int], normalization_parameters: Dict[int, NormalizationParameters], blobname_prefix: str, split_expensive_feature_groups: bool, ) -> Tuple[str, List[str]]: """ Normalizes inputs according to parameters. Expects a dense matrix whose ith column corresponds to feature i. Note that the Caffe2 BatchBoxCox operator isn't implemented on CUDA GPU so we need to use a CPU context. :param input_matrix: Input matrix to normalize. :param features: Array that maps feature ids to column indices. :param normalization_parameters: Mapping from feature names to NormalizationParameters. :param blobname_prefix: Prefix for input blobs to norm_net. :param num_output_features: The number of features in an output processed datapoint. If set to None, this function will compute it. """ with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): feature_starts = self._get_type_boundaries( features, normalization_parameters) normalized_input_blobs = [] parameters: List[str] = [] for i, feature_type in enumerate(FEATURE_TYPES): start_index = feature_starts[i] if (i + 1) == len(FEATURE_TYPES): end_index = len(normalization_parameters) else: end_index = feature_starts[i + 1] if start_index == end_index: continue # No features of this type slices = [] split_feature_group, split_intervals = self._should_split_feature_group( split_expensive_feature_groups, start_index, end_index, feature_type) if split_feature_group: for j in range(len(split_intervals) - 1): slice_blob = self._get_input_blob_indexed( blobname_prefix, feature_type, j) C2.net().Slice( [input_matrix], [slice_blob], starts=[0, split_intervals[j]], ends=[-1, split_intervals[j + 1]], ) slices.append((slice_blob, split_intervals[j], split_intervals[j + 1])) else: sliced_input_features = self._get_input_blob( blobname_prefix, feature_type) C2.net().Slice( [input_matrix], [sliced_input_features], starts=[0, start_index], ends=[-1, end_index], ) slices.append( (sliced_input_features, start_index, end_index)) for (slice_blob, start, end) in slices: normalized_input_blob, blob_parameters = self.preprocess_blob( slice_blob, [ normalization_parameters[x] for x in features[start:end] ], ) logger.info( "Processed split ({}, {}) for feature type {}".format( start, end, feature_type)) parameters.extend(blob_parameters) normalized_input_blobs.append(normalized_input_blob) for i, inp in enumerate(normalized_input_blobs): logger.info("input# {}: {}".format(i, inp)) concatenated_input_blob, concatenated_input_blob_dim = C2.Concat( *normalized_input_blobs, axis=1) return concatenated_input_blob, parameters
def main(argv_new): """Main entrypoint""" parser = argparse.ArgumentParser( description="Train a RL net to play in openAI GYM.") parser.add_argument("-x", "--number-steps-total", type=int, help="total number of training steps", default=1000000) parser.add_argument("-w", "--number-steps-timeout", type=int, help="number of steps before time out", default=-1) parser.add_argument("-i", "--number-iterations", type=int, help="total number of iterations", default=1000) parser.add_argument("-y", "--learn-every-n-iterations", type=int, help="training every n numbers of game iterations", default=2) parser.add_argument("-z", "--learn-batch-num-every-iteration", type=int, help="batch number for learning each time", default=100) parser.add_argument("-b", "--batch-size", type=int, help="batch size for training", default=128) parser.add_argument("-s", "--save-iteration", type=int, help="saving checkpoint every n number of iterations", default=-1) parser.add_argument("-p", "--path", help="path of the checkpoint file", default=MODEL_PATH) parser.add_argument("-q", "--maxq-learning", help="max q over actions instead of current", action="store_true", default=True) parser.add_argument("-c", "--constraint", help="constrained actions", action="store_true", default=False) parser.add_argument("-t", "--test", help="test (no learning and minimal epsilon)", action="store_true", default=False) parser.add_argument("-u", "--upload", help="upload after finishing training/testing", action="store_true", default=False) parser.add_argument("-v", "--verbosity", action="count", help="increase output verbosity", default=0) parser.add_argument("-g", "--gymenv", help="specify gym env for training", default="CartPole-v0") parser.add_argument("-r", "--render", help="render training", action="store_true", default=False) parser.add_argument("-a", "--model-id", help="specify training model unique id", default="new") parser.add_argument("-m", "--model-type", help="specify training model type:\ DQN or ACTORCRITIC", default="DQN") parser.add_argument("-o", "--optimizer", help="specify optimizer for training", default="SGD") parser.add_argument("-l", "--learning-rate", type=float, help="specify learning rate for training", default=0.01) parser.add_argument("-d", "--discount-gamma", type=float, help="specify discounted factor gamma for RL", default=0.9) parser.add_argument("--gpu", action="store_true", help="If set, training is going to use GPU 0", default=False) args = parser.parse_args(argv_new) print("args:", args) workspace.GlobalInit(['caffe2', '--caffe2_log_level=2']) workspace.ResetWorkspace() device = core.DeviceOption(caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0) with core.DeviceScope(device): Run(args)
def Test(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus if num_gpus > 0: total_batch_size = args.batch_size * num_gpus log.info("Running on GPUs: {}".format(gpus)) log.info("total_batch_size: {}".format(total_batch_size)) else: total_batch_size = args.batch_size log.info("Running on CPU") log.info("total_batch_size: {}".format(total_batch_size)) # Model building functions def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=args.model_name, model_depth=args.model_depth, num_labels=args.num_labels, num_channels=args.num_channels, crop_size=args.crop_size, clip_length=( args.clip_length_of if args.input_type == 1 else args.clip_length_rgb ), loss_scale=loss_scale, is_test=1, pred_layer_name=args.pred_layer_name, ) test_model = cnn.CNNModelHelper( order="NCHW", name="video_model_test", use_cudnn=(True if args.use_cudnn == 1 else False), cudnn_exhaustive_search=True, ) test_reader, number_of_examples = model_builder.create_data_reader( test_model, name="test_reader", input_data=args.test_data, ) if args.num_iter <= 0: num_iter = int(number_of_examples / total_batch_size) else: num_iter = args.num_iter def test_input_fn(model): model_helper.AddVideoInput( test_model, test_reader, batch_size=args.batch_size, clip_per_video=args.clip_per_video, decode_type=1, length_rgb=args.clip_length_rgb, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, num_decode_threads=4, num_of_class=args.num_labels, random_mirror=False, random_crop=False, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=(args.input_type == 0), get_optical_flow=(args.input_type == 1), get_video_id=args.get_video_id, use_local_file=args.use_local_file, ) if num_gpus > 0: data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, devices=gpus ) else: test_model._device_type = caffe2_pb2.CPU test_model._devices = [0] device_opt = core.DeviceOption(test_model._device_type, 0) with core.DeviceScope(device_opt): # Because our loaded models are named with "gpu_x", keep the naming for now. # TODO: Save model using `data_parallel_model.ExtractPredictorNet` # to extract the model for "gpu_0". It also renames # the input and output blobs by stripping the "gpu_x/" prefix with core.NameScope("{}_{}".format("gpu", 0)): test_input_fn(test_model) create_model_ops(test_model, 1.0) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) if args.db_type == 'minidb': if num_gpus > 0: model_helper.LoadModel(args.load_model_path, args.db_type) data_parallel_model.FinalizeAfterCheckpoint(test_model) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(args.load_model_path, args.db_type) elif args.db_type == 'pickle': if num_gpus > 0: model_loader.LoadModelFromPickleFile( test_model, args.load_model_path, use_gpu=True, root_gpu_id=gpus[0] ) data_parallel_model.FinalizeAfterCheckpoint(test_model) else: model_loader.LoadModelFromPickleFile( test_model, args.load_model_path, use_gpu=False ) else: log.warning("Unsupported db_type: {}".format(args.db_type)) # metric counters for classification clip_acc = 0 video_top1 = 0 video_topk = 0 video_count = 0 clip_count = 0 for i in range(num_iter): workspace.RunNet(test_model.net.Proto().name) num_devices = 1 # default for cpu if num_gpus > 0: num_devices = num_gpus for g in range(num_devices): # get labels label = workspace.FetchBlob( "gpu_{}".format(g) + '/label' ) # get predictions predicts = workspace.FetchBlob("gpu_{}".format(g) + '/softmax') assert predicts.shape[0] == args.batch_size * args.clip_per_video for j in range(args.batch_size): # get label for one video sample_label = label[j * args.clip_per_video] # get clip accuracy for k in range(args.clip_per_video): c1, _ = metric.accuracy_metric( predicts[j * args.clip_per_video + k, :], label[j * args.clip_per_video + k]) clip_acc = clip_acc + c1 # get all clip predictions for one video all_clips = predicts[ j * args.clip_per_video:(j + 1) * args.clip_per_video, :] # aggregate predictions into one video_pred = PredictionAggregation(all_clips, args.aggregation) c1, ck = metric.accuracy_metric( video_pred, sample_label, args.top_k) video_top1 = video_top1 + c1 video_topk = video_topk + ck video_count = video_count + args.batch_size clip_count = clip_count + label.shape[0] if i > 0 and i % args.display_iter == 0: log.info('Iter {}/{}: clip: {}, top1: {}, top 5: {}'.format( i, num_iter, clip_acc / clip_count, video_top1 / video_count, video_topk / video_count)) log.info("Test accuracy: clip: {}, top 1: {}, top{}: {}".format( clip_acc / clip_count, video_top1 / video_count, args.top_k, video_topk / video_count )) if num_gpus > 0: flops, params = model_helper.GetFlopsAndParams(test_model, gpus[0]) else: flops, params = model_helper.GetFlopsAndParams(test_model) log.info('FLOPs: {}, params: {}'.format(flops, params))
def testEqualToCudnn(self): with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType)): T = 8 batch_size = 4 input_dim = 8 hidden_dim = 31 workspace.FeedBlob("seq_lengths", np.array([T] * batch_size, dtype=np.int32)) workspace.FeedBlob( "target", np.zeros([T, batch_size, hidden_dim], dtype=np.float32)) workspace.FeedBlob( "hidden_init", np.zeros([1, batch_size, hidden_dim], dtype=np.float32)) workspace.FeedBlob( "cell_init", np.zeros([1, batch_size, hidden_dim], dtype=np.float32)) own_model = model_helper.ModelHelper(name="own_lstm") input_shape = [T, batch_size, input_dim] cudnn_model = model_helper.ModelHelper(name="cudnn_lstm") input_blob = cudnn_model.param_init_net.UniformFill( [], "input", shape=input_shape) workspace.FeedBlob( "CUDNN/hidden_init_cudnn", np.zeros([1, batch_size, hidden_dim], dtype=np.float32)) workspace.FeedBlob( "CUDNN/cell_init_cudnn", np.zeros([1, batch_size, hidden_dim], dtype=np.float32)) cudnn_output, cudnn_last_hidden, cudnn_last_state, param_extract = rnn_cell.cudnn_LSTM( model=cudnn_model, input_blob=input_blob, initial_states=("hidden_init_cudnn", "cell_init_cudnn"), dim_in=input_dim, dim_out=hidden_dim, scope="CUDNN", return_params=True, ) cudnn_loss = cudnn_model.AveragedLoss( cudnn_model.SquaredL2Distance([cudnn_output, "target"], "CUDNN/dist"), "CUDNN/loss") own_output, own_last_hidden, _, own_last_state, own_params = rnn_cell.LSTM( model=own_model, input_blob=input_blob, seq_lengths="seq_lengths", initial_states=("hidden_init", "cell_init"), dim_in=input_dim, dim_out=hidden_dim, scope="OWN", return_params=True, ) own_loss = own_model.AveragedLoss( own_model.SquaredL2Distance([own_output, "target"], "OWN/dist"), "OWN/loss") # Add gradients cudnn_model.AddGradientOperators([cudnn_loss]) own_model.AddGradientOperators([own_loss]) # Add parameter updates LR = cudnn_model.param_init_net.ConstantFill([], shape=[1], value=0.01) ONE = cudnn_model.param_init_net.ConstantFill([], shape=[1], value=1.0) for param in cudnn_model.GetParams(): cudnn_model.WeightedSum( [param, ONE, cudnn_model.param_to_grad[param], LR], param) for param in own_model.GetParams(): own_model.WeightedSum( [param, ONE, own_model.param_to_grad[param], LR], param) # Copy states over own_model.net.Copy(own_last_hidden, "hidden_init") own_model.net.Copy(own_last_state, "cell_init") cudnn_model.net.Copy(cudnn_last_hidden, "CUDNN/hidden_init_cudnn") cudnn_model.net.Copy(cudnn_last_state, "CUDNN/cell_init_cudnn") workspace.RunNetOnce(cudnn_model.param_init_net) workspace.CreateNet(cudnn_model.net) ## ## CUDNN LSTM MODEL EXECUTION ## # Get initial values from CuDNN LSTM so we can feed them # to our own. (param_extract_net, param_extract_mapping) = param_extract workspace.RunNetOnce(param_extract_net) cudnn_lstm_params = { input_type: {k: workspace.FetchBlob(v[0]) for k, v in viewitems(pars)} for input_type, pars in viewitems(param_extract_mapping) } # Run the model 3 times, so that some parameter updates are done workspace.RunNet(cudnn_model.net.Proto().name, 3) ## ## OWN LSTM MODEL EXECUTION ## # Map the cuDNN parameters to our own workspace.RunNetOnce(own_model.param_init_net) rnn_cell.InitFromLSTMParams(own_params, cudnn_lstm_params) # Run the model 3 times, so that some parameter updates are done workspace.CreateNet(own_model.net) workspace.RunNet(own_model.net.Proto().name, 3) ## ## COMPARE RESULTS ## # Then compare that final results after 3 runs are equal own_output_data = workspace.FetchBlob(own_output) own_last_hidden = workspace.FetchBlob(own_last_hidden) own_loss = workspace.FetchBlob(own_loss) cudnn_output_data = workspace.FetchBlob(cudnn_output) cudnn_last_hidden = workspace.FetchBlob(cudnn_last_hidden) cudnn_loss = workspace.FetchBlob(cudnn_loss) self.assertTrue(np.allclose(own_output_data, cudnn_output_data)) self.assertTrue(np.allclose(own_last_hidden, cudnn_last_hidden)) self.assertTrue(np.allclose(own_loss, cudnn_loss))
def lstm_with_attention( self, create_lstm_with_attention, encoder_output_length, encoder_output_dim, decoder_input_length, decoder_state_dim, batch_size, ref, gc, ): model = CNNModelHelper(name='external') with core.DeviceScope(gc): ( encoder_outputs, decoder_inputs, decoder_input_lengths, initial_decoder_hidden_state, initial_decoder_cell_state, initial_attention_weighted_encoder_context, ) = model.net.AddExternalInputs( 'encoder_outputs', 'decoder_inputs', 'decoder_input_lengths', 'initial_decoder_hidden_state', 'initial_decoder_cell_state', 'initial_attention_weighted_encoder_context', ) create_lstm_with_attention( model=model, decoder_inputs=decoder_inputs, decoder_input_lengths=decoder_input_lengths, initial_decoder_hidden_state=initial_decoder_hidden_state, initial_decoder_cell_state=initial_decoder_cell_state, initial_attention_weighted_encoder_context=( initial_attention_weighted_encoder_context), encoder_output_dim=encoder_output_dim, encoder_outputs=encoder_outputs, decoder_input_dim=decoder_state_dim, decoder_state_dim=decoder_state_dim, scope='external/LSTMWithAttention', ) op = model.net._net.op[-1] workspace.RunNetOnce(model.param_init_net) # This is original decoder_inputs after linear layer decoder_input_blob = op.input[0] workspace.FeedBlob( decoder_input_blob, np.random.randn( decoder_input_length, batch_size, decoder_state_dim * 4, ).astype(np.float32)) workspace.FeedBlob( 'external/LSTMWithAttention/encoder_outputs_transposed', np.random.randn( batch_size, encoder_output_dim, encoder_output_length, ).astype(np.float32), ) workspace.FeedBlob( 'external/LSTMWithAttention/weighted_encoder_outputs', np.random.randn( encoder_output_length, batch_size, encoder_output_dim, ).astype(np.float32), ) workspace.FeedBlob( decoder_input_lengths, np.random.randint(0, decoder_input_length + 1, size=(batch_size, )).astype(np.int32)) workspace.FeedBlob( initial_decoder_hidden_state, np.random.randn(1, batch_size, decoder_state_dim).astype(np.float32)) workspace.FeedBlob( initial_decoder_cell_state, np.random.randn(1, batch_size, decoder_state_dim).astype(np.float32)) workspace.FeedBlob( initial_attention_weighted_encoder_context, np.random.randn(1, batch_size, encoder_output_dim).astype(np.float32)) inputs = [workspace.FetchBlob(name) for name in op.input] self.assertReferenceChecks( device_option=gc, op=op, inputs=inputs, reference=ref, grad_reference=None, output_to_grad=None, outputs_to_check=range(6), ) gradients_to_check = [ index for (index, input_name) in enumerate(op.input) if input_name != 'decoder_input_lengths' ] for param in gradients_to_check: self.assertGradientChecks( device_option=gc, op=op, inputs=inputs, outputs_to_check=param, outputs_with_grads=[0, 4], threshold=0.01, stepsize=0.001, )
def run_model(self, gpu_devices): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): grad = model.param_to_grad[param] model.WeightedSum([param, ONE, grad, LR], param) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(gpu_devices), ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/data".format(g), data) workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) print(i, workspace.FetchBlob("gpu_0/fc_w").flatten()[:5]) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("gpu_0/fc_w")
def Parallelize_GPU_BMUF( model_helper_obj, input_builder_fun, forward_pass_builder_fun, param_update_builder_fun, block_learning_rate=1.0, block_momentum=None, devices=None, rendezvous=None, net_type='dag', master_gpu=None, use_nccl=False, optimize_gradient_memory=False, reset_momentum_sgd=False, warmup_iterations=None, max_concurrent_distributed_ops=4, ): ''' Function to create model that run on many GPUs and creates a net for parameter_updates that can be run independently for number of iterations then followed by another net that runs once to compute the final parameter updates according to block wise model update filtering rule described in : Scalable Training of Deep Learning Machines by Incremental Block Training with Intra-block Parallel Optimization and Blockwise Model-Update Filtering (ICASSP 2016). ''' assert isinstance(model_helper_obj, model_helper.ModelHelper) if devices is None: devices = list(range(0, workspace.NumCudaDevices())) if master_gpu is None: master_gpu = devices[0] model_helper_obj._devices = devices model_helper_obj._rendezvous = rendezvous model_helper_obj._device_type = caffe2_pb2.CUDA model_helper_obj._device_prefix = 'gpu' master_gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, master_gpu) num_shards = rendezvous['num_shards'] if rendezvous else 1 num_workers = len(devices) * num_shards num_worker_threads = 4 * len(devices) if rendezvous: num_worker_threads += 8 loss_scale = 1.0 / num_workers if block_momentum is None: block_momentum = 1.0 - 1.0 / num_workers max_concurrent_distributed_ops = min( max_concurrent_distributed_ops, num_worker_threads - 1 ) model_helper_obj.net.Proto().num_workers = num_worker_threads model_helper_obj.net.Proto().type = net_type # A net for initializing global model parameters. Its called once in the # same step as net parameters initialization. model_helper_obj._global_model_init_net = core.Net('global_model_init') model_helper_obj._global_model_init_net.Proto().type = net_type model_helper_obj._global_model_init_net.Proto().num_workers = \ num_worker_threads # A net for computing final parameter updates. Its will run once after # running net (local models updates) for `num_local_iterations` times. model_helper_obj._global_model_param_updates_net = core.Net('global_model') model_helper_obj._global_model_param_updates_net.Proto().type = net_type model_helper_obj._global_model_param_updates_net.Proto().num_workers = \ num_worker_threads def _v(param): return "{}_v".format(param) def _g(param): return "{}_g".format(param) # Keep track of params that were in the model before: they are not # data parallel, so we need to handle them separately non_datapar_params = copy.copy(model_helper_obj.params) model_helper_obj._losses_by_gpu = {} def _InitializeModels(gpu_id): input_builder_fun(model_helper_obj) loss = forward_pass_builder_fun(model_helper_obj, loss_scale) model_helper_obj._losses_by_gpu[gpu_id] = loss _ForEachGPU(devices, _InitializeModels, scoped=True) model_helper_obj._device_grouped_blobs =\ _GroupByDevice(model_helper_obj, devices, model_helper_obj.params, non_datapar_params) model_helper_obj._param_names =\ model_helper_obj._device_grouped_blobs.keys() _AddGradientOperators( devices, model_helper_obj, model_helper_obj._losses_by_gpu ) _InferBlobDevice(model_helper_obj) def _InitializeParamUpdate(gpu_id): param_update_builder_fun(model_helper_obj) _ForEachGPU(devices, _InitializeParamUpdate, scoped=True) model_parameter_names = list( viewkeys(model_helper_obj._device_grouped_blobs) ) if warmup_iterations is not None: model_helper_obj._warmup_iterations = warmup_iterations # A net for broadcasting gpu-0 (master shard) parameters after # running net for `warmup_iterartions`. model_helper_obj._warmup_broadcast = core.Net('warmup-broadcast') model_helper_obj._warmup_broadcast.Proto().type = net_type model_helper_obj._warmup_broadcast.Proto().num_workers = \ num_worker_threads _SyncAllParams( devices, model_helper_obj, model_helper_obj.param_init_net, model_helper_obj._warmup_broadcast, rendezvous, model_parameter_names, max_concurrent_distributed_ops ) for param_name in model_helper_obj._device_grouped_blobs.keys(): param = model_helper_obj._device_grouped_blobs[param_name][master_gpu] with core.DeviceScope(master_gpu_opt): model_helper_obj._warmup_broadcast.Copy(param, _g(param)) # (Step-0) Initialize momentum parameters on master GPU. for param_name in viewkeys(model_helper_obj._device_grouped_blobs): param = model_helper_obj._device_grouped_blobs[param_name][master_gpu] with core.DeviceScope(master_gpu_opt): model_helper_obj._global_model_init_net.ConstantFill( param, _v(param), value=0.0 ) model_helper_obj._global_model_init_net.Copy(param, _g(param)) # (Step-1) Update models for num_local_iterations. # (Step-2) Comute post-local-updates average of the params. # Sum model params across GPUs and store resutls in param_avg blob. _AllReduceBlobs( model_parameter_names, devices, model_helper_obj, model_helper_obj._global_model_param_updates_net, rendezvous, use_nccl, max_concurrent_distributed_ops ) # (Step-3) Update momentum params : # param_v = block_momentum * param_v # + block_learning_Rate * (param_avg - param) # param = param + param_v for param_name in model_parameter_names: param = model_helper_obj._device_grouped_blobs[param_name][master_gpu] with core.DeviceScope(master_gpu_opt): # TODO(ataei) : Stop building the graph here to get model average ? model_helper_obj._global_model_param_updates_net.Scale( param, param, scale=1.0 / num_workers ) model_helper_obj._global_model_param_updates_net.Sub( [param, _g(param)], param ) model_helper_obj._global_model_param_updates_net.Scale( param, param, scale=block_learning_rate ) model_helper_obj._global_model_param_updates_net.Scale( _v(param), _v(param), scale=block_momentum ) model_helper_obj._global_model_param_updates_net.Add( [_v(param), param], _v(param) ) model_helper_obj._global_model_param_updates_net.Add( [_g(param), _v(param)], _g(param) ) model_helper_obj._global_model_param_updates_net.Copy( _g(param), param ) _SyncAllParams( devices, model_helper_obj, model_helper_obj.param_init_net, model_helper_obj._global_model_param_updates_net, rendezvous, model_parameter_names, max_concurrent_distributed_ops ) # Reset momentum-SGD parameters if reset_momentum_sgd: momentum_ops = [op for op in model_helper_obj.net.Proto().op if op.type == 'MomentumSGDUpdate'] for op in momentum_ops: momentum_blob = op.input[1] with core.DeviceScope(op.device_option): model_helper_obj._global_model_param_updates_net.ConstantFill( [momentum_blob], momentum_blob, value=0.0 ) if optimize_gradient_memory: _OptimizeGradientMemorySimple( model_helper_obj, model_helper_obj._losses_by_gpu, devices ) model_helper_obj._data_parallel_model_init_nets = [ model_helper_obj.param_init_net, model_helper_obj._global_model_init_net ] model_helper_obj._data_parallel_model_nets = [ model_helper_obj.net, (model_helper_obj._global_model_param_updates_net, 1) ]
def test_inject_copy_placeholder_ops(self): ''' Test inject cross device copies with placeholder ops. Placeholder ops are decorator/fake ops that don't have operator schema. ''' # Create CPU and GPU devices on 2 nodes. cpu_device = [] gpu_device = [] for i in range(0, 2): cpu_device.append(caffe2_pb2.DeviceOption()) cpu_device[i].node_name = 'node:' + str(i) gpu_device.append(caffe2_pb2.DeviceOption()) gpu_device[i].device_type = caffe2_pb2.CUDA gpu_device[i].cuda_gpu_id = 0 gpu_device[i].node_name = 'node:' + str(i) send_node = 'node:0' recv_node = 'node:1' placeholder_send = 'Placeholder:Dummy:Send' placeholder_recv = 'Placeholder:Dummy:Recv' # init_net. init_net = core.Net("init_net") with core.DeviceScope(gpu_device[0]): weight = init_net.XavierFill([], 'fc_w', shape=[10, 100]) bias = init_net.ConstantFill([], 'fc_b', shape=[ 10, ]) with core.DeviceScope(cpu_device[0]): op = core.CreateOperator(placeholder_send, [weight, bias], [], dst_node=recv_node) init_net._net.op.extend([op]) # train_net train_net = core.Net("train_net") with core.DeviceScope(cpu_device[1]): # XXX. replace hardcoded op name. Move test to net_transforms. op = core.CreateOperator(placeholder_recv, [], [weight, bias], src_node=send_node) train_net._net.op.extend([op]) train_net.FC(["data", weight, bias], "fc1") # Inject cross device copies. init_net, x_dev_state = core.InjectCrossDeviceCopies( init_net, placeHolderOps=[placeholder_send, placeholder_recv]) train_net, x_dev_state = core.InjectCrossDeviceCopies( train_net, x_dev_state, placeHolderOps=[placeholder_send, placeholder_recv]) # Verify (init_net) op = init_net._net.op[2] self.assertEqual(op.type, "CopyGPUToCPU") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 0) self.assertEqual(op.output[0], "fc_w_cpu") op = init_net._net.op[3] self.assertEqual(op.type, "CopyGPUToCPU") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 0) self.assertEqual(op.output[0], "fc_b_cpu") op = init_net._net.op[4] self.assertEqual(op.type, placeholder_send) self.assertEqual(op.device_option.device_type, 0) self.assertEqual(op.input[0], "fc_w_cpu") self.assertEqual(op.input[1], "fc_b_cpu") # Verify (train_net) op = train_net._net.op[0] self.assertEqual(op.type, placeholder_recv) self.assertEqual(op.device_option.device_type, 0) self.assertEqual(op.output[0], "fc_w_cpu") self.assertEqual(op.output[1], "fc_b_cpu") op = train_net._net.op[3] self.assertEqual(op.type, "FC") self.assertEqual(op.device_option.device_type, 0) self.assertEqual(op.input[1], "fc_w_cpu") self.assertEqual(op.input[2], "fc_b_cpu")
def Parallelize( model_helper_obj, input_builder_fun, forward_pass_builder_fun, param_update_builder_fun=None, optimizer_builder_fun=None, post_sync_builder_fun=None, devices=None, rendezvous=None, net_type='dag', broadcast_computed_params=True, optimize_gradient_memory=False, use_nccl=False, max_concurrent_distributed_ops=16, cpu_device=False, ): ''' Function to create a model that can run on many GPUs or CPUs. model_helper_obj: an object of ModelHelper input_builder_fun: Function that adds the input operators Note: Remember to instantiate reader outside of this function so all devices share same reader object. Signature: input_builder_fun(model) forward_pass_builder_fun: Function to add the operators to the model. Must return list of loss-blob references that are used to build the gradient. Loss scale parameter is passed, as you should scale the loss of your model by 1.0 / the total number of devices. Signature: forward_pass_builder_fun(model, loss_scale) param_update_builder_fun: Function that adds operators that are run after gradient update, such as updating the weights and weight decaying. This is called for each GPU separately. Signature: param_update_builder_fun(model) optimizer_builder_fun: Alternative to param_update_builder_fun, allows one to add an optimizer for the whole model. Called only once, without name or devicescope. post_sync_builder_fun: Function applied after initial parameter sync has been completed, such as keeping multi-precision parameters in sync. Signature: post_sync_builder_fun(model) devices: List of GPU ids, such as [0, 1, 2, 3], rendezvous: used for rendezvous in distributed computation, if None then only one node is used. To create rendezvous, use <TBD>. net_type: Network type optimize_gradient_memory: whether to apply 'memonger' to share blobs in gradient computation to reduce memory footprint cpu_device Use CPU instead of GPU ''' if devices is None: devices = list(range(0, workspace.NumCudaDevices())), if not cpu_device: for gpu in devices: if gpu >= workspace.NumCudaDevices(): log.warning("** Only {} GPUs available, GPUs {} requested".format( workspace.NumCudaDevices(), devices)) break model_helper_obj._device_type = caffe2_pb2.CUDA model_helper_obj._device_prefix = "gpu" device_name = "GPU" else: model_helper_obj._device_type = caffe2_pb2.CPU model_helper_obj._device_prefix = "cpu" device_name = "CPU" log.info("Parallelizing model for devices: {}".format(devices)) extra_workers = 8 if rendezvous is not None else 0 # best-guess num_workers = len(devices) * 4 + extra_workers max_concurrent_distributed_ops =\ min(max_concurrent_distributed_ops, num_workers - 1) model_helper_obj.net.Proto().num_workers = num_workers model_helper_obj.net.Proto().type = net_type # Store some information in the model -- a bit ugly model_helper_obj._devices = devices model_helper_obj._rendezvous = rendezvous model_helper_obj._grad_names = [] assert isinstance(model_helper_obj, model_helper.ModelHelper) # Keep track of params that were in the model before: they are not # data parallel, so we need to handle them separately non_datapar_params = copy.copy(model_helper_obj.params) # Add input and model log.info("Create input and model training operators") losses_by_gpu = {} num_shards = 1 if rendezvous is None else rendezvous['num_shards'] loss_scale = 1.0 / (len(devices) * num_shards) has_parameter_updates = param_update_builder_fun is not None or \ optimizer_builder_fun is not None assert not ( param_update_builder_fun is not None and optimizer_builder_fun is not None ), 'Can only specify one of param_update_builder_fun, optimizer_builder_fun' for device in devices: device_opt = core.DeviceOption(model_helper_obj._device_type, device) with core.DeviceScope(device_opt): with core.NameScope("{}_{}".format(model_helper_obj._device_prefix, device)): log.info("Model for {} : {}".format(device_name, device)) input_builder_fun(model_helper_obj) losses = forward_pass_builder_fun(model_helper_obj, loss_scale) # Losses are not needed for test net if has_parameter_updates: assert isinstance(losses, list), \ 'Model builder function must return list of loss blobs' for loss in losses: assert isinstance(loss, core.BlobReference), \ 'Model builder func must return list of loss blobs' losses_by_gpu[device] = losses _ValidateParams(model_helper_obj.params) # Create parameter map model_helper_obj._device_grouped_blobs =\ _GroupByDevice(model_helper_obj, devices, model_helper_obj.params, non_datapar_params) # computed params computed_params_grouped =\ _GroupByDevice(model_helper_obj, devices, model_helper_obj.GetComputedParams(''), []) model_helper_obj._device_grouped_blobs.update(computed_params_grouped) model_helper_obj._param_names =\ list(viewkeys(model_helper_obj._device_grouped_blobs)) model_helper_obj._computed_param_names =\ list(viewkeys(computed_params_grouped)) if not has_parameter_updates: log.info("Parameter update function not defined --> only forward") _InferBlobDevice(model_helper_obj) return log.info("Adding gradient operators") _AddGradientOperators(devices, model_helper_obj, losses_by_gpu) _ValidateParams(model_helper_obj.params) # Group gradients by device and register to blob lookup param_to_grad = model_helper_obj.param_to_grad grads_ordered = [param_to_grad[p] for p in model_helper_obj.params if p in param_to_grad] non_datapar_grads = [param_to_grad[p] for p in non_datapar_params] gradients_grouped = _GroupByDevice( model_helper_obj, devices, grads_ordered, non_datapar_grads ) model_helper_obj._device_grouped_blobs.update(gradients_grouped) model_helper_obj._grad_names = list(viewkeys(gradients_grouped)) model_helper_obj._losses_by_gpu = losses_by_gpu _InferBlobDevice(model_helper_obj) log.info("Add gradient all-reduces for SyncSGD") if broadcast_computed_params: _BroadcastComputedParams(devices, model_helper_obj, rendezvous, use_nccl) if len(model_helper_obj._grad_names) > 0: # Gradients in reverse order reverse_ordered_grads = _GetReverseOrderedGrads(model_helper_obj) assert(len(reverse_ordered_grads) > 0) _AllReduceBlobs( reverse_ordered_grads, devices, model_helper_obj, model_helper_obj.net, rendezvous, use_nccl, max_concurrent_distributed_ops, ) else: log.info("NOTE: Param builder function did not create any parameters.") log.info("Post-iteration operators for updating params") num_shards = 1 if rendezvous is None else rendezvous['num_shards'] if param_update_builder_fun is not None: for device in devices: device_opt = core.DeviceOption(model_helper_obj._device_type, device) with core.DeviceScope(device_opt): with core.NameScope( "{}_{}".format(model_helper_obj._device_prefix, device) ): param_update_builder_fun(model_helper_obj) else: log.info("Calling optimizer builder function") optimizer_builder_fun(model_helper_obj) (sync_blobs, sync_names) = _ComputeBlobsToSync(model_helper_obj) sync_blobs_grouped = _GroupByDevice( model_helper_obj, devices, sync_blobs, [], ) model_helper_obj._device_grouped_blobs.update(sync_blobs_grouped) _InferBlobDevice(model_helper_obj) _AnalyzeOperators(model_helper_obj) # Configure dagnet to run with only one worker on the first iteration, # to prevent concurrency problems with allocs and nccl. arg = model_helper_obj.Proto().arg.add() arg.name = "first_iter_only_one_worker" arg.i = 1 # Add initial parameter syncs log.info("Add initial parameter sync") _SyncAllParams( devices, model_helper_obj, model_helper_obj.param_init_net, model_helper_obj.param_init_net, rendezvous, sync_names, max_concurrent_distributed_ops=1 ) # Handle any operations that need to be done after parameter sync # i.e. making sure multi-precision copies of parameters are up-to-date if post_sync_builder_fun is not None: for device in devices: device_opt = core.DeviceOption(model_helper_obj._device_type, device) with core.DeviceScope(device_opt): with core.NameScope( "{}_{}".format(model_helper_obj._device_prefix, device) ): post_sync_builder_fun(model_helper_obj) if optimize_gradient_memory: _OptimizeGradientMemorySimple(model_helper_obj, losses_by_gpu, devices) model_helper_obj._data_parallel_model_init_nets = [ model_helper_obj.param_init_net, ] model_helper_obj._data_parallel_model_nets = [model_helper_obj.net]
def test_inject_copy_multi_use(self): net = core.Net("test") device_option = caffe2_pb2.DeviceOption() device_option.device_type = caffe2_pb2.CUDA device_option.cuda_gpu_id = 1 with core.DeviceScope(device_option): net.Relu("data", "relu1") net.Relu("data", "relu2") with core.DeviceScope(device_option): net.Relu("data", "relu3") net.Relu("data", "relu4") device_option.cuda_gpu_id = 0 with core.DeviceScope(device_option): net.Relu("data", "relu5") device_option.cuda_gpu_id = 1 with core.DeviceScope(device_option): net.Relu("data", "relu6") new_net, _ = core.InjectCrossDeviceCopies(net) op = new_net._net.op[0] self.assertEqual(op.type, "CopyCPUToGPU") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 1) self.assertEqual(op.output[0], "data_cuda_1") op = new_net._net.op[1] self.assertEqual(op.type, "Relu") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 1) self.assertEqual(op.output[0], "relu1") op = new_net._net.op[2] self.assertEqual(op.type, "Relu") self.assertEqual(op.device_option.device_type, 0) self.assertEqual(op.output[0], "relu2") op = new_net._net.op[3] self.assertEqual(op.type, "Relu") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 1) self.assertEqual(op.input[0], "data_cuda_1") self.assertEqual(op.output[0], "relu3") op = new_net._net.op[4] self.assertEqual(op.type, "Relu") self.assertEqual(op.device_option.device_type, 0) self.assertEqual(op.output[0], "relu4") op = new_net._net.op[5] self.assertEqual(op.type, "CopyCPUToGPU") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 0) self.assertEqual(op.output[0], "data_cuda_0") op = new_net._net.op[6] self.assertEqual(op.type, "Relu") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 0) self.assertEqual(op.input[0], "data_cuda_0") self.assertEqual(op.output[0], "relu5") op = new_net._net.op[7] self.assertEqual(op.type, "Relu") self.assertEqual(op.device_option.device_type, 1) self.assertEqual(op.device_option.cuda_gpu_id, 1) self.assertEqual(op.input[0], "data_cuda_1") self.assertEqual(op.output[0], "relu6") """
def __init__( self, cli_args, model=None, tag=None, enable_prof=False, ): super(MT_Wide_and_Deep_Wrapper, self).__init__() self.args = cli_args # GPU Enable Flags gpu_en = self.args.use_gpu if gpu_en: device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0) ngpus = C.num_cuda_devices # 1 print("(Wrapper) Using {} GPU(s)...".format(ngpus)) else: device_opt = core.DeviceOption(caffe2_pb2.CPU) print("(Wrapper) Using CPU...") self.gpu_en = gpu_en num_tables = len(cli_args.arch_embedding_size.split("-")) # We require 3 datastructures in caffe2 to enable non-blocking inputs for MT_Wide_and_Deep # At a high-level each input needs an input queue. Inputs are enqueued # when they arrive on the "server" or "core" and dequeued by the # model's inference engine # Input Blob -> Input Net -> ID Q ===> MT_Wide_and_Deep model self.id_qs = [] self.id_input_blobs = [] self.id_input_nets = [] # Same thing for the lengths inputs self.len_qs = [] self.len_input_blobs = [] self.len_input_nets = [] for i in range(num_tables): q, input_blob, net = self.build_mtwnd_sparse_queue(tag="id", qid=i) self.id_qs.append(q) self.id_input_blobs.append(input_blob) self.id_input_nets.append(net) q, input_blob, net = self.build_mtwnd_sparse_queue(tag="len", qid=i) self.len_qs.append(q) self.len_input_blobs.append(input_blob) self.len_input_nets.append(net) self.fc_q, self.fc_input_blob, self.fc_input_net = self.build_mtwnd_fc_queue( ) if self.args.queue: with core.DeviceScope(device_opt): self.mtwnd = MT_Wide_and_Deep(cli_args, model, tag, enable_prof, id_qs=self.id_qs, len_qs=self.len_qs, fc_q=self.fc_q) else: with core.DeviceScope(device_opt): self.mtwnd = MT_Wide_and_Deep(cli_args, model, tag, enable_prof)
def assertReferenceChecks( self, device_option, op, inputs, reference, input_device_options=None, threshold=1e-4, output_to_grad=None, grad_reference=None, atol=None, outputs_to_check=None, ): """ This runs the reference Python function implementation (effectively calling `reference(*inputs)`, and compares that to the output of output, with an absolute/relative tolerance given by the `threshold` parameter. Useful for checking the implementation matches the Python (typically NumPy) implementation of the same functionality. Usage example: @given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs) def test_softsign(self, X, inplace, gc, dc): op = core.CreateOperator( "Softsign", ["X"], ["X" if inplace else "Y"]) def softsign(X): return (X / (1 + np.abs(X)),) self.assertReferenceChecks(gc, op, [X], softsign) """ op = copy.deepcopy(op) op.device_option.CopyFrom(device_option) with temp_workspace(): if (len(op.input) > len(inputs)): raise ValueError( 'must supply an input for each input on the op: %s vs %s' % (op.input, inputs)) _input_device_options = input_device_options or \ core.InferOpBlobDevicesAsDict(op)[0] for (n, b) in zip(op.input, inputs): workspace.FeedBlob(n, b, device_option=_input_device_options.get( n, device_option)) net = core.Net("opnet") net.Proto().op.extend([op]) test_shape_inference = False try: (shapes, types) = workspace.InferShapesAndTypes([net]) test_shape_inference = True except RuntimeError as e: # Temporarily catch runtime errors when inferring shape # and type info logging.warning(str(e)) if os.getenv('CAFFE2_ASSERT_SHAPEINFERENCE') == '1': raise e workspace.RunNetOnce(net) reference_outputs = reference(*inputs) if not (isinstance(reference_outputs, tuple) or isinstance(reference_outputs, list)): raise RuntimeError( "You are providing a wrong reference implementation. A " "proper one should return a tuple/list of numpy arrays.") if not outputs_to_check: self.assertEqual(len(reference_outputs), len(op.output)) outputs_to_check = list(range(len(op.output))) outs = [] for (output_index, ref) in zip(outputs_to_check, reference_outputs): output_blob_name = op.output[output_index] output = workspace.FetchBlob(output_blob_name) if output.dtype.kind in ('S', 'O'): np.testing.assert_array_equal(output, ref) else: if atol is None: atol = threshold np.testing.assert_allclose( output, ref, atol=atol, rtol=threshold, err_msg=( 'Output {0} is not matching the reference'.format( output_blob_name, )), ) if test_shape_inference: self._assertInferTensorChecks(output_blob_name, shapes, types, output) outs.append(output) if grad_reference is not None: assert output_to_grad is not None, \ "If grad_reference is set," \ "output_to_grad has to be set as well" with core.DeviceScope(device_option): self._assertGradReferenceChecks(op, inputs, reference_outputs, output_to_grad, grad_reference, threshold=threshold) return outs
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): workspace.FeedBlob( core.ScopedBlobReference("seq_lengths"), np.array([self.T] * self.batch_per_device, dtype=np.int32)) model.param_init_net.ConstantFill( [], "hidden_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim]) model.param_init_net.ConstantFill( [], "cell_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim]) output, _last_hidden, _, _last_state, = rnn_cell.LSTM( model=model, input_blob="data", seq_lengths="seq_lengths", initial_states=("hidden_init", "cell_init"), dim_in=self.input_dim, dim_out=self.hidden_dim, scope="partest", ) # A silly loss function loss = model.AveragedLoss( model.Sub([output, "target"], "dist"), "loss", ) loss = model.Scale(loss, "loss_scaled", scale=loss_scale) return [loss] def param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param) assert len( model.GetParams()) == len(model.params) // len(model._devices) workspace.ResetWorkspace() model = cnn.CNNModelHelper(name="recurrent_test{}".format(devices), ) self.T = 8 self.batch_size = 64 self.input_dim = 8 self.hidden_dim = 31 self.batch_per_device = self.batch_size // len(devices) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=devices, optimize_gradient_memory=True, cpu_device=not gpu, ) # Change all initialization to be ConstantFills so that # the everything is deterministic for op in model.param_init_net.Proto().op: if op.type.endswith('Fill'): op.type = 'ConstantFill' # Each run has same input, independent of number of gpus np.random.seed(20150210) for i in range(0, 10): full_data = np.random.rand(self.T, self.batch_size, self.input_dim) full_target = np.random.rand(self.T, self.batch_size, self.hidden_dim) for (j, g) in enumerate(devices): st = j * self.batch_per_device en = st + self.batch_per_device data = full_data[:, st:en, :].astype(np.float32) targets = full_target[:, st:en, :].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data) workspace.FeedBlob( "{}_{}/target".format(model._device_prefix, g), targets) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("{}_0/partest/i2h_w".format( model._device_prefix))
def _build_embedding_encoder( self, model, inputs, input_lengths, vocab_size, embeddings, embedding_size, use_attention, num_gpus, forward_only=False, ): if num_gpus == 0: embedded_encoder_inputs = model.net.Gather( [embeddings, inputs], ['embedded_encoder_inputs'], ) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): embedded_encoder_inputs_cpu = model.net.Gather( [embeddings, inputs], ['embedded_encoder_inputs_cpu'], ) embedded_encoder_inputs = model.CopyCPUToGPU( embedded_encoder_inputs_cpu, 'embedded_encoder_inputs', ) if self.encoder_type == 'rnn': assert len(self.encoder_params['encoder_layer_configs']) == 1 encoder_num_units = ( self.encoder_params['encoder_layer_configs'][0]['num_units']) encoder_initial_cell_state = model.param_init_net.ConstantFill( [], ['encoder_initial_cell_state'], shape=[encoder_num_units], value=0.0, ) encoder_initial_hidden_state = (model.param_init_net.ConstantFill( [], 'encoder_initial_hidden_state', shape=[encoder_num_units], value=0.0, )) # Choose corresponding rnn encoder function if self.encoder_params['use_bidirectional_encoder']: rnn_encoder_func = seq2seq_util.rnn_bidirectional_encoder encoder_output_dim = 2 * encoder_num_units else: rnn_encoder_func = seq2seq_util.rnn_unidirectional_encoder encoder_output_dim = encoder_num_units ( encoder_outputs, final_encoder_hidden_state, final_encoder_cell_state, ) = rnn_encoder_func( model, embedded_encoder_inputs, input_lengths, encoder_initial_hidden_state, encoder_initial_cell_state, embedding_size, encoder_num_units, use_attention, ) weighted_encoder_outputs = None else: raise ValueError('Unsupported encoder type {}'.format( self.encoder_type)) return ( encoder_outputs, weighted_encoder_outputs, final_encoder_hidden_state, final_encoder_cell_state, encoder_output_dim, )
def run_model(self, V, gpu_devices, cpu_indices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): if cpu_indices: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): gathered_cpu = model.net.Gather([self.vecs, 'indices'], 'gathered_cpu') gathered = model.CopyCPUToGPU(gathered_cpu, "gathered") else: gpu_vecs = model.param_init_net.CopyCPUToGPU( self.vecs, "gpuvecs", ) model.params.append(gpu_vecs) gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered') flattened = model.Flatten(gathered, "flattened") fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: param_momentum = model.param_init_net.ConstantFill( [param], param + '_momentum', value=0.0, ) model.net.SparseMomentumSGDUpdate( [ param_grad.values, param_momentum, LR, param, param_grad.indices, ], [param_grad.values, param_momentum, param], momentum=0.1, nesterov=0, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) self.vecs = model.param_init_net.UniformFill([], "vecs", shape=[V, 16]) if cpu_indices: model.params.append(self.vecs) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs if cpu_indices: with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): for param in model.GetParams(): param_grad = model.param_to_grad[param] model.ScatterWeightedSum([ param, self.ONE_CPU, param_grad.indices, param_grad.values, self.LR ], self.vecs) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_indices = np.random.permutation(V)[:batch_size * 16].reshape( batch_size, 16) full_labels = full_indices[:, 0] % 2 batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en, :].astype(np.int32) labels = full_labels[st:en].astype(np.float32) device_for_indices = core.DeviceOption(caffe2_pb2.CPU) if not cpu_indices: device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g) with core.DeviceScope(device_for_indices): workspace.FeedBlob("gpu_{}/indices".format(g), indices) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = np.random.rand(V, 16).astype(np.float32) workspace.FeedBlob(self.vecs, orig_vecs) if not cpu_indices: for g in gpu_devices: workspace.FeedBlob( "gpu_{}/gpuvecs".format(g), orig_vecs, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) if len(gpu_devices) == 2: if not cpu_indices: idx = workspace.FetchBlob("gpu_0/indices") idx = list(idx.flatten()) n = len(idx) nu = len(set(idx)) assert n == nu, "We cannot have duplicate indices" # Sanity check to see the vecs were updated self.assertFalse(np.allclose(workspace.FetchBlob(self.vecs), orig_vecs)) return [ workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"), workspace.FetchBlob("gpu_0/fc_w") ]
def model_build_fun(self, model, forward_only=False, loss_scale=None): encoder_inputs = model.net.AddExternalInput( workspace.GetNameScope() + 'encoder_inputs', ) encoder_lengths = model.net.AddExternalInput( workspace.GetNameScope() + 'encoder_lengths', ) decoder_inputs = model.net.AddExternalInput( workspace.GetNameScope() + 'decoder_inputs', ) decoder_lengths = model.net.AddExternalInput( workspace.GetNameScope() + 'decoder_lengths', ) targets = model.net.AddExternalInput( workspace.GetNameScope() + 'targets', ) target_weights = model.net.AddExternalInput( workspace.GetNameScope() + 'target_weights', ) attention_type = self.model_params['attention'] assert attention_type in ['none', 'regular'] ( encoder_outputs, weighted_encoder_outputs, final_encoder_hidden_state, final_encoder_cell_state, encoder_output_dim, ) = self._build_embedding_encoder( model=model, inputs=encoder_inputs, input_lengths=encoder_lengths, vocab_size=self.source_vocab_size, embeddings=self.encoder_embeddings, embedding_size=self.model_params['encoder_embedding_size'], use_attention=(attention_type != 'none'), num_gpus=self.num_gpus, forward_only=forward_only, ) assert len(self.model_params['decoder_layer_configs']) == 1 decoder_num_units = ( self.model_params['decoder_layer_configs'][0]['num_units']) if attention_type == 'none': decoder_initial_hidden_state = model.FC( final_encoder_hidden_state, 'decoder_initial_hidden_state', encoder_output_dim, decoder_num_units, axis=2, ) decoder_initial_cell_state = model.FC( final_encoder_cell_state, 'decoder_initial_cell_state', encoder_output_dim, decoder_num_units, axis=2, ) else: decoder_initial_hidden_state = model.param_init_net.ConstantFill( [], 'decoder_initial_hidden_state', shape=[decoder_num_units], value=0.0, ) decoder_initial_cell_state = model.param_init_net.ConstantFill( [], 'decoder_initial_cell_state', shape=[decoder_num_units], value=0.0, ) initial_attention_weighted_encoder_context = ( model.param_init_net.ConstantFill( [], 'initial_attention_weighted_encoder_context', shape=[encoder_output_dim], value=0.0, )) if self.num_gpus == 0: embedded_decoder_inputs = model.net.Gather( [self.decoder_embeddings, decoder_inputs], ['embedded_decoder_inputs'], ) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): embedded_decoder_inputs_cpu = model.net.Gather( [self.decoder_embeddings, decoder_inputs], ['embedded_decoder_inputs_cpu'], ) embedded_decoder_inputs = model.CopyCPUToGPU( embedded_decoder_inputs_cpu, 'embedded_decoder_inputs', ) # seq_len x batch_size x decoder_embedding_size if attention_type == 'none': decoder_outputs, _, _, _ = recurrent.LSTM( model=model, input_blob=embedded_decoder_inputs, seq_lengths=decoder_lengths, initial_states=( decoder_initial_hidden_state, decoder_initial_cell_state, ), dim_in=self.model_params['decoder_embedding_size'], dim_out=decoder_num_units, scope='decoder', outputs_with_grads=[0], ) decoder_output_size = decoder_num_units else: (decoder_outputs, _, _, _, attention_weighted_encoder_contexts, _) = recurrent.LSTMWithAttention( model=model, decoder_inputs=embedded_decoder_inputs, decoder_input_lengths=decoder_lengths, initial_decoder_hidden_state=decoder_initial_hidden_state, initial_decoder_cell_state=decoder_initial_cell_state, initial_attention_weighted_encoder_context=( initial_attention_weighted_encoder_context), encoder_output_dim=encoder_output_dim, encoder_outputs=encoder_outputs, decoder_input_dim=self.model_params['decoder_embedding_size'], decoder_state_dim=decoder_num_units, scope='decoder', outputs_with_grads=[0, 4], ) decoder_outputs, _ = model.net.Concat( [decoder_outputs, attention_weighted_encoder_contexts], [ 'states_and_context_combination', '_states_and_context_combination_concat_dims', ], axis=2, ) decoder_output_size = decoder_num_units + encoder_output_dim # we do softmax over the whole sequence # (max_length in the batch * batch_size) x decoder embedding size # -1 because we don't know max_length yet decoder_outputs_flattened, _ = model.net.Reshape( [decoder_outputs], [ 'decoder_outputs_flattened', 'decoder_outputs_and_contexts_combination_old_shape', ], shape=[-1, decoder_output_size], ) output_logits = self.output_projection( model=model, decoder_outputs=decoder_outputs_flattened, decoder_output_size=decoder_output_size, target_vocab_size=self.target_vocab_size, decoder_softmax_size=self.model_params['decoder_softmax_size'], ) targets, _ = model.net.Reshape( [targets], ['targets', 'targets_old_shape'], shape=[-1], ) target_weights, _ = model.net.Reshape( [target_weights], ['target_weights', 'target_weights_old_shape'], shape=[-1], ) output_probs = model.net.Softmax( [output_logits], ['output_probs'], engine=('CUDNN' if self.num_gpus > 0 else None), ) label_cross_entropy = model.net.LabelCrossEntropy( [output_probs, targets], ['label_cross_entropy'], ) weighted_label_cross_entropy = model.net.Mul( [label_cross_entropy, target_weights], 'weighted_label_cross_entropy', ) total_loss_scalar = model.net.SumElements( [weighted_label_cross_entropy], 'total_loss_scalar', ) total_loss_scalar_weighted = model.net.Scale( [total_loss_scalar], 'total_loss_scalar_weighted', scale=1.0 / self.batch_size, ) return [total_loss_scalar_weighted]
def caffe2_yellowfin(self, zero_debias, grad_coef, n_dim, n_iter, gpu): caffe2_res = {} alpha = 1.0 mu = 0.0 beta = 0.999 curv_win_width = 20 epsilon = 1e-6 net = core.Net("net") param_init_net = core.Net("param_init_net") workspace.ResetWorkspace() with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): iteration = param_init_net.ConstantFill([], "iteration", shape=[1], value=0, dtype=core.DataType.INT64) iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"]) net.AtomicIter([iter_mutex, iteration], [iteration]) pre_grad = param_init_net.ConstantFill([], "pre_grad", shape=[n_dim], value=grad_coef) if gpu: iteration = net.CopyCPUToGPU([iteration], "iteration_cpu") iteration_float = net.Cast([iteration], "iteration_float") grad = net.Mul([pre_grad, iteration_float], "grad", broadcast=True) w = param_init_net.ConstantFill([], "w", shape=[n_dim], value=0.0) # a hack to create an object with __dict__ param_info = lambda: None param_info.blob = w param_info.grad = grad optimizer.YellowFinOptimizer(alpha=alpha, mu=mu, beta=beta, curv_win_width=curv_win_width, epsilon=epsilon, zero_debias=zero_debias)._run( net, param_init_net, param_info) workspace.RunNetOnce(param_init_net) workspace.CreateNet(net, overwrite=True) for i in range(n_iter): workspace.RunNet(net) scalars_memory_blob = workspace.FetchBlob("w_scalars_memory") g_norm2_avg = scalars_memory_blob[1] g_norm2_min_avg = scalars_memory_blob[2] g_norm2_max_avg = scalars_memory_blob[3] distance_avg = scalars_memory_blob[4] g_avg_blob = workspace.FetchBlob("w_g_avg") res_lr = workspace.FetchBlob("w_lr_avg")[0] res_mu = workspace.FetchBlob("w_mu_avg")[0] g_deb = self.deb(g_avg_blob, beta, i + 1, zero_debias) variance = max( self.deb(g_norm2_avg, beta, i + 1, zero_debias) - g_deb.dot(g_deb), epsilon) if i > 0: caffe2_res[i] = { 'h_max': np.exp(self.deb(g_norm2_max_avg, beta, i + 1, zero_debias)), 'h_min': np.exp(self.deb(g_norm2_min_avg, beta, i + 1, zero_debias)), 'var': variance, 'dist': self.deb(distance_avg, beta, i + 1, zero_debias), 'lr': res_lr, 'mu': res_mu } return caffe2_res
def test_resnet50_core(self): N = 2 warmup = 20 repeat = 100 print("Batch size: {}, repeat inference {} times, warmup {} times". format(N, repeat, warmup)) init_net, pred_net, _ = self._get_c2_model('resnet50') self._add_head_tail(pred_net, 'real_data', 'real_softmax') input_blob_dims = (N, 3, 224, 224) input_name = "real_data" device_option = core.DeviceOption(caffe2_pb2.CUDA, 0) init_net.device_option.CopyFrom(device_option) pred_net.device_option.CopyFrom(device_option) for op in pred_net.op: op.device_option.CopyFrom(device_option) op.engine = 'CUDNN' net_outputs = pred_net.external_output Y_c2 = None data = np.random.randn(*input_blob_dims).astype(np.float32) c2_time = 1 workspace.SwitchWorkspace("gpu_test", True) with core.DeviceScope(device_option): workspace.FeedBlob(input_name, data) workspace.RunNetOnce(init_net) workspace.CreateNet(pred_net) for _ in range(warmup): workspace.RunNet(pred_net.name) start = time.time() for _ in range(repeat): workspace.RunNet(pred_net.name) end = time.time() c2_time = end - start output_values = [workspace.FetchBlob(name) for name in net_outputs] Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values) workspace.ResetWorkspace() # Fill the workspace with the weights with core.DeviceScope(device_option): workspace.RunNetOnce(init_net) # Cut the graph start = time.time() pred_net_cut = transform_caffe2_net(pred_net, {input_name: input_blob_dims}, build_serializable_op=True) del init_net, pred_net #_print_net(pred_net_cut) Y_trt = None input_name = pred_net_cut.external_input[0] print("C2 runtime: {}s".format(c2_time)) with core.DeviceScope(device_option): workspace.FeedBlob(input_name, data) workspace.CreateNet(pred_net_cut) end = time.time() print("Conversion time: {:.2f}s".format(end - start)) for _ in range(warmup): workspace.RunNet(pred_net_cut.name) start = time.time() for _ in range(repeat): workspace.RunNet(pred_net_cut.name) end = time.time() trt_time = end - start print("TRT runtime: {}s, improvement: {}%".format( trt_time, (c2_time - trt_time) / c2_time * 100)) output_values = [workspace.FetchBlob(name) for name in net_outputs] Y_trt = namedtupledict('Outputs', net_outputs)(*output_values) np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
def _run(self, net, param_init_net, param_info): # Note: This is number of persistent scalars in YellowFin optimizer. # It should always be the number of scalars being used. The same # number should be used in class for the operation. SCALARS_MEMORY_SIZE = 5 param = param_info.blob grad = param_info.grad moment = param_init_net.ConstantFill([param], param + "_moment", value=0.0) curv_win = param_init_net.ConstantFill([], param + "_curv_win", shape=[self.curv_win_width], value=0.0) g_avg = param_init_net.ConstantFill([param], param + "_g_avg", value=0.0) g2_avg = param_init_net.ConstantFill([param], param + "_g2_avg", value=0.0) lr_avg = param_init_net.ConstantFill([], param + "_lr_avg", shape=[1], value=self.alpha) mu_avg = param_init_net.ConstantFill([], param + "_mu_avg", shape=[1], value=self.mu) scalars_memory = param_init_net.ConstantFill( [], param + "_scalars_memory", shape=[SCALARS_MEMORY_SIZE], value=0.0) assert self.alpha > 0 assert not isinstance(grad, core.GradientSlice), \ "Doesn't support sparse gradients" if not param_init_net.BlobIsDefined(_OPTIMIZER_ITERATION_NAME): # Add training operators. with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): iteration = param_init_net.ConstantFill( [], _OPTIMIZER_ITERATION_NAME, shape=[1], value=0, dtype=core.DataType.INT64) iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"]) net.AtomicIter([iter_mutex, iteration], [iteration]) else: iteration = param_init_net.GetBlobRef(_OPTIMIZER_ITERATION_NAME) self._aux_params.shared.append(iteration) self._aux_params.local.append(moment) self._aux_params.local.append(lr_avg) self._aux_params.local.append(mu_avg) self._aux_params.local.append(curv_win) self._aux_params.local.append(g_avg) self._aux_params.local.append(g2_avg) self._aux_params.local.append(scalars_memory) yf_in_out_args = [ param, moment, lr_avg, mu_avg, curv_win, g_avg, g2_avg, scalars_memory ] net.YellowFin(yf_in_out_args + [grad, iteration], yf_in_out_args, beta=self.beta, epsilon=self.epsilon, curv_win_width=self.curv_win_width, zero_debias=self.zero_debias)
def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl): """Performs NCCL AllReduce to distribute blobs to all the GPUs.""" if len(devices) == 1: return # Now we need to Allreduce blobs on all the GPUs. # Pick GPU #0 as a master GPU. master_device_opt = core.DeviceOption(model._device_type, devices[0]) last_out = None concatenated_idx = set() for blob_name in blob_names: # Group by blob_name for reduce. blobs_group = list(viewvalues(model._device_grouped_blobs[blob_name])) assert len(blobs_group) == len(devices), \ "Each GPU from {}, should have a copy of {}.".format( devices, blob_name) if _IsGPUBlob(model, blob_name): with core.DeviceScope(master_device_opt): if not isinstance(blobs_group[0], core.GradientSlice): _AllReduce( devices, model, net, blob_name, use_nccl, last_out ) # last_out is used to serialize the execution of nccls last_out = blobs_group[0] else: # Sparse gradients: all-gather for indices and values master_ns = "{}_{}".format(model._device_prefix, devices[0]) ''' Skip if we have already copied concatenated indices to the indices of GradientSlice. This happens when two or more grad blobs are gathered with the same indices blob ''' skip_idx_concat = False for g in blobs_group: if g.indices in concatenated_idx: skip_idx_concat = True if not skip_idx_concat: grad_idx_concat, _ = net.Concat( [g.indices for g in blobs_group], ["{}/{}_index_concat".format(master_ns, blob_name), "{}/{}_index_splitinfo".format(master_ns, blob_name)], axis=0, name="note:data_parallel_model") for gpu, g in viewitems(model._device_grouped_blobs[blob_name]): device_opt = core.DeviceOption(model._device_type, gpu) with core.DeviceScope(device_opt): model.Copy(grad_idx_concat, g.indices) concatenated_idx.add(g.indices) grad_val_concat, _ = net.Concat( [g.values for g in blobs_group], ["{}/{}_val_concat".format(master_ns, blob_name), "{}/{}_val_splitinfo".format(master_ns, blob_name)], axis=0, name="note:data_parallel_model") for gpu, g in viewitems(model._device_grouped_blobs[blob_name]): device_opt = core.DeviceOption(model._device_type, gpu) with core.DeviceScope(device_opt): model.Copy(grad_val_concat, g.values) else: assert not isinstance(blobs_group[0], core.GradientSlice), \ "Synchronizing gradient slices not supported" with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): # Poor man's allreduce model.net.Sum(blobs_group, [blobs_group[0]]) _Broadcast(devices, model, model.net, blob_name)
(nbatches, lT) = dc.generate_output_data() ### construct the neural network specified above ### print("Trying to initialize DLRM") libFlashRec = cdll.LoadLibrary("./libflashrec.so") libFlashRec.open_unvme() lru = LRU(1000) global hits global misses hits = 0 misses = 0 print("libFlashRec opened") load_instances = 8 run_instances = 1 dlrm_run_instances = [] with core.DeviceScope(device_opt): for i in xrange(run_instances): dlrm_run_instances.append(DLRM_Net( args, libFlashRec=libFlashRec )) print("Initialized DLRM Net") for dlrm in dlrm_run_instances: dlrm.create(lX[0], lS_l[0], lS_i[0], lT[0]) print("Created network") total_time = 0 dload_time = 0 k = 0 time_start = time.time() print("Running networks") def stage_run_dlrm(dlrm, run_q, stop):
def build_embedding_encoder( model, encoder_params, num_decoder_layers, inputs, input_lengths, vocab_size, embeddings, embedding_size, use_attention, num_gpus=0, forward_only=False, scope=None, ): with core.NameScope(scope or ''): if num_gpus == 0: embedded_encoder_inputs = model.net.Gather( [embeddings, inputs], ['embedded_encoder_inputs'], ) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): embedded_encoder_inputs_cpu = model.net.Gather( [embeddings, inputs], ['embedded_encoder_inputs_cpu'], ) embedded_encoder_inputs = model.CopyCPUToGPU( embedded_encoder_inputs_cpu, 'embedded_encoder_inputs', ) layer_inputs = embedded_encoder_inputs layer_input_size = embedding_size encoder_units_per_layer = [] final_encoder_hidden_states = [] final_encoder_cell_states = [] num_encoder_layers = len(encoder_params['encoder_layer_configs']) use_bidirectional_encoder = encoder_params.get( 'use_bidirectional_encoder', False, ) for i, layer_config in enumerate(encoder_params['encoder_layer_configs']): if use_bidirectional_encoder and i == 0: layer_func = rnn_bidirectional_layer output_dims = 2 * layer_config['num_units'] else: layer_func = rnn_unidirectional_layer output_dims = layer_config['num_units'] encoder_units_per_layer.append(output_dims) is_final_layer = (i == num_encoder_layers - 1) dropout_keep_prob = layer_config.get( 'dropout_keep_prob', None, ) return_final_state = i >= (num_encoder_layers - num_decoder_layers) ( layer_outputs, final_layer_hidden_state, final_layer_cell_state, ) = layer_func( model=model, inputs=layer_inputs, input_lengths=input_lengths, input_size=layer_input_size, num_units=layer_config['num_units'], dropout_keep_prob=dropout_keep_prob, forward_only=forward_only, return_sequence_output=(not is_final_layer) or use_attention, return_final_state=return_final_state, scope=get_layer_scope(scope, 'encoder', i), ) if not is_final_layer: layer_inputs = layer_outputs layer_input_size = output_dims final_encoder_hidden_states.append(final_layer_hidden_state) final_encoder_cell_states.append(final_layer_cell_state) encoder_outputs = layer_outputs weighted_encoder_outputs = None return ( encoder_outputs, weighted_encoder_outputs, final_encoder_hidden_states, final_encoder_cell_states, encoder_units_per_layer, )