Exemplo n.º 1
0
def build_embedding_decoder(
    model,
    decoder_layer_configs,
    inputs,
    input_lengths,
    encoder_lengths,
    encoder_outputs,
    weighted_encoder_outputs,
    final_encoder_hidden_states,
    final_encoder_cell_states,
    encoder_units_per_layer,
    vocab_size,
    embeddings,
    embedding_size,
    attention_type,
    forward_only,
    num_gpus=0,
    scope=None,
):
    with core.NameScope(scope or ''):
        if num_gpus == 0:
            embedded_decoder_inputs = model.net.Gather(
                [embeddings, inputs],
                ['embedded_decoder_inputs'],
            )
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                embedded_decoder_inputs_cpu = model.net.Gather(
                    [embeddings, inputs],
                    ['embedded_decoder_inputs_cpu'],
                )
            embedded_decoder_inputs = model.CopyCPUToGPU(
                embedded_decoder_inputs_cpu,
                'embedded_decoder_inputs',
            )

    decoder_cells = []
    decoder_units_per_layer = []
    for i, layer_config in enumerate(decoder_layer_configs):
        num_units = layer_config['num_units']
        decoder_units_per_layer.append(num_units)

        if i == 0:
            input_size = embedding_size
        else:
            input_size = decoder_cells[-1].get_output_dim()

        cell = rnn_cell.LSTMCell(
            name=get_layer_scope(scope, 'decoder', i),
            forward_only=forward_only,
            input_size=input_size,
            hidden_size=num_units,
            forget_bias=0.0,
            memory_optimization=False,
        )

        dropout_keep_prob = layer_config.get('dropout_keep_prob', None)
        if dropout_keep_prob is not None:
            dropout_ratio = 1.0 - layer_config.dropout_keep_prob
            cell = rnn_cell.DropoutCell(
                internal_cell=cell,
                dropout_ratio=dropout_ratio,
                forward_only=forward_only,
                is_test=False,
                name=get_layer_scope(scope, 'decoder_dropout', i),
            )

        decoder_cells.append(cell)

    states = build_initial_rnn_decoder_states(
        model=model,
        encoder_units_per_layer=encoder_units_per_layer,
        decoder_units_per_layer=decoder_units_per_layer,
        final_encoder_hidden_states=final_encoder_hidden_states,
        final_encoder_cell_states=final_encoder_cell_states,
        use_attention=(attention_type != 'none'),
    )
    attention_decoder = LSTMWithAttentionDecoder(
        encoder_outputs=encoder_outputs,
        encoder_output_dim=encoder_units_per_layer[-1],
        encoder_lengths=encoder_lengths,
        vocab_size=vocab_size,
        attention_type=attention_type,
        embedding_size=embedding_size,
        decoder_num_units=decoder_units_per_layer[-1],
        decoder_cells=decoder_cells,
        weighted_encoder_outputs=weighted_encoder_outputs,
    )
    decoder_outputs, _ = attention_decoder.apply_over_sequence(
        model=model,
        inputs=embedded_decoder_inputs,
        seq_lengths=input_lengths,
        initial_states=states,
    )

    # we do softmax over the whole sequence
    # (max_length in the batch * batch_size) x decoder embedding size
    # -1 because we don't know max_length yet
    decoder_outputs_flattened, _ = model.net.Reshape(
        [decoder_outputs],
        [
            'decoder_outputs_flattened',
            'decoder_outputs_and_contexts_combination_old_shape',
        ],
        shape=[-1, attention_decoder.get_output_dim()],
    )

    decoder_outputs = decoder_outputs_flattened
    decoder_output_dim = attention_decoder.get_output_dim()

    return (decoder_outputs, decoder_output_dim)
Exemplo n.º 2
0
    def test_multiple_optimizers(self):
        from caffe2.python import brew, core, optimizer
        from caffe2.python.model_helper import ModelHelper

        model = ModelHelper(name="test")
        fc1 = brew.fc(model, 'data', 'fc1', 100, 50)
        fc2 = brew.fc(model, fc1, 'fc2', 50, 25)
        pred = brew.fc(model, fc2, 'fc3', 25, 10)
        (softmax, loss) = model.SoftmaxWithLoss(
            [pred, 'label'],
            ['softmax', 'loss'],
        )
        model.AddGradientOperators([loss])

        param_to_device = optimizer._get_param_to_device(model)

        def infer_blob_device(blob_name):
            return optimizer.get_param_device(blob_name,
                                              "{}_grad".format(blob_name),
                                              param_to_device)

        sgd_1 = optimizer.SgdOptimizer(base_learning_rate=0.1)
        sgd_2 = optimizer.SgdOptimizer(base_learning_rate=0.2)
        adagrad = optimizer.AdagradOptimizer()

        # Check same optimizer share the same learning rate.
        with core.DeviceScope(infer_blob_device("fc1_w")):
            sgd_1(model.net, model.param_init_net, "fc1_w", "fc1_w_grad")
        with core.DeviceScope(infer_blob_device("fc1_b")):
            sgd_1(model.net, model.param_init_net, "fc1_b", "fc1_b_grad")
        fc1_lr_blobs = []
        for op in model.net.Proto().op:
            if op.type == 'WeightedSum' and op.input[0] == 'fc1_w' or \
                    op.input[0] == 'fc1_b':
                fc1_lr_blobs.append(op.input[3])
        self.assertEqual(fc1_lr_blobs[0], fc1_lr_blobs[1])

        # Check different instance of the same optimizer has a different lr.
        with core.DeviceScope(infer_blob_device("fc2_w")):
            sgd_2(model.net, model.param_init_net, "fc2_w", "fc2_w_grad")
        with core.DeviceScope(infer_blob_device("fc2_b")):
            sgd_2(model.net, model.param_init_net, "fc2_b", "fc2_b_grad")
        fc2_lr_blobs = []
        for op in model.net.Proto().op:
            if op.type == 'WeightedSum' and op.input[0] == 'fc2_w' or \
                    op.input[0] == 'fc2_b':
                self.assertTrue(op.input[3] not in fc1_lr_blobs)
                fc2_lr_blobs.append(op.input[3])
        self.assertEqual(fc2_lr_blobs[0], fc2_lr_blobs[1])

        # Check different optimizer type case
        with core.DeviceScope(infer_blob_device("fc3_w")):
            adagrad(model.net, model.param_init_net, "fc3_w", "fc3_w_grad")
        with core.DeviceScope(infer_blob_device("fc3_b")):
            adagrad(model.net, model.param_init_net, "fc3_b", "fc3_b_grad")
        fc3_lr_blobs = []
        for op in model.net.Proto().op:
            if op.type == 'Adagrad' and op.input[0] == 'fc3_w' or \
                    op.input[0] == 'fc3_b':
                self.assertTrue(op.input[3] not in fc2_lr_blobs)
                self.assertTrue(op.input[3] not in fc1_lr_blobs)
                fc3_lr_blobs.append(op.input[3])
        self.assertEqual(fc3_lr_blobs[0], fc3_lr_blobs[1])
Exemplo n.º 3
0
def _build(
    model,
    optimizer,
    weights_only=False,
    use_param_info_optim=True,
    max_gradient_norm=None,
    allow_lr_injection=False,
):
    param_to_device = _get_param_to_device(model)

    # Validate there are no duplicate params
    model.Validate()

    params = []
    for param_info in model.GetOptimizationParamInfo():
        if weights_only and param_info.blob not in model.weights:
            continue
        params.append(param_info)

    lr_multiplier = None
    if max_gradient_norm is not None:
        lr_multiplier = _calc_norm_ratio(
            model,
            params,
            'norm_clipped_grad_update',
            param_to_device,
            max_gradient_norm,
        )

    if allow_lr_injection:
        if not model.net.BlobIsDefined(_LEARNING_RATE_INJECTION):
            lr_injection = model.param_init_net.ConstantFill(
                [],
                _LEARNING_RATE_INJECTION,
                shape=[1],
                value=1.0,
            )
        else:
            lr_injection = _LEARNING_RATE_INJECTION

        if lr_multiplier is None:
            lr_multiplier = lr_injection
        else:
            lr_multiplier = model.net.Mul(
                [lr_multiplier, lr_injection],
                'lr_multiplier',
                broadcast=1,
            )
    optimizer.add_lr_multiplier(lr_multiplier)

    for param_info in params:
        param_name = str(param_info.blob)

        device = get_param_device(param_name, param_info.grad, param_to_device)

        with core.DeviceScope(device):
            if param_info.optimizer and use_param_info_optim:
                param_info.optimizer(model.net, model.param_init_net,
                                     param_info)
            else:
                optimizer(model.net, model.param_init_net, param_info)
    return optimizer
    def run_model(self, devices, gpu):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)

            # For testing explicit sync
            model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
            return [loss]

        def add_optimizer(model):
            return optimizer.build_sgd(
                model,
                0.1,
                policy="fixed",
                max_gradient_norm=5.0,
                allow_lr_injection=True,
            )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="test{}".format(devices),
        )
        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            optimizer_builder_fun=add_optimizer,
            devices=devices,
            cpu_device=not gpu,
            shared_model=not gpu,
            combine_spatial_bn=not gpu,
        )
        data_parallel_model.AddBlobSync(model, ["sync_num"])

        # Light test for LR names
        lr_names = data_parallel_model.GetLearningRateBlobNames(model)
        self.assertGreater(len(lr_names), 0)

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(devices)

            for (j, g) in enumerate(devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type,
                                                        g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data)
                    workspace.FeedBlob(
                        "{}_{}/label".format(model._device_prefix, g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.FeedBlob(model._device_prefix + "_0/sync_num",
                               np.array([i * 2]).astype(np.float32),
                               device_option=core.DeviceOption(
                                   model._device_type, 0))
            workspace.RunNet(model.net.Proto().name)

            # Test AddBlobSync
            for j in model._devices:
                sync = workspace.FetchBlob(model._device_prefix +
                                           "_{}/sync_num".format(j))[0]
                self.assertTrue(abs(sync - i * 2) < 0.01)

        return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
    def run_model(self, V, gpu_devices):
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            gpu_vecs_gathered = []
            gpu_vecs = []
            for num, vec in enumerate(self.vecs):
                gpu_vec = model.param_init_net.CopyCPUToGPU(
                    vec,
                    'gpuvec_{}'.format(num),
                )
                if num != 2:
                    model.params.append(gpu_vec)
                gpu_vecs.append(gpu_vec)
            for num, gpu_vec in enumerate(gpu_vecs):
                gpu_vec_gathered = model.net.Gather(
                    [gpu_vec, 'indices'], ['gpu_vec_gathered_{}'.format(num)])
                gpu_vecs_gathered.append(gpu_vec_gathered)

            assert len(gpu_vecs_gathered) == 3

            fc = model.net.FC(
                [
                    gpu_vecs_gathered[2],
                    gpu_vecs_gathered[0],
                    gpu_vecs_gathered[1],
                ],
                ['fc'],
            )
            _, loss = model.net.SoftmaxWithLoss(
                [fc, 'label'],
                ['ce_loss', 'avg_loss'],
                only_loss=True,
            )
            loss = model.Scale(loss, scale=loss_scale)
            model.net.Print(loss, [], limit=10)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    model.net.ScatterWeightedSum(
                        [
                            param,
                            ONE,
                            param_grad.indices,
                            param_grad.values,
                            ONE,
                        ],
                        param,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )
        batch_size = 32
        batch_per_device = batch_size // len(gpu_devices)

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                '''
                self.vecs consists of 3 big blobs on which we call Gather:
                1) FC weights, shape=(V, 16)
                2) FC bias, shape=(V)
                3) FC input, shape=(batch_per_device, 16)
                '''
                self.vecs = [
                    model.param_init_net.UniformFill([],
                                                     "vec_{}".format(num),
                                                     shape=[V, 16])
                    for num in range(2)
                ]
                self.vecs.append(
                    model.param_init_net.UniformFill(
                        [], "vec_2", shape=[batch_per_device, 16]))
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [],
                    "ONE_CPU",
                    shape=[1],
                    value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
            for num, vec in enumerate(self.vecs[:-1]):
                model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec)

        # Each run has same input, independent of number of gpus
        for i in range(0, 10):
            np.random.seed(2603)
            full_indices = np.random.permutation(V)[:batch_size].reshape(
                batch_size)
            full_labels = full_indices[:] % batch_per_device

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en].astype(np.int32)
                labels = full_labels[st:en].astype(np.int32)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = [
                    np.random.rand(V, 16).astype(np.float32),
                    np.random.rand(V).astype(np.float32),
                    np.random.rand(V, 16).astype(np.float32),
                ]
                for vec, orig_vec in zip(self.vecs, orig_vecs):
                    workspace.FeedBlob(vec, orig_vec)
                for g in gpu_devices:
                    for num, orig_vec in enumerate(orig_vecs):
                        workspace.FeedBlob(
                            "gpu_{}/gpuvec_{}".format(g, num),
                            orig_vec,
                            device_option=core.DeviceOption(
                                caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

            idx = workspace.FetchBlob('gpu_0/indices')
            grad_slices = [
                workspace.FetchBlob('gpu_{}/gpu_vec_gathered_{}_grad'.format(
                    g, num)) for g in gpu_devices for num in range(2)
            ]
            for grad_slice in grad_slices:
                # print (len(idx), len(grad_slice))
                assert len(idx) == len(grad_slice), (
                    'Number of indices {} is not same as number of gradient '
                    'slices {}. This might lead to illegal memory access'.
                    format(len(idx), len(grad_slice)))
Exemplo n.º 6
0
def _AllReduceBlobsDistributed(
    blob_names,
    devices,
    model,
    net,
    rendezvous,
    max_concurrent_distributed_ops,
):
    num_workers = model.net.Proto().num_workers
    assert num_workers > 1, "Please specify more than 1 worker"
    all_reduce_engine = rendezvous['engine']

    master_device_opt = core.DeviceOption(model._device_type, devices[0])

    reducing_device_opt = master_device_opt

    context = CollectivesConcurrencyControl(
        "allreduce",
        max_concurrent_distributed_ops,
        model.param_init_net,
        rendezvous
    )

    nccl_control_blob = None

    for blob_name in blob_names:
        master_blob = model._device_grouped_blobs[blob_name][devices[0]]
        blobs_group = list(viewvalues(model._device_grouped_blobs[blob_name]))

        assert master_blob in blobs_group

        # Remark: NCCLReduce does not support in-place modifications
        # so we need a temporary blob
        reduced_blob = str(master_blob) + "_red"

        def allreduce(blobs):
            with core.DeviceScope(reducing_device_opt):
                comm_world, control_input = \
                    context.get_control_and_context(blobs[0])
                net.Allreduce(
                    inputs=[comm_world] + blobs,
                    outputs=blobs,
                    name=blob_name,
                    engine=all_reduce_engine,
                    control_input=control_input,
                    status_blob="allreduce_{}_status".format(blob_name),
                )

        if rendezvous['engine'] == 'GLOO':
            # With Gloo cross GPU and cross machine allreduce
            # can be executed in a single operation
            allreduce(blobs_group)
        else:
            # Step 1: sum blobs from local GPUs to master GPU
            with core.DeviceScope(master_device_opt):
                model.ConstantFill(master_blob, reduced_blob, value=0.0)

                # Temp fix since NCCLReduce does not work
                net.NCCLAllreduce(
                    blobs_group,
                    blobs_group,
                    control_input=nccl_control_blob,
                )
                nccl_control_blob = blobs_group[0]
                net.Copy(master_blob, reduced_blob)

            # Step 2: allreduce between all hosts, between master GPUs
            allreduce([reduced_blob])

            with core.DeviceScope(master_device_opt):
                net.Copy(reduced_blob, master_blob)

            # Step 3: broadcast locally
            _Broadcast(devices, model, net, blob_name)
 def test_device_scope_check(self):
     with self.assertRaises(AssertionError):
         with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
             data_parallel_model.Parallelize_GPU(None, None, None)
Exemplo n.º 8
0
    def run_model(self, devices, gpu):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]

        def add_optimizer(model):
            optimizer.build_sgd(model, 0.1, policy="fixed")

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="test{}".format(devices),
        )
        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            optimizer_builder_fun=add_optimizer,
            devices=devices,
            cpu_device=not gpu,
        )

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(devices)

            for (j, g) in enumerate(devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type,
                                                        g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data)
                    workspace.FeedBlob(
                        "{}_{}/label".format(model._device_prefix, g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)
        return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
Exemplo n.º 9
0
    def normalize_dense_matrix(
        self,
        input_matrix: str,
        features: List[int],
        normalization_parameters: Dict[int, NormalizationParameters],
        blobname_prefix: str,
        split_expensive_feature_groups: bool,
    ) -> Tuple[str, List[str]]:
        """
        Normalizes inputs according to parameters. Expects a dense matrix whose ith
        column corresponds to feature i.

        Note that the Caffe2 BatchBoxCox operator isn't implemented on CUDA GPU so
        we need to use a CPU context.

        :param input_matrix: Input matrix to normalize.
        :param features: Array that maps feature ids to column indices.
        :param normalization_parameters: Mapping from feature names to
            NormalizationParameters.
        :param blobname_prefix: Prefix for input blobs to norm_net.
        :param num_output_features: The number of features in an output processed
            datapoint. If set to None, this function will compute it.
        """
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
            feature_starts = self._get_type_boundaries(
                features, normalization_parameters)

            normalized_input_blobs = []
            parameters: List[str] = []
            for i, feature_type in enumerate(FEATURE_TYPES):
                start_index = feature_starts[i]
                if (i + 1) == len(FEATURE_TYPES):
                    end_index = len(normalization_parameters)
                else:
                    end_index = feature_starts[i + 1]
                if start_index == end_index:
                    continue  # No features of this type
                slices = []

                split_feature_group, split_intervals = self._should_split_feature_group(
                    split_expensive_feature_groups, start_index, end_index,
                    feature_type)

                if split_feature_group:
                    for j in range(len(split_intervals) - 1):
                        slice_blob = self._get_input_blob_indexed(
                            blobname_prefix, feature_type, j)
                        C2.net().Slice(
                            [input_matrix],
                            [slice_blob],
                            starts=[0, split_intervals[j]],
                            ends=[-1, split_intervals[j + 1]],
                        )
                        slices.append((slice_blob, split_intervals[j],
                                       split_intervals[j + 1]))
                else:
                    sliced_input_features = self._get_input_blob(
                        blobname_prefix, feature_type)

                    C2.net().Slice(
                        [input_matrix],
                        [sliced_input_features],
                        starts=[0, start_index],
                        ends=[-1, end_index],
                    )

                    slices.append(
                        (sliced_input_features, start_index, end_index))

                for (slice_blob, start, end) in slices:
                    normalized_input_blob, blob_parameters = self.preprocess_blob(
                        slice_blob,
                        [
                            normalization_parameters[x]
                            for x in features[start:end]
                        ],
                    )
                    logger.info(
                        "Processed split ({}, {}) for feature type {}".format(
                            start, end, feature_type))
                    parameters.extend(blob_parameters)
                    normalized_input_blobs.append(normalized_input_blob)
            for i, inp in enumerate(normalized_input_blobs):
                logger.info("input# {}: {}".format(i, inp))
            concatenated_input_blob, concatenated_input_blob_dim = C2.Concat(
                *normalized_input_blobs, axis=1)
        return concatenated_input_blob, parameters
def main(argv_new):
    """Main entrypoint"""
    parser = argparse.ArgumentParser(
        description="Train a RL net to play in openAI GYM.")
    parser.add_argument("-x",
                        "--number-steps-total",
                        type=int,
                        help="total number of training steps",
                        default=1000000)
    parser.add_argument("-w",
                        "--number-steps-timeout",
                        type=int,
                        help="number of steps before time out",
                        default=-1)
    parser.add_argument("-i",
                        "--number-iterations",
                        type=int,
                        help="total number of iterations",
                        default=1000)
    parser.add_argument("-y",
                        "--learn-every-n-iterations",
                        type=int,
                        help="training every n numbers of game iterations",
                        default=2)
    parser.add_argument("-z",
                        "--learn-batch-num-every-iteration",
                        type=int,
                        help="batch number for learning each time",
                        default=100)
    parser.add_argument("-b",
                        "--batch-size",
                        type=int,
                        help="batch size for training",
                        default=128)

    parser.add_argument("-s",
                        "--save-iteration",
                        type=int,
                        help="saving checkpoint every n number of iterations",
                        default=-1)
    parser.add_argument("-p",
                        "--path",
                        help="path of the checkpoint file",
                        default=MODEL_PATH)

    parser.add_argument("-q",
                        "--maxq-learning",
                        help="max q over actions instead of current",
                        action="store_true",
                        default=True)
    parser.add_argument("-c",
                        "--constraint",
                        help="constrained actions",
                        action="store_true",
                        default=False)

    parser.add_argument("-t",
                        "--test",
                        help="test (no learning and minimal epsilon)",
                        action="store_true",
                        default=False)
    parser.add_argument("-u",
                        "--upload",
                        help="upload after finishing training/testing",
                        action="store_true",
                        default=False)
    parser.add_argument("-v",
                        "--verbosity",
                        action="count",
                        help="increase output verbosity",
                        default=0)

    parser.add_argument("-g",
                        "--gymenv",
                        help="specify gym env for training",
                        default="CartPole-v0")
    parser.add_argument("-r",
                        "--render",
                        help="render training",
                        action="store_true",
                        default=False)

    parser.add_argument("-a",
                        "--model-id",
                        help="specify training model unique id",
                        default="new")
    parser.add_argument("-m",
                        "--model-type",
                        help="specify training model type:\
                        DQN or ACTORCRITIC",
                        default="DQN")
    parser.add_argument("-o",
                        "--optimizer",
                        help="specify optimizer for training",
                        default="SGD")
    parser.add_argument("-l",
                        "--learning-rate",
                        type=float,
                        help="specify learning rate for training",
                        default=0.01)
    parser.add_argument("-d",
                        "--discount-gamma",
                        type=float,
                        help="specify discounted factor gamma for RL",
                        default=0.9)

    parser.add_argument("--gpu",
                        action="store_true",
                        help="If set, training is going to use GPU 0",
                        default=False)

    args = parser.parse_args(argv_new)
    print("args:", args)

    workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
    workspace.ResetWorkspace()

    device = core.DeviceOption(caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU,
                               0)
    with core.DeviceScope(device):
        Run(args)
Exemplo n.º 11
0
def Test(args):
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = range(args.num_gpus)
        num_gpus = args.num_gpus

    if num_gpus > 0:
        total_batch_size = args.batch_size * num_gpus
        log.info("Running on GPUs: {}".format(gpus))
        log.info("total_batch_size: {}".format(total_batch_size))
    else:
        total_batch_size = args.batch_size
        log.info("Running on CPU")
        log.info("total_batch_size: {}".format(total_batch_size))

    # Model building functions
    def create_model_ops(model, loss_scale):
        return model_builder.build_model(
            model=model,
            model_name=args.model_name,
            model_depth=args.model_depth,
            num_labels=args.num_labels,
            num_channels=args.num_channels,
            crop_size=args.crop_size,
            clip_length=(
                args.clip_length_of if args.input_type == 1
                else args.clip_length_rgb
            ),
            loss_scale=loss_scale,
            is_test=1,
            pred_layer_name=args.pred_layer_name,
        )

    test_model = cnn.CNNModelHelper(
        order="NCHW",
        name="video_model_test",
        use_cudnn=(True if args.use_cudnn == 1 else False),
        cudnn_exhaustive_search=True,
    )

    test_reader, number_of_examples = model_builder.create_data_reader(
        test_model,
        name="test_reader",
        input_data=args.test_data,
    )

    if args.num_iter <= 0:
        num_iter = int(number_of_examples / total_batch_size)
    else:
        num_iter = args.num_iter

    def test_input_fn(model):
        model_helper.AddVideoInput(
            test_model,
            test_reader,
            batch_size=args.batch_size,
            clip_per_video=args.clip_per_video,
            decode_type=1,
            length_rgb=args.clip_length_rgb,
            sampling_rate_rgb=args.sampling_rate_rgb,
            scale_h=args.scale_h,
            scale_w=args.scale_w,
            crop_size=args.crop_size,
            num_decode_threads=4,
            num_of_class=args.num_labels,
            random_mirror=False,
            random_crop=False,
            input_type=args.input_type,
            length_of=args.clip_length_of,
            sampling_rate_of=args.sampling_rate_of,
            frame_gap_of=args.frame_gap_of,
            do_flow_aggregation=args.do_flow_aggregation,
            flow_data_type=args.flow_data_type,
            get_rgb=(args.input_type == 0),
            get_optical_flow=(args.input_type == 1),
            get_video_id=args.get_video_id,
            use_local_file=args.use_local_file,
        )

    if num_gpus > 0:
        data_parallel_model.Parallelize_GPU(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_model_ops,
            param_update_builder_fun=None,
            devices=gpus
        )
    else:
        test_model._device_type = caffe2_pb2.CPU
        test_model._devices = [0]
        device_opt = core.DeviceOption(test_model._device_type, 0)
        with core.DeviceScope(device_opt):
            # Because our loaded models are named with "gpu_x", keep the naming for now.
            # TODO: Save model using `data_parallel_model.ExtractPredictorNet`
            # to extract the model for "gpu_0". It also renames
            # the input and output blobs by stripping the "gpu_x/" prefix
            with core.NameScope("{}_{}".format("gpu", 0)):
                test_input_fn(test_model)
                create_model_ops(test_model, 1.0)

    workspace.RunNetOnce(test_model.param_init_net)
    workspace.CreateNet(test_model.net)

    if args.db_type == 'minidb':
        if num_gpus > 0:
            model_helper.LoadModel(args.load_model_path, args.db_type)
            data_parallel_model.FinalizeAfterCheckpoint(test_model)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
                model_helper.LoadModel(args.load_model_path, args.db_type)
    elif args.db_type == 'pickle':
        if num_gpus > 0:
            model_loader.LoadModelFromPickleFile(
                test_model,
                args.load_model_path,
                use_gpu=True,
                root_gpu_id=gpus[0]
            )
            data_parallel_model.FinalizeAfterCheckpoint(test_model)
        else:
            model_loader.LoadModelFromPickleFile(
                test_model,
                args.load_model_path,
                use_gpu=False
            )
    else:
        log.warning("Unsupported db_type: {}".format(args.db_type))


    # metric counters for classification
    clip_acc = 0
    video_top1 = 0
    video_topk = 0
    video_count = 0
    clip_count = 0

    for i in range(num_iter):
        workspace.RunNet(test_model.net.Proto().name)
        num_devices = 1  # default for cpu
        if num_gpus > 0:
            num_devices = num_gpus

        for g in range(num_devices):
            # get labels
            label = workspace.FetchBlob(
                "gpu_{}".format(g) + '/label'
            )
            # get predictions
            predicts = workspace.FetchBlob("gpu_{}".format(g) + '/softmax')
            assert predicts.shape[0] == args.batch_size * args.clip_per_video

            for j in range(args.batch_size):
                # get label for one video
                sample_label = label[j * args.clip_per_video]
                # get clip accuracy
                for k in range(args.clip_per_video):
                    c1, _ = metric.accuracy_metric(
                        predicts[j * args.clip_per_video + k, :],
                        label[j * args.clip_per_video + k])
                    clip_acc = clip_acc + c1
                # get all clip predictions for one video
                all_clips = predicts[
                    j * args.clip_per_video:(j + 1) * args.clip_per_video, :]
                # aggregate predictions into one
                video_pred = PredictionAggregation(all_clips, args.aggregation)
                c1, ck = metric.accuracy_metric(
                    video_pred, sample_label, args.top_k)
                video_top1 = video_top1 + c1
                video_topk = video_topk + ck

            video_count = video_count + args.batch_size
            clip_count = clip_count + label.shape[0]

        if i > 0 and i % args.display_iter == 0:
            log.info('Iter {}/{}: clip: {}, top1: {}, top 5: {}'.format(
                i,
                num_iter,
                clip_acc / clip_count,
                video_top1 / video_count,
                video_topk / video_count))

    log.info("Test accuracy: clip: {}, top 1: {}, top{}: {}".format(
        clip_acc / clip_count,
        video_top1 / video_count,
        args.top_k,
        video_topk / video_count
    ))

    if num_gpus > 0:
        flops, params = model_helper.GetFlopsAndParams(test_model, gpus[0])
    else:
        flops, params = model_helper.GetFlopsAndParams(test_model)
    log.info('FLOPs: {}, params: {}'.format(flops, params))
Exemplo n.º 12
0
    def testEqualToCudnn(self):
        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType)):
            T = 8
            batch_size = 4
            input_dim = 8
            hidden_dim = 31

            workspace.FeedBlob("seq_lengths",
                               np.array([T] * batch_size, dtype=np.int32))
            workspace.FeedBlob(
                "target",
                np.zeros([T, batch_size, hidden_dim], dtype=np.float32))
            workspace.FeedBlob(
                "hidden_init",
                np.zeros([1, batch_size, hidden_dim], dtype=np.float32))
            workspace.FeedBlob(
                "cell_init",
                np.zeros([1, batch_size, hidden_dim], dtype=np.float32))

            own_model = model_helper.ModelHelper(name="own_lstm")

            input_shape = [T, batch_size, input_dim]
            cudnn_model = model_helper.ModelHelper(name="cudnn_lstm")
            input_blob = cudnn_model.param_init_net.UniformFill(
                [], "input", shape=input_shape)
            workspace.FeedBlob(
                "CUDNN/hidden_init_cudnn",
                np.zeros([1, batch_size, hidden_dim], dtype=np.float32))
            workspace.FeedBlob(
                "CUDNN/cell_init_cudnn",
                np.zeros([1, batch_size, hidden_dim], dtype=np.float32))

            cudnn_output, cudnn_last_hidden, cudnn_last_state, param_extract = rnn_cell.cudnn_LSTM(
                model=cudnn_model,
                input_blob=input_blob,
                initial_states=("hidden_init_cudnn", "cell_init_cudnn"),
                dim_in=input_dim,
                dim_out=hidden_dim,
                scope="CUDNN",
                return_params=True,
            )
            cudnn_loss = cudnn_model.AveragedLoss(
                cudnn_model.SquaredL2Distance([cudnn_output, "target"],
                                              "CUDNN/dist"), "CUDNN/loss")

            own_output, own_last_hidden, _, own_last_state, own_params = rnn_cell.LSTM(
                model=own_model,
                input_blob=input_blob,
                seq_lengths="seq_lengths",
                initial_states=("hidden_init", "cell_init"),
                dim_in=input_dim,
                dim_out=hidden_dim,
                scope="OWN",
                return_params=True,
            )
            own_loss = own_model.AveragedLoss(
                own_model.SquaredL2Distance([own_output, "target"],
                                            "OWN/dist"), "OWN/loss")

            # Add gradients
            cudnn_model.AddGradientOperators([cudnn_loss])
            own_model.AddGradientOperators([own_loss])

            # Add parameter updates
            LR = cudnn_model.param_init_net.ConstantFill([],
                                                         shape=[1],
                                                         value=0.01)
            ONE = cudnn_model.param_init_net.ConstantFill([],
                                                          shape=[1],
                                                          value=1.0)
            for param in cudnn_model.GetParams():
                cudnn_model.WeightedSum(
                    [param, ONE, cudnn_model.param_to_grad[param], LR], param)
            for param in own_model.GetParams():
                own_model.WeightedSum(
                    [param, ONE, own_model.param_to_grad[param], LR], param)

            # Copy states over
            own_model.net.Copy(own_last_hidden, "hidden_init")
            own_model.net.Copy(own_last_state, "cell_init")
            cudnn_model.net.Copy(cudnn_last_hidden, "CUDNN/hidden_init_cudnn")
            cudnn_model.net.Copy(cudnn_last_state, "CUDNN/cell_init_cudnn")

            workspace.RunNetOnce(cudnn_model.param_init_net)
            workspace.CreateNet(cudnn_model.net)

            ##
            ##  CUDNN LSTM MODEL EXECUTION
            ##
            # Get initial values from CuDNN LSTM so we can feed them
            # to our own.
            (param_extract_net, param_extract_mapping) = param_extract
            workspace.RunNetOnce(param_extract_net)
            cudnn_lstm_params = {
                input_type:
                {k: workspace.FetchBlob(v[0])
                 for k, v in viewitems(pars)}
                for input_type, pars in viewitems(param_extract_mapping)
            }

            # Run the model 3 times, so that some parameter updates are done
            workspace.RunNet(cudnn_model.net.Proto().name, 3)

            ##
            ## OWN LSTM MODEL EXECUTION
            ##
            # Map the cuDNN parameters to our own
            workspace.RunNetOnce(own_model.param_init_net)
            rnn_cell.InitFromLSTMParams(own_params, cudnn_lstm_params)

            # Run the model 3 times, so that some parameter updates are done
            workspace.CreateNet(own_model.net)
            workspace.RunNet(own_model.net.Proto().name, 3)

            ##
            ## COMPARE RESULTS
            ##
            # Then compare that final results after 3 runs are equal
            own_output_data = workspace.FetchBlob(own_output)
            own_last_hidden = workspace.FetchBlob(own_last_hidden)
            own_loss = workspace.FetchBlob(own_loss)

            cudnn_output_data = workspace.FetchBlob(cudnn_output)
            cudnn_last_hidden = workspace.FetchBlob(cudnn_last_hidden)
            cudnn_loss = workspace.FetchBlob(cudnn_loss)

            self.assertTrue(np.allclose(own_output_data, cudnn_output_data))
            self.assertTrue(np.allclose(own_last_hidden, cudnn_last_hidden))
            self.assertTrue(np.allclose(own_loss, cudnn_loss))
Exemplo n.º 13
0
    def lstm_with_attention(
        self,
        create_lstm_with_attention,
        encoder_output_length,
        encoder_output_dim,
        decoder_input_length,
        decoder_state_dim,
        batch_size,
        ref,
        gc,
    ):
        model = CNNModelHelper(name='external')
        with core.DeviceScope(gc):
            (
                encoder_outputs,
                decoder_inputs,
                decoder_input_lengths,
                initial_decoder_hidden_state,
                initial_decoder_cell_state,
                initial_attention_weighted_encoder_context,
            ) = model.net.AddExternalInputs(
                'encoder_outputs',
                'decoder_inputs',
                'decoder_input_lengths',
                'initial_decoder_hidden_state',
                'initial_decoder_cell_state',
                'initial_attention_weighted_encoder_context',
            )
            create_lstm_with_attention(
                model=model,
                decoder_inputs=decoder_inputs,
                decoder_input_lengths=decoder_input_lengths,
                initial_decoder_hidden_state=initial_decoder_hidden_state,
                initial_decoder_cell_state=initial_decoder_cell_state,
                initial_attention_weighted_encoder_context=(
                    initial_attention_weighted_encoder_context),
                encoder_output_dim=encoder_output_dim,
                encoder_outputs=encoder_outputs,
                decoder_input_dim=decoder_state_dim,
                decoder_state_dim=decoder_state_dim,
                scope='external/LSTMWithAttention',
            )
            op = model.net._net.op[-1]
        workspace.RunNetOnce(model.param_init_net)

        # This is original decoder_inputs after linear layer
        decoder_input_blob = op.input[0]

        workspace.FeedBlob(
            decoder_input_blob,
            np.random.randn(
                decoder_input_length,
                batch_size,
                decoder_state_dim * 4,
            ).astype(np.float32))
        workspace.FeedBlob(
            'external/LSTMWithAttention/encoder_outputs_transposed',
            np.random.randn(
                batch_size,
                encoder_output_dim,
                encoder_output_length,
            ).astype(np.float32),
        )
        workspace.FeedBlob(
            'external/LSTMWithAttention/weighted_encoder_outputs',
            np.random.randn(
                encoder_output_length,
                batch_size,
                encoder_output_dim,
            ).astype(np.float32),
        )
        workspace.FeedBlob(
            decoder_input_lengths,
            np.random.randint(0, decoder_input_length + 1,
                              size=(batch_size, )).astype(np.int32))
        workspace.FeedBlob(
            initial_decoder_hidden_state,
            np.random.randn(1, batch_size,
                            decoder_state_dim).astype(np.float32))
        workspace.FeedBlob(
            initial_decoder_cell_state,
            np.random.randn(1, batch_size,
                            decoder_state_dim).astype(np.float32))
        workspace.FeedBlob(
            initial_attention_weighted_encoder_context,
            np.random.randn(1, batch_size,
                            encoder_output_dim).astype(np.float32))
        inputs = [workspace.FetchBlob(name) for name in op.input]
        self.assertReferenceChecks(
            device_option=gc,
            op=op,
            inputs=inputs,
            reference=ref,
            grad_reference=None,
            output_to_grad=None,
            outputs_to_check=range(6),
        )
        gradients_to_check = [
            index for (index, input_name) in enumerate(op.input)
            if input_name != 'decoder_input_lengths'
        ]
        for param in gradients_to_check:
            self.assertGradientChecks(
                device_option=gc,
                op=op,
                inputs=inputs,
                outputs_to_check=param,
                outputs_with_grads=[0, 4],
                threshold=0.01,
                stepsize=0.001,
            )
Exemplo n.º 14
0
    def run_model(self, gpu_devices):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ITER = model.Iter("ITER")
            LR = model.net.LearningRate(
                [ITER],
                "LR",
                base_lr=(-0.1),
                policy="fixed",
            )
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            for param in model.GetParams():
                grad = model.param_to_grad[param]
                model.WeightedSum([param, ONE, grad, LR], param)

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="test{}".format(gpu_devices),
        )
        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(gpu_devices)

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/data".format(g), data)
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            print(i, workspace.FetchBlob("gpu_0/fc_w").flatten()[:5])
            workspace.RunNet(model.net.Proto().name)

        return workspace.FetchBlob("gpu_0/fc_w")
Exemplo n.º 15
0
def Parallelize_GPU_BMUF(
    model_helper_obj,
    input_builder_fun,
    forward_pass_builder_fun,
    param_update_builder_fun,
    block_learning_rate=1.0,
    block_momentum=None,
    devices=None,
    rendezvous=None,
    net_type='dag',
    master_gpu=None,
    use_nccl=False,
    optimize_gradient_memory=False,
    reset_momentum_sgd=False,
    warmup_iterations=None,
    max_concurrent_distributed_ops=4,
):
    '''
    Function to create model that run on many GPUs and creates a net for
    parameter_updates that can be run independently for number of iterations
    then followed by another net that runs once to compute the final parameter
    updates according to block wise model update filtering rule described
    in : Scalable Training of Deep Learning Machines by Incremental Block
    Training with Intra-block Parallel Optimization and Blockwise Model-Update
    Filtering (ICASSP 2016).
    '''
    assert isinstance(model_helper_obj, model_helper.ModelHelper)

    if devices is None:
        devices = list(range(0, workspace.NumCudaDevices()))
    if master_gpu is None:
        master_gpu = devices[0]

    model_helper_obj._devices = devices
    model_helper_obj._rendezvous = rendezvous
    model_helper_obj._device_type = caffe2_pb2.CUDA
    model_helper_obj._device_prefix = 'gpu'
    master_gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, master_gpu)

    num_shards = rendezvous['num_shards'] if rendezvous else 1
    num_workers = len(devices) * num_shards
    num_worker_threads = 4 * len(devices)
    if rendezvous:
        num_worker_threads += 8
    loss_scale = 1.0 / num_workers
    if block_momentum is None:
        block_momentum = 1.0 - 1.0 / num_workers

    max_concurrent_distributed_ops = min(
        max_concurrent_distributed_ops,
        num_worker_threads - 1
    )

    model_helper_obj.net.Proto().num_workers = num_worker_threads
    model_helper_obj.net.Proto().type = net_type

    # A net for initializing global model parameters. Its called once in the
    # same step as net parameters initialization.
    model_helper_obj._global_model_init_net = core.Net('global_model_init')
    model_helper_obj._global_model_init_net.Proto().type = net_type
    model_helper_obj._global_model_init_net.Proto().num_workers = \
        num_worker_threads

    # A net for computing final parameter updates. Its will run once after
    # running net (local models updates) for `num_local_iterations` times.
    model_helper_obj._global_model_param_updates_net = core.Net('global_model')
    model_helper_obj._global_model_param_updates_net.Proto().type = net_type
    model_helper_obj._global_model_param_updates_net.Proto().num_workers = \
        num_worker_threads

    def _v(param):
        return "{}_v".format(param)

    def _g(param):
        return "{}_g".format(param)

    # Keep track of params that were in the model before: they are not
    # data parallel, so we need to handle them separately
    non_datapar_params = copy.copy(model_helper_obj.params)
    model_helper_obj._losses_by_gpu = {}

    def _InitializeModels(gpu_id):
        input_builder_fun(model_helper_obj)
        loss = forward_pass_builder_fun(model_helper_obj, loss_scale)
        model_helper_obj._losses_by_gpu[gpu_id] = loss
    _ForEachGPU(devices, _InitializeModels, scoped=True)

    model_helper_obj._device_grouped_blobs =\
        _GroupByDevice(model_helper_obj, devices,
                       model_helper_obj.params, non_datapar_params)

    model_helper_obj._param_names =\
        model_helper_obj._device_grouped_blobs.keys()

    _AddGradientOperators(
        devices, model_helper_obj, model_helper_obj._losses_by_gpu
    )

    _InferBlobDevice(model_helper_obj)

    def _InitializeParamUpdate(gpu_id):
        param_update_builder_fun(model_helper_obj)
    _ForEachGPU(devices, _InitializeParamUpdate, scoped=True)

    model_parameter_names = list(
        viewkeys(model_helper_obj._device_grouped_blobs)
    )
    if warmup_iterations is not None:
        model_helper_obj._warmup_iterations = warmup_iterations
        # A net for broadcasting gpu-0 (master shard) parameters after
        # running net for `warmup_iterartions`.
        model_helper_obj._warmup_broadcast = core.Net('warmup-broadcast')
        model_helper_obj._warmup_broadcast.Proto().type = net_type
        model_helper_obj._warmup_broadcast.Proto().num_workers = \
            num_worker_threads

        _SyncAllParams(
            devices,
            model_helper_obj,
            model_helper_obj.param_init_net,
            model_helper_obj._warmup_broadcast,
            rendezvous,
            model_parameter_names,
            max_concurrent_distributed_ops
        )
        for param_name in model_helper_obj._device_grouped_blobs.keys():
            param = model_helper_obj._device_grouped_blobs[param_name][master_gpu]
            with core.DeviceScope(master_gpu_opt):
                model_helper_obj._warmup_broadcast.Copy(param, _g(param))

    # (Step-0) Initialize momentum parameters on master GPU.
    for param_name in viewkeys(model_helper_obj._device_grouped_blobs):
        param = model_helper_obj._device_grouped_blobs[param_name][master_gpu]
        with core.DeviceScope(master_gpu_opt):
            model_helper_obj._global_model_init_net.ConstantFill(
                param, _v(param), value=0.0
            )
            model_helper_obj._global_model_init_net.Copy(param, _g(param))

    # (Step-1) Update models for num_local_iterations.

    # (Step-2) Comute post-local-updates average of the params.
    # Sum model params across GPUs and store resutls in param_avg blob.
    _AllReduceBlobs(
        model_parameter_names,
        devices,
        model_helper_obj,
        model_helper_obj._global_model_param_updates_net,
        rendezvous,
        use_nccl,
        max_concurrent_distributed_ops
    )

    # (Step-3) Update momentum params :
    # param_v = block_momentum * param_v
    # + block_learning_Rate * (param_avg - param)
    # param = param + param_v
    for param_name in model_parameter_names:
        param = model_helper_obj._device_grouped_blobs[param_name][master_gpu]
        with core.DeviceScope(master_gpu_opt):
            # TODO(ataei) : Stop building the graph here to get model average ?
            model_helper_obj._global_model_param_updates_net.Scale(
                param, param, scale=1.0 / num_workers
            )
            model_helper_obj._global_model_param_updates_net.Sub(
                [param, _g(param)], param
            )
            model_helper_obj._global_model_param_updates_net.Scale(
                param, param, scale=block_learning_rate
            )
            model_helper_obj._global_model_param_updates_net.Scale(
                _v(param), _v(param), scale=block_momentum
            )
            model_helper_obj._global_model_param_updates_net.Add(
                [_v(param), param], _v(param)
            )
            model_helper_obj._global_model_param_updates_net.Add(
                [_g(param), _v(param)], _g(param)
            )
            model_helper_obj._global_model_param_updates_net.Copy(
                _g(param), param
            )


    _SyncAllParams(
        devices,
        model_helper_obj,
        model_helper_obj.param_init_net,
        model_helper_obj._global_model_param_updates_net,
        rendezvous,
        model_parameter_names,
        max_concurrent_distributed_ops
    )

    # Reset momentum-SGD parameters
    if reset_momentum_sgd:
        momentum_ops = [op for op in model_helper_obj.net.Proto().op
                        if op.type == 'MomentumSGDUpdate']
        for op in momentum_ops:
            momentum_blob = op.input[1]
            with core.DeviceScope(op.device_option):
                model_helper_obj._global_model_param_updates_net.ConstantFill(
                    [momentum_blob], momentum_blob, value=0.0
                )

    if optimize_gradient_memory:
        _OptimizeGradientMemorySimple(
            model_helper_obj, model_helper_obj._losses_by_gpu, devices
        )

    model_helper_obj._data_parallel_model_init_nets = [
        model_helper_obj.param_init_net,
        model_helper_obj._global_model_init_net
    ]

    model_helper_obj._data_parallel_model_nets = [
        model_helper_obj.net,
        (model_helper_obj._global_model_param_updates_net, 1)
    ]
Exemplo n.º 16
0
    def test_inject_copy_placeholder_ops(self):
        '''
        Test inject cross device copies with placeholder ops. Placeholder ops
        are decorator/fake ops that don't have operator schema.
        '''
        # Create CPU and GPU devices on 2 nodes.
        cpu_device = []
        gpu_device = []
        for i in range(0, 2):
            cpu_device.append(caffe2_pb2.DeviceOption())
            cpu_device[i].node_name = 'node:' + str(i)
            gpu_device.append(caffe2_pb2.DeviceOption())
            gpu_device[i].device_type = caffe2_pb2.CUDA
            gpu_device[i].cuda_gpu_id = 0
            gpu_device[i].node_name = 'node:' + str(i)
        send_node = 'node:0'
        recv_node = 'node:1'
        placeholder_send = 'Placeholder:Dummy:Send'
        placeholder_recv = 'Placeholder:Dummy:Recv'

        # init_net.
        init_net = core.Net("init_net")
        with core.DeviceScope(gpu_device[0]):
            weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
            bias = init_net.ConstantFill([], 'fc_b', shape=[
                10,
            ])
        with core.DeviceScope(cpu_device[0]):
            op = core.CreateOperator(placeholder_send, [weight, bias], [],
                                     dst_node=recv_node)
            init_net._net.op.extend([op])

        # train_net
        train_net = core.Net("train_net")
        with core.DeviceScope(cpu_device[1]):
            # XXX. replace hardcoded op name. Move test to net_transforms.
            op = core.CreateOperator(placeholder_recv, [], [weight, bias],
                                     src_node=send_node)
            train_net._net.op.extend([op])
            train_net.FC(["data", weight, bias], "fc1")

        # Inject cross device copies.
        init_net, x_dev_state = core.InjectCrossDeviceCopies(
            init_net, placeHolderOps=[placeholder_send, placeholder_recv])
        train_net, x_dev_state = core.InjectCrossDeviceCopies(
            train_net,
            x_dev_state,
            placeHolderOps=[placeholder_send, placeholder_recv])

        # Verify (init_net)
        op = init_net._net.op[2]
        self.assertEqual(op.type, "CopyGPUToCPU")
        self.assertEqual(op.device_option.device_type, 1)
        self.assertEqual(op.device_option.cuda_gpu_id, 0)
        self.assertEqual(op.output[0], "fc_w_cpu")
        op = init_net._net.op[3]
        self.assertEqual(op.type, "CopyGPUToCPU")
        self.assertEqual(op.device_option.device_type, 1)
        self.assertEqual(op.device_option.cuda_gpu_id, 0)
        self.assertEqual(op.output[0], "fc_b_cpu")
        op = init_net._net.op[4]
        self.assertEqual(op.type, placeholder_send)
        self.assertEqual(op.device_option.device_type, 0)
        self.assertEqual(op.input[0], "fc_w_cpu")
        self.assertEqual(op.input[1], "fc_b_cpu")
        # Verify (train_net)
        op = train_net._net.op[0]
        self.assertEqual(op.type, placeholder_recv)
        self.assertEqual(op.device_option.device_type, 0)
        self.assertEqual(op.output[0], "fc_w_cpu")
        self.assertEqual(op.output[1], "fc_b_cpu")
        op = train_net._net.op[3]
        self.assertEqual(op.type, "FC")
        self.assertEqual(op.device_option.device_type, 0)
        self.assertEqual(op.input[1], "fc_w_cpu")
        self.assertEqual(op.input[2], "fc_b_cpu")
Exemplo n.º 17
0
def Parallelize(
    model_helper_obj,
    input_builder_fun,
    forward_pass_builder_fun,
    param_update_builder_fun=None,
    optimizer_builder_fun=None,
    post_sync_builder_fun=None,
    devices=None,
    rendezvous=None,
    net_type='dag',
    broadcast_computed_params=True,
    optimize_gradient_memory=False,
    use_nccl=False,
    max_concurrent_distributed_ops=16,
    cpu_device=False,
):
    '''
    Function to create a model that can run on many GPUs or CPUs.
      model_helper_obj: an object of ModelHelper
      input_builder_fun:
                         Function that adds the input operators
                         Note: Remember to instantiate reader outside of this
                         function so all devices share same reader object.
                         Signature:  input_builder_fun(model)
      forward_pass_builder_fun:
                        Function to add the operators to the model.
                        Must return list of loss-blob references that
                        are used to build the gradient. Loss scale parameter
                        is passed, as you should scale the loss of your model
                        by 1.0 / the total number of devices.
                        Signature: forward_pass_builder_fun(model, loss_scale)
      param_update_builder_fun:
                        Function that adds operators that are run after
                        gradient update, such as updating the weights and
                        weight decaying. This is called for each GPU separately.
                        Signature: param_update_builder_fun(model)
      optimizer_builder_fun:
                        Alternative to param_update_builder_fun, allows one
                        to add an optimizer for the whole model. Called only
                        once, without name or devicescope.
      post_sync_builder_fun:
                        Function applied after initial parameter sync has been
                        completed, such as keeping multi-precision parameters
                        in sync.
                        Signature: post_sync_builder_fun(model)
      devices:          List of GPU ids, such as [0, 1, 2, 3],
      rendezvous:       used for rendezvous in distributed computation, if None
                        then only one node is used. To create rendezvous,
                        use <TBD>.
      net_type:         Network type
      optimize_gradient_memory: whether to apply 'memonger' to share blobs
                        in gradient computation to reduce memory footprint
      cpu_device        Use CPU instead of GPU
    '''
    if devices is None:
        devices = list(range(0, workspace.NumCudaDevices())),

    if not cpu_device:
        for gpu in devices:
            if gpu >= workspace.NumCudaDevices():
                log.warning("** Only {} GPUs available, GPUs {} requested".format(
                    workspace.NumCudaDevices(), devices))
                break
        model_helper_obj._device_type = caffe2_pb2.CUDA
        model_helper_obj._device_prefix = "gpu"
        device_name = "GPU"
    else:
        model_helper_obj._device_type = caffe2_pb2.CPU
        model_helper_obj._device_prefix = "cpu"
        device_name = "CPU"

    log.info("Parallelizing model for devices: {}".format(devices))
    extra_workers = 8 if rendezvous is not None else 0  # best-guess
    num_workers = len(devices) * 4 + extra_workers
    max_concurrent_distributed_ops =\
        min(max_concurrent_distributed_ops, num_workers - 1)
    model_helper_obj.net.Proto().num_workers = num_workers
    model_helper_obj.net.Proto().type = net_type

    # Store some information in the model -- a bit ugly
    model_helper_obj._devices = devices
    model_helper_obj._rendezvous = rendezvous
    model_helper_obj._grad_names = []

    assert isinstance(model_helper_obj, model_helper.ModelHelper)

    # Keep track of params that were in the model before: they are not
    # data parallel, so we need to handle them separately
    non_datapar_params = copy.copy(model_helper_obj.params)

    # Add input and model
    log.info("Create input and model training operators")

    losses_by_gpu = {}
    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
    loss_scale = 1.0 / (len(devices) * num_shards)

    has_parameter_updates = param_update_builder_fun is not None or \
        optimizer_builder_fun is not None
    assert not (
        param_update_builder_fun is not None and
        optimizer_builder_fun is not None
    ), 'Can only specify one of param_update_builder_fun, optimizer_builder_fun'

    for device in devices:
        device_opt = core.DeviceOption(model_helper_obj._device_type, device)
        with core.DeviceScope(device_opt):
            with core.NameScope("{}_{}".format(model_helper_obj._device_prefix,
                                               device)):
                log.info("Model for {} : {}".format(device_name, device))
                input_builder_fun(model_helper_obj)
                losses = forward_pass_builder_fun(model_helper_obj, loss_scale)
                # Losses are not needed for test net
                if has_parameter_updates:
                    assert isinstance(losses, list), \
                        'Model builder function must return list of loss blobs'
                    for loss in losses:
                        assert isinstance(loss, core.BlobReference), \
                            'Model builder func must return list of loss blobs'

                losses_by_gpu[device] = losses
    _ValidateParams(model_helper_obj.params)

    # Create parameter map
    model_helper_obj._device_grouped_blobs =\
        _GroupByDevice(model_helper_obj, devices,
                       model_helper_obj.params, non_datapar_params)

    # computed params
    computed_params_grouped =\
        _GroupByDevice(model_helper_obj, devices,
                       model_helper_obj.GetComputedParams(''), [])
    model_helper_obj._device_grouped_blobs.update(computed_params_grouped)

    model_helper_obj._param_names =\
        list(viewkeys(model_helper_obj._device_grouped_blobs))
    model_helper_obj._computed_param_names =\
        list(viewkeys(computed_params_grouped))

    if not has_parameter_updates:
        log.info("Parameter update function not defined --> only forward")
        _InferBlobDevice(model_helper_obj)
        return

    log.info("Adding gradient operators")
    _AddGradientOperators(devices, model_helper_obj, losses_by_gpu)

    _ValidateParams(model_helper_obj.params)

    # Group gradients by device and register to blob lookup
    param_to_grad = model_helper_obj.param_to_grad
    grads_ordered = [param_to_grad[p] for p in
                     model_helper_obj.params if p in param_to_grad]
    non_datapar_grads = [param_to_grad[p] for p in non_datapar_params]

    gradients_grouped = _GroupByDevice(
        model_helper_obj,
        devices,
        grads_ordered,
        non_datapar_grads
    )
    model_helper_obj._device_grouped_blobs.update(gradients_grouped)
    model_helper_obj._grad_names = list(viewkeys(gradients_grouped))
    model_helper_obj._losses_by_gpu = losses_by_gpu

    _InferBlobDevice(model_helper_obj)

    log.info("Add gradient all-reduces for SyncSGD")
    if broadcast_computed_params:
        _BroadcastComputedParams(devices, model_helper_obj, rendezvous, use_nccl)

    if len(model_helper_obj._grad_names) > 0:
        # Gradients in reverse order
        reverse_ordered_grads = _GetReverseOrderedGrads(model_helper_obj)
        assert(len(reverse_ordered_grads) > 0)
        _AllReduceBlobs(
            reverse_ordered_grads,
            devices,
            model_helper_obj,
            model_helper_obj.net,
            rendezvous,
            use_nccl,
            max_concurrent_distributed_ops,
        )
    else:
        log.info("NOTE: Param builder function did not create any parameters.")

    log.info("Post-iteration operators for updating params")
    num_shards = 1 if rendezvous is None else rendezvous['num_shards']

    if param_update_builder_fun is not None:
        for device in devices:
            device_opt = core.DeviceOption(model_helper_obj._device_type, device)
            with core.DeviceScope(device_opt):
                with core.NameScope(
                    "{}_{}".format(model_helper_obj._device_prefix, device)
                ):
                    param_update_builder_fun(model_helper_obj)
    else:
        log.info("Calling optimizer builder function")
        optimizer_builder_fun(model_helper_obj)

    (sync_blobs, sync_names) = _ComputeBlobsToSync(model_helper_obj)
    sync_blobs_grouped = _GroupByDevice(
        model_helper_obj,
        devices,
        sync_blobs,
        [],
    )
    model_helper_obj._device_grouped_blobs.update(sync_blobs_grouped)

    _InferBlobDevice(model_helper_obj)
    _AnalyzeOperators(model_helper_obj)

    # Configure dagnet to run with only one worker on the first iteration,
    # to prevent concurrency problems with allocs and nccl.
    arg = model_helper_obj.Proto().arg.add()
    arg.name = "first_iter_only_one_worker"
    arg.i = 1

    # Add initial parameter syncs
    log.info("Add initial parameter sync")
    _SyncAllParams(
        devices,
        model_helper_obj,
        model_helper_obj.param_init_net,
        model_helper_obj.param_init_net,
        rendezvous,
        sync_names,
        max_concurrent_distributed_ops=1
    )

    # Handle any operations that need to be done after parameter sync
    # i.e. making sure multi-precision copies of parameters are up-to-date
    if post_sync_builder_fun is not None:
        for device in devices:
            device_opt = core.DeviceOption(model_helper_obj._device_type, device)
            with core.DeviceScope(device_opt):
                with core.NameScope(
                    "{}_{}".format(model_helper_obj._device_prefix, device)
                ):
                    post_sync_builder_fun(model_helper_obj)

    if optimize_gradient_memory:
        _OptimizeGradientMemorySimple(model_helper_obj, losses_by_gpu, devices)

    model_helper_obj._data_parallel_model_init_nets = [
        model_helper_obj.param_init_net,
    ]
    model_helper_obj._data_parallel_model_nets = [model_helper_obj.net]
Exemplo n.º 18
0
    def test_inject_copy_multi_use(self):
        net = core.Net("test")
        device_option = caffe2_pb2.DeviceOption()
        device_option.device_type = caffe2_pb2.CUDA
        device_option.cuda_gpu_id = 1

        with core.DeviceScope(device_option):
            net.Relu("data", "relu1")
        net.Relu("data", "relu2")
        with core.DeviceScope(device_option):
            net.Relu("data", "relu3")
        net.Relu("data", "relu4")
        device_option.cuda_gpu_id = 0
        with core.DeviceScope(device_option):
            net.Relu("data", "relu5")
        device_option.cuda_gpu_id = 1
        with core.DeviceScope(device_option):
            net.Relu("data", "relu6")

        new_net, _ = core.InjectCrossDeviceCopies(net)
        op = new_net._net.op[0]
        self.assertEqual(op.type, "CopyCPUToGPU")
        self.assertEqual(op.device_option.device_type, 1)
        self.assertEqual(op.device_option.cuda_gpu_id, 1)
        self.assertEqual(op.output[0], "data_cuda_1")
        op = new_net._net.op[1]
        self.assertEqual(op.type, "Relu")
        self.assertEqual(op.device_option.device_type, 1)
        self.assertEqual(op.device_option.cuda_gpu_id, 1)
        self.assertEqual(op.output[0], "relu1")
        op = new_net._net.op[2]
        self.assertEqual(op.type, "Relu")
        self.assertEqual(op.device_option.device_type, 0)
        self.assertEqual(op.output[0], "relu2")
        op = new_net._net.op[3]
        self.assertEqual(op.type, "Relu")
        self.assertEqual(op.device_option.device_type, 1)
        self.assertEqual(op.device_option.cuda_gpu_id, 1)
        self.assertEqual(op.input[0], "data_cuda_1")
        self.assertEqual(op.output[0], "relu3")
        op = new_net._net.op[4]
        self.assertEqual(op.type, "Relu")
        self.assertEqual(op.device_option.device_type, 0)
        self.assertEqual(op.output[0], "relu4")
        op = new_net._net.op[5]
        self.assertEqual(op.type, "CopyCPUToGPU")
        self.assertEqual(op.device_option.device_type, 1)
        self.assertEqual(op.device_option.cuda_gpu_id, 0)
        self.assertEqual(op.output[0], "data_cuda_0")
        op = new_net._net.op[6]
        self.assertEqual(op.type, "Relu")
        self.assertEqual(op.device_option.device_type, 1)
        self.assertEqual(op.device_option.cuda_gpu_id, 0)
        self.assertEqual(op.input[0], "data_cuda_0")
        self.assertEqual(op.output[0], "relu5")
        op = new_net._net.op[7]
        self.assertEqual(op.type, "Relu")
        self.assertEqual(op.device_option.device_type, 1)
        self.assertEqual(op.device_option.cuda_gpu_id, 1)
        self.assertEqual(op.input[0], "data_cuda_1")
        self.assertEqual(op.output[0], "relu6")
        """
    def __init__(
        self,
        cli_args,
        model=None,
        tag=None,
        enable_prof=False,
    ):
        super(MT_Wide_and_Deep_Wrapper, self).__init__()
        self.args = cli_args

        # GPU Enable Flags
        gpu_en = self.args.use_gpu

        if gpu_en:
            device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
            ngpus = C.num_cuda_devices  # 1
            print("(Wrapper) Using {} GPU(s)...".format(ngpus))
        else:
            device_opt = core.DeviceOption(caffe2_pb2.CPU)
            print("(Wrapper) Using CPU...")

        self.gpu_en = gpu_en

        num_tables = len(cli_args.arch_embedding_size.split("-"))

        # We require 3 datastructures in caffe2 to enable non-blocking inputs for MT_Wide_and_Deep
        # At a high-level each input needs an input queue. Inputs are enqueued
        # when they arrive on the "server" or "core" and dequeued by the
        # model's inference engine
        # Input Blob -> Input Net -> ID Q ===> MT_Wide_and_Deep model
        self.id_qs = []
        self.id_input_blobs = []
        self.id_input_nets = []

        # Same thing for the lengths inputs
        self.len_qs = []
        self.len_input_blobs = []
        self.len_input_nets = []

        for i in range(num_tables):

            q, input_blob, net = self.build_mtwnd_sparse_queue(tag="id", qid=i)
            self.id_qs.append(q)
            self.id_input_blobs.append(input_blob)
            self.id_input_nets.append(net)

            q, input_blob, net = self.build_mtwnd_sparse_queue(tag="len",
                                                               qid=i)
            self.len_qs.append(q)
            self.len_input_blobs.append(input_blob)
            self.len_input_nets.append(net)

        self.fc_q, self.fc_input_blob, self.fc_input_net = self.build_mtwnd_fc_queue(
        )

        if self.args.queue:
            with core.DeviceScope(device_opt):
                self.mtwnd = MT_Wide_and_Deep(cli_args,
                                              model,
                                              tag,
                                              enable_prof,
                                              id_qs=self.id_qs,
                                              len_qs=self.len_qs,
                                              fc_q=self.fc_q)
        else:
            with core.DeviceScope(device_opt):
                self.mtwnd = MT_Wide_and_Deep(cli_args, model, tag,
                                              enable_prof)
Exemplo n.º 20
0
    def assertReferenceChecks(
        self,
        device_option,
        op,
        inputs,
        reference,
        input_device_options=None,
        threshold=1e-4,
        output_to_grad=None,
        grad_reference=None,
        atol=None,
        outputs_to_check=None,
    ):
        """
        This runs the reference Python function implementation
        (effectively calling `reference(*inputs)`, and compares that
        to the output of output, with an absolute/relative tolerance
        given by the `threshold` parameter.

        Useful for checking the implementation matches the Python
        (typically NumPy) implementation of the same functionality.

        Usage example:

            @given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs)
            def test_softsign(self, X, inplace, gc, dc):
                op = core.CreateOperator(
                    "Softsign", ["X"], ["X" if inplace else "Y"])

                def softsign(X):
                    return (X / (1 + np.abs(X)),)

                self.assertReferenceChecks(gc, op, [X], softsign)
        """
        op = copy.deepcopy(op)
        op.device_option.CopyFrom(device_option)

        with temp_workspace():
            if (len(op.input) > len(inputs)):
                raise ValueError(
                    'must supply an input for each input on the op: %s vs %s' %
                    (op.input, inputs))
            _input_device_options = input_device_options or \
                core.InferOpBlobDevicesAsDict(op)[0]
            for (n, b) in zip(op.input, inputs):
                workspace.FeedBlob(n,
                                   b,
                                   device_option=_input_device_options.get(
                                       n, device_option))
            net = core.Net("opnet")
            net.Proto().op.extend([op])
            test_shape_inference = False
            try:
                (shapes, types) = workspace.InferShapesAndTypes([net])
                test_shape_inference = True
            except RuntimeError as e:
                # Temporarily catch runtime errors when inferring shape
                # and type info
                logging.warning(str(e))
                if os.getenv('CAFFE2_ASSERT_SHAPEINFERENCE') == '1':
                    raise e
            workspace.RunNetOnce(net)
            reference_outputs = reference(*inputs)
            if not (isinstance(reference_outputs, tuple)
                    or isinstance(reference_outputs, list)):
                raise RuntimeError(
                    "You are providing a wrong reference implementation. A "
                    "proper one should return a tuple/list of numpy arrays.")
            if not outputs_to_check:
                self.assertEqual(len(reference_outputs), len(op.output))
                outputs_to_check = list(range(len(op.output)))
            outs = []
            for (output_index, ref) in zip(outputs_to_check,
                                           reference_outputs):
                output_blob_name = op.output[output_index]
                output = workspace.FetchBlob(output_blob_name)
                if output.dtype.kind in ('S', 'O'):
                    np.testing.assert_array_equal(output, ref)
                else:
                    if atol is None:
                        atol = threshold
                    np.testing.assert_allclose(
                        output,
                        ref,
                        atol=atol,
                        rtol=threshold,
                        err_msg=(
                            'Output {0} is not matching the reference'.format(
                                output_blob_name, )),
                    )
                if test_shape_inference:
                    self._assertInferTensorChecks(output_blob_name, shapes,
                                                  types, output)
                outs.append(output)
            if grad_reference is not None:
                assert output_to_grad is not None, \
                    "If grad_reference is set," \
                    "output_to_grad has to be set as well"

                with core.DeviceScope(device_option):
                    self._assertGradReferenceChecks(op,
                                                    inputs,
                                                    reference_outputs,
                                                    output_to_grad,
                                                    grad_reference,
                                                    threshold=threshold)

            return outs
    def run_model(self, devices, gpu):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            workspace.FeedBlob(
                core.ScopedBlobReference("seq_lengths"),
                np.array([self.T] * self.batch_per_device, dtype=np.int32))
            model.param_init_net.ConstantFill(
                [],
                "hidden_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim])
            model.param_init_net.ConstantFill(
                [],
                "cell_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim])

            output, _last_hidden, _, _last_state, = rnn_cell.LSTM(
                model=model,
                input_blob="data",
                seq_lengths="seq_lengths",
                initial_states=("hidden_init", "cell_init"),
                dim_in=self.input_dim,
                dim_out=self.hidden_dim,
                scope="partest",
            )

            # A silly loss function
            loss = model.AveragedLoss(
                model.Sub([output, "target"], "dist"),
                "loss",
            )
            loss = model.Scale(loss, "loss_scaled", scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ITER = model.Iter("ITER")
            LR = model.net.LearningRate(
                [ITER],
                "LR",
                base_lr=(-0.1),
                policy="fixed",
            )
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                model.WeightedSum([param, ONE, param_grad, LR], param)

            assert len(
                model.GetParams()) == len(model.params) // len(model._devices)

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(name="recurrent_test{}".format(devices), )

        self.T = 8
        self.batch_size = 64
        self.input_dim = 8
        self.hidden_dim = 31
        self.batch_per_device = self.batch_size // len(devices)

        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=devices,
            optimize_gradient_memory=True,
            cpu_device=not gpu,
        )

        # Change all initialization to be ConstantFills so that
        # the everything is deterministic
        for op in model.param_init_net.Proto().op:
            if op.type.endswith('Fill'):
                op.type = 'ConstantFill'

        # Each run has same input, independent of number of gpus
        np.random.seed(20150210)
        for i in range(0, 10):
            full_data = np.random.rand(self.T, self.batch_size, self.input_dim)
            full_target = np.random.rand(self.T, self.batch_size,
                                         self.hidden_dim)

            for (j, g) in enumerate(devices):
                st = j * self.batch_per_device
                en = st + self.batch_per_device
                data = full_data[:, st:en, :].astype(np.float32)
                targets = full_target[:, st:en, :].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type,
                                                        g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data)
                    workspace.FeedBlob(
                        "{}_{}/target".format(model._device_prefix, g),
                        targets)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

        return workspace.FetchBlob("{}_0/partest/i2h_w".format(
            model._device_prefix))
Exemplo n.º 22
0
    def _build_embedding_encoder(
        self,
        model,
        inputs,
        input_lengths,
        vocab_size,
        embeddings,
        embedding_size,
        use_attention,
        num_gpus,
        forward_only=False,
    ):
        if num_gpus == 0:
            embedded_encoder_inputs = model.net.Gather(
                [embeddings, inputs],
                ['embedded_encoder_inputs'],
            )
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                embedded_encoder_inputs_cpu = model.net.Gather(
                    [embeddings, inputs],
                    ['embedded_encoder_inputs_cpu'],
                )
            embedded_encoder_inputs = model.CopyCPUToGPU(
                embedded_encoder_inputs_cpu,
                'embedded_encoder_inputs',
            )

        if self.encoder_type == 'rnn':
            assert len(self.encoder_params['encoder_layer_configs']) == 1
            encoder_num_units = (
                self.encoder_params['encoder_layer_configs'][0]['num_units'])
            encoder_initial_cell_state = model.param_init_net.ConstantFill(
                [],
                ['encoder_initial_cell_state'],
                shape=[encoder_num_units],
                value=0.0,
            )
            encoder_initial_hidden_state = (model.param_init_net.ConstantFill(
                [],
                'encoder_initial_hidden_state',
                shape=[encoder_num_units],
                value=0.0,
            ))
            # Choose corresponding rnn encoder function
            if self.encoder_params['use_bidirectional_encoder']:
                rnn_encoder_func = seq2seq_util.rnn_bidirectional_encoder
                encoder_output_dim = 2 * encoder_num_units
            else:
                rnn_encoder_func = seq2seq_util.rnn_unidirectional_encoder
                encoder_output_dim = encoder_num_units

            (
                encoder_outputs,
                final_encoder_hidden_state,
                final_encoder_cell_state,
            ) = rnn_encoder_func(
                model,
                embedded_encoder_inputs,
                input_lengths,
                encoder_initial_hidden_state,
                encoder_initial_cell_state,
                embedding_size,
                encoder_num_units,
                use_attention,
            )
            weighted_encoder_outputs = None
        else:
            raise ValueError('Unsupported encoder type {}'.format(
                self.encoder_type))

        return (
            encoder_outputs,
            weighted_encoder_outputs,
            final_encoder_hidden_state,
            final_encoder_cell_state,
            encoder_output_dim,
        )
    def run_model(self, V, gpu_devices, cpu_indices):
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            if cpu_indices:
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    gathered_cpu = model.net.Gather([self.vecs, 'indices'],
                                                    'gathered_cpu')

                gathered = model.CopyCPUToGPU(gathered_cpu, "gathered")
            else:
                gpu_vecs = model.param_init_net.CopyCPUToGPU(
                    self.vecs,
                    "gpuvecs",
                )
                model.params.append(gpu_vecs)
                gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered')
            flattened = model.Flatten(gathered, "flattened")
            fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    param_momentum = model.param_init_net.ConstantFill(
                        [param],
                        param + '_momentum',
                        value=0.0,
                    )
                    model.net.SparseMomentumSGDUpdate(
                        [
                            param_grad.values,
                            param_momentum,
                            LR,
                            param,
                            param_grad.indices,
                        ],
                        [param_grad.values, param_momentum, param],
                        momentum=0.1,
                        nesterov=0,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                self.vecs = model.param_init_net.UniformFill([],
                                                             "vecs",
                                                             shape=[V, 16])
                if cpu_indices:
                    model.params.append(self.vecs)
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [],
                    "ONE_CPU",
                    shape=[1],
                    value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        if cpu_indices:
            with core.NameScope("cpu"):
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    for param in model.GetParams():
                        param_grad = model.param_to_grad[param]
                        model.ScatterWeightedSum([
                            param, self.ONE_CPU, param_grad.indices,
                            param_grad.values, self.LR
                        ], self.vecs)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
                model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs)

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_indices = np.random.permutation(V)[:batch_size * 16].reshape(
                batch_size, 16)
            full_labels = full_indices[:, 0] % 2
            batch_per_device = batch_size // len(gpu_devices)

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en, :].astype(np.int32)
                labels = full_labels[st:en].astype(np.float32)

                device_for_indices = core.DeviceOption(caffe2_pb2.CPU)
                if not cpu_indices:
                    device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g)

                with core.DeviceScope(device_for_indices):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = np.random.rand(V, 16).astype(np.float32)
                workspace.FeedBlob(self.vecs, orig_vecs)
                if not cpu_indices:
                    for g in gpu_devices:
                        workspace.FeedBlob(
                            "gpu_{}/gpuvecs".format(g),
                            orig_vecs,
                            device_option=core.DeviceOption(
                                caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)
            if len(gpu_devices) == 2:
                if not cpu_indices:
                    idx = workspace.FetchBlob("gpu_0/indices")
                    idx = list(idx.flatten())
                    n = len(idx)
                    nu = len(set(idx))
                    assert n == nu, "We cannot have duplicate indices"

        # Sanity check to see the vecs were updated
        self.assertFalse(np.allclose(workspace.FetchBlob(self.vecs),
                                     orig_vecs))
        return [
            workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"),
            workspace.FetchBlob("gpu_0/fc_w")
        ]
Exemplo n.º 24
0
    def model_build_fun(self, model, forward_only=False, loss_scale=None):
        encoder_inputs = model.net.AddExternalInput(
            workspace.GetNameScope() + 'encoder_inputs', )
        encoder_lengths = model.net.AddExternalInput(
            workspace.GetNameScope() + 'encoder_lengths', )
        decoder_inputs = model.net.AddExternalInput(
            workspace.GetNameScope() + 'decoder_inputs', )
        decoder_lengths = model.net.AddExternalInput(
            workspace.GetNameScope() + 'decoder_lengths', )
        targets = model.net.AddExternalInput(
            workspace.GetNameScope() + 'targets', )
        target_weights = model.net.AddExternalInput(
            workspace.GetNameScope() + 'target_weights', )
        attention_type = self.model_params['attention']
        assert attention_type in ['none', 'regular']

        (
            encoder_outputs,
            weighted_encoder_outputs,
            final_encoder_hidden_state,
            final_encoder_cell_state,
            encoder_output_dim,
        ) = self._build_embedding_encoder(
            model=model,
            inputs=encoder_inputs,
            input_lengths=encoder_lengths,
            vocab_size=self.source_vocab_size,
            embeddings=self.encoder_embeddings,
            embedding_size=self.model_params['encoder_embedding_size'],
            use_attention=(attention_type != 'none'),
            num_gpus=self.num_gpus,
            forward_only=forward_only,
        )

        assert len(self.model_params['decoder_layer_configs']) == 1
        decoder_num_units = (
            self.model_params['decoder_layer_configs'][0]['num_units'])

        if attention_type == 'none':
            decoder_initial_hidden_state = model.FC(
                final_encoder_hidden_state,
                'decoder_initial_hidden_state',
                encoder_output_dim,
                decoder_num_units,
                axis=2,
            )
            decoder_initial_cell_state = model.FC(
                final_encoder_cell_state,
                'decoder_initial_cell_state',
                encoder_output_dim,
                decoder_num_units,
                axis=2,
            )
        else:
            decoder_initial_hidden_state = model.param_init_net.ConstantFill(
                [],
                'decoder_initial_hidden_state',
                shape=[decoder_num_units],
                value=0.0,
            )
            decoder_initial_cell_state = model.param_init_net.ConstantFill(
                [],
                'decoder_initial_cell_state',
                shape=[decoder_num_units],
                value=0.0,
            )
            initial_attention_weighted_encoder_context = (
                model.param_init_net.ConstantFill(
                    [],
                    'initial_attention_weighted_encoder_context',
                    shape=[encoder_output_dim],
                    value=0.0,
                ))

        if self.num_gpus == 0:
            embedded_decoder_inputs = model.net.Gather(
                [self.decoder_embeddings, decoder_inputs],
                ['embedded_decoder_inputs'],
            )
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                embedded_decoder_inputs_cpu = model.net.Gather(
                    [self.decoder_embeddings, decoder_inputs],
                    ['embedded_decoder_inputs_cpu'],
                )
            embedded_decoder_inputs = model.CopyCPUToGPU(
                embedded_decoder_inputs_cpu,
                'embedded_decoder_inputs',
            )

        # seq_len x batch_size x decoder_embedding_size
        if attention_type == 'none':
            decoder_outputs, _, _, _ = recurrent.LSTM(
                model=model,
                input_blob=embedded_decoder_inputs,
                seq_lengths=decoder_lengths,
                initial_states=(
                    decoder_initial_hidden_state,
                    decoder_initial_cell_state,
                ),
                dim_in=self.model_params['decoder_embedding_size'],
                dim_out=decoder_num_units,
                scope='decoder',
                outputs_with_grads=[0],
            )
            decoder_output_size = decoder_num_units
        else:
            (decoder_outputs, _, _, _, attention_weighted_encoder_contexts,
             _) = recurrent.LSTMWithAttention(
                 model=model,
                 decoder_inputs=embedded_decoder_inputs,
                 decoder_input_lengths=decoder_lengths,
                 initial_decoder_hidden_state=decoder_initial_hidden_state,
                 initial_decoder_cell_state=decoder_initial_cell_state,
                 initial_attention_weighted_encoder_context=(
                     initial_attention_weighted_encoder_context),
                 encoder_output_dim=encoder_output_dim,
                 encoder_outputs=encoder_outputs,
                 decoder_input_dim=self.model_params['decoder_embedding_size'],
                 decoder_state_dim=decoder_num_units,
                 scope='decoder',
                 outputs_with_grads=[0, 4],
             )
            decoder_outputs, _ = model.net.Concat(
                [decoder_outputs, attention_weighted_encoder_contexts],
                [
                    'states_and_context_combination',
                    '_states_and_context_combination_concat_dims',
                ],
                axis=2,
            )
            decoder_output_size = decoder_num_units + encoder_output_dim

        # we do softmax over the whole sequence
        # (max_length in the batch * batch_size) x decoder embedding size
        # -1 because we don't know max_length yet
        decoder_outputs_flattened, _ = model.net.Reshape(
            [decoder_outputs],
            [
                'decoder_outputs_flattened',
                'decoder_outputs_and_contexts_combination_old_shape',
            ],
            shape=[-1, decoder_output_size],
        )
        output_logits = self.output_projection(
            model=model,
            decoder_outputs=decoder_outputs_flattened,
            decoder_output_size=decoder_output_size,
            target_vocab_size=self.target_vocab_size,
            decoder_softmax_size=self.model_params['decoder_softmax_size'],
        )
        targets, _ = model.net.Reshape(
            [targets],
            ['targets', 'targets_old_shape'],
            shape=[-1],
        )
        target_weights, _ = model.net.Reshape(
            [target_weights],
            ['target_weights', 'target_weights_old_shape'],
            shape=[-1],
        )
        output_probs = model.net.Softmax(
            [output_logits],
            ['output_probs'],
            engine=('CUDNN' if self.num_gpus > 0 else None),
        )
        label_cross_entropy = model.net.LabelCrossEntropy(
            [output_probs, targets],
            ['label_cross_entropy'],
        )
        weighted_label_cross_entropy = model.net.Mul(
            [label_cross_entropy, target_weights],
            'weighted_label_cross_entropy',
        )
        total_loss_scalar = model.net.SumElements(
            [weighted_label_cross_entropy],
            'total_loss_scalar',
        )
        total_loss_scalar_weighted = model.net.Scale(
            [total_loss_scalar],
            'total_loss_scalar_weighted',
            scale=1.0 / self.batch_size,
        )
        return [total_loss_scalar_weighted]
Exemplo n.º 25
0
    def caffe2_yellowfin(self, zero_debias, grad_coef, n_dim, n_iter, gpu):
        caffe2_res = {}

        alpha = 1.0
        mu = 0.0
        beta = 0.999
        curv_win_width = 20
        epsilon = 1e-6

        net = core.Net("net")
        param_init_net = core.Net("param_init_net")
        workspace.ResetWorkspace()

        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
            iteration = param_init_net.ConstantFill([],
                                                    "iteration",
                                                    shape=[1],
                                                    value=0,
                                                    dtype=core.DataType.INT64)
            iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"])
            net.AtomicIter([iter_mutex, iteration], [iteration])
        pre_grad = param_init_net.ConstantFill([],
                                               "pre_grad",
                                               shape=[n_dim],
                                               value=grad_coef)
        if gpu:
            iteration = net.CopyCPUToGPU([iteration], "iteration_cpu")
        iteration_float = net.Cast([iteration], "iteration_float")
        grad = net.Mul([pre_grad, iteration_float], "grad", broadcast=True)
        w = param_init_net.ConstantFill([], "w", shape=[n_dim], value=0.0)

        # a hack to create an object with __dict__
        param_info = lambda: None
        param_info.blob = w
        param_info.grad = grad

        optimizer.YellowFinOptimizer(alpha=alpha,
                                     mu=mu,
                                     beta=beta,
                                     curv_win_width=curv_win_width,
                                     epsilon=epsilon,
                                     zero_debias=zero_debias)._run(
                                         net, param_init_net, param_info)

        workspace.RunNetOnce(param_init_net)
        workspace.CreateNet(net, overwrite=True)
        for i in range(n_iter):
            workspace.RunNet(net)
            scalars_memory_blob = workspace.FetchBlob("w_scalars_memory")
            g_norm2_avg = scalars_memory_blob[1]
            g_norm2_min_avg = scalars_memory_blob[2]
            g_norm2_max_avg = scalars_memory_blob[3]
            distance_avg = scalars_memory_blob[4]
            g_avg_blob = workspace.FetchBlob("w_g_avg")
            res_lr = workspace.FetchBlob("w_lr_avg")[0]
            res_mu = workspace.FetchBlob("w_mu_avg")[0]
            g_deb = self.deb(g_avg_blob, beta, i + 1, zero_debias)
            variance = max(
                self.deb(g_norm2_avg, beta, i + 1, zero_debias) -
                g_deb.dot(g_deb), epsilon)
            if i > 0:
                caffe2_res[i] = {
                    'h_max':
                    np.exp(self.deb(g_norm2_max_avg, beta, i + 1,
                                    zero_debias)),
                    'h_min':
                    np.exp(self.deb(g_norm2_min_avg, beta, i + 1,
                                    zero_debias)),
                    'var':
                    variance,
                    'dist':
                    self.deb(distance_avg, beta, i + 1, zero_debias),
                    'lr':
                    res_lr,
                    'mu':
                    res_mu
                }
        return caffe2_res
Exemplo n.º 26
0
    def test_resnet50_core(self):
        N = 2
        warmup = 20
        repeat = 100
        print("Batch size: {}, repeat inference {} times, warmup {} times".
              format(N, repeat, warmup))
        init_net, pred_net, _ = self._get_c2_model('resnet50')
        self._add_head_tail(pred_net, 'real_data', 'real_softmax')
        input_blob_dims = (N, 3, 224, 224)
        input_name = "real_data"

        device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
        init_net.device_option.CopyFrom(device_option)
        pred_net.device_option.CopyFrom(device_option)
        for op in pred_net.op:
            op.device_option.CopyFrom(device_option)
            op.engine = 'CUDNN'
        net_outputs = pred_net.external_output
        Y_c2 = None
        data = np.random.randn(*input_blob_dims).astype(np.float32)
        c2_time = 1
        workspace.SwitchWorkspace("gpu_test", True)
        with core.DeviceScope(device_option):
            workspace.FeedBlob(input_name, data)
            workspace.RunNetOnce(init_net)
            workspace.CreateNet(pred_net)
            for _ in range(warmup):
                workspace.RunNet(pred_net.name)
            start = time.time()
            for _ in range(repeat):
                workspace.RunNet(pred_net.name)
            end = time.time()
            c2_time = end - start
            output_values = [workspace.FetchBlob(name) for name in net_outputs]
            Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values)
        workspace.ResetWorkspace()

        # Fill the workspace with the weights
        with core.DeviceScope(device_option):
            workspace.RunNetOnce(init_net)

        # Cut the graph
        start = time.time()
        pred_net_cut = transform_caffe2_net(pred_net,
                                            {input_name: input_blob_dims},
                                            build_serializable_op=True)
        del init_net, pred_net
        #_print_net(pred_net_cut)

        Y_trt = None
        input_name = pred_net_cut.external_input[0]
        print("C2 runtime: {}s".format(c2_time))
        with core.DeviceScope(device_option):
            workspace.FeedBlob(input_name, data)
            workspace.CreateNet(pred_net_cut)
            end = time.time()
            print("Conversion time: {:.2f}s".format(end - start))

            for _ in range(warmup):
                workspace.RunNet(pred_net_cut.name)
            start = time.time()
            for _ in range(repeat):
                workspace.RunNet(pred_net_cut.name)
            end = time.time()
            trt_time = end - start
            print("TRT runtime: {}s, improvement: {}%".format(
                trt_time, (c2_time - trt_time) / c2_time * 100))
            output_values = [workspace.FetchBlob(name) for name in net_outputs]
            Y_trt = namedtupledict('Outputs', net_outputs)(*output_values)
        np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
Exemplo n.º 27
0
    def _run(self, net, param_init_net, param_info):

        # Note: This is number of persistent scalars in YellowFin optimizer.
        #       It should always be the number of scalars being used. The same
        #       number should be used in class for the operation.
        SCALARS_MEMORY_SIZE = 5

        param = param_info.blob
        grad = param_info.grad
        moment = param_init_net.ConstantFill([param],
                                             param + "_moment",
                                             value=0.0)
        curv_win = param_init_net.ConstantFill([],
                                               param + "_curv_win",
                                               shape=[self.curv_win_width],
                                               value=0.0)
        g_avg = param_init_net.ConstantFill([param],
                                            param + "_g_avg",
                                            value=0.0)
        g2_avg = param_init_net.ConstantFill([param],
                                             param + "_g2_avg",
                                             value=0.0)
        lr_avg = param_init_net.ConstantFill([],
                                             param + "_lr_avg",
                                             shape=[1],
                                             value=self.alpha)
        mu_avg = param_init_net.ConstantFill([],
                                             param + "_mu_avg",
                                             shape=[1],
                                             value=self.mu)
        scalars_memory = param_init_net.ConstantFill(
            [],
            param + "_scalars_memory",
            shape=[SCALARS_MEMORY_SIZE],
            value=0.0)

        assert self.alpha > 0
        assert not isinstance(grad, core.GradientSlice), \
            "Doesn't support sparse gradients"

        if not param_init_net.BlobIsDefined(_OPTIMIZER_ITERATION_NAME):
            # Add training operators.
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                iteration = param_init_net.ConstantFill(
                    [],
                    _OPTIMIZER_ITERATION_NAME,
                    shape=[1],
                    value=0,
                    dtype=core.DataType.INT64)
                iter_mutex = param_init_net.CreateMutex([],
                                                        ["iteration_mutex"])
                net.AtomicIter([iter_mutex, iteration], [iteration])
        else:
            iteration = param_init_net.GetBlobRef(_OPTIMIZER_ITERATION_NAME)

        self._aux_params.shared.append(iteration)
        self._aux_params.local.append(moment)
        self._aux_params.local.append(lr_avg)
        self._aux_params.local.append(mu_avg)
        self._aux_params.local.append(curv_win)
        self._aux_params.local.append(g_avg)
        self._aux_params.local.append(g2_avg)
        self._aux_params.local.append(scalars_memory)

        yf_in_out_args = [
            param, moment, lr_avg, mu_avg, curv_win, g_avg, g2_avg,
            scalars_memory
        ]

        net.YellowFin(yf_in_out_args + [grad, iteration],
                      yf_in_out_args,
                      beta=self.beta,
                      epsilon=self.epsilon,
                      curv_win_width=self.curv_win_width,
                      zero_debias=self.zero_debias)
Exemplo n.º 28
0
def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl):
    """Performs NCCL AllReduce to distribute blobs to all the GPUs."""

    if len(devices) == 1:
        return

    # Now we need to Allreduce blobs on all the GPUs.
    # Pick GPU #0 as a master GPU.
    master_device_opt = core.DeviceOption(model._device_type, devices[0])
    last_out = None
    concatenated_idx = set()

    for blob_name in blob_names:
        # Group by blob_name for reduce.
        blobs_group = list(viewvalues(model._device_grouped_blobs[blob_name]))
        assert len(blobs_group) == len(devices), \
            "Each GPU from {}, should have a copy of {}.".format(
                devices, blob_name)

        if _IsGPUBlob(model, blob_name):
            with core.DeviceScope(master_device_opt):
                if not isinstance(blobs_group[0], core.GradientSlice):
                    _AllReduce(
                        devices, model, net, blob_name, use_nccl, last_out
                    )
                    # last_out is used to serialize the execution of nccls
                    last_out = blobs_group[0]

                else:
                    # Sparse gradients: all-gather for indices and values
                    master_ns = "{}_{}".format(model._device_prefix, devices[0])
                    '''
                    Skip if we have already copied concatenated indices
                    to the indices of GradientSlice. This happens when two
                    or more grad blobs are gathered with the same indices
                    blob
                    '''
                    skip_idx_concat = False
                    for g in blobs_group:
                        if g.indices in concatenated_idx:
                            skip_idx_concat = True

                    if not skip_idx_concat:
                        grad_idx_concat, _ = net.Concat(
                            [g.indices for g in blobs_group],
                            ["{}/{}_index_concat".format(master_ns, blob_name),
                             "{}/{}_index_splitinfo".format(master_ns, blob_name)],
                            axis=0,
                            name="note:data_parallel_model")

                        for gpu, g in viewitems(model._device_grouped_blobs[blob_name]):
                            device_opt = core.DeviceOption(model._device_type, gpu)
                            with core.DeviceScope(device_opt):
                                model.Copy(grad_idx_concat, g.indices)
                                concatenated_idx.add(g.indices)

                    grad_val_concat, _ = net.Concat(
                        [g.values for g in blobs_group],
                        ["{}/{}_val_concat".format(master_ns, blob_name),
                         "{}/{}_val_splitinfo".format(master_ns, blob_name)],
                        axis=0, name="note:data_parallel_model")

                    for gpu, g in viewitems(model._device_grouped_blobs[blob_name]):
                        device_opt = core.DeviceOption(model._device_type, gpu)
                        with core.DeviceScope(device_opt):
                            model.Copy(grad_val_concat, g.values)

        else:
            assert not isinstance(blobs_group[0], core.GradientSlice), \
                "Synchronizing gradient slices not supported"
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                # Poor man's allreduce
                model.net.Sum(blobs_group, [blobs_group[0]])
                _Broadcast(devices, model, model.net, blob_name)
Exemplo n.º 29
0
        (nbatches, lT) = dc.generate_output_data()

    ### construct the neural network specified above ###
    print("Trying to initialize DLRM")
    libFlashRec = cdll.LoadLibrary("./libflashrec.so")
    libFlashRec.open_unvme()
    lru = LRU(1000)
    global hits
    global misses
    hits = 0
    misses = 0
    print("libFlashRec opened")
    load_instances = 8
    run_instances = 1
    dlrm_run_instances = []
    with core.DeviceScope(device_opt):
        for i in xrange(run_instances):
            dlrm_run_instances.append(DLRM_Net( args, libFlashRec=libFlashRec ))
    print("Initialized DLRM Net")
    for dlrm in dlrm_run_instances:
        dlrm.create(lX[0], lS_l[0], lS_i[0], lT[0])
    print("Created network")

    total_time = 0
    dload_time = 0
    k = 0

    time_start = time.time()

    print("Running networks")
    def stage_run_dlrm(dlrm, run_q, stop):
Exemplo n.º 30
0
def build_embedding_encoder(
    model,
    encoder_params,
    num_decoder_layers,
    inputs,
    input_lengths,
    vocab_size,
    embeddings,
    embedding_size,
    use_attention,
    num_gpus=0,
    forward_only=False,
    scope=None,
):
    with core.NameScope(scope or ''):
        if num_gpus == 0:
            embedded_encoder_inputs = model.net.Gather(
                [embeddings, inputs],
                ['embedded_encoder_inputs'],
            )
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                embedded_encoder_inputs_cpu = model.net.Gather(
                    [embeddings, inputs],
                    ['embedded_encoder_inputs_cpu'],
                )
            embedded_encoder_inputs = model.CopyCPUToGPU(
                embedded_encoder_inputs_cpu,
                'embedded_encoder_inputs',
            )

    layer_inputs = embedded_encoder_inputs
    layer_input_size = embedding_size
    encoder_units_per_layer = []
    final_encoder_hidden_states = []
    final_encoder_cell_states = []

    num_encoder_layers = len(encoder_params['encoder_layer_configs'])
    use_bidirectional_encoder = encoder_params.get(
        'use_bidirectional_encoder',
        False,
    )

    for i, layer_config in enumerate(encoder_params['encoder_layer_configs']):

        if use_bidirectional_encoder and i == 0:
            layer_func = rnn_bidirectional_layer
            output_dims = 2 * layer_config['num_units']
        else:
            layer_func = rnn_unidirectional_layer
            output_dims = layer_config['num_units']
        encoder_units_per_layer.append(output_dims)

        is_final_layer = (i == num_encoder_layers - 1)

        dropout_keep_prob = layer_config.get(
            'dropout_keep_prob',
            None,
        )

        return_final_state = i >= (num_encoder_layers - num_decoder_layers)
        (
            layer_outputs,
            final_layer_hidden_state,
            final_layer_cell_state,
        ) = layer_func(
            model=model,
            inputs=layer_inputs,
            input_lengths=input_lengths,
            input_size=layer_input_size,
            num_units=layer_config['num_units'],
            dropout_keep_prob=dropout_keep_prob,
            forward_only=forward_only,
            return_sequence_output=(not is_final_layer) or use_attention,
            return_final_state=return_final_state,
            scope=get_layer_scope(scope, 'encoder', i),
        )

        if not is_final_layer:
            layer_inputs = layer_outputs
            layer_input_size = output_dims
        final_encoder_hidden_states.append(final_layer_hidden_state)
        final_encoder_cell_states.append(final_layer_cell_state)

    encoder_outputs = layer_outputs
    weighted_encoder_outputs = None

    return (
        encoder_outputs,
        weighted_encoder_outputs,
        final_encoder_hidden_states,
        final_encoder_cell_states,
        encoder_units_per_layer,
    )