Exemplo n.º 1
0
    def _test_moe(args, hidden_dim, ep_size, tp_size, enable_expert_tp, use_residual):

        # TODO: replace this with a true parallel mlp in the future
        # and run convergence tests

        tensor_parallel_expert = torch.nn.Sequential(
            torch.nn.Linear(hidden_dim,
                            4 * hidden_dim // tp_size),
            torch.nn.ReLU(),
            torch.nn.Linear(4 * hidden_dim // tp_size,
                            hidden_dim))

        # set num experts to world size
        world_size = deepspeed.comm.get_world_size()
        model = MoE(
            hidden_size=hidden_dim,
            expert=tensor_parallel_expert,
            num_experts=world_size,
            ep_size=ep_size,
            use_residual=use_residual,
            enable_expert_tensor_parallelism=enable_expert_tp,
        )
        optimizer = torch.optim.AdamW(params=model.parameters())
        model, _, _, _ = deepspeed.initialize(args=args,
                                              model=model,
                                              optimizer=optimizer,
                                              dist_init_required=False,
                                              mpu=MPU(tp_size))

        assert model.num_local_experts == world_size // ep_size
        if enable_expert_tp:
            assert deepspeed.utils.groups._get_expert_model_parallel_world_size(
            ) == tp_size
        else:
            assert deepspeed.utils.groups._get_expert_model_parallel_world_size() == 1
    def __init__(self, hidden_size, output_dropout_prob, init_method,
                 output_layer_init_method=None, num_experts=1):
        super(GPT2ParallelMLPMoE, self).__init__()
        # Set output layer initialization if not provided.
        if output_layer_init_method is None:
            output_layer_init_method = init_method

        self.experts = GPT2ParallelMLPExperts(hidden_size, init_method, 
                                            output_layer_init_method=output_layer_init_method,
                                            num_experts = num_experts) 
        

        self.MoE = MoE(
                    hidden_size,
                    num_experts=num_experts,
                    second_policy_train = 'random', # in top_2 gating, policy for whether to use a second-place expert
                    second_policy_eval = 'random',  # all (always) | none (never) | threshold (if gate value > the given threshold) | random (if gate value > threshold * random_uniform(0, 1))
                    second_threshold_train = 0.2,
                    second_threshold_eval = 0.2,
                    capacity_factor_train = 1.25,   # experts have fixed capacity per batch. we need some extra capacity in case gating is not perfectly balanced.
                    capacity_factor_eval = 2.,      # capacity_factor_* should be set to a value >=1
                    loss_coef = 1e-2,                # multiplier on the auxiliary expert balancing auxiliary loss
                    experts=self.experts)

        self.dropout = torch.nn.Dropout(output_dropout_prob)
Exemplo n.º 3
0
 def __init__(self, hidden_dim, num_experts=4):
     super(SimpleMoEModel, self).__init__()
     self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
     linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
     self.linear2 = MoE(hidden_size=hidden_dim,
                        expert=linear2,
                        num_experts=num_experts,
                        k=1)
     self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
Exemplo n.º 4
0
 def __init__(self, hidden_dim, num_experts=2, ep_size=1, use_residual=False):
     super(SimplePRMoEModel, self).__init__()
     self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
     linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
     self.linear2 = MoE(hidden_size=hidden_dim,
                        expert=linear2,
                        ep_size=ep_size,
                        use_residual=use_residual,
                        num_experts=num_experts,
                        k=1)
     linear3 = torch.nn.Linear(hidden_dim, hidden_dim)
     self.linear3 = MoE(hidden_size=hidden_dim,
                        expert=linear3,
                        ep_size=ep_size,
                        use_residual=use_residual,
                        num_experts=int(2 * num_experts),
                        k=1)
     self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
Exemplo n.º 5
0
    def __init__(self,
                 hidden_size,
                 num_attention_heads,
                 attention_dropout_prob,
                 output_dropout_prob,
                 layernorm_epsilon,
                 init_method,
                 output_layer_init_method=None,
                 init_method0=init.xavier_normal_,
                 num_experts=1):
        super(BertParallelTransformerLayer, self).__init__()

        # Self attention.
        self.num_experts = num_experts
        self.attention = BertParallelSelfAttention(hidden_size,
                                                   num_attention_heads,
                                                   attention_dropout_prob,
                                                   output_parallel=True,
                                                   init_method=init_method0)
        # Self attention output.
        self.self_output = BertParallelTransformerOutput(
            hidden_size,
            hidden_size,
            output_dropout_prob,
            layernorm_epsilon=layernorm_epsilon,
            input_is_parallel=True,
            init_method=init_method0)

        self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
        # Intermediate.
        # MLP
        if num_experts == 1:
            self.mlp = BertParallelMLP(
                hidden_size,
                output_dropout_prob,
                init_method,
                output_layer_init_method=output_layer_init_method)
        else:
            from deepspeed.moe.layer import MoE
            import mpu
            # Use the DeepSpeed API to use MoE layer and experts.
            # -- sharding, comm. and parameter handling will be done inside DeepSpeed
            self.mlp = MoE(
                hidden_size,
                output_dropout_prob,
                BertParallelMLP(
                    hidden_size,
                    output_dropout_prob,
                    init_method,
                    output_layer_init_method=output_layer_init_method),
                num_experts=num_experts)
Exemplo n.º 6
0
    def __init__(self,
                 hidden_size,
                 num_attention_heads,
                 attention_dropout_prob,
                 output_dropout_prob,
                 layernorm_epsilon,
                 init_method,
                 output_layer_init_method=None,
                 num_experts=1):
        super(GPT2ParallelTransformerLayer, self).__init__()
        # Set output layer initialization if not provided.
        if output_layer_init_method is None:
            output_layer_init_method = init_method

        # Layernorm on the input data.
        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)

        # Self attention.
        self.attention = GPT2ParallelSelfAttention(
            hidden_size,
            num_attention_heads,
            attention_dropout_prob,
            output_dropout_prob,
            init_method,
            output_layer_init_method=output_layer_init_method)

        # Layernorm on the input data.
        self.post_attention_layernorm = LayerNorm(hidden_size,
                                                  eps=layernorm_epsilon)

        # MLP
        if num_experts == 1:
            self.mlp = GPT2ParallelMLP(
                hidden_size,
                output_dropout_prob,
                init_method,
                output_layer_init_method=output_layer_init_method)
        else:
            from deepspeed.moe.layer import MoE
            # Use the DeepSpeed API to use MoE layer and experts.
            # -- sharding, comm. and parameter handling will be done inside DeepSpeed
            self.mlp = MoE(
                hidden_size,
                output_dropout_prob,
                GPT2ParallelMLP(
                    hidden_size,
                    output_dropout_prob,
                    init_method,
                    output_layer_init_method=output_layer_init_method),
                num_experts=num_experts)