def _test_moe(args, hidden_dim, ep_size, tp_size, enable_expert_tp, use_residual): # TODO: replace this with a true parallel mlp in the future # and run convergence tests tensor_parallel_expert = torch.nn.Sequential( torch.nn.Linear(hidden_dim, 4 * hidden_dim // tp_size), torch.nn.ReLU(), torch.nn.Linear(4 * hidden_dim // tp_size, hidden_dim)) # set num experts to world size world_size = deepspeed.comm.get_world_size() model = MoE( hidden_size=hidden_dim, expert=tensor_parallel_expert, num_experts=world_size, ep_size=ep_size, use_residual=use_residual, enable_expert_tensor_parallelism=enable_expert_tp, ) optimizer = torch.optim.AdamW(params=model.parameters()) model, _, _, _ = deepspeed.initialize(args=args, model=model, optimizer=optimizer, dist_init_required=False, mpu=MPU(tp_size)) assert model.num_local_experts == world_size // ep_size if enable_expert_tp: assert deepspeed.utils.groups._get_expert_model_parallel_world_size( ) == tp_size else: assert deepspeed.utils.groups._get_expert_model_parallel_world_size() == 1
def __init__(self, hidden_size, output_dropout_prob, init_method, output_layer_init_method=None, num_experts=1): super(GPT2ParallelMLPMoE, self).__init__() # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method self.experts = GPT2ParallelMLPExperts(hidden_size, init_method, output_layer_init_method=output_layer_init_method, num_experts = num_experts) self.MoE = MoE( hidden_size, num_experts=num_experts, second_policy_train = 'random', # in top_2 gating, policy for whether to use a second-place expert second_policy_eval = 'random', # all (always) | none (never) | threshold (if gate value > the given threshold) | random (if gate value > threshold * random_uniform(0, 1)) second_threshold_train = 0.2, second_threshold_eval = 0.2, capacity_factor_train = 1.25, # experts have fixed capacity per batch. we need some extra capacity in case gating is not perfectly balanced. capacity_factor_eval = 2., # capacity_factor_* should be set to a value >=1 loss_coef = 1e-2, # multiplier on the auxiliary expert balancing auxiliary loss experts=self.experts) self.dropout = torch.nn.Dropout(output_dropout_prob)
def __init__(self, hidden_dim, num_experts=4): super(SimpleMoEModel, self).__init__() self.linear = torch.nn.Linear(hidden_dim, hidden_dim) linear2 = torch.nn.Linear(hidden_dim, hidden_dim) self.linear2 = MoE(hidden_size=hidden_dim, expert=linear2, num_experts=num_experts, k=1) self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
def __init__(self, hidden_dim, num_experts=2, ep_size=1, use_residual=False): super(SimplePRMoEModel, self).__init__() self.linear = torch.nn.Linear(hidden_dim, hidden_dim) linear2 = torch.nn.Linear(hidden_dim, hidden_dim) self.linear2 = MoE(hidden_size=hidden_dim, expert=linear2, ep_size=ep_size, use_residual=use_residual, num_experts=num_experts, k=1) linear3 = torch.nn.Linear(hidden_dim, hidden_dim) self.linear3 = MoE(hidden_size=hidden_dim, expert=linear3, ep_size=ep_size, use_residual=use_residual, num_experts=int(2 * num_experts), k=1) self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
def __init__(self, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, init_method, output_layer_init_method=None, init_method0=init.xavier_normal_, num_experts=1): super(BertParallelTransformerLayer, self).__init__() # Self attention. self.num_experts = num_experts self.attention = BertParallelSelfAttention(hidden_size, num_attention_heads, attention_dropout_prob, output_parallel=True, init_method=init_method0) # Self attention output. self.self_output = BertParallelTransformerOutput( hidden_size, hidden_size, output_dropout_prob, layernorm_epsilon=layernorm_epsilon, input_is_parallel=True, init_method=init_method0) self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # Intermediate. # MLP if num_experts == 1: self.mlp = BertParallelMLP( hidden_size, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method) else: from deepspeed.moe.layer import MoE import mpu # Use the DeepSpeed API to use MoE layer and experts. # -- sharding, comm. and parameter handling will be done inside DeepSpeed self.mlp = MoE( hidden_size, output_dropout_prob, BertParallelMLP( hidden_size, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method), num_experts=num_experts)
def __init__(self, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, init_method, output_layer_init_method=None, num_experts=1): super(GPT2ParallelTransformerLayer, self).__init__() # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method # Layernorm on the input data. self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # Self attention. self.attention = GPT2ParallelSelfAttention( hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method) # Layernorm on the input data. self.post_attention_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # MLP if num_experts == 1: self.mlp = GPT2ParallelMLP( hidden_size, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method) else: from deepspeed.moe.layer import MoE # Use the DeepSpeed API to use MoE layer and experts. # -- sharding, comm. and parameter handling will be done inside DeepSpeed self.mlp = MoE( hidden_size, output_dropout_prob, GPT2ParallelMLP( hidden_size, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method), num_experts=num_experts)