def call(self, inputs, training): """ Forward pass of the One-Gate Mixture of Experts Model Parameters ---------- inputs: np.array or tf.Tensor Input to the model training: bool True during training, False otherwise Returns ------- outputs: list of tf.Tensor Outputs of forward pass for each task """ outputs = [] if self.base_layer: if has_arg(self.base_layer, "training"): inputs = self.base_layer(inputs, training) else: inputs = self.base_layer(inputs) moe = [moe(inputs, training) for moe in self.moe_layers][0] for task in self.task_layers: if has_arg(task, "training"): outputs.append(task(moe, training)) else: outputs.append(task(moe)) return outputs
def call(self, inputs, training): """ Forward pass of the Multi-Gate Mixture of Experts model. Parameters ---------- inputs: np.array or tf.Tensor Input to the model training: bool If True runs model in training mode, otherwise in prediction mode. Returns ------- outputs: list of tf.Tensor Outputs of forward pass for each task """ outputs = [] if self.base_layer: if has_arg(self.base_layer, "training"): inputs = self.base_layer(inputs, training) else: inputs = self.base_layer(inputs) moes = [moe(inputs, training) for moe in self.moe_layers] for task, moe in zip(self.task_layers, moes): if has_arg(task, "training"): outputs.append(task(moe, training)) else: outputs.append(task(moe)) return outputs
def test_has_arg(self): def test_func_diff(a, b): return a-b def test_func_sum(a=2, b=3): return a+b self.assertTrue(has_arg(test_func_diff,"a")) self.assertTrue(has_arg(test_func_sum,"b")) self.assertFalse(has_arg(test_func_sum,"c")) self.assertFalse(has_arg(test_func_diff,"z"))
def call(self, inputs, training): """ Defines set of computations performed in the MOE layer. MOE layer can accept single tensor (in this case it assumes the same input for every expert) or collection/sequence of tensors (in this case it assumes every tensor corresponds to its own expert) Parameters ---------- inputs: np.array, tf.Tensor, list/tuple of np.arrays or tf.Tensors Inputs to the MOE layer training: bool True if layer is called in training mode, False otherwise Returns ------- moe_output: tf.Tensor Output of mixture of experts layers ( linearly weighted output of expert layers). """ # compute each expert output (optionally pass training argument, # since some experts may contain training arg, some may not. experts_output = [] for expert in self.expert_layers: if has_arg(expert, "training"): experts_output.append(expert(inputs, training)) else: experts_output.append(expert(inputs)) # compute probability of expert (degree of expert utilization) for given # input set if self.base_expert_prob_layer: inputs = self.base_expert_prob_layer(inputs) expert_utilization_prob = self.expert_probs(inputs) if self.add_dropout: expert_utilization_prob = self.drop_expert_layer( expert_utilization_prob, training) # compute weighted output of experts moe_output = 0 for i, expert_output in enumerate(experts_output): moe_output += ( expert_output * tf.expand_dims(expert_utilization_prob[:, i], axis=-1)) return moe_output
def call(self, inputs, training): """ Forward pass through constraining layer. Constraining layer can accept single tensor (in this case it assumes the same input for every expert) or collection/sequence of tensors (in this case it assumes every tensor corresponds to its own expert) Parameters ---------- inputs: tf.Tensor, np.array or List/Tuple of tf.Tensors/np.arrays Input tensor training: bool True in case of training, False otherwise """ # compute output of the layer outputs = [None] * len(self.layers) for i, layer in enumerate(self.layers): if has_arg(layer, "training"): if isinstance(inputs, Sequence): outputs[i] = layer(inputs[i], training) else: outputs[i] = layer(inputs, training) else: outputs[i] = layer(inputs[i]) if isinstance( inputs, (List, Tuple)) else layer(inputs) # get all trainable variables from every column of MTL trainable_vars = [layer.trainable_variables for layer in self.layers] # add constraining loss sharing_loss = 0. if self.l2_regularizer > 0.: sharing_loss += self.l2_regularizer * regularize_norm_diff( trainable_vars, "L2") if self.l1_regularizer > 0.: sharing_loss += self.l1_regularizer * regularize_norm_diff( trainable_vars, "L1") # add sharing loss and return outputs self.add_loss(sharing_loss) return outputs