def __init__(self, config: Munch): r""" Init a new GPT2 synapse module. Args: config (:obj:`munch.Munch`, `required`): munched config class. """ super(GPT2LMSynapse, self).__init__(config=config) if config == None: config = GPT2LMSynapse.build_config() # Build hugging face config. huggingface_config = GPT2Config( vocab_size=bittensor.__vocab_size__, n_embd=bittensor.__network_dim__, n_layer=config.synapse.n_layer, n_head=config.synapse.n_head, n_inner=config.synapse.n_inner, activation_function=config.synapse.activation_function, resid_pdrop=config.synapse.resid_pdrop, embd_pdrop=config.synapse.embd_pdrop, attn_pdrop=config.synapse.attn_pdrop, layer_norm_epsilon=config.synapse.layer_norm_epsilon, initializer_range=config.synapse.initializer_range, summary_type=config.synapse.summary_type, summary_use_proj=config.synapse.summary_use_proj, summary_activation=config.synapse.summary_activation, summary_proj_to_labels=config.synapse.summary_proj_to_labels, summary_first_dropout=config.synapse.summary_first_dropout, ) # encoder_layer: encodes tokenized sequences to network dim. # [batch_size, sequence_len] -> [batch_size, sequence_len, bittensor.__network_dim__] self.transformer = GPT2Model(huggingface_config) # pooler_layer: pools the hidden units for use by the pkm dendrite rpc query. # [batch_size, bittensor.__network_dim__, sequence_len] -> [batch_size, bittensor.__network_dim__] self.pooler = GPT2Pooler(huggingface_config) # router: (PKM layer) queries network using pooled embeddings as context. # [batch_size, bittensor.__network_dim__] -> topk * [batch_size, bittensor.__network_dim__] self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # hidden_layer: transforms context and encoding to network_dim hidden units. # [batch_size, sequence_dim, 2 * bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__network_dim__] self.hidden_layer = nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # target_layer: maps from hidden layer to vocab dimension for each token. Used by MLM loss. # [batch_size, sequence_len, bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__vocab_size__] self.target_layer = nn.Linear(bittensor.__network_dim__, bittensor.__vocab_size__, bias=False) # Loss function: MLM cross-entropy loss. # predicted: [batch_size, sequence_len, 1], targets: [batch_size, sequence_len, 1] -> [1] self.loss_fct = nn.CrossEntropyLoss() self.to(self.device)
def __init__( self, config: Munch = None): r""" Init a new DPN synapse module. Args: config (:obj: `munch.Munch`, `required`) munch namespace config item. """ super(DPNSynapse, self).__init__(config = config) if config == None: config = DPNSynapse.build_config() in_planes, out_planes = config.synapse.in_planes, config.synapse.out_planes num_blocks, dense_depth = config.synapse.num_blocks, config.synapse.dense_depth # Transform Network """ Transform network. Layers take in image inputs normalizes them and applies 4 convolutional layers. Image encoder: transforms PIL-encoded tensors to a common shape. [batch_size, channels, rows, cols] -> [batch_size, -1, -1, -1] Output: [batch_size, self.transform_dim (9728)] """ self.transform = Normalize((0.1307,), (0.3081,), device=self.device) self.adaptive_pool = nn.AdaptiveAvgPool2d((32, 32)) self.transform_conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) self.transform_bn1 = nn.BatchNorm2d(64) self.last_planes = 64 self.transform_layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1) self.transform_layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2) self.transform_layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=1) self.transform_layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2) self.transform_dim = (out_planes[3] * 4)+(((num_blocks[3]+1) * 4)*dense_depth[3]) # dendrite: (PKM layer) queries network using pooled embeddings as context. # [batch_size, -1] -> topk * [batch_size, bittensor.__network_dim__] self.router = PKMRouter(config, query_dim = self.transform_dim) # Context layers. """ Distillation model for remote context. This layer takes input coming from transform layer, and runs it through 3 linear layers, projecting it to bittensor.__network_dim__. """ self.context_layer1 = nn.Linear(self.transform_dim, 512) self.context_layer2 = nn.Linear(512, 256) self.context_layer3 = nn.Linear(256, bittensor.__network_dim__) # hidden layer. self.hidden_layer1 = nn.Linear(self.transform_dim + bittensor.__network_dim__, 512) self.hidden_layer2 = nn.Linear(512, 256) self.hidden_layer3 = nn.Linear(256, bittensor.__network_dim__) # Layers to project target down to target size passed by config # (number of classes) self.target_layer1 = nn.Linear(bittensor.__network_dim__, 128) self.target_layer2 = nn.Linear(128, self.config.synapse.target_dim) self.to(self.device)
def add_args(parser: argparse.ArgumentParser): parser.add_argument( '--synapse.target_dim', default=10, type=int, help='Final logit layer dimension. i.e. 10 for MNIST.') parser = PKMRouter.add_args(parser)
def __init__(self, config: Munch, **kwargs): r""" Init a new ffnn synapse module. :param [config]: munch namespace config item. :type [config]: [:obj:`munch.Munch`](, `required`) """ super(FFNNSynapse, self).__init__(config=config, **kwargs) if config == None: config = FFNNSynapse.default_config() bittensor.config.Config.update_with_kwargs(config.synapse, kwargs) FFNNSynapse.check_config(config) self.config = config # transform_layer: transforms images to common dimension. # [batch_size, -1, -1, -1] -> [batch_size, self.transform_dim] self.transform = Normalize((0.1307, ), (0.3081, ), device=self.device) self.transform_pool = nn.AdaptiveAvgPool2d((28, 28)) self.transform_conv1 = nn.Conv2d(1, 10, kernel_size=5) self.transform_conv2 = nn.Conv2d(10, 20, kernel_size=5) self.transform_drop = nn.Dropout2d() self.transform_dim = 320 # context_layer: distills the remote_context from the transform layer. # [batch_size, transform_dim] -> [batch_size, bittensor.__network_dim__] self.context_layer1 = nn.Linear(self.transform_dim, 256) self.context_layer2 = nn.Linear(256, bittensor.__network_dim__) # hidden_layer: learns hidden units for network and target. # [batch_size, transform_dim + bittensor.__network_dim__] = [batch_size, bittensor.__network_dim__] self.hidden_layer1 = nn.Linear( self.transform_dim + bittensor.__network_dim__, bittensor.__network_dim__) self.hidden_layer2 = nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # dendrite: (PKM layer) queries network using pooled embeddings as context. # [batch_size, -1] -> topk * [batch_size, bittensor.__network_dim__] self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # target_layer: Maps from hidden layer to target dimension # [batch_size, bittensor.__network_dim__] -> [batch_size, self.target_dim] self.target_layer1 = nn.Linear(bittensor.__network_dim__, 256) self.target_layer2 = nn.Linear(256, self.config.synapse.target_dim) self.to(self.device)
def __init__(self, config: Munch = None, **kwargs): """ Initialize a new XLM synapse module. Args: config (:obj:`munch.Munch`, `required`): munched config class. """ super(XLMSynapse, self).__init__(config=config, **kwargs) if config == None: config = XLMSynapse.default_config() bittensor.config.Config.update_with_kwargs(config.synapse, kwargs) XLMSynapse.check_config(config) self.config = config # Build config. xlm_config = XLMConfig( vocab_size=bittensor.__vocab_size__, emb_dim=bittensor.__network_dim__, n_layers=config.synapse.n_layers, n_heads=config.synapse.n_heads, # More needed ) # model layer: encodes tokenized sequences to network dim. self.xlm = XLMModel(xlm_config) # pooler layer: pools the hidden units for use by the pkm dendrite rpc query. self.pooler = XLMPooler(xlm_config) # router: (PKM layer) queries network using embeddings as context self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # hidden layer: transforms context and encoding to network dimension hidden units. self.hidden_layer = nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # target layer: maps from hidden layer to vocab dimension for each token. self.target_layer = nn.Linear(bittensor.__network_dim__, bittensor.__vocab_size__, bias=False) # Loss function self.loss_fct = nn.CrossEntropyLoss() self.to(self.device)
def __init__(self, config: Munch, **kwargs): r""" Init a new base-bert synapse. Args: config (:obj:`munch.Munch`, `required`): """ super(BertSynapseBase, self).__init__(config=config, **kwargs) if config == None: config = BertSynapseBase.default_config() bittensor.config.Config.update_with_kwargs(config.synapse, kwargs) BertSynapseBase.check_config(config) self.config = config # Hugging face config item. huggingface_config = BertConfig( vocab_size=bittensor.__vocab_size__, hidden_size=bittensor.__network_dim__, num_hidden_layers=config.synapse.num_hidden_layers, num_attention_heads=config.synapse.num_attention_heads, intermediate_size=bittensor.__network_dim__, is_decoder=False) # dendrite: (PKM layer) queries network using pooled embeddings as context. # [batch_size, -1] -> topk * [batch_size, bittensor.__network_dim__] self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # encoder_layer: encodes tokenized sequences to network dim. # [batch_size, sequence_len] -> [batch_size, sequence_len, bittensor.__network_dim__] self.transformer = BertModel(huggingface_config, add_pooling_layer=True) # hidden_layer: transforms context and encoding to network_dim hidden units. # [batch_size, sequence_dim, bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__network_dim__] self.hidden_layer = torch.nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # pooling_layer: transforms teh hidden layer into a pooled representation by taking the encoding of the first token # [batch_size, sequence_dim, bittensor.__network_dim__] -> [batch_size, bittensor.__network_dim__] self.pooler = BertPooler(huggingface_config) self.to(self.device)
def add_args(parser: argparse.ArgumentParser): r""" Add custom params to the parser. """ parser.add_argument( '--synapse.num_hidden_layers', default=2, type=int, help='Number of hidden layers in the Transformer encoder.') parser.add_argument( '--synapse.num_attention_heads', default=2, type=int, help= 'Number of attention heads for each attention layer in the Transformer encoder.' ) parser.add_argument( '--synapse.n_block_filter', default=100, type=int, help='Stale neurons are filtered after this many blocks.') PKMRouter.add_args(parser)
def add_args(parser: argparse.ArgumentParser): """ Add model params """ parser.add_argument( '--synapse.n_head', default=32, type=int, help= 'Number of attention heads for each attention layer in the Transformer encoder.' ) parser.add_argument( '--synapse.n_layer', default=12, type=int, help='Number of hidden layers in the Transformer encoder.') parser.add_argument( '--synapse.block_size', default=20, type=int, help='Number of hidden layers in the Transformer encoder.') parser.add_argument('--synapse.embd_pdrop', default=0.1, type=float, help='GPT embedding dropout probability.') parser.add_argument('--synapse.resid_pdrop', default=0.1, type=float, help='GPT residual dropout probability.') parser.add_argument('--synapse.attn_pdrop', default=0.1, type=float, help='GPT attention dropout probability.') PKMRouter.add_args(parser)
def add_args(parser: argparse.ArgumentParser): r""" This function adds the configuration items for the DPN synapse. These args are use to instantiate a Dual Path model. Instantiating a configuration with the defaults will yield a "shallow" DPN-26 configuration. For deeper network configurations, it is possible to set the num_blocks parameter to (3, 4, 20, 3) for a DPN-92. For DPN-98 set the following: in_planes: (160, 320, 640, 1280) out_planes: (256, 512, 1024, 2048) num_blocks: (3, 6, 20, 3) dense_depth: (16, 32, 32, 128) """ def to_list(arg): return [int(i) for i in arg.split(",")] parser.add_argument('--synapse.in_planes', default='160, 320, 640, 1280', action="append", type=to_list) parser.add_argument('--synapse.out_planes', default='256, 512, 1024, 2048', action="append", type=to_list) parser.add_argument('--synapse.num_blocks', default='3, 6, 20, 3', action="append", type=to_list) parser.add_argument('--synapse.dense_depth', default='16, 32, 32, 128', action="append", type=to_list) parser.add_argument('--synapse.target_dim', default=10, type=int, help='Final logit layer dimension. i.e. 10 for CIFAR-10.') parser = PKMRouter.add_args(parser)
def check_config(config: Munch): assert config.synapse.n_layers > 0, "Number of hidden layers in the Transformer encoder must be > 0" assert config.synapse.n_heads > 0, "Number of attention heads for each attention layer in the Transformer encoder must be > 0" config = PKMRouter.check_config(config)
class GPT2Synapse(bittensor.synapse.Synapse): def __init__(self, config, **kwargs): super(GPT2Synapse, self).__init__(config=config, **kwargs) """The full GPT language model, with context of a block size. Args: config (:obj: `munch.Munch`, `required`): munched config class. """ if config == None: config = GPT2Synapse.default_config() bittensor.config.Config.update_with_kwargs(config.synapse, kwargs) GPT2Synapse.check_config(config) self.config = config gpt_config = GPTConfig(vocab_size=bittensor.__vocab_size__, n_embd=bittensor.__network_dim__, n_head=config.synapse.n_head, n_layer=config.synapse.n_layer, block_size=config.synapse.block_size, embd_pdrop=config.synapse.embd_pdrop, resid_pdrop=config.synapse.resid_pdrop, attn_pdrop=config.synapse.attn_pdrop) # Token embedding layer. # [bittensor.__vocab_size__, bittensor.__network_dim__] self.tok_emb = nn.Embedding(gpt_config.vocab_size, gpt_config.n_embd) # Positional embedding. # [1, block_size, bittensor.__network_dim__] self.pos_emb = nn.Parameter( torch.zeros(1, gpt_config.block_size, gpt_config.n_embd)) self.drop = nn.Dropout(gpt_config.embd_pdrop) # Transformer blocks self.blocks = nn.Sequential( *[Block(gpt_config) for _ in range(gpt_config.n_layer)]) # Decoder head self.ln_f = nn.LayerNorm(gpt_config.n_embd) # Head # [ bittensor.__network_dim__, bittensor.__network_dim__ ] self.head = nn.Linear(gpt_config.n_embd, bittensor.__network_dim__, bias=False) # pooler_layer: pools the hidden units for use by the pkm dendrite rpc query. self.pooler = GPTPooler(gpt_config) # Router: (PKM layer) queries network using pooled embeddings as context. self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # Hidden layer self.hidden_layer = nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # Target layer self.target_layer = nn.Linear(bittensor.__network_dim__, gpt_config.vocab_size, bias=False) # Block size here corresponds to sequence lengths self.block_size = gpt_config.block_size self.apply(self._init_weights) # Loss function: MLM cross-entropy loss. # predicted: [batch_size, sequence_len, 1], targets: [batch_size, sequence_len, 1] -> [1] self.loss_fct = nn.CrossEntropyLoss() self.num_parameters = sum(p.numel() for p in self.parameters()) self.to(self.device) @staticmethod def default_config() -> Munch: parser = argparse.ArgumentParser() GPT2Synapse.add_args(parser) config = bittensor.config.Config.to_config(parser) return config @staticmethod def add_args(parser: argparse.ArgumentParser): """ Add model params """ parser.add_argument( '--synapse.n_head', default=32, type=int, help= 'Number of attention heads for each attention layer in the Transformer encoder.' ) parser.add_argument( '--synapse.n_layer', default=12, type=int, help='Number of hidden layers in the Transformer encoder.') parser.add_argument( '--synapse.block_size', default=20, type=int, help='Number of hidden layers in the Transformer encoder.') parser.add_argument('--synapse.embd_pdrop', default=0.1, type=float, help='GPT embedding dropout probability.') parser.add_argument('--synapse.resid_pdrop', default=0.1, type=float, help='GPT residual dropout probability.') parser.add_argument('--synapse.attn_pdrop', default=0.1, type=float, help='GPT attention dropout probability.') PKMRouter.add_args(parser) @staticmethod def check_config(config: Munch): pass def get_block_size(self): return self.block_size def _init_weights(self, module): if isinstance(module, (nn.Linear, nn.Embedding)): module.weight.data.normal_(mean=0.0, std=0.02) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) def forward_text(self, inputs: torch.LongTensor): """ Local forward inputs through the CLM GPT Synapse. Args: inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): Batch_size length list of tokenized sentences. Returns: hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): Hidden layer representation produced using the local_context. """ # Truncate seq length of incoming inputs if they are too long initial_length = inputs.size(1) inputs = inputs if initial_length <= self.block_size else inputs[:, -self. block_size:] hidden = self.local_forward(inputs=inputs.to(self.device), training=False).local_hidden # Now pad the output tensor back to the original length if initial_length > self.block_size: diff = initial_length - self.block_size padding = (0, 0, diff, 0) hidden = torch.nn.functional.pad(hidden, padding, "constant", 0) return hidden def local_forward(self, inputs: torch.LongTensor, training: bool = True) -> SimpleNamespace: """ Forward pass through GPT2 synapse. Args: inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, block_size)`, `required`): Batch_size length x list of text sentences. training (:obj:`bool')`, `optional`, defaults to True): Switch to True if this forward pass computes a CLM loss. SimpleNamespace { local_context (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): Hidden layer context. local_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): Hidden layer encoding produced using local_context. local_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__vocab_size__)`, `optional`): GPT MLM Target predictions produced using local_context. local_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): GPT MLM loss using local_context. } """ _, t = inputs.size() assert t <= self.block_size, "Cannot forward, model block size is exhausted." # FWD locally # Each index maps to a learnable vector token_embeddings = self.tok_emb(inputs) # Each Position maps to a learnable vector position_embeddings = self.pos_emb[:, :t, :] output = SimpleNamespace() # Dropout on token embeddings and position embeddings out = self.drop(token_embeddings + position_embeddings) # out = self.blocks(out) out = self.ln_f(out) output.local_context = self.head(out) output.local_hidden = self.hidden_layer(output.local_context) if training: output.local_target = self.target_layer(output.local_hidden) shift_logits = output.local_target[..., :-1, :].contiguous() shift_labels = inputs[..., 1:].contiguous() output.local_target_loss = self.loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return output def remote_forward(self, neuron: bittensor.neuron.Neuron, inputs: torch.LongTensor, training: bool) -> SimpleNamespace: """ Forward pass inputs and labels through the GPT2 module and into the remote network. Args: neuron (:obj: `bittensor.neuron.Neuron`, `required`): Bittensor neuron, used for making queries to the remote network. inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): Batch_size length list of text sentences. training (:obj:`bool')`, `optional`, defaults to True): Switch to True if this forward pass computes an MLM loss. Returns: self.local_forward() + SimpleNamespace ( remote_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `optional`): Hidden layer encoding produced using the remote_context. remote_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__vocab_size__)`, `optional`): GPT MLM Target predictions using the remote_context. remote_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): GPT MLM loss using the remote_context. distillation_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): Distillation loss between local_context and remote_context. router (:obj:`SimpleNamespace`, `required`): Outputs from the pkm dendrite. ) """ inputs = torch.clamp( inputs, 0, bittensor.__vocab_size__) # Filter out of range tokens. # Run local model # output = SimpleNamespace output = self.local_forward(inputs, training) # pooled: pooled hidden layer from local run, used as query context pooled = self.pooler(output.local_hidden.detach()) # remote_context: joined responses from a dendrite.forward_text call. # remote_context.shape = [batch_size, sequence_len (or block_size), bittensor.__network_dim__] output.router = self.router.forward_text(neuron, inputs.to(self.device), pooled) remote_context = output.router.response.to(self.device) # distillation_loss : distillation loss between local_context and remote_context # distillation_loss.shape = [1] output.distillation_loss = F.mse_loss(output.local_context, remote_context.detach()) # remote_hidden: hidden l;ayer encoding using remote_context. # remote_hidden.shape = [batch_size, sequence_len, bittensor.__network_dim__] output.remote_hidden = self.hidden_layer(remote_context) if training: # remote_target : projection of remote_hidden onto the target dimension # remote_target.shape = [batch_size, sequence_len, bittensor.__vocab_size__] output.remote_target = self.target_layer(output.remote_hidden) # remote_target_loss : CLM loss between remote_target and passed_targets. # remote_target_loss.shape = [1] shift_logits = output.remote_target[..., :-1, :].contiguous() shift_labels = inputs[..., 1:].contiguous() output.remote_target_loss = self.loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) shift_labels.view(-1) return output
def check_config(config: Munch): assert config.synapse.target_dim > 0, "target dimension must be greater than 0." config = PKMRouter.check_config(config)
class FFNNSynapse(bittensor.synapse.Synapse): """ Simple feed forward NN for images. """ def __init__(self, config: Munch, **kwargs): r""" Init a new ffnn synapse module. :param [config]: munch namespace config item. :type [config]: [:obj:`munch.Munch`](, `required`) """ super(FFNNSynapse, self).__init__(config=config, **kwargs) if config == None: config = FFNNSynapse.default_config() bittensor.config.Config.update_with_kwargs(config.synapse, kwargs) FFNNSynapse.check_config(config) self.config = config # transform_layer: transforms images to common dimension. # [batch_size, -1, -1, -1] -> [batch_size, self.transform_dim] self.transform = Normalize((0.1307, ), (0.3081, ), device=self.device) self.transform_pool = nn.AdaptiveAvgPool2d((28, 28)) self.transform_conv1 = nn.Conv2d(1, 10, kernel_size=5) self.transform_conv2 = nn.Conv2d(10, 20, kernel_size=5) self.transform_drop = nn.Dropout2d() self.transform_dim = 320 # context_layer: distills the remote_context from the transform layer. # [batch_size, transform_dim] -> [batch_size, bittensor.__network_dim__] self.context_layer1 = nn.Linear(self.transform_dim, 256) self.context_layer2 = nn.Linear(256, bittensor.__network_dim__) # hidden_layer: learns hidden units for network and target. # [batch_size, transform_dim + bittensor.__network_dim__] = [batch_size, bittensor.__network_dim__] self.hidden_layer1 = nn.Linear( self.transform_dim + bittensor.__network_dim__, bittensor.__network_dim__) self.hidden_layer2 = nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # dendrite: (PKM layer) queries network using pooled embeddings as context. # [batch_size, -1] -> topk * [batch_size, bittensor.__network_dim__] self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # target_layer: Maps from hidden layer to target dimension # [batch_size, bittensor.__network_dim__] -> [batch_size, self.target_dim] self.target_layer1 = nn.Linear(bittensor.__network_dim__, 256) self.target_layer2 = nn.Linear(256, self.config.synapse.target_dim) self.to(self.device) @staticmethod def default_config() -> Munch: parser = argparse.ArgumentParser() FFNNSynapse.add_args(parser) config = bittensor.config.Config.to_config(parser) return config @staticmethod def add_args(parser: argparse.ArgumentParser): parser.add_argument( '--synapse.target_dim', default=10, type=int, help='Final logit layer dimension. i.e. 10 for MNIST.') parser = PKMRouter.add_args(parser) @staticmethod def check_config(config: Munch): assert config.synapse.target_dim > 0, "target dimension must be greater than 0." config = PKMRouter.check_config(config) def forward_image(self, images: torch.Tensor): r""" Forward image inputs through the FFNN synapse . Args: inputs (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_dim, channels, rows, cols)`, `required`): Image tensors produced by calling PIL.toTensor() and with sequence dimension. Returns: hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_dim, bittensor.__network_dim__)`, `required`): Hidden layer encoding produced by using local_context. """ # images: remove sequence dimension from images. # images.shape = [batch_size, channels, rows, cols] images = images.view(images.shape[0] * images.shape[1], images.shape[2], images.shape[3], images.shape[4]).to(self.device) # hidden: hidden layer using local_contextcontext for local computation only. # hidden.shape = [batch_size, __network_dim__] hidden = self.local_forward(images=images).local_hidden # hidden: re-add sequence dimension to outputs. # hidden.shape = [batch_size, sequence_dim, __network_dim__] hidden = torch.unsqueeze(hidden, 1) return hidden def local_forward(self, images: torch.Tensor, targets: torch.Tensor = None) -> SimpleNamespace: r""" Forward pass non-sequential image inputs and targets through the FFNN Synapse. The call does not make remote queries to the network and returns only local hidden, target and losses. Args: images (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, channels, rows, cols)`, `required`): PIL.toTensor() encoded images. targets (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_dim)`, `optional`, defaults to None): Image labels. Returns: local_context (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `required`): Pre-Hidden layer context, trained to match the remote context. local_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `required`): Hidden layer produced from the context. local_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_dim)`, `optional`): FFNN Target predictions using local_context. local_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): FFNN Classification loss using local_context. local_accuracy (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): Accuracy of target predictions. """ # Return vars to be filled. output = SimpleNamespace() # transform: transform images to common shape. # transform.shape = [batch_size, self.transform_dim] transform = self.transform(images).to(self.device) transform = F.relu(F.max_pool2d(self.transform_conv1(transform), 2)) transform = F.relu( F.max_pool2d(self.transform_drop(self.transform_conv2(transform)), 2)) output.transform = transform.view(-1, self.transform_dim) # local_context: distillation model for remote_context. # local_context.shape = [batch_size, bittensor.__network_dim__] local_context = self.context_layer1(output.transform.detach()) output.local_context = self.context_layer2(local_context) # local_hidden: hidden layer encoding using local_context. # local_hidden.shape = [batch_size, bittensor.__network_dim__] local_hidden = torch.cat( (output.transform, output.local_context.detach()), dim=1) local_hidden = F.relu(self.hidden_layer1(local_hidden)) output.local_hidden = F.relu(self.hidden_layer2(local_hidden)) if targets is not None: # local_target: projection of local_hidden onto target dimension. # local_target.shape = [batch_size, target_dim] targets.to(self.device) local_target = self.target_layer1(output.local_hidden) local_target = self.target_layer2(local_target) output.local_target = F.log_softmax(local_target, dim=1) # local_target_loss: loss between local_target and passed targets. # local_target_loss.shape = [1] output.local_target_loss = F.nll_loss(output.local_target, targets) # Record extra metadata accuracy. max_logit = local_target.data.max(1, keepdim=True)[1] correct = max_logit.eq(targets.data.view_as(max_logit)).sum() output.local_accuracy = (100.0 * correct) / targets.shape[0] return output def remote_forward(self, neuron: bittensor.neuron.Neuron, images: torch.Tensor, targets: torch.Tensor = None) -> SimpleNamespace: """ Forward pass non-sequential image inputs and targets through the remote context of the synapse. The call makes RPC queries accross the network using the passed neuron's metagraph and dendrite. Args: neuron (:obj: `bittensor.neuron.Neuron`, `required`): Bittensor neuron, used for making queries to the remote network. images (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, channels, rows, cols)`, `required`): PIL.toTensor() encoded images. targets (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_dim)`, `optional`, defaults to None): Image labels. Returns: self.local_forward() + SimpleNamespace ( router (:obj:`SimpleNamespace`, `required`): Outputs from the pkm dendrite remote call. distillation_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): Distillation loss between the local and remote context. remote_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `optional`): Hidden layer encoding produced using the remote context. remote_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_dim)`, `optional`): FFNN Target predictions using the remote_context. remote_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): FFNN Classification loss using the remote_context. """ # Call the local forward pass. # output = bittensor.SynapseOutput output = self.local_forward(images, targets) # Make remote queries using the PKMRouter. # remote_context: responses from a bittensor remote network call. # remote_context.shape = [batch_size, bittensor.__network_dim__] images = torch.unsqueeze(images, 1) output.router = self.router.forward_image(neuron, images, output.local_hidden) remote_context = torch.squeeze(output.router.response, 1).to(self.device) # Distill the local context to match the remote context. # distillation_loss: distillation loss between local_context and remote_context # distillation_loss.shape = [1] output.distillation_loss = F.mse_loss(output.local_context, remote_context.detach()) # remote_hidden: hidden layer encoding using remote_context. # remote_hidden.shape = [batch_size, bittensor.__network_dim__] remote_hidden = torch.cat([output.transform, remote_context], dim=1) remote_hidden = self.hidden_layer1(remote_hidden) output.remote_hidden = self.hidden_layer2(remote_hidden) if targets is not None: # Project hidden units onto the targets. # remote_target: projection of remote_hidden onto target dimension. # remote_target.shape = [batch_size, target_dim] remote_target = self.target_layer1(remote_hidden) remote_target = self.target_layer2(remote_target) output.remote_target = F.log_softmax(remote_target, dim=1) # Compute the target loss. # remote_target_loss: loss between remote_target and passed targets. # remote_target_loss.shape = [1] output.remote_target_loss = F.nll_loss(output.remote_target, targets) # Add extra metrics # Record extra metadata accuracy. max_logit = output.remote_target.data.max(1, keepdim=True)[1] correct = max_logit.eq(targets.data.view_as(max_logit)).sum() output.remote_accuracy = (100.0 * correct) / targets.shape[0] return output
class GPT2LMSynapse(bittensor.synapse.Synapse): """ A Bittensor Synapse training GPT2 with Causal Language Modelling (CLM) """ def __init__(self, config: Munch): r""" Init a new GPT2 synapse module. Args: config (:obj:`munch.Munch`, `required`): munched config class. """ super(GPT2LMSynapse, self).__init__(config=config) if config == None: config = GPT2LMSynapse.build_config() # Build hugging face config. huggingface_config = GPT2Config( vocab_size=bittensor.__vocab_size__, n_embd=bittensor.__network_dim__, n_layer=config.synapse.n_layer, n_head=config.synapse.n_head, n_inner=config.synapse.n_inner, activation_function=config.synapse.activation_function, resid_pdrop=config.synapse.resid_pdrop, embd_pdrop=config.synapse.embd_pdrop, attn_pdrop=config.synapse.attn_pdrop, layer_norm_epsilon=config.synapse.layer_norm_epsilon, initializer_range=config.synapse.initializer_range, summary_type=config.synapse.summary_type, summary_use_proj=config.synapse.summary_use_proj, summary_activation=config.synapse.summary_activation, summary_proj_to_labels=config.synapse.summary_proj_to_labels, summary_first_dropout=config.synapse.summary_first_dropout, ) # encoder_layer: encodes tokenized sequences to network dim. # [batch_size, sequence_len] -> [batch_size, sequence_len, bittensor.__network_dim__] self.transformer = GPT2Model(huggingface_config) # pooler_layer: pools the hidden units for use by the pkm dendrite rpc query. # [batch_size, bittensor.__network_dim__, sequence_len] -> [batch_size, bittensor.__network_dim__] self.pooler = GPT2Pooler(huggingface_config) # router: (PKM layer) queries network using pooled embeddings as context. # [batch_size, bittensor.__network_dim__] -> topk * [batch_size, bittensor.__network_dim__] self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # hidden_layer: transforms context and encoding to network_dim hidden units. # [batch_size, sequence_dim, 2 * bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__network_dim__] self.hidden_layer = nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # target_layer: maps from hidden layer to vocab dimension for each token. Used by MLM loss. # [batch_size, sequence_len, bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__vocab_size__] self.target_layer = nn.Linear(bittensor.__network_dim__, bittensor.__vocab_size__, bias=False) # Loss function: MLM cross-entropy loss. # predicted: [batch_size, sequence_len, 1], targets: [batch_size, sequence_len, 1] -> [1] self.loss_fct = nn.CrossEntropyLoss() self.to(self.device) @staticmethod def build_config() -> Munch: parser = argparse.ArgumentParser() GPT2LMSynapse.add_args(parser) config = bittensor.config.Config.to_config(parser) GPT2LMSynapse.check_config(config) return config @staticmethod def add_args(parser: argparse.ArgumentParser): r""" Add custom params to the parser. """ parser.add_argument( '--synapse.n_head', default=1, type=int, help= 'Number of attention heads for each attention layer in the Transformer encoder.' ) parser.add_argument( '--synapse.n_layer', default=2, type=int, help='Number of hidden layers in the Transformer encoder.') parser.add_argument( '--synapse.n_inner', default=8, type=int, help= 'The dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd' ) parser.add_argument( '--synapse.activation_function', default='gelu_new', type=str, help= 'Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]' ) parser.add_argument('--synapse.resid_pdrop', default=0.1, type=float, help='GPT residual dropout probabilit.') parser.add_argument('--synapse.embd_pdrop', default=0.1, type=float, help='GPT embedding dropout probability.') parser.add_argument('--synapse.attn_pdrop', default=0.1, type=float, help='GPT attention dropout probability.') parser.add_argument( '--synapse.layer_norm_epsilon', default=1e-05, type=float, help='GPT the epsilon to use in the layer normalization layers') parser.add_argument( '--synapse.summary_type', default='cls_index', type=str, help= 'Supply a Tensor of classification token position (like GPT/GPT-2).' ) parser.add_argument( '--synapse.initializer_range', default=0.02, type=float, help= 'The standard deviation of the truncated_normal_initializer for initializing all weight matrices.' ) parser.add_argument( '--synapse.summary_use_proj', default=True, type=bool, help= 'Whether or not to add a projection after the vector extraction.') parser.add_argument( '--synapse.summary_activation', type=str, help= 'Pass "tanh" for a tanh activation to the output, any other value will result in no activation.' ) parser.add_argument( '--synapse.summary_proj_to_labels', default=True, type=bool, help= 'Whether the projection outputs should have config.num_labels or config.hidden_size classes.' ) parser.add_argument( '--synapse.summary_first_dropout', default=0.1, type=float, help= 'The dropout ratio to be used after the projection and activation.' ) parser.add_argument( '--synapse.n_block_filter', default=100, type=int, help='Stale neurons are filtered after this many blocks.') PKMRouter.add_args(parser) @staticmethod def check_config(config: Munch): pass def forward_text(self, inputs: torch.LongTensor): """ Local forward inputs through the MLM GPT Synapse. Args: inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): Batch_size length list of tokenized sentences. Returns: hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): Hidden layer representation produced using the local_context. """ hidden = self.local_forward(inputs=inputs.to(self.device), training=False).local_hidden return hidden def local_forward(self, inputs: torch.LongTensor, training: bool = True) -> SimpleNamespace: r""" Forward pass through GPT synapse. Args: inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): Batch_size length list of text sentences. training (:obj:`bool')`, `optional`, defaults to True): Switch to True if this forward pass computes an MLM loss. SimpleNamespace { local_context (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): Hidden layer context. local_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): Hidden layer encoding produced using local_context. local_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__vocab_size__)`, `optional`): GPT MLM Target predictions produced using local_context. local_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): GPT MLM loss using local_context. } """ inputs = torch.clamp( inputs, 0, bittensor.__vocab_size__) # Filter out of range tokens. # Return vars to be filled. output = SimpleNamespace() # local_context: distilled version of remote_context. # local_context.shape = [batch_size, sequence_len, bittensor.__network_dim__] output.local_context = self.transformer( input_ids=inputs, return_dict=True).last_hidden_state # local_hidden: hidden layer encoding of sequence with local_context. # local_hidden.shape = [batch_size, sequence_len, bittensor.__network_dim__] output.local_hidden = self.hidden_layer(output.local_context) if training: # local_target: projection of local_hidden onto target dimension. # local_target.shape = [batch_size, sequence_len, bittensor.__vocab_size__] output.local_target = self.target_layer(output.local_hidden) # local_target_loss: MLM loss between local_target and passed targets. # local_target_loss.shape = [1] shift_logits = output.local_target[..., :-1, :].contiguous() shift_labels = inputs[..., 1:].contiguous() output.local_target_loss = self.loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return output def remote_forward(self, neuron: bittensor.neuron.Neuron, inputs: torch.LongTensor, training: bool) -> SimpleNamespace: """ Forward pass inputs and labels through the GPT2 module. Args: neuron (:obj: `bittensor.neuron.Neuron`, `required`): Bittensor neuron, used for making queries to the remote network. inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): Batch_size length list of text sentences. training (:obj:`bool')`, `optional`, defaults to True): Switch to True if this forward pass computes an MLM loss. Returns: self.local_forward() + SimpleNamespace ( remote_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `optional`): Hidden layer encoding produced using the remote_context. remote_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__vocab_size__)`, `optional`): GPT MLM Target predictions using the remote_context. remote_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): GPT MLM loss using the remote_context. distillation_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): Distillation loss between local_context and remote_context. router (:obj:`SimpleNamespace`, `required`): Outputs from the pkm dendrite. ) """ inputs = torch.clamp( inputs, 0, bittensor.__vocab_size__) # Filter out of range tokens. # Run the local model. # output = SimpleNamespace output = self.local_forward(inputs, training) # pooled: pooled hidden layer from local run, used as our query context. # pooled.shape = [batch_size, bittensor.__network_dim__] pooled = self.pooler(output.local_hidden.detach()) # remote_context: joined responses from a dendrite.forward_text call. # remote_context.shape = [batch_size, sequence_len, bittensor.__network_dim__] output.router = self.router.forward_text(neuron, inputs.to(self.device), pooled) remote_context = output.router.response # distillation_loss: distillation loss between local_context and remote_context # distillation_loss.shape = [1] output.distillation_loss = F.mse_loss(output.local_context, remote_context.detach()) # remote_hidden: hidden layer encoding using remote_context. # remote_hidden.shape = [batch_size, sequence_len, bittensor.__network_dim__] output.remote_hidden = self.hidden_layer(remote_context) if training: # remote_target: projection of remote_hidden onto target dimension. # remote_target.shape = [batch_size, sequence_len, bittensor.__vocab_size__] output.remote_target = self.target_layer(output.remote_hidden) # remote_target_loss: MLM loss between remote_target and passed targets. # remote_target_loss.shape = [1] shift_logits = output.remote_target[..., :-1, :].contiguous() shift_labels = inputs[..., 1:].contiguous() output.remote_target_loss = self.loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return output
class BertSynapseBase(bittensor.synapse.Synapse): def __init__(self, config: Munch, **kwargs): r""" Init a new base-bert synapse. Args: config (:obj:`munch.Munch`, `required`): """ super(BertSynapseBase, self).__init__(config=config, **kwargs) if config == None: config = BertSynapseBase.default_config() bittensor.config.Config.update_with_kwargs(config.synapse, kwargs) BertSynapseBase.check_config(config) self.config = config # Hugging face config item. huggingface_config = BertConfig( vocab_size=bittensor.__vocab_size__, hidden_size=bittensor.__network_dim__, num_hidden_layers=config.synapse.num_hidden_layers, num_attention_heads=config.synapse.num_attention_heads, intermediate_size=bittensor.__network_dim__, is_decoder=False) # dendrite: (PKM layer) queries network using pooled embeddings as context. # [batch_size, -1] -> topk * [batch_size, bittensor.__network_dim__] self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # encoder_layer: encodes tokenized sequences to network dim. # [batch_size, sequence_len] -> [batch_size, sequence_len, bittensor.__network_dim__] self.transformer = BertModel(huggingface_config, add_pooling_layer=True) # hidden_layer: transforms context and encoding to network_dim hidden units. # [batch_size, sequence_dim, bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__network_dim__] self.hidden_layer = torch.nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # pooling_layer: transforms teh hidden layer into a pooled representation by taking the encoding of the first token # [batch_size, sequence_dim, bittensor.__network_dim__] -> [batch_size, bittensor.__network_dim__] self.pooler = BertPooler(huggingface_config) self.to(self.device) @staticmethod def default_config() -> Munch: parser = argparse.ArgumentParser() BertSynapseBase.add_args(parser) config = bittensor.config.Config.to_config(parser) return config @staticmethod def add_args(parser: argparse.ArgumentParser): r""" Add custom params to the parser. """ parser.add_argument( '--synapse.num_hidden_layers', default=2, type=int, help='Number of hidden layers in the Transformer encoder.') parser.add_argument( '--synapse.num_attention_heads', default=2, type=int, help= 'Number of attention heads for each attention layer in the Transformer encoder.' ) parser.add_argument( '--synapse.n_block_filter', default=100, type=int, help='Stale neurons are filtered after this many blocks.') PKMRouter.add_args(parser) @staticmethod def check_config(config: Munch): r""" Add custom checks to the config. """ pass def forward_text(self, inputs: torch.LongTensor): """ Local forward inputs through the BERT NSP Synapse. Args: inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): Batch_size length list of tokenized sentences. Returns: hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): Hidden layer representation produced using the local_context. """ hidden = self.base_local_forward(inputs=inputs).local_hidden return hidden def base_local_forward(self, inputs: torch.LongTensor, attention_mask: torch.LongTensor = None): r""" Forward pass inputs and labels through the NSP BERT module. Args: inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): Batch_size length list of text sentences. attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **maked**. SimpleNamespace { local_context (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): Hidden layer context. local_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): Hidden layer encoding produced using local_context. local_pooled (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `required`): Local hidden state pooled by returning the encoding of the first token. } """ inputs = torch.clamp( inputs, 0, bittensor.__vocab_size__) # Filter out of range tokens. # Return vars to be filled. output = SimpleNamespace() # local_context: distilled version of remote_context. # local_context.shape = [batch_size, sequence_len, bittensor.__network_dim__] output.local_context = self.transformer( input_ids=inputs, return_dict=True, attention_mask=attention_mask).last_hidden_state # local_hidden: hidden layer encoding of sequence using local context # local_hidden.shape = [batch_size, sequence_len, bittensor.__network_dim__] output.local_hidden = self.hidden_layer(output.local_context) output.local_pooled = self.pooler(output.local_hidden) return output def base_remote_forward(self, neuron: bittensor.neuron.Neuron, inputs: torch.LongTensor, attention_mask: torch.LongTensor = None): """Forward pass inputs and labels through the remote BERT networks. Args: neuron (:obj: `bittensor.Neuron`, `required`): Bittensor neuron, used for making queries to the remote network. inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): Batch_size length list of text sentences. attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **maked**. Returns: SimpleNamespace ( distillation_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): Distillation loss between local_context and remote_context. remote_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `optional`): Hidden layer encoding produced using the remote_context. router (:obj:`SimpleNamespace`, `required`): Outputs from the pkm dendrite. ) """ inputs = torch.clamp( inputs, 0, bittensor.__vocab_size__) # Filter out of range tokens. output = self.base_local_forward(inputs=inputs, attention_mask=attention_mask) # remote_context: joined responses from a bittensor.forward_text call. # remote_context.shape = [batch_size, sequence_len, bittensor.__network_dim__] output.router = self.router.forward_text(neuron=neuron, text=inputs, query=output.local_pooled) # distillation_loss: distillation loss between local_context and remote_context # distillation_loss.shape = [1] output.distillation_loss = F.mse_loss(output.local_context, output.router.response.detach()) # remote_hidden: hidden layer encoding using remote_context. # remote_hidden.shape = [batch_size, sequence_len, bittensor.__network_dim__] output.remote_hidden = self.hidden_layer(output.router.response) output.remote_pooled = self.pooler(output.remote_hidden) return output
def add_args(parser: argparse.ArgumentParser): r""" Add custom params to the parser. """ parser.add_argument( '--synapse.n_head', default=1, type=int, help= 'Number of attention heads for each attention layer in the Transformer encoder.' ) parser.add_argument( '--synapse.n_layer', default=2, type=int, help='Number of hidden layers in the Transformer encoder.') parser.add_argument( '--synapse.n_inner', default=8, type=int, help= 'The dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd' ) parser.add_argument( '--synapse.activation_function', default='gelu_new', type=str, help= 'Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]' ) parser.add_argument('--synapse.resid_pdrop', default=0.1, type=float, help='GPT residual dropout probabilit.') parser.add_argument('--synapse.embd_pdrop', default=0.1, type=float, help='GPT embedding dropout probability.') parser.add_argument('--synapse.attn_pdrop', default=0.1, type=float, help='GPT attention dropout probability.') parser.add_argument( '--synapse.layer_norm_epsilon', default=1e-05, type=float, help='GPT the epsilon to use in the layer normalization layers') parser.add_argument( '--synapse.summary_type', default='cls_index', type=str, help= 'Supply a Tensor of classification token position (like GPT/GPT-2).' ) parser.add_argument( '--synapse.initializer_range', default=0.02, type=float, help= 'The standard deviation of the truncated_normal_initializer for initializing all weight matrices.' ) parser.add_argument( '--synapse.summary_use_proj', default=True, type=bool, help= 'Whether or not to add a projection after the vector extraction.') parser.add_argument( '--synapse.summary_activation', type=str, help= 'Pass "tanh" for a tanh activation to the output, any other value will result in no activation.' ) parser.add_argument( '--synapse.summary_proj_to_labels', default=True, type=bool, help= 'Whether the projection outputs should have config.num_labels or config.hidden_size classes.' ) parser.add_argument( '--synapse.summary_first_dropout', default=0.1, type=float, help= 'The dropout ratio to be used after the projection and activation.' ) parser.add_argument( '--synapse.n_block_filter', default=100, type=int, help='Stale neurons are filtered after this many blocks.') PKMRouter.add_args(parser)
def add_args(parser: argparse.ArgumentParser): """ Add custom params to the Synapse Args: parser (:obj:`argparse.AgumentParser`): Argument Parser object. """ parser.add_argument( '--synapse.emb_dim', default=bittensor.__network_dim__, type=int, help='Dimensionality of the encoder layers and the pooler layer.') parser.add_argument( '--synapse.n_layers', default=12, type=int, help='Number of hidden layers in the Transformer encoder.') parser.add_argument( '--synapse.n_heads', default=16, type=int, help= 'Number of attention heads for each attention layer in the Transformer encoder.' ) parser.add_argument( '--synapse.dropout', default=0.1, type=float, help= 'The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.' ) parser.add_argument( '--synapse.attention_dropout', default=0.1, type=float, help='The dropout probability for the attention mechanism.') parser.add_argument( '--synapse.gelu_activation', default=True, type=bool, help= 'Whether or not to use gelu for the activations instead of relu.') parser.add_argument( '--synapse.sinusoidal_embeddings', default=False, type=bool, help= 'Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.' ) parser.add_argument( '--synapse.causal', default=False, type=bool, help= 'Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in order to only attend to the left-side context instead if a bidirectional context.' ) parser.add_argument( '--synapse.asm', default=False, type=bool, help= 'Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction layer.' ) parser.add_argument( '--synapse.n_langs', default=1, type=int, help= 'The number of languages the model handles. Set to 1 for monolingual models.' ) parser.add_argument( '--synapse.use_lang_emb', default=True, type=bool, help= 'Whether to use language embeddings. Some models use additional language embeddings, see the multilingual models page for information on how to use them.' ) parser.add_argument( '--synapse.max_position_embeddings', default=512, type=bool, help= 'The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).' ) parser.add_argument( '--synapse.embed_init_std', default=pow(2048, -0.5), type=float, help= 'The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.' ) parser.add_argument( '--synapse.init_std', default=50257, type=int, help= 'The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the embedding matrices.' ) parser.add_argument( '--synapse.layer_norm_eps', default=pow(1, -12), type=float, help='The epsilon used by the layer normalization layers.') parser.add_argument( '--synapse.bos_index', default=0, type=int, help= 'The index of the beginning of sentence token in the vocabulary.') parser.add_argument( '--synapse.eos_index', default=1, type=int, help='The index of the end of sentence token in the vocabulary.') parser.add_argument( '--synapse.pad_index', default=2, type=int, help='The index of the padding token in the vocabulary.') parser.add_argument( '--synapse.unk_index', default=3, type=int, help='The index of the unknown token in the vocabulary.') parser.add_argument( '--synapse.mask_index', default=5, type=int, help='The index of the masking token in the vocabulary.') parser.add_argument( '--synapse.is_encoder', default=True, type=bool, help= 'Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.' ) parser.add_argument( '--synapse.summary_type', default="first", type=str, help= 'Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.' ) parser.add_argument( '--synapse.summary_use_proj', default=True, type=bool, help= 'Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. Whether or not to add a projection after the vector extraction.' ) parser.add_argument( '--synapse.summary_activation', type=str, help= 'Pass "tanh" for a tanh activation to the output, any other value will result in no activation.' ) parser.add_argument( '--synapse.summary_proj_to_labels', default=True, type=bool, help= 'Whether the projection outputs should have config.num_labels or config.hidden_size classes.' ) parser.add_argument( '--synapse.summary_first_dropout', default=0.1, type=float, help= 'The dropout ratio to be used after the projection and activation.' ) parser.add_argument('--synapse.start_n_top', default=5, type=int, help=' Used in the SQuAD evaluation script.') parser.add_argument('--synapse.end_n_top', default=5, type=int, help='Used in the SQuAD evaluation script.') parser.add_argument( '--synapse.mask_token_id', default=0, type=int, help= 'Model agnostic parameter to identify masked tokens when generating text in an MLM context.' ) parser.add_argument( '--synapse.lang_id', default=1, type=int, help= 'The ID of the language used by the model. This parameter is used when generating text in a given language.' ) PKMRouter.add_args(parser)
def __init__(self, config, **kwargs): super(GPT2Synapse, self).__init__(config=config, **kwargs) """The full GPT language model, with context of a block size. Args: config (:obj: `munch.Munch`, `required`): munched config class. """ if config == None: config = GPT2Synapse.default_config() bittensor.config.Config.update_with_kwargs(config.synapse, kwargs) GPT2Synapse.check_config(config) self.config = config gpt_config = GPTConfig(vocab_size=bittensor.__vocab_size__, n_embd=bittensor.__network_dim__, n_head=config.synapse.n_head, n_layer=config.synapse.n_layer, block_size=config.synapse.block_size, embd_pdrop=config.synapse.embd_pdrop, resid_pdrop=config.synapse.resid_pdrop, attn_pdrop=config.synapse.attn_pdrop) # Token embedding layer. # [bittensor.__vocab_size__, bittensor.__network_dim__] self.tok_emb = nn.Embedding(gpt_config.vocab_size, gpt_config.n_embd) # Positional embedding. # [1, block_size, bittensor.__network_dim__] self.pos_emb = nn.Parameter( torch.zeros(1, gpt_config.block_size, gpt_config.n_embd)) self.drop = nn.Dropout(gpt_config.embd_pdrop) # Transformer blocks self.blocks = nn.Sequential( *[Block(gpt_config) for _ in range(gpt_config.n_layer)]) # Decoder head self.ln_f = nn.LayerNorm(gpt_config.n_embd) # Head # [ bittensor.__network_dim__, bittensor.__network_dim__ ] self.head = nn.Linear(gpt_config.n_embd, bittensor.__network_dim__, bias=False) # pooler_layer: pools the hidden units for use by the pkm dendrite rpc query. self.pooler = GPTPooler(gpt_config) # Router: (PKM layer) queries network using pooled embeddings as context. self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # Hidden layer self.hidden_layer = nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # Target layer self.target_layer = nn.Linear(bittensor.__network_dim__, gpt_config.vocab_size, bias=False) # Block size here corresponds to sequence lengths self.block_size = gpt_config.block_size self.apply(self._init_weights) # Loss function: MLM cross-entropy loss. # predicted: [batch_size, sequence_len, 1], targets: [batch_size, sequence_len, 1] -> [1] self.loss_fct = nn.CrossEntropyLoss() self.num_parameters = sum(p.numel() for p in self.parameters()) self.to(self.device)
class DPNSynapse(bittensor.synapse.Synapse): """ Bittensor endpoint trained on PIL images to detect objects using an DPN. """ def __init__( self, config: Munch = None): r""" Init a new DPN synapse module. Args: config (:obj: `munch.Munch`, `required`) munch namespace config item. """ super(DPNSynapse, self).__init__(config = config) if config == None: config = DPNSynapse.build_config() in_planes, out_planes = config.synapse.in_planes, config.synapse.out_planes num_blocks, dense_depth = config.synapse.num_blocks, config.synapse.dense_depth # Transform Network """ Transform network. Layers take in image inputs normalizes them and applies 4 convolutional layers. Image encoder: transforms PIL-encoded tensors to a common shape. [batch_size, channels, rows, cols] -> [batch_size, -1, -1, -1] Output: [batch_size, self.transform_dim (9728)] """ self.transform = Normalize((0.1307,), (0.3081,), device=self.device) self.adaptive_pool = nn.AdaptiveAvgPool2d((32, 32)) self.transform_conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) self.transform_bn1 = nn.BatchNorm2d(64) self.last_planes = 64 self.transform_layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1) self.transform_layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2) self.transform_layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=1) self.transform_layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2) self.transform_dim = (out_planes[3] * 4)+(((num_blocks[3]+1) * 4)*dense_depth[3]) # dendrite: (PKM layer) queries network using pooled embeddings as context. # [batch_size, -1] -> topk * [batch_size, bittensor.__network_dim__] self.router = PKMRouter(config, query_dim = self.transform_dim) # Context layers. """ Distillation model for remote context. This layer takes input coming from transform layer, and runs it through 3 linear layers, projecting it to bittensor.__network_dim__. """ self.context_layer1 = nn.Linear(self.transform_dim, 512) self.context_layer2 = nn.Linear(512, 256) self.context_layer3 = nn.Linear(256, bittensor.__network_dim__) # hidden layer. self.hidden_layer1 = nn.Linear(self.transform_dim + bittensor.__network_dim__, 512) self.hidden_layer2 = nn.Linear(512, 256) self.hidden_layer3 = nn.Linear(256, bittensor.__network_dim__) # Layers to project target down to target size passed by config # (number of classes) self.target_layer1 = nn.Linear(bittensor.__network_dim__, 128) self.target_layer2 = nn.Linear(128, self.config.synapse.target_dim) self.to(self.device) @staticmethod def build_config() -> Munch: parser = argparse.ArgumentParser(); DPNSynapse.add_args(parser) config = bittensor.config.Config.to_config(parser); DPNSynapse.check_config(config) return config @staticmethod def add_args(parser: argparse.ArgumentParser): r""" This function adds the configuration items for the DPN synapse. These args are use to instantiate a Dual Path model. Instantiating a configuration with the defaults will yield a "shallow" DPN-26 configuration. For deeper network configurations, it is possible to set the num_blocks parameter to (3, 4, 20, 3) for a DPN-92. For DPN-98 set the following: in_planes: (160, 320, 640, 1280) out_planes: (256, 512, 1024, 2048) num_blocks: (3, 6, 20, 3) dense_depth: (16, 32, 32, 128) """ def to_list(arg): return [int(i) for i in arg.split(",")] parser.add_argument('--synapse.in_planes', default='160, 320, 640, 1280', action="append", type=to_list) parser.add_argument('--synapse.out_planes', default='256, 512, 1024, 2048', action="append", type=to_list) parser.add_argument('--synapse.num_blocks', default='3, 6, 20, 3', action="append", type=to_list) parser.add_argument('--synapse.dense_depth', default='16, 32, 32, 128', action="append", type=to_list) parser.add_argument('--synapse.target_dim', default=10, type=int, help='Final logit layer dimension. i.e. 10 for CIFAR-10.') parser = PKMRouter.add_args(parser) @staticmethod def check_config(config: Munch): assert isinstance(config.synapse.in_planes, list), 'synapse.in_planes must be a tuple, got {}'.format(config.synapse.in_planes) assert isinstance(config.synapse.out_planes, list), 'synapse.out_planes must be a tuple, got {}'.format(config.synapse.out_planes) assert isinstance(config.synapse.num_blocks, list), 'synapse.num_blocks must be a tuple, got {}'.format(config.synapse.num_blocks) assert isinstance(config.synapse.dense_depth, list), 'synapse.dense_depth must be a tuple, got {}'.format(config.synapse.dense_depth) assert all(isinstance(el, int) for el in config.synapse.in_planes), 'synapse.in_planes must be a tuple of ints, got {}'.format(config.synapse.in_planes) assert all(isinstance(el, int) for el in config.synapse.out_planes), 'synapse.out_planes must be a tuple of ints, got {}'.format(config.synapse.out_planes) assert all(isinstance(el, int) for el in config.synapse.num_blocks), 'synapse.num_blocks must be a tuple of ints, got {}'.format(config.synapse.num_blocks) assert all(isinstance(el, int) for el in config.synapse.dense_depth), 'synapse.dense_depth must be a tuple of ints, got {}'.format(config.synapse.dense_depth) def forward_image ( self, images: torch.Tensor): r""" Forward image inputs through the DPN synapse . Args: inputs (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_dim, channels, rows, cols)`, `required`): Image tensors produced by calling PIL.toTensor() and with sequence dimension. Returns: hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_dim, bittensor.__network_dim__)`, `required`): Hidden layer encoding produced by using local_context. """ # images: remove sequence dimension from images. # images.shape = [batch_size, channels, rows, cols] images = images.view(images.shape[0] * images.shape[1], images.shape[2], images.shape[3], images.shape[4]) # hidden: hidden layer using local context for local computation only. # hidden.shape = [batch_size, __network_dim__] hidden = self.forward (images = images.to(self.device), remote = False).local_hidden # hidden: re-add sequence dimension to outputs. # hidden.shape = [batch_size, sequence_dim, __network_dim__] hidden = torch.unsqueeze(hidden, 1) return hidden def local_forward ( self, images: torch.Tensor, targets: torch.Tensor = None ) -> SimpleNamespace: r""" Forward pass non-sequential image inputs and targets through the DPN Synapse. Args: images (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, channels, rows, cols)`, `required`): PIL.toTensor() encoded images. targets (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.target_size)`, `optional`): Image labels. remote (:obj:`bool')`, `optional`): Switch between local and remote context. If true, function makes quries to the remote network. Returns: SimpleNamespace ( local_context (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `required`): Pre-Hidden layer context, trained to match the remote context. local_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `required`): Hidden layer produced from the context. local_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_dim)`, `optional`): FFNN Target predictions using local_context. local_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): FFNN Classification loss using local_context. local_accuracy (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): Accuracy of target predictions. transform (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, transform_dim)`, `optional`): transformation of various sized images to batch-size transform dim. ) """ # Return vars to be filled. output = SimpleNamespace () r""" Transform the images into a common shape (32x32) """ # transform: transform images to common shape. # transform.shape = [batch_size, self.transform_dim] transform = self.transform(images) transform = self.adaptive_pool(transform) transform = F.relu(self.transform_bn1(self.transform_conv1(transform.detach()))) transform = self.transform_layer1(transform) transform = self.transform_layer2(transform) transform = self.transform_layer3(transform) transform = self.transform_layer4(transform) transform = F.avg_pool2d(transform, 4) output.transform = torch.flatten(transform, start_dim=1) # local_context: distillation model for remote_context. # local_context.shape = [batch_size, bittensor.__network_dim__] local_context = self.context_layer1(output.transform.detach()) local_context = self.context_layer2(local_context) output.local_context = self.context_layer3(local_context) # local_hidden: hidden layer encoding using local_context. # local_hidden.shape = [batch_size, bittensor.__network_dim__] local_hidden = torch.cat([output.transform, output.local_context], dim=1) local_hidden = self.hidden_layer1(local_hidden) local_hidden = self.hidden_layer2(local_hidden) output.local_hidden = self.hidden_layer3(local_hidden) if targets is not None: # local_target: projection of local_hidden onto target dimension. # local_target.shape = [batch_size, target_dim] targets.to(self.device) local_target = self.target_layer1(output.local_hidden) local_target = self.target_layer2(local_target) output.local_target = F.log_softmax(local_target, dim=1) # local_target_loss: loss between local_target and passed targets. # local_target_loss.shape = [1] output.local_target_loss = F.nll_loss(output.local_target, targets) # Record extra metadata accuracy. max_logit = local_target.data.max(1, keepdim=True)[1] correct = max_logit.eq( targets.data.view_as(max_logit) ).sum() output.local_accuracy = (100.0 * correct) / targets.shape[0] return output def remote_forward(self, neuron: bittensor.neuron.Neuron, images: torch.Tensor, targets: torch.Tensor = None) -> SimpleNamespace: """ Forward pass non-sequential image inputs and targets through the synapse. Makes RPC queries to downstream neurons. Args: neuron (:obj: `bittensor.neuron.Neuron`, `required`): Bittensor neuron, used for making queries to the remote network. images (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, channels, rows, cols)`, `required`): PIL.toTensor() encoded images. targets (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_dim)`, `optional`, defaults to None): Image labels. Returns: self.local_forward() + SimpleNamespace ( router (:obj:`SimpleNamespace`, `required`): Outputs from the pkm dendrite remote call. distillation_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): Distillation loss between the local and remote context. remote_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__network_dim__)`, `optional`): Hidden layer encoding produced using the remote context. remote_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_dim)`, `optional`): FFNN Target predictions using the remote_context. remote_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): FFNN Classification loss using the remote_context. ) """ # Call the local forward pass. # output = bittensor.SynapseOutput output = self.local_forward( images, targets ) # Make remote queries using the PKMRouter. # remote_context: responses from a bittensor remote network call. # remote_context.shape = [batch_size, bittensor.__network_dim__] images = torch.unsqueeze(images, 1) output.router = self.router.forward_image( neuron, images, output.transform ) remote_context = torch.squeeze( output.router.response, 1 ).to(self.device) # Distill the local context to match the remote context. # distillation_loss: distillation loss between local_context and remote_context # distillation_loss.shape = [1] output.distillation_loss = F.mse_loss(output.local_context, remote_context.detach() ) # remote_hidden: hidden layer encoding using remote_context. # remote_hidden.shape = [batch_size, bittensor.__network_dim__] remote_hidden = torch.cat([output.transform, remote_context], dim=1) remote_hidden = self.hidden_layer1(remote_hidden) remote_hidden = self.hidden_layer2(remote_hidden) output.remote_hidden = self.hidden_layer3(remote_hidden) if targets is not None: # remote_target: projection of remote_hidden onto target dimension. # remote_target.shape = [batch_size, config.target_size] remote_target = self.target_layer1(output.remote_hidden) remote_target = self.target_layer2(remote_target) output.remote_target = F.log_softmax(remote_target, dim=1) # remote_target_loss: loss between remote_target and passed targets. # remote_target_loss.shape = [1] output.remote_target_loss = F.nll_loss(output.remote_target, targets) return output def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride): """ Generates a sequential container containing Bottleneck layers. Args: in_planes (tuple): 4-element tuple describing the in_planes config. out_planes (tuple): 4-element tuple describing the out_planes config. num_blocks (tuple): 4-element tuple describing the number of blocks at this layer. dense_depth (tuple): 4-element tuple describing the depth of this layer. stride (int): Convolutional stride length. Returns: nn.Sequential: A torch.nn sequential container containing the layers outlined in the inputs. """ strides = [stride] + [1]*(num_blocks-1) layers = [] for i,stride in enumerate(strides): layers.append(self.Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0)) self.last_planes = out_planes + (i+2) * dense_depth return nn.Sequential(*layers) class Bottleneck(nn.Module): def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer): super(DPNSynapse.Bottleneck, self).__init__() self.out_planes = out_planes self.dense_depth = dense_depth self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(in_planes) self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False) self.bn2 = nn.BatchNorm2d(in_planes) self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(out_planes + dense_depth) self.shortcut = nn.Sequential() if first_layer: self.shortcut = nn.Sequential( nn.Conv2d(last_planes, out_planes + dense_depth, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(out_planes + dense_depth) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) x = self.shortcut(x) d = self.out_planes out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1) out = F.relu(out) return out
class XLMSynapse(bittensor.synapse.Synapse): """A Bittensor Synapse training XLM Args: synapse (:obj:`Synapse`): The Synapse superclass, which contains fwd and backward logic. """ def __init__(self, config: Munch = None, **kwargs): """ Initialize a new XLM synapse module. Args: config (:obj:`munch.Munch`, `required`): munched config class. """ super(XLMSynapse, self).__init__(config=config, **kwargs) if config == None: config = XLMSynapse.default_config() bittensor.config.Config.update_with_kwargs(config.synapse, kwargs) XLMSynapse.check_config(config) self.config = config # Build config. xlm_config = XLMConfig( vocab_size=bittensor.__vocab_size__, emb_dim=bittensor.__network_dim__, n_layers=config.synapse.n_layers, n_heads=config.synapse.n_heads, # More needed ) # model layer: encodes tokenized sequences to network dim. self.xlm = XLMModel(xlm_config) # pooler layer: pools the hidden units for use by the pkm dendrite rpc query. self.pooler = XLMPooler(xlm_config) # router: (PKM layer) queries network using embeddings as context self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # hidden layer: transforms context and encoding to network dimension hidden units. self.hidden_layer = nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # target layer: maps from hidden layer to vocab dimension for each token. self.target_layer = nn.Linear(bittensor.__network_dim__, bittensor.__vocab_size__, bias=False) # Loss function self.loss_fct = nn.CrossEntropyLoss() self.to(self.device) @staticmethod def default_config() -> Munch: parser = argparse.ArgumentParser() XLMSynapse.add_args(parser) config = bittensor.config.Config.to_config(parser) return config @staticmethod def add_args(parser: argparse.ArgumentParser): """ Add custom params to the Synapse Args: parser (:obj:`argparse.AgumentParser`): Argument Parser object. """ parser.add_argument( '--synapse.emb_dim', default=bittensor.__network_dim__, type=int, help='Dimensionality of the encoder layers and the pooler layer.') parser.add_argument( '--synapse.n_layers', default=12, type=int, help='Number of hidden layers in the Transformer encoder.') parser.add_argument( '--synapse.n_heads', default=16, type=int, help= 'Number of attention heads for each attention layer in the Transformer encoder.' ) parser.add_argument( '--synapse.dropout', default=0.1, type=float, help= 'The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.' ) parser.add_argument( '--synapse.attention_dropout', default=0.1, type=float, help='The dropout probability for the attention mechanism.') parser.add_argument( '--synapse.gelu_activation', default=True, type=bool, help= 'Whether or not to use gelu for the activations instead of relu.') parser.add_argument( '--synapse.sinusoidal_embeddings', default=False, type=bool, help= 'Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.' ) parser.add_argument( '--synapse.causal', default=False, type=bool, help= 'Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in order to only attend to the left-side context instead if a bidirectional context.' ) parser.add_argument( '--synapse.asm', default=False, type=bool, help= 'Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction layer.' ) parser.add_argument( '--synapse.n_langs', default=1, type=int, help= 'The number of languages the model handles. Set to 1 for monolingual models.' ) parser.add_argument( '--synapse.use_lang_emb', default=True, type=bool, help= 'Whether to use language embeddings. Some models use additional language embeddings, see the multilingual models page for information on how to use them.' ) parser.add_argument( '--synapse.max_position_embeddings', default=512, type=bool, help= 'The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).' ) parser.add_argument( '--synapse.embed_init_std', default=pow(2048, -0.5), type=float, help= 'The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.' ) parser.add_argument( '--synapse.init_std', default=50257, type=int, help= 'The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the embedding matrices.' ) parser.add_argument( '--synapse.layer_norm_eps', default=pow(1, -12), type=float, help='The epsilon used by the layer normalization layers.') parser.add_argument( '--synapse.bos_index', default=0, type=int, help= 'The index of the beginning of sentence token in the vocabulary.') parser.add_argument( '--synapse.eos_index', default=1, type=int, help='The index of the end of sentence token in the vocabulary.') parser.add_argument( '--synapse.pad_index', default=2, type=int, help='The index of the padding token in the vocabulary.') parser.add_argument( '--synapse.unk_index', default=3, type=int, help='The index of the unknown token in the vocabulary.') parser.add_argument( '--synapse.mask_index', default=5, type=int, help='The index of the masking token in the vocabulary.') parser.add_argument( '--synapse.is_encoder', default=True, type=bool, help= 'Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.' ) parser.add_argument( '--synapse.summary_type', default="first", type=str, help= 'Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.' ) parser.add_argument( '--synapse.summary_use_proj', default=True, type=bool, help= 'Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. Whether or not to add a projection after the vector extraction.' ) parser.add_argument( '--synapse.summary_activation', type=str, help= 'Pass "tanh" for a tanh activation to the output, any other value will result in no activation.' ) parser.add_argument( '--synapse.summary_proj_to_labels', default=True, type=bool, help= 'Whether the projection outputs should have config.num_labels or config.hidden_size classes.' ) parser.add_argument( '--synapse.summary_first_dropout', default=0.1, type=float, help= 'The dropout ratio to be used after the projection and activation.' ) parser.add_argument('--synapse.start_n_top', default=5, type=int, help=' Used in the SQuAD evaluation script.') parser.add_argument('--synapse.end_n_top', default=5, type=int, help='Used in the SQuAD evaluation script.') parser.add_argument( '--synapse.mask_token_id', default=0, type=int, help= 'Model agnostic parameter to identify masked tokens when generating text in an MLM context.' ) parser.add_argument( '--synapse.lang_id', default=1, type=int, help= 'The ID of the language used by the model. This parameter is used when generating text in a given language.' ) PKMRouter.add_args(parser) @staticmethod def check_config(config: Munch): assert config.synapse.n_layers > 0, "Number of hidden layers in the Transformer encoder must be > 0" assert config.synapse.n_heads > 0, "Number of attention heads for each attention layer in the Transformer encoder must be > 0" def forward_text(self, inputs: torch.LongTensor): """ Local forward inputs through the XLM Synapse. Args: inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): Batch_size length list of tokenized sentences. Returns: hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): Hidden layer representation produced using the local_context. """ hidden = self.local_forward(inputs=inputs.to(self.device), training=False).local_hidden return hidden def local_forward(self, inputs: torch.LongTensor, training: bool = True) -> SimpleNamespace: """ Forward pass through XLM synapse. Args: inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): Batch_size length list of text sentences. training (:obj:`bool')`, `optional`, defaults to True): Switch to True if this forward pass computes an CLM loss. SimpleNamespace { local_context (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): Hidden layer context. local_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `required`): Hidden layer encoding produced using local_context. local_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__vocab_size__)`, `optional`): XLM CLM Target predictions produced using local_context. local_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): XLM CLM loss using local_context. } """ # return variables to be filled. output = SimpleNamespace() # local_context: distilled version of remote context. # local_context.shape = [batch_size, sequence_len, bittensor.__network_dim__] output.local_context = self.xlm(input_ids=inputs, return_dict=True).last_hidden_state # local_hidden: hidden layer encoding of sequence with local_context. # local_hidden.shape = [batch_size, sequence_len, bittensor.__network_dim__] output.local_hidden = self.hidden_layer(output.local_context) if training: # local_target: projection of local_hidden onto target dimension. # local_target.shape = [batch_size, sequence_len, bittensor.__vocab_size__] output.local_target = self.target_layer(output.local_hidden) # local_target_loss: XLM loss between local_target and ground truth targets (passed targets) shift_logits = output.local_target[..., :-1, :].contiguous() shift_labels = inputs[..., 1:].contiguous() output.local_target_loss = self.loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return output def remote_forward(self, neuron: bittensor.neuron.Neuron, inputs: torch.LongTensor, training: bool) -> SimpleNamespace: """ Forward pass inputs and labels through the XLM module. Args: neuron (:obj: `bittensor.neuron.Neuron`, `required`): Bittensor neuron, used for making queries to the remote network. inputs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_len)`, `required`): Batch_size length list of text sentences. training (:obj:`bool')`, `optional`, defaults to True): Switch to True if this forward pass computes a CLM loss. Returns: self.local_forward() + SimpleNamespace ( remote_hidden (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_len, bittensor.__network_dim__)`, `optional`): Hidden layer encoding produced using the remote_context. remote_target (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, bittensor.__vocab_size__)`, `optional`): XLM CLM Target predictions using the remote_context. remote_target_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): XLM CLM loss using the remote_context. distillation_loss (:obj:`torch.FloatTensor` of shape :obj:`(1)`, `optional`): Distillation loss between local_context and remote_context. router (:obj:`SimpleNamespace`, `required`): Outputs from the pkm dendrite. ) """ # Filter out of range tokens inputs = torch.clamp(inputs, 0, bittensor.__vocab_size__) # Run local model # output = SimpleNamespace output = self.local_forward(inputs, training) # pooled: pooled hidden layer from local run, used as our query context. # pooled.shape = [batch_size, bittensor.__network_dim__] pooled = self.pooler(output.local_hidden.detach()) # remote_context: joined responses from a dendrite.forward_text call. # remote_context.shape = [batch_size, sequence_len, bittensor.__network_dim__] output.router = self.router.forward_text(neuron, inputs.to(self.device), pooled) remote_context = output.router.response # Distillation loss: distillation loss between local_context and remote_context # distillation_loss.shape = [1] output.distillation_loss = F.mse_loss(output.local_context, remote_context.detach()) # remote_hidden: hidden layer encoding using remote_context. # remote_hidden.shape = [batch_size, sequence_length, bittensor.__network_dim__] output.remote_hidden = self.hidden_layer(remote_context) if training: # remote_target: projection of remote_hidden onto target dimension. # remote_target.shape = [batch_size, sequence_len, bittensor.__vocab_size__] output.remote_target = self.target_layer(output.remote_hidden) # remote_target_loss: CLM oss between remote_target and passed targets. # remote_target_loss.shape = [1] shift_logits = output.remote_target[..., :-1, :].contiguous() shift_labels = inputs[..., 1:].contiguous() output.remote_target_loss = self.loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return output