def __init__(self, topology): self.descs = [] self.descs.append(LayerDesc(EmbeddingPipe)) for x in range(6): self.descs.append(LayerDesc(TransformerNetPipe)) self.descs.append(lambda x: x[0]) super().__init__(layers=self.descs, loss_fn=CriterionPipe(), topology=topology, seg_method="layer:TransformerNetPipe")
def __init__(self, topology): self.descs = [] self.descs.append(LayerDesc(EmbeddingPipe)) for x in range(2): self.descs.append(LayerDesc(TransformerNetPipe)) super().__init__(layers=self.descs, loss_fn=CriterionPipe(), topology=topology, seg_method="layer:TransformerNetPipe", recompute_interval=1, recompute_partition=False, recompute_offload=False)
def __init__(self, **kwargs): self.descs = [] self.descs.append( SharedLayerDesc('embed', EmbeddingPipe, shared_weight_attr='embedding_weight')) self.descs.append(LayerDesc(MatmulNet)) self.descs.append(LayerDesc(BiasNet)) def _logits_helper(embedding, output): return paddle.matmul(output[0], embedding.embedding_weight) self.descs.append( SharedLayerDesc('embed', EmbeddingPipe, forward_func=_logits_helper, shared_weight_attr='embedding_weight')) super(SimpleNetPipe, self).__init__(layers=self.descs, loss_fn=LossNet(), **kwargs)
def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, pad_token_id=0, eos_token_id=7, bos_token_id=0, eol_token_id=3, num_partitions=1, topology=None, recompute_interval=0): # forward desc self.descs = [] self.descs.append( SharedLayerDesc('embed', EmbeddingPipe, shared_weight_attr='embedding_weight', vocab_size=vocab_size, hidden_size=hidden_size, hidden_dropout_prob=hidden_dropout_prob, max_position_embeddings=max_position_embeddings, type_vocab_size=type_vocab_size, initializer_range=0.02)) for _ in range(num_hidden_layers): self.descs.append( LayerDesc(TransformerDecoderLayer, d_model=hidden_size, nhead=num_attention_heads, dim_feedforward=intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=hidden_dropout_prob, weight_attr=paddle.ParamAttr( initializer=nn.initializer.Normal( mean=0.0, std=initializer_range)), bias_attr=None, num_partitions=num_partitions)) self.descs.append(LayerDesc(nn.LayerNorm, normalized_shape=hidden_size)) def _logits_helper(embedding, output): return parallel_matmul(output, embedding.embedding_weight, True) self.descs.append( SharedLayerDesc('embed', EmbeddingPipe, forward_func=_logits_helper, shared_weight_attr='embedding_weight', vocab_size=vocab_size, hidden_size=hidden_size, hidden_dropout_prob=hidden_dropout_prob, max_position_embeddings=max_position_embeddings, type_vocab_size=type_vocab_size, initializer_range=0.02)) super().__init__(layers=self.descs, loss_fn=GPTPretrainingCriterionPipe(), topology=topology, seg_method="layer:TransformerDecoderLayer", recompute_interval=recompute_interval, recompute_partition=False, recompute_offload=False)
def __init__(self, num_classes=10, **kwargs): self.num_classes = num_classes decs = [ LayerDesc(nn.Conv2D, 1, 64, kernel_size=11, stride=4, padding=5), LayerDesc(nn.ReLU), LayerDesc(nn.MaxPool2D, kernel_size=2, stride=2), LayerDesc(nn.Conv2D, 64, 192, kernel_size=5, padding=2), F.relu, LayerDesc(nn.MaxPool2D, kernel_size=2, stride=2), LayerDesc(nn.Conv2D, 192, 384, kernel_size=3, padding=1), F.relu, LayerDesc(nn.Conv2D, 384, 256, kernel_size=3, padding=1), F.relu, LayerDesc(nn.Conv2D, 256, 256, kernel_size=3, padding=1), F.relu, LayerDesc(nn.MaxPool2D, kernel_size=2, stride=2), LayerDesc(ReshapeHelp, shape=[-1, 256]), LayerDesc(nn.Linear, 256, self.num_classes), # classifier ] super(AlexNetPipeDesc, self).__init__(layers=decs, loss_fn=nn.CrossEntropyLoss(), **kwargs)