示例#1
0
    def __init__(
        self,
        layer_idx,
        hidden_size,
        is_seq_len_dim_leading,
        init_method,
        output_layer_init_method,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.layer_idx = layer_idx

        args = get_args()
        self.attn = SelfAttention(
            layer_idx,
            hidden_size,
            is_seq_len_dim_leading,
            args.hidden_dropout,
            init_method,
            output_layer_init_method,
        )
        self.mlp = MLP(
            layer_idx,
            hidden_size,
            args.hidden_dropout,
            init_method,
            output_layer_init_method,
        )

        self.layernorm_1 = LayerNorm(layer_idx, (self.hidden_size, ))
        self.layernorm_2 = LayerNorm(layer_idx, (self.hidden_size, ))
示例#2
0
    def __init__(self, name="loss"):
        self.name = name

        args = get_args()
        self.batch_size = args.global_batch_size // args.num_accumulation_steps
        self.seq_length = args.seq_length
        self.vocab_size = args.padded_vocab_size
示例#3
0
    def __init__(
        self,
        layer_id,
        batch_size,
        seq_length,
        hidden_size,
        hidden_dropout_rate,
        initializer=None,
        output_layer_initializer=None,
    ):
        self.layer_id = layer_id
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.hidden_size = hidden_size
        self.hidden_dropout_rate = hidden_dropout_rate
        self.initializer = initializer
        self.output_layer_initializer = output_layer_initializer

        args = get_args()
        self.num_heads = args.num_attention_heads
        self.head_size = args.hidden_size // args.num_attention_heads
        self.attention_dropout_rate = args.attention_dropout
        self.scale_tril_softmax_dropout_fusion = args.scale_tril_softmax_dropout_fusion
        self.bias_dropout_fusion = args.bias_dropout_fusion
        self.multihead_attention_fusion = args.multihead_attention_fusion

        self.norm_factor = math.sqrt(float(self.head_size))
        self.coeff = 1.0
        if args.apply_query_key_layer_scaling:
            self.coeff = float(self.layer_id)
            self.norm_factor *= self.coeff
示例#4
0
    def __init__(self, seq_length, hidden_size, vocab_size):
        super().__init__()
        self.seq_length = seq_length
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        args = get_args()
        self.dropout = flow.nn.Dropout(p=args.hidden_dropout)
        self.enable_amp = args.fp16

        # word token embedding shape (vocab_size, hidden_size)
        # sbp: [B, S(0)]
        self.wte = flow.nn.Parameter(
            flow.empty(
                (self.vocab_size, self.hidden_size),
                dtype=flow.float32,
                placement=dist.get_layer_placement(0),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast,
                                     flow.sbp.split(0)]),
            ))

        # word position embedding shape (seq_len, hidden_size)
        # sbp: [B, B]
        self.wpe = flow.nn.Parameter(
            flow.empty(
                (self.seq_length, self.hidden_size),
                dtype=flow.float32,
                placement=dist.get_layer_placement(0),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
            ))

        flow.nn.init.normal_(self.wte, std=args.init_method_std)
        flow.nn.init.normal_(self.wpe, std=args.init_method_std)
示例#5
0
    def __init__(self,
                 layer_idx,
                 input_size,
                 output_size,
                 init_method,
                 need_gelu=False):
        super().__init__()
        self.need_gelu = need_gelu

        args = get_args()
        self.bias_gelu_fusion = args.bias_gelu_fusion

        # col parallel linear weight sbp: [B, S(1)]
        self.weight = flow.nn.Parameter(
            flow.empty(
                (input_size, output_size),
                dtype=flow.float32,
                placement=dist.get_layer_placement(layer_idx),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast,
                                     flow.sbp.split(1)]),
            ))
        init_method(self.weight)

        # col parallel linear bias sbp: [B, S(0)]
        self.bias = flow.nn.Parameter(
            flow.empty(
                (output_size, ),
                dtype=flow.float32,
                placement=dist.get_layer_placement(layer_idx),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast,
                                     flow.sbp.split(0)]),
            ))
        flow.nn.init.zeros_(self.bias)
示例#6
0
    def __init__(
        self,
        model,
        data_loader,
        cross_entropy,
        optimizer=None,
        lr_scheduler=None,
        grad_scaler=None,
    ):
        super().__init__()
        self.model = model
        self.data_loader = data_loader
        self.cross_entropy = cross_entropy
        self.is_train = False
        if optimizer is not None:
            self.is_train = True
            self.add_optimizer(optimizer, lr_sch=lr_scheduler)
            if grad_scaler is not None:
                self.set_grad_scaler(grad_scaler)

        args = get_args()
        self.set_activation_checkpointing()
        self.set_pipeline_stage_id()
        self.config.set_gradient_accumulation_steps(
            args.num_accumulation_steps)

        if args.fp16:
            self.config.enable_amp(True)

        self.config.allow_fuse_add_to_output(True)
        self.config.allow_fuse_model_update_ops(True)
        self.config.allow_fuse_cast_scale(True)
示例#7
0
    def __init__(self):
        self.args = get_args()
        self.rank = flow.env.get_rank()
        self.world_size = flow.env.get_world_size()
        self.model = GPTModel()
        self.data_loader = GPTDataLoader()
        self.cross_entropy = ParallelSparseSoftmaxCrossEntropyLoss()
        self.optimizer = make_optimizer(self.args, self.model)
        self.lr_scheduler = make_lr_scheduler(self.args, self.optimizer)
        # self.optimizer = None
        # self.lr_scheduler = None
        # NOTE(zwx): grad scaler is not available in eager mode
        self.grad_scaler = make_grad_scaler(self.args)

        if self.args.graph:
            flow.boxing.nccl.enable_use_compute_stream(True)

            self.train_graph = GPTGraph(
                self.model,
                self.data_loader,
                self.cross_entropy,
                self.optimizer,
                self.lr_scheduler,
                self.grad_scaler,
            )

        # self.save("init")

        self.logger = Logger(self.rank)
        self.logger.register_metric("iter", IterationMetric())
        self.logger.register_metric("samples", AccumulationMetric())
        self.logger.register_metric("loss", LossMetric(), "loss: {:.5f}", True)
        self.logger.register_metric("throughput", ThroughputMetric(),
                                    "throughput: {:.2f}", True)
示例#8
0
 def __init__(self, name):
     self.name = name
     args = get_args()
     assert args.dataset is not None
     self.dataset = args.dataset
     self.batch_size = args.global_batch_size // args.num_accumulation_steps
     self.seq_length = args.seq_length
     self.seed = args.seed
     self.split = args.split
     self.num_samples = args.train_samples
示例#9
0
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        args = get_args()
        self.is_seq_len_dim_leading = True if args.multihead_attention_fusion else False
        self.num_layers = args.num_layers

        self._build_layers(args.init_method_std)
        self.layernorm_f = LayerNorm(-1, (self.hidden_size, ))
示例#10
0
    def __init__(self):
        super().__init__()
        args = get_args()
        self.batch_size = args.global_batch_size // args.num_accumulation_steps
        self.seq_length = args.seq_length
        self.hidden_size = args.hidden_size

        self.embedding = Embedding(self.seq_length, self.hidden_size,
                                   args.padded_vocab_size)
        self.transformer = Transformer(self.hidden_size)
        self.logits = Logits()
示例#11
0
    def __init__(self, name):
        self.name = name

        args = get_args()
        self.batch_size = args.global_batch_size // args.num_accumulation_steps
        self.seq_length = args.seq_length
        self.hidden_size = args.hidden_size
        self.vocab_size = args.padded_vocab_size

        self.embedding = Embedding(self.batch_size, self.seq_length,
                                   self.hidden_size, self.vocab_size)
        self.transformer = Transformer(self.batch_size, self.seq_length,
                                       self.hidden_size)
示例#12
0
    def __init__(self, batch_size, seq_length, hidden_size, vocab_size):
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        args = get_args()
        self.embedding_dropout_rate = args.hidden_dropout
        self.use_fp16 = args.fp16

        self.wpe_initializer = flow.random_normal_initializer(
            stddev=args.init_method_std)
        self.wte_initializer = flow.random_normal_initializer(
            stddev=args.init_method_std)
示例#13
0
def _infer_split_axis(x):
    if len(x.shape) == 2:
        return 0

    if len(x.shape) == 3:
        if x.shape[0] == x.shape[-1]:
            return -1

        args = get_args()
        if x.shape[0] == args.seq_length:
            return 1

        if x.shape[1] == args.seq_length:
            return 0

    return -1
示例#14
0
    def __init__(
        self,
        layer_idx,
        hidden_size,
        is_seq_len_dim_leading,
        hidden_dropout_rate,
        init_method,
        output_layer_init_method,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.is_seq_len_dim_leading = is_seq_len_dim_leading

        args = get_args()
        self.num_heads = args.num_attention_heads
        self.head_size = args.hidden_size // args.num_attention_heads
        self.attention_dropout_rate = args.attention_dropout
        self.scale_tril_softmax_dropout_fusion = args.scale_tril_softmax_dropout_fusion
        self.multihead_attention_fusion = args.multihead_attention_fusion

        if not self.scale_tril_softmax_dropout_fusion:
            self.multihead_attn_dropout = flow.nn.Dropout(
                p=self.attention_dropout_rate)

        self.norm_factor = math.sqrt(float(self.head_size))
        self.coeff = 1.0
        if args.apply_query_key_layer_scaling:
            self.coeff = float(layer_idx + 1)
            self.norm_factor *= self.coeff

        self.c_attn = ColumnParallelLinear(
            layer_idx,
            self.hidden_size,
            self.hidden_size * 3,
            init_method,
        )

        self.c_proj = RowParallelLinear(
            layer_idx,
            self.hidden_size,
            self.hidden_size,
            output_layer_init_method,
            dropout_rate=hidden_dropout_rate,
        )
示例#15
0
    def __init__(
        self,
        batch_size,
        seq_length,
        hidden_size,
        hidden_dropout_rate,
        initializer=None,
        output_layer_initializer=None,
    ):
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.hidden_size = hidden_size
        self.hidden_dropout_rate = hidden_dropout_rate
        self.initializer = initializer
        self.output_layer_initializer = output_layer_initializer

        args = get_args()
        self.bias_gelu_fusion = args.bias_gelu_fusion
        self.bias_dropout_fusion = args.bias_dropout_fusion
示例#16
0
    def __init__(self):
        super().__init__()
        args = get_args()
        assert args.dataset is not None

        batch_size = args.global_batch_size // args.num_accumulation_steps
        self.reader = flow.nn.GPTIndexedBinDataReader(
            data_file_prefix=args.dataset,
            seq_length=args.seq_length,
            num_samples=args.train_samples,
            batch_size=batch_size,
            dtype=flow.int64,
            shuffle=True,
            random_seed=args.seed,
            split_sizes=args.split,
            split_index=0,
            placement=dist.get_layer_placement(0, "cpu"),
            sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast]),
        )
        self.data_decoder = DataDecoder()
        self.label_decoder = LabelDecoder()
示例#17
0
    def __init__(self, batch_size, seq_length, hidden_size):
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.hidden_size = hidden_size

        args = get_args()
        self.multihead_attention_fusion = args.multihead_attention_fusion
        self.num_layers = args.num_layers
        self.layers = []
        for i in range(self.num_layers):
            self.layers.append(
                TransformerLayer(
                    f"h{i}",
                    i + 1,
                    batch_size,
                    seq_length,
                    hidden_size,
                    initializer=flow.random_normal_initializer(
                        stddev=args.init_method_std),
                    output_layer_initializer=flow.random_normal_initializer(
                        stddev=(args.init_method_std /
                                math.sqrt(2.0 * self.num_layers))),
                ))
示例#18
0
    def __init__(
        self,
        name,
        layer_id,
        batch_size,
        seq_length,
        hidden_size,
        initializer=None,
        output_layer_initializer=None,
    ):
        self.name = name
        self.layer_id = layer_id
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.hidden_size = hidden_size

        args = get_args()
        self.enable_profiling = args.profile_transformer_layer
        self.attn = SelfAttention(
            layer_id,
            batch_size,
            seq_length,
            hidden_size,
            args.hidden_dropout,
            initializer,
            output_layer_initializer,
        )
        self.mlp = MLP(
            batch_size,
            seq_length,
            hidden_size,
            args.hidden_dropout,
            initializer,
            output_layer_initializer,
        )

        self.checkpoint_activations = args.checkpoint_activations
示例#19
0
    def __init__(
        self,
        layer_idx,
        input_size,
        output_size,
        init_method,
        dropout_rate,
    ):
        super().__init__()
        self.dropout_rate = dropout_rate

        args = get_args()
        self.bias_dropout_fusion = args.bias_dropout_fusion
        if not self.bias_dropout_fusion:
            self.dropout = flow.nn.Dropout(p=dropout_rate)

        # col parallel linear weight sbp: [B, S(0)]
        self.weight = flow.nn.Parameter(
            flow.empty(
                (input_size, output_size),
                dtype=flow.float32,
                placement=dist.get_layer_placement(layer_idx),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast,
                                     flow.sbp.split(0)]),
            ))
        init_method(self.weight)

        # col parallel linear bias sbp: [B, B]
        self.bias = flow.nn.Parameter(
            flow.empty(
                (output_size, ),
                dtype=flow.float32,
                placement=dist.get_layer_placement(layer_idx),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
            ))
        flow.nn.init.zeros_(self.bias)
示例#20
0
        "--reset-position-ids",
        action="store_true",
        help="Reset posistion ids after end-of-document token.",
    )
    parser.add_argument(
        "--reset-attention-mask",
        action="store_true",
        help="Reset self attention maske after "
        "end-of-document token.",
    )
    parser.add_argument(
        "--eod-mask-loss",
        action="store_true",
        help="Mask loss for the end of document tokens.",
    )

    return parser


if __name__ == "__main__":

    args = get_args(extra_args_provider=get_tasks_args)

    if args.task in ["LAMBADA"]:
        from zeroshot_gpt.evaluate import main
    else:
        raise NotImplementedError("Task {} is not implemented.".format(
            args.task))

    main(args)
示例#21
0
 def __init__(self):
     args = get_args()
     self._init_parallel_size(args)
     self._init_placement_group(args)
     self._init_parallel_hierarchy()
示例#22
0
def train():
    args = get_args()
    _init_env(args)
    _init_config(args)
    trainer = _make_gpt_train_func(args)
    snapshot = Snapshot(
        load_dir=args.load,
        save_dir=args.save,
        save_interval=args.save_interval,
        total_iters=args.train_iters,
        save_last=args.save_last,
        save_init=args.save_init,
    )

    metric = Metric(
        print_steps=args.log_interval,
        start_step=snapshot.iter,
        max_step=args.train_iters,
        num_samples_per_batch=args.micro_batch_size * args.data_parallel_size,
        keys=["loss"],
        print_format=args.metric_print_format,
        nvidia_smi_report_step=10,
        nvidia_smi_report_file=None,
    )

    if args.use_external_dataset:
        train_val_test_num_samples = get_train_val_test_num_samples(
            args.split, args.train_samples)
        train_ds, _, _ = build_train_valid_test_datasets(
            data_prefix=[args.dataset],
            data_impl="mmap",
            splits_string=args.split,
            train_valid_test_num_samples=train_val_test_num_samples,
            seq_length=args.seq_length,
            seed=args.seed,
            skip_warmup=0,
        )

    if args.train_iters is None and args.train_samples is None:
        raise ValueError("train_iters and train_samples must be set either")

    print("Training...")
    try:
        batch_size = args.micro_batch_size * args.num_accumulation_steps
        iteration = snapshot.iter
        while iteration < args.train_iters:
            if args.use_external_dataset:
                batch = [
                    train_ds[iteration * batch_size + i]
                    for i in range(batch_size)
                ]
                data = np.stack(batch)
                trainer(data).async_get(metric.metric_cb())
            else:
                trainer().async_get(metric.metric_cb())

            snapshot.step()
            iteration = snapshot.iter

    except KeyboardInterrupt:
        print("interrupted")