def __init__( self, layer_idx, hidden_size, is_seq_len_dim_leading, init_method, output_layer_init_method, ): super().__init__() self.hidden_size = hidden_size self.layer_idx = layer_idx args = get_args() self.attn = SelfAttention( layer_idx, hidden_size, is_seq_len_dim_leading, args.hidden_dropout, init_method, output_layer_init_method, ) self.mlp = MLP( layer_idx, hidden_size, args.hidden_dropout, init_method, output_layer_init_method, ) self.layernorm_1 = LayerNorm(layer_idx, (self.hidden_size, )) self.layernorm_2 = LayerNorm(layer_idx, (self.hidden_size, ))
def __init__(self, name="loss"): self.name = name args = get_args() self.batch_size = args.global_batch_size // args.num_accumulation_steps self.seq_length = args.seq_length self.vocab_size = args.padded_vocab_size
def __init__( self, layer_id, batch_size, seq_length, hidden_size, hidden_dropout_rate, initializer=None, output_layer_initializer=None, ): self.layer_id = layer_id self.batch_size = batch_size self.seq_length = seq_length self.hidden_size = hidden_size self.hidden_dropout_rate = hidden_dropout_rate self.initializer = initializer self.output_layer_initializer = output_layer_initializer args = get_args() self.num_heads = args.num_attention_heads self.head_size = args.hidden_size // args.num_attention_heads self.attention_dropout_rate = args.attention_dropout self.scale_tril_softmax_dropout_fusion = args.scale_tril_softmax_dropout_fusion self.bias_dropout_fusion = args.bias_dropout_fusion self.multihead_attention_fusion = args.multihead_attention_fusion self.norm_factor = math.sqrt(float(self.head_size)) self.coeff = 1.0 if args.apply_query_key_layer_scaling: self.coeff = float(self.layer_id) self.norm_factor *= self.coeff
def __init__(self, seq_length, hidden_size, vocab_size): super().__init__() self.seq_length = seq_length self.hidden_size = hidden_size self.vocab_size = vocab_size args = get_args() self.dropout = flow.nn.Dropout(p=args.hidden_dropout) self.enable_amp = args.fp16 # word token embedding shape (vocab_size, hidden_size) # sbp: [B, S(0)] self.wte = flow.nn.Parameter( flow.empty( (self.vocab_size, self.hidden_size), dtype=flow.float32, placement=dist.get_layer_placement(0), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]), )) # word position embedding shape (seq_len, hidden_size) # sbp: [B, B] self.wpe = flow.nn.Parameter( flow.empty( (self.seq_length, self.hidden_size), dtype=flow.float32, placement=dist.get_layer_placement(0), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), )) flow.nn.init.normal_(self.wte, std=args.init_method_std) flow.nn.init.normal_(self.wpe, std=args.init_method_std)
def __init__(self, layer_idx, input_size, output_size, init_method, need_gelu=False): super().__init__() self.need_gelu = need_gelu args = get_args() self.bias_gelu_fusion = args.bias_gelu_fusion # col parallel linear weight sbp: [B, S(1)] self.weight = flow.nn.Parameter( flow.empty( (input_size, output_size), dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)]), )) init_method(self.weight) # col parallel linear bias sbp: [B, S(0)] self.bias = flow.nn.Parameter( flow.empty( (output_size, ), dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]), )) flow.nn.init.zeros_(self.bias)
def __init__( self, model, data_loader, cross_entropy, optimizer=None, lr_scheduler=None, grad_scaler=None, ): super().__init__() self.model = model self.data_loader = data_loader self.cross_entropy = cross_entropy self.is_train = False if optimizer is not None: self.is_train = True self.add_optimizer(optimizer, lr_sch=lr_scheduler) if grad_scaler is not None: self.set_grad_scaler(grad_scaler) args = get_args() self.set_activation_checkpointing() self.set_pipeline_stage_id() self.config.set_gradient_accumulation_steps( args.num_accumulation_steps) if args.fp16: self.config.enable_amp(True) self.config.allow_fuse_add_to_output(True) self.config.allow_fuse_model_update_ops(True) self.config.allow_fuse_cast_scale(True)
def __init__(self): self.args = get_args() self.rank = flow.env.get_rank() self.world_size = flow.env.get_world_size() self.model = GPTModel() self.data_loader = GPTDataLoader() self.cross_entropy = ParallelSparseSoftmaxCrossEntropyLoss() self.optimizer = make_optimizer(self.args, self.model) self.lr_scheduler = make_lr_scheduler(self.args, self.optimizer) # self.optimizer = None # self.lr_scheduler = None # NOTE(zwx): grad scaler is not available in eager mode self.grad_scaler = make_grad_scaler(self.args) if self.args.graph: flow.boxing.nccl.enable_use_compute_stream(True) self.train_graph = GPTGraph( self.model, self.data_loader, self.cross_entropy, self.optimizer, self.lr_scheduler, self.grad_scaler, ) # self.save("init") self.logger = Logger(self.rank) self.logger.register_metric("iter", IterationMetric()) self.logger.register_metric("samples", AccumulationMetric()) self.logger.register_metric("loss", LossMetric(), "loss: {:.5f}", True) self.logger.register_metric("throughput", ThroughputMetric(), "throughput: {:.2f}", True)
def __init__(self, name): self.name = name args = get_args() assert args.dataset is not None self.dataset = args.dataset self.batch_size = args.global_batch_size // args.num_accumulation_steps self.seq_length = args.seq_length self.seed = args.seed self.split = args.split self.num_samples = args.train_samples
def __init__(self, hidden_size): super().__init__() self.hidden_size = hidden_size args = get_args() self.is_seq_len_dim_leading = True if args.multihead_attention_fusion else False self.num_layers = args.num_layers self._build_layers(args.init_method_std) self.layernorm_f = LayerNorm(-1, (self.hidden_size, ))
def __init__(self): super().__init__() args = get_args() self.batch_size = args.global_batch_size // args.num_accumulation_steps self.seq_length = args.seq_length self.hidden_size = args.hidden_size self.embedding = Embedding(self.seq_length, self.hidden_size, args.padded_vocab_size) self.transformer = Transformer(self.hidden_size) self.logits = Logits()
def __init__(self, name): self.name = name args = get_args() self.batch_size = args.global_batch_size // args.num_accumulation_steps self.seq_length = args.seq_length self.hidden_size = args.hidden_size self.vocab_size = args.padded_vocab_size self.embedding = Embedding(self.batch_size, self.seq_length, self.hidden_size, self.vocab_size) self.transformer = Transformer(self.batch_size, self.seq_length, self.hidden_size)
def __init__(self, batch_size, seq_length, hidden_size, vocab_size): self.batch_size = batch_size self.seq_length = seq_length self.hidden_size = hidden_size self.vocab_size = vocab_size args = get_args() self.embedding_dropout_rate = args.hidden_dropout self.use_fp16 = args.fp16 self.wpe_initializer = flow.random_normal_initializer( stddev=args.init_method_std) self.wte_initializer = flow.random_normal_initializer( stddev=args.init_method_std)
def _infer_split_axis(x): if len(x.shape) == 2: return 0 if len(x.shape) == 3: if x.shape[0] == x.shape[-1]: return -1 args = get_args() if x.shape[0] == args.seq_length: return 1 if x.shape[1] == args.seq_length: return 0 return -1
def __init__( self, layer_idx, hidden_size, is_seq_len_dim_leading, hidden_dropout_rate, init_method, output_layer_init_method, ): super().__init__() self.hidden_size = hidden_size self.is_seq_len_dim_leading = is_seq_len_dim_leading args = get_args() self.num_heads = args.num_attention_heads self.head_size = args.hidden_size // args.num_attention_heads self.attention_dropout_rate = args.attention_dropout self.scale_tril_softmax_dropout_fusion = args.scale_tril_softmax_dropout_fusion self.multihead_attention_fusion = args.multihead_attention_fusion if not self.scale_tril_softmax_dropout_fusion: self.multihead_attn_dropout = flow.nn.Dropout( p=self.attention_dropout_rate) self.norm_factor = math.sqrt(float(self.head_size)) self.coeff = 1.0 if args.apply_query_key_layer_scaling: self.coeff = float(layer_idx + 1) self.norm_factor *= self.coeff self.c_attn = ColumnParallelLinear( layer_idx, self.hidden_size, self.hidden_size * 3, init_method, ) self.c_proj = RowParallelLinear( layer_idx, self.hidden_size, self.hidden_size, output_layer_init_method, dropout_rate=hidden_dropout_rate, )
def __init__( self, batch_size, seq_length, hidden_size, hidden_dropout_rate, initializer=None, output_layer_initializer=None, ): self.batch_size = batch_size self.seq_length = seq_length self.hidden_size = hidden_size self.hidden_dropout_rate = hidden_dropout_rate self.initializer = initializer self.output_layer_initializer = output_layer_initializer args = get_args() self.bias_gelu_fusion = args.bias_gelu_fusion self.bias_dropout_fusion = args.bias_dropout_fusion
def __init__(self): super().__init__() args = get_args() assert args.dataset is not None batch_size = args.global_batch_size // args.num_accumulation_steps self.reader = flow.nn.GPTIndexedBinDataReader( data_file_prefix=args.dataset, seq_length=args.seq_length, num_samples=args.train_samples, batch_size=batch_size, dtype=flow.int64, shuffle=True, random_seed=args.seed, split_sizes=args.split, split_index=0, placement=dist.get_layer_placement(0, "cpu"), sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast]), ) self.data_decoder = DataDecoder() self.label_decoder = LabelDecoder()
def __init__(self, batch_size, seq_length, hidden_size): self.batch_size = batch_size self.seq_length = seq_length self.hidden_size = hidden_size args = get_args() self.multihead_attention_fusion = args.multihead_attention_fusion self.num_layers = args.num_layers self.layers = [] for i in range(self.num_layers): self.layers.append( TransformerLayer( f"h{i}", i + 1, batch_size, seq_length, hidden_size, initializer=flow.random_normal_initializer( stddev=args.init_method_std), output_layer_initializer=flow.random_normal_initializer( stddev=(args.init_method_std / math.sqrt(2.0 * self.num_layers))), ))
def __init__( self, name, layer_id, batch_size, seq_length, hidden_size, initializer=None, output_layer_initializer=None, ): self.name = name self.layer_id = layer_id self.batch_size = batch_size self.seq_length = seq_length self.hidden_size = hidden_size args = get_args() self.enable_profiling = args.profile_transformer_layer self.attn = SelfAttention( layer_id, batch_size, seq_length, hidden_size, args.hidden_dropout, initializer, output_layer_initializer, ) self.mlp = MLP( batch_size, seq_length, hidden_size, args.hidden_dropout, initializer, output_layer_initializer, ) self.checkpoint_activations = args.checkpoint_activations
def __init__( self, layer_idx, input_size, output_size, init_method, dropout_rate, ): super().__init__() self.dropout_rate = dropout_rate args = get_args() self.bias_dropout_fusion = args.bias_dropout_fusion if not self.bias_dropout_fusion: self.dropout = flow.nn.Dropout(p=dropout_rate) # col parallel linear weight sbp: [B, S(0)] self.weight = flow.nn.Parameter( flow.empty( (input_size, output_size), dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]), )) init_method(self.weight) # col parallel linear bias sbp: [B, B] self.bias = flow.nn.Parameter( flow.empty( (output_size, ), dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), )) flow.nn.init.zeros_(self.bias)
"--reset-position-ids", action="store_true", help="Reset posistion ids after end-of-document token.", ) parser.add_argument( "--reset-attention-mask", action="store_true", help="Reset self attention maske after " "end-of-document token.", ) parser.add_argument( "--eod-mask-loss", action="store_true", help="Mask loss for the end of document tokens.", ) return parser if __name__ == "__main__": args = get_args(extra_args_provider=get_tasks_args) if args.task in ["LAMBADA"]: from zeroshot_gpt.evaluate import main else: raise NotImplementedError("Task {} is not implemented.".format( args.task)) main(args)
def __init__(self): args = get_args() self._init_parallel_size(args) self._init_placement_group(args) self._init_parallel_hierarchy()
def train(): args = get_args() _init_env(args) _init_config(args) trainer = _make_gpt_train_func(args) snapshot = Snapshot( load_dir=args.load, save_dir=args.save, save_interval=args.save_interval, total_iters=args.train_iters, save_last=args.save_last, save_init=args.save_init, ) metric = Metric( print_steps=args.log_interval, start_step=snapshot.iter, max_step=args.train_iters, num_samples_per_batch=args.micro_batch_size * args.data_parallel_size, keys=["loss"], print_format=args.metric_print_format, nvidia_smi_report_step=10, nvidia_smi_report_file=None, ) if args.use_external_dataset: train_val_test_num_samples = get_train_val_test_num_samples( args.split, args.train_samples) train_ds, _, _ = build_train_valid_test_datasets( data_prefix=[args.dataset], data_impl="mmap", splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, seq_length=args.seq_length, seed=args.seed, skip_warmup=0, ) if args.train_iters is None and args.train_samples is None: raise ValueError("train_iters and train_samples must be set either") print("Training...") try: batch_size = args.micro_batch_size * args.num_accumulation_steps iteration = snapshot.iter while iteration < args.train_iters: if args.use_external_dataset: batch = [ train_ds[iteration * batch_size + i] for i in range(batch_size) ] data = np.stack(batch) trainer(data).async_get(metric.metric_cb()) else: trainer().async_get(metric.metric_cb()) snapshot.step() iteration = snapshot.iter except KeyboardInterrupt: print("interrupted")