def backward_step(optimizer, model, loss): """Backward step.""" args = get_args() timers = get_timers() # Backward pass. timers('backward-backward').start() optimizer.zero_grad(set_grads_to_None=True) if args.fp16: optimizer.backward(loss, update_master_grads=False) else: loss.backward() timers('backward-backward').stop() # All-reduce if needed. if args.DDP_impl == 'local': timers('backward-allreduce').start() model.allreduce_params(reduce_after=False, fp32_allreduce=args.fp32_allreduce) timers('backward-allreduce').stop() # Update master gradients. timers('backward-master-grad').start() if args.fp16: optimizer.update_master_grads() timers('backward-master-grad').stop() # Clipping gradients helps prevent the exploding gradient. timers('backward-clip-grad').start() if args.clip_grad > 0: if not args.fp16: mpu.clip_grad_norm(model.parameters(), args.clip_grad) else: optimizer.clip_master_grads(args.clip_grad) timers('backward-clip-grad').stop()
def backward_step(optimizer, model, loss): """Backward step.""" args = get_args() timers = get_timers() # Backward pass. timers('backward-backward').start() if args.deepspeed: model.backward(loss) else: raise ValueError("Must be using deepspeed to run neox") timers('backward-backward').stop() if args.deepspeed: # DeepSpeed backward propagation already addressed all reduce communication. # Reset the timer to avoid breaking timer logs below. timers('backward-allreduce').reset() else: raise ValueError("Must be using deepspeed to run neox") if not args.deepspeed: # Update master gradients. timers('backward-master-grad').start() if args.fp16: optimizer.update_master_grads() timers('backward-master-grad').stop() # Clipping gradients helps prevent the exploding gradient. timers('backward-clip-grad').start() if args.clip_grad > 0: if not args.fp16: mpu.clip_grad_norm(model.parameters(), args.clip_grad) else: optimizer.clip_master_grads(args.clip_grad) timers('backward-clip-grad').stop()
def backward_step(optimizer, model, loss): """Backward step.""" args = get_args() timers = get_timers() # Backward pass. timers('backward-backward').start() if args.deepspeed: model.backward(loss) else: optimizer.zero_grad(set_grads_to_None=True) if args.fp16: optimizer.backward(loss, update_master_grads=False) else: loss.backward() timers('backward-backward').stop() if args.deepspeed: # DeepSpeed backward propagation already addressed all reduce communication. # Reset the timer to avoid breaking timer logs below. timers('backward-allreduce').reset() else: # All-reduce if needed. if args.DDP_impl == 'local': timers('backward-allreduce').start() model.allreduce_params(reduce_after=False, fp32_allreduce=args.fp32_allreduce) timers('backward-allreduce').stop() if not args.deepspeed: # Update master gradients. timers('backward-master-grad').start() if args.fp16: optimizer.update_master_grads() timers('backward-master-grad').stop() # Clipping gradients helps prevent the exploding gradient. timers('backward-clip-grad').start() if args.clip_grad > 0: if not args.fp16: mpu.clip_grad_norm(model.parameters(), args.clip_grad) else: optimizer.clip_master_grads(args.clip_grad) timers('backward-clip-grad').stop()