def scale_by_grad_accum_steps_wrapper_hook( hook_state, bucket: dist.GradBucket ) -> torch.futures.Future[torch.Tensor]: bucket.set_buffer(bucket.buffer().div_( args.gradient_accumulation_steps)) fut = hook(hook_state, bucket) return fut
def bf16_compress_wrapper_hook( hook_state, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]: # Cast bucket tensor to BF16. bucket.set_buffer(bucket.buffer().to(torch.bfloat16)) fut = hook(hook_state, bucket) def decompress(fut): decompressed_tensor = bucket.buffer() # Decompress in place to reduce the peak memory. # See: https://github.com/pytorch/pytorch/issues/45968 decompressed_tensor.copy_(fut.value()) return decompressed_tensor # Decompress after hook has run. return fut.then(decompress)