def __init__(self, module, device_ids=None, output_device=None, dim=0): super(DataParallel, self).__init__() torch._C._log_api_usage_once("torch.nn.parallel.DataParallel") device_type = _get_available_device_type() if device_type is None: self.module = module self.device_ids = [] return if device_ids is None: device_ids = _get_all_device_indices() if output_device is None: output_device = device_ids[0] self.dim = dim self.module = module self.device_ids = [_get_device_index(x, True) for x in device_ids] self.output_device = _get_device_index(output_device, True) self.src_device_obj = torch.device(device_type, self.device_ids[0]) _check_balance(self.device_ids) if len(self.device_ids) == 1: self.module.to(self.src_device_obj)
def __init__(self, module, device_ids=None, output_device=None, dim=0): super(DataParallel, self).__init__() #检查是否有可用的GPU device_type = _get_available_device_type() if device_type is None: self.module = module self.device_ids = [] return #默认使用所有可见的GPU if device_ids is None: device_ids = _get_all_device_indices() #默认server是device_ids列表上的第一个 if output_device is None: output_device = device_ids[0] self.dim = dim self.module = module self.device_ids = [_get_device_index(x, True) for x in device_ids] self.output_device = _get_device_index(output_device, True) self.src_device_obj = torch.device(device_type, self.device_ids[0]) #检查负载是否平衡,不平衡(指内存或者处理器 max/min > 0.75 会有警告) _check_balance(self.device_ids) #单卡 if len(self.device_ids) == 1: self.module.to(self.src_device_obj)
def __init__(self, module, device_ids=None, output_device=None, dim=0): super(DataParallel, self).__init__() device_type = _get_available_device_type() if device_type is None: self.module = module self.device_ids = [] return if device_ids is None: device_ids = _get_all_device_indices() if output_device is None: output_device = device_ids[0] self.dim = dim self.module = module self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids)) self.output_device = _get_device_index(output_device, True) self.src_device_obj = torch.device(device_type, self.device_ids[0]) _check_balance(self.device_ids) if len(self.device_ids) == 1: self.module.to(self.src_device_obj)
def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, module_kwargs=None): r"""Evaluates module(input) in parallel across the GPUs given in device_ids. This is the functional version of the DataParallel module. Args: module (Module): the module to evaluate in parallel inputs (Tensor): inputs to the module device_ids (list of int or torch.device): GPU ids on which to replicate module output_device (list of int or torch.device): GPU location of the output Use -1 to indicate the CPU. (default: device_ids[0]) Returns: a Tensor containing the result of module(input) located on output_device """ if not isinstance(inputs, tuple): inputs = (inputs, ) if inputs is not None else () device_type = _get_available_device_type() if device_ids is None: device_ids = _get_all_device_indices() if output_device is None: output_device = device_ids[0] device_ids = [_get_device_index(x, True) for x in device_ids] output_device = _get_device_index(output_device, True) src_device_obj = torch.device(device_type, device_ids[0]) for t in chain(module.parameters(), module.buffers()): if t.device != src_device_obj: raise RuntimeError("module must have its parameters and buffers " "on device {} (device_ids[0]) but found one of " "them on device: {}".format( src_device_obj, t.device)) inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim) # for module without any inputs, empty list and dict will be created # so the module can be executed on one device which is the first one in device_ids if not inputs and not module_kwargs: inputs = ((), ) module_kwargs = ({}, ) if len(device_ids) == 1: return module(*inputs[0], **module_kwargs[0]) used_device_ids = device_ids[:len(inputs)] replicas = replicate(module, used_device_ids) outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids) return gather(outputs, output_device, dim)
def __init__(self, module, device_ids=None, output_device=None, dim=0, broadcast_buffers=True, process_group=None, bucket_cap_mb=25, find_unused_parameters=False, check_reduction=False): super(DistributedDataParallel, self).__init__() assert any((p.requires_grad for p in module.parameters())), ( "DistributedDataParallel is not needed when a module " "doesn't have any parameter that requires a gradient.") self.is_multi_device_module = len( {p.device for p in module.parameters()}) > 1 distinct_device_types = {p.device.type for p in module.parameters()} assert len(distinct_device_types) == 1, ( "DistributedDataParallel's input module must be on " "the same type of devices, but input module parameters locate in {}." ).format(distinct_device_types) self.device_type = list(distinct_device_types)[0] if self.device_type == "cpu" or self.is_multi_device_module: assert not device_ids and not output_device, ( "DistributedDataParallel device_ids and output_device arguments " "only work with single-device GPU modules, but got " "device_ids {}, output_device {}, and module parameters {}." ).format(device_ids, output_device, {p.device for p in module.parameters()}) self.device_ids = None self.output_device = None else: # Use all devices by default for single-device GPU modules if device_ids is None: device_ids = _get_all_device_indices() self.device_ids = list( map(lambda x: _get_device_index(x, True), device_ids)) if output_device is None: output_device = device_ids[0] self.output_device = _get_device_index(output_device, True) if process_group is None: self.process_group = _get_default_group() else: self.process_group = process_group self.dim = dim self.module = module self.broadcast_buffers = broadcast_buffers self.find_unused_parameters = find_unused_parameters self.require_backward_grad_sync = True self.require_forward_param_sync = True if check_reduction: # This argument is no longer used since the reducer # will ensure reduction completes even if some parameters # do not receive gradients. pass # used for intra-node param sync and inter-node sync as well self.broadcast_bucket_size = int(250 * 1024 * 1024) # reduction bucket size self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024) # Sync params and buffers module_states = list(self.module.state_dict().values()) if len(module_states) > 0: self._distributed_broadcast_coalesced(module_states, self.broadcast_bucket_size) self._ddp_init_helper()
valid_indices = list(range(10000, 11700)) #valid_indices = [2] #test_indices = [3] test_indices = list(range(11700, 13300)) #test_indices = list(range(7700, 9800)) print(f'data path : {args.dir}') if not args.test: model = LES(input_channels=input_length * 2, output_channels=2, kernel_size=kernel_size, dropout_rate=dropout_rate, time_range=time_range).to(device) # DataParallel requires input tensor to be provided on first device in device_ids list, so need to prepend current cuda device default_list = _get_all_device_indices() device_ids = [args.gpu] + default_list model = nn.DataParallel(model, device_ids=device_ids) # Note that when generating the data, they need to be batches of at least size 46(mid<40>+output_length), probably should try 50 #train_set = Dataset(valid_indices, input_length + time_range - 1, 40, output_length, train_direc, True) # Create Normalization transform if args.orig_norm: trans_func = transforms.Compose([ transforms.Normalize(mean=[ORIG_AVG, ORIG_AVG], std=[ORIG_STD, ORIG_STD]) ]) else: chan1_mean, chan1_std, chan2_mean, chan2_std = get_train_avg_std( train_indices) print(f'channel 0 mean {chan1_mean} and std: {chan1_std}')