def gpu_lauum(A, upper, overwrite=True, write_opposite=False, opt: Optional[FalkonOptions] = None): """ Parameters ----------- A : ndarray [N, N] 2D positive-definite matrix that will be factorized as A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper` is False. overwrite : bool Whether to overwrite matrix A or to output the result in a new buffer. Notes ------ The factorization will always be the 'lower' version of the factorization which could however end up on the upper-triangular part of the matrix in case A is not Fortran contiguous to begin with. """ if opt is None: opt = FalkonOptions() gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0] for g in gpu_info: g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95, opt.max_gpu_mem * 0.95) # Start matrix preparations if isinstance(A, np.ndarray): Anp = A elif isinstance(A, torch.Tensor): Anp = A.numpy() else: raise TypeError("Unexpected type encountered for A: %s" % (A.dtype)) if not overwrite: Anp = np.copy(Anp, order='A') # Will give a fortran-contiguous numpy array. No copies are performed. Anp, transposed = prepare_matrix(Anp) if transposed: upper = not upper # Parallel can only do lower C or F-contiguous arrays # But by transposing as necessary, it is able to run with every combination of inputs. At = torch.from_numpy(Anp) if upper: At = At.T # The parallel runner chooses based on the contiguity pattern of the inputs. _parallel_lauum_runner(At, write_opposite, opt, gpu_info) if transposed: Anp = Anp.T if isinstance(A, np.ndarray): return Anp else: return torch.from_numpy(Anp)
def _get_gpu_info(opt: BaseOptions, slack: float = 0.9) -> List[DeviceInfo]: # List available devices, get their relative speed and split # computations based on device relative speed. gpu_info = [v for k, v in devices.get_device_info(opt).items() if v.isGPU] for g in gpu_info: g.usable_ram = min(g.free_memory * slack, opt.max_gpu_mem * slack) return gpu_info
def gpu_lauum(A, upper, overwrite=True, write_opposite=False, opt: Optional[FalkonOptions] = None): """ Parameters ----------- A : torch.Tensor (N x N) positive-definite matrix that will be factorized as A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper` is False. overwrite : bool Whether to overwrite matrix A or to output the result in a new buffer. Returns ------- out : torch.Tensor A (N x N) tensor. This will share the same memory as the input tensor `A` if `overwrite` is set to True, otherwise it will be a newly allocated tensor. """ if opt is None: opt = FalkonOptions() if not overwrite: A = copy_same_stride(A, pin_memory=True) # TODO: There is a helper function in mmv_ops for this. gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0] for g in gpu_info: g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95, opt.max_gpu_mem * 0.95) # Parallel can only do lower C or F-contiguous arrays # By transposing as necessary, it is able to run with every combination of inputs. transposed = False # noinspection PyUnresolvedReferences if upper: A = A.T transposed = True # The parallel runner chooses based on the contiguity pattern of the inputs. _parallel_lauum_runner(A, write_opposite, gpu_info) if transposed: A = A.T return A
def init(opt: BaseOptions): if opt.use_cpu: return device_ids = [k for k in get_device_info(opt).keys() if k >= 0] global _cublas_handles global _cusolver_handles for i in device_ids: with torch.cuda.device(i): # CuBLAS handle if _cublas_handles.get(i, None) is None: handle = cublasCreate() _cublas_handles[i] = handle # CuSOLVER (Dense) handle if _cusolver_handles.get(i, None) is None: handle = cusolver.cusolverDnCreate() _cusolver_handles[i] = handle
def fit(self, X: torch.Tensor, Y: torch.Tensor, Xts: Optional[torch.Tensor] = None, Yts: Optional[torch.Tensor] = None): """Fits the Falkon KRR model. Parameters ----------- X : torch.Tensor The tensor of training data, of shape [num_samples, num_dimensions]. If X is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Must be a CUDA tensor. Y : torch.Tensor The tensor of training targets, of shape [num_samples, num_outputs]. If X and Y represent a classification problem, Y can be encoded as a one-hot vector. If Y is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Must be a CUDA tensor. Xts : torch.Tensor or None Tensor of validation data, of shape [num_test_samples, num_dimensions]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Must be a CUDA tensor. Yts : torch.Tensor or None Tensor of validation targets, of shape [num_test_samples, num_outputs]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Must be a CUDA tensor. Returns -------- model: InCoreFalkon The fitted model """ # Fix a synchronization bug which occurs when re-using center selector. torch.cuda.synchronize() X, Y, Xts, Yts = self._check_fit_inputs(X, Y, Xts, Yts) self.fit_times_ = [] self.ny_points_ = None self.alpha_ = None # Start training timer t_s = time.time() # Pick Nystrom centers if self.weight_fn is not None: # noinspection PyTupleAssignmentBalance ny_points, ny_indices = self.center_selection.select_indices( X, None) else: # noinspection PyTypeChecker ny_points: Union[ torch.Tensor, falkon.sparse.SparseTensor] = self.center_selection.select( X, None) ny_indices = None num_centers = ny_points.shape[0] pc_stream = torch.cuda.Stream(X.device) with TicToc("Calcuating Preconditioner of size %d" % (num_centers), debug=self.options.debug), torch.cuda.stream(pc_stream): precond = falkon.preconditioner.FalkonPreconditioner( self.penalty, self.kernel, self.options) ny_weight_vec = None if self.weight_fn is not None: ny_weight_vec = self.weight_fn(Y[ny_indices]) precond.init(ny_points, weight_vec=ny_weight_vec) pc_stream.synchronize() # Cache must be emptied to ensure enough memory is visible to the optimizer torch.cuda.empty_cache() # K_NM storage decision gpu_info = get_device_info(self.options)[X.device.index] available_ram = min(self.options.max_gpu_mem, gpu_info.free_memory) * 0.9 if self._can_store_knm(X, ny_points, available_ram): Knm = self.kernel(X, ny_points, opt=self.options) else: Knm = None self.fit_times_.append(time.time() - t_s) # Preparation time # Here we define the callback function which will run at the end # of conjugate gradient iterations. This function computes and # displays the validation error. validation_cback = None if self.error_fn is not None and self.error_every is not None: validation_cback = self._get_callback_fn(X, Y, Xts, Yts, ny_points, precond) # Start with the falkon algorithm with TicToc('Computing Falkon iterations', debug=self.options.debug): optim = falkon.optim.FalkonConjugateGradient( self.kernel, precond, self.options, weight_fn=self.weight_fn) if Knm is not None: beta = optim.solve(Knm, None, Y, self.penalty, initial_solution=None, max_iter=self.maxiter, callback=validation_cback) else: beta = optim.solve(X, ny_points, Y, self.penalty, initial_solution=None, max_iter=self.maxiter, callback=validation_cback) self.alpha_ = precond.apply(beta) self.ny_points_ = ny_points return self
def run_keops_mmv(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, other_vars: List[torch.Tensor], out: Optional[torch.Tensor], formula: str, aliases: List[str], axis: int, reduction: str = 'Sum', opt: Optional[FalkonOptions] = None) -> torch.Tensor: if opt is None: opt = FalkonOptions() # Choose backend N, D = X1.shape T = v.shape[1] backend = _decide_backend(opt, D) dtype = _keops_dtype(X1.dtype) device = X1.device if not check_same_device(X1, X2, v, out, *other_vars): raise RuntimeError("All input tensors must be on the same device.") if (device.type == 'cuda') and (not backend.startswith("GPU")): warnings.warn("KeOps backend was chosen to be CPU, but GPU input tensors found. " "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, " "please pass CPU tensors; to avoid this warning if the GPU backend is " "desired, check your options (i.e. set 'use_cpu=False').") backend = "GPU_1D" # Define formula wrapper fn = Genred(formula, aliases, reduction_op=reduction, axis=axis, dtype=dtype, dtype_acc=opt.keops_acc_dtype, sum_scheme=opt.keops_sum_scheme) # Compile on a small data subset small_data_variables = [X1[:100], X2[:10], v[:10]] + other_vars small_data_out = torch.empty((100, T), dtype=X1.dtype, device=device) fn(*small_data_variables, out=small_data_out, backend=backend) # Create output matrix if out is None: # noinspection PyArgumentList out = torch.empty(N, T, dtype=X1.dtype, device=device, pin_memory=(backend != 'CPU') and (device.type == 'cpu')) if backend.startswith("GPU") and device.type == 'cpu': # Info about GPUs ram_slack = 0.7 # slack is high due to imprecise memory usage estimates gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0] gpu_ram = [ min((g.free_memory - 300 * 2 ** 20) * ram_slack, opt.max_gpu_mem * ram_slack) for g in gpu_info ] block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i in range(len(gpu_info)): # First round of subdivision bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv( X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), other_vars=other_vars, function=fn, backend=backend, gpu_ram=gpu_ram[i] ), gpu_info[i].Id)) _start_wait_processes(_single_gpu_method, args) else: # Run on CPU or GPU with CUDA inputs variables = [X1, X2, v] + other_vars out = fn(*variables, out=out, backend=backend) return out
t_e = time.time() timings.append(t_e - t_s) print("Exp %s - N %d - Rep %d - %.2fs" % (exp, N, j, timings[-1]), flush=True) if exp['torch']: torch.cuda.empty_cache() exp['timings'].append(min(timings)) return experiments if __name__ == "__main__": init_opt = falkon.FalkonOptions(compute_arch_speed=False) initialization.init(init_opt) gpu_info = [ v for k, v in devices.get_device_info(init_opt).items() if k >= 0 ] num_gpu = len(gpu_info) defaultN32 = [ 10_000, 20_000, 30_000, 40_000, 50_000, 65_000, 80_000, 100_000, 120_000, 140_000 ] #defaultN64 = [10_000, 20_000, 30_000, 40_000, 50_000, 65_000, 80_000] falkon.FalkonOptions(chol_force_ooc=True, chol_par_blk_multiplier=2, compute_arch_speed=False) experiments = [ { 'name': 'Parallel 32',
def _get_cpu_ram(opt: BaseOptions, slack: float = 0.9) -> float: cpu_info = devices.get_device_info(opt)[-1] avail_mem = min(cpu_info.free_memory, opt.max_cpu_mem - cpu_info.used_memory) return avail_mem * slack
def fit(self, X: torch.Tensor, Y: torch.Tensor, Xts: Optional[torch.Tensor] = None, Yts: Optional[torch.Tensor] = None): """Fits the Falkon KRR model. Parameters ----------- X : torch.Tensor (2D) The tensor of training data, of shape [num_samples, num_dimensions]. If X is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Y : torch.Tensor (1D or 2D) The tensor of training targets, of shape [num_samples, num_outputs]. If X and Y represent a classification problem, Y can be encoded as a one-hot vector. If Y is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Xts : torch.Tensor (2D) or None Tensor of validation data, of shape [num_test_samples, num_dimensions]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Yts : torch.Tensor (1D or 2D) or None Tensor of validation targets, of shape [num_test_samples, num_outputs]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Returns -------- model: Falkon The fitted model """ if X.size(0) != Y.size(0): raise ValueError("X and Y must have the same number of " "samples (found %d and %d)" % (X.size(0), Y.size(0))) if Y.dim() == 1: Y = torch.unsqueeze(Y, 1) if Y.dim() != 2: raise ValueError("Y is expected 1D or 2D. Found %dD." % (Y.dim())) if not check_same_dtype(X, Y): raise TypeError("X and Y must have the same data-type.") dtype = X.dtype # Decide whether to use CUDA for preconditioning based on M _use_cuda_preconditioner = ( self.use_cuda_ and (not self.options.cpu_preconditioner) and self.M >= get_min_cuda_preconditioner_size(dtype) ) _use_cuda_mmv = ( self.use_cuda_ and X.shape[0] * X.shape[1] * self.M / self.num_gpus >= get_min_cuda_mmv_size(dtype) ) self.fit_times_ = [] self.ny_points_ = None self.alpha_ = None t_s = time.time() ny_points = self.center_selection.select(X, None, self.M) if self.use_cuda_: ny_points = ny_points.pin_memory() with TicToc("Calcuating Preconditioner of size %d" % (self.M), debug=self.options.debug): pc_opt: FalkonOptions = dataclasses.replace(self.options, use_cpu=not _use_cuda_preconditioner) if pc_opt.debug: print("Preconditioner will run on %s" % ("CPU" if pc_opt.use_cpu else ("%d GPUs" % self.num_gpus))) precond = falkon.preconditioner.FalkonPreconditioner(self.penalty, self.kernel, pc_opt) precond.init(ny_points) if _use_cuda_mmv: # Cache must be emptied to ensure enough memory is visible to the optimizer torch.cuda.empty_cache() X = X.pin_memory() # Decide whether it's worthwile to pre-compute the k_NM kernel. # If we precompute K_NM, each CG iteration costs # Given a single kernel evaluation between two D-dimensional vectors # costs D, at CG iteration we must perform N*M kernel evaluations. # Other than the kernel evaluations we must perform two matrix-vector # products 2(N*M*T) and a bunch of triangular solves. # # So if we precompute we have 2*(N*M*T), othewise we also have N*M*D # but precomputing costs us N*M memory. # So heuristic is the following: # - If D is large (e.g. > 100) check if RAM is sufficient # - If RAM is sufficient precompute # - Otherwise do not precompute Knm = None if X.size(1) > 1200: necessary_ram = X.size(0) * ny_points.size(0) * sizeof_dtype(dtype) k_opt = dataclasses.replace(self.options, use_cpu=True) cpu_info = get_device_info(k_opt) available_ram = min(k_opt.max_cpu_mem, cpu_info[-1].free_memory) * 0.9 del k_opt if available_ram > necessary_ram: if self.options.debug: print("%d*%d Kernel matrix will be stored" % (X.size(0), ny_points.size(0))) Knm = self.kernel(X, ny_points, opt=self.options) # TODO: Maybe we should do the same for Kts, but this complicates # checks for fitting in memory elif self.options.debug: print( "Cannot store full kernel matrix: not enough memory (have %.2fGB, need %.2fGB)" % (available_ram / 2 ** 30, necessary_ram / 2 ** 30)) self.fit_times_.append(time.time() - t_s) # Preparation time # Here we define the callback function which will run at the end # of conjugate gradient iterations. This function computes and # displays the validation error. val_cback = None if self.error_fn is not None and self.error_every is not None: def val_cback(it, beta, train_time): self.fit_times_.append(self.fit_times_[0] + train_time) if it % self.error_every != 0: print("Iteration %3d - Elapsed %.1fs" % (it, self.fit_times_[-1]), flush=True) return err_str = "training" if Xts is None or Yts is None else "validation" alpha = precond.apply(beta) # Compute error: can be train or test; if Xts is not None and Yts is not None: pred = self._predict(Xts, ny_points, alpha) err = self.error_fn(Yts, pred) else: pred = self._predict(X, ny_points, alpha) err = self.error_fn(Y, pred) err_name = "error" if isinstance(err, tuple) and len(err) == 2: err, err_name = err print("Iteration %3d - Elapsed %.1fs - %s %s: %.4f" % (it, self.fit_times_[-1], err_str, err_name, err), flush=True) # Start with the falkon algorithm with TicToc('Computing Falkon iterations', debug=self.options.debug): o_opt: FalkonOptions = dataclasses.replace(self.options, use_cpu=not _use_cuda_mmv) if o_opt.debug: print("Optimizer will run on %s" % ("CPU" if o_opt.use_cpu else ("%d GPUs" % self.num_gpus)), flush=True) optim = falkon.optim.FalkonConjugateGradient(self.kernel, precond, o_opt) if Knm is not None: beta = optim.solve( Knm, None, Y, self.penalty, initial_solution=None, max_iter=self.maxiter, callback=val_cback) else: beta = optim.solve( X, ny_points, Y, self.penalty, initial_solution=None, max_iter=self.maxiter, callback=val_cback) self.alpha_ = precond.apply(beta) self.ny_points_ = ny_points return self
def gpu_cholesky(A: torch.Tensor, upper: bool, clean: bool, overwrite: bool, opt: FalkonOptions) -> torch.Tensor: """ Parameters ----------- A : torch.Tensor 2D positive-definite matrix of size (n x n) that will be factorized as A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper` is False. upper : bool Whether the triangle which should be factorized is the upper or lower of `A`. clean : bool Whether the "other" triangle of the output matrix (the one that does not contain the factorization) will be filled with zeros or not. overwrite : bool Whether to overwrite matrix A or to output the result in a new buffer. opt : FalkonOptions Options forwarded for block calculation, and other knobs in the out-of-core parallel POTRF implementation. Useful options are the ones defined in :class:`~falkon.options.CholeskyOptions` . Notes ------ The factorization will always be the 'lower' version of the factorization which could however end up on the upper-triangular part of the matrix in case A is not Fortran contiguous to begin with. """ # Handle 'overwrite' option immediately so that its usage is reflected in memory # availability (in case A is on GPU). if not overwrite: # We could change the stride to be more favorable to the POTRF requirements # but it gets complicated. We leave such decisions to the user! A = copy_same_stride(A, pin_memory=True) # Decide which version of the algo we run: can be in-core or parallel. # (Note that the original OOC version is not going to run). # Determine GPU free RAM gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0] for g in gpu_info: g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95, opt.max_gpu_mem * 0.95) if A.is_cuda: try: device = [d for d in gpu_info if d.Id == A.device.index][0] except IndexError: # This should never happen! raise RuntimeError("Device of matrix A (%s) is not recognized" % (A.device)) else: device = max(gpu_info, key=lambda g: g.actual_free_mem) ic = can_do_ic(A, device) and not opt.chol_force_ooc if opt.chol_force_in_core and not ic: raise RuntimeError( "Cannot run in-core POTRF but `chol_force_in_core` was specified.") f_order = is_f_contig(A) transposed = False if not f_order: A = A.T upper = not upper transposed = True # Now A is always in f_order. So we can only allow upper=False (ooc) if upper: # Can do only in-core! if not ic: raise ValueError( "GPU POTRF is only implemented on the " "lower triangle for Fortran-ordered matrices (or on the upper " "triangle for C-ordered matrices)") if not ic and A.is_cuda: _msg = "Cannot run out-of-core POTRF on CUDA matrix 'A'." if opt.chol_force_ooc: _msg += " Set the `chol_force_ooc` option to `False` in to allow in-core POTRF." raise ValueError(_msg) # Handle different implementations for POTRF: in-core and out-of-core if ic: if opt.debug: print("Using in-core POTRF") _ic_cholesky(A, upper, device=device.Id, cusolver_handle=initialization.cusolver_handle(device.Id)) else: if opt.debug: print("Using parallel POTRF") _parallel_potrf_runner(A, opt, gpu_info) # Perform cleaning of the 'other side' of the matrix if clean: la_helpers.zero_triang(A, upper=not upper) # Undo previous matrix transformations if transposed: A = A.T return A
def fit(self, X: torch.Tensor, Y: torch.Tensor, Xts: Optional[torch.Tensor] = None, Yts: Optional[torch.Tensor] = None): """Fits the Falkon KRR model. Parameters ----------- X : torch.Tensor The tensor of training data, of shape [num_samples, num_dimensions]. If X is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Y : torch.Tensor The tensor of training targets, of shape [num_samples, num_outputs]. If X and Y represent a classification problem, Y can be encoded as a one-hot vector. If Y is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Xts : torch.Tensor or None Tensor of validation data, of shape [num_test_samples, num_dimensions]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Yts : torch.Tensor or None Tensor of validation targets, of shape [num_test_samples, num_outputs]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Returns -------- model: Falkon The fitted model """ X, Y, Xts, Yts = self._check_fit_inputs(X, Y, Xts, Yts) dtype = X.dtype self.fit_times_ = [] self.ny_points_ = None self.alpha_ = None # Start training timer t_s = time.time() # Pick Nystrom centers if self.weight_fn is not None: # noinspection PyTupleAssignmentBalance ny_points, ny_indices = self.center_selection.select_indices( X, None) else: # noinspection PyTypeChecker ny_points: Union[ torch.Tensor, falkon.sparse.SparseTensor] = self.center_selection.select( X, None) ny_indices = None num_centers = ny_points.shape[0] # Decide whether to use CUDA for preconditioning and iterations, based on number of centers _use_cuda_preconditioner = ( self.use_cuda_ and (not self.options.cpu_preconditioner) and num_centers >= get_min_cuda_preconditioner_size( dtype, self.options)) _use_cuda_mmv = (self.use_cuda_ and X.shape[0] * X.shape[1] * num_centers / self.num_gpus >= get_min_cuda_mmv_size(dtype, self.options)) if self.use_cuda_: ny_points = ny_points.pin_memory() with TicToc("Calcuating Preconditioner of size %d" % (num_centers), debug=self.options.debug): pc_opt: FalkonOptions = dataclasses.replace( self.options, use_cpu=not _use_cuda_preconditioner) if pc_opt.debug: print("Preconditioner will run on %s" % ("CPU" if pc_opt.use_cpu else ("%d GPUs" % self.num_gpus))) precond = falkon.preconditioner.FalkonPreconditioner( self.penalty, self.kernel, pc_opt) ny_weight_vec = None if self.weight_fn is not None: ny_weight_vec = self.weight_fn(Y[ny_indices]) precond.init(ny_points, weight_vec=ny_weight_vec) if _use_cuda_mmv: # Cache must be emptied to ensure enough memory is visible to the optimizer torch.cuda.empty_cache() X = X.pin_memory() # K_NM storage decision k_opt = dataclasses.replace(self.options, use_cpu=True) cpu_info = get_device_info(k_opt) available_ram = min(k_opt.max_cpu_mem, cpu_info[-1].free_memory) * 0.9 if self._can_store_knm(X, ny_points, available_ram): Knm = self.kernel(X, ny_points, opt=self.options) else: Knm = None self.fit_times_.append(time.time() - t_s) # Preparation time # Here we define the callback function which will run at the end # of conjugate gradient iterations. This function computes and # displays the validation error. validation_cback = None if self.error_fn is not None and self.error_every is not None: validation_cback = self._get_callback_fn(X, Y, Xts, Yts, ny_points, precond) # Start with the falkon algorithm with TicToc('Computing Falkon iterations', debug=self.options.debug): o_opt: FalkonOptions = dataclasses.replace( self.options, use_cpu=not _use_cuda_mmv) if o_opt.debug: print("Optimizer will run on %s" % ("CPU" if o_opt.use_cpu else ("%d GPUs" % self.num_gpus)), flush=True) optim = falkon.optim.FalkonConjugateGradient( self.kernel, precond, o_opt, weight_fn=self.weight_fn) if Knm is not None: beta = optim.solve(Knm, None, Y, self.penalty, initial_solution=None, max_iter=self.maxiter, callback=validation_cback) else: beta = optim.solve(X, ny_points, Y, self.penalty, initial_solution=None, max_iter=self.maxiter, callback=validation_cback) self.alpha_ = precond.apply(beta) self.ny_points_ = ny_points return self
def run_keops_mmv(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, other_vars: List[torch.Tensor], out: Optional[torch.Tensor], formula: str, aliases: List[str], axis: int, reduction: str = 'Sum', opt: Optional[FalkonOptions] = None) -> torch.Tensor: if opt is None: opt = FalkonOptions() # Choose backend N, D = X1.shape M = X2.shape[0] T = v.shape[1] backend = _decide_backend(opt, D) dtype = _keops_dtype(X1.dtype) # Define formula wrapper fn = Genred(formula, aliases, reduction_op=reduction, axis=axis, dtype=dtype, dtype_acc=opt.keops_acc_dtype, sum_scheme=opt.keops_sum_scheme) # Compile on a small data subset small_data_variables = [X1[:100], X2[:10], v[:10]] + other_vars small_data_out = torch.empty((100, T), dtype=X1.dtype, device=X1.device) fn(*small_data_variables, out=small_data_out, backend=backend) # Create output matrix if out is None: # noinspection PyArgumentList out = torch.empty(N, T, dtype=X1.dtype, device='cpu', pin_memory=backend != 'CPU') if backend.startswith("GPU"): # Info about GPUs ram_slack = 0.7 # slack is high due to imprecise memory usage estimates gpu_info = [ v for k, v in devices.get_device_info(opt).items() if k >= 0 ] gpu_ram = [ min((g.free_memory - 300 * 2**20) * ram_slack, opt.max_gpu_mem * ram_slack) for g in gpu_info ] block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i in range(len(gpu_info)): # First round of subdivision bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), other_vars=other_vars, function=fn, backend=backend, gpu_ram=gpu_ram[i]), gpu_info[i].Id)) _start_wait_processes(_single_gpu_method, args) else: # Run on CPU variables = [X1, X2, v] + other_vars out = fn(*variables, out=out, backend=backend) return out