def load_csv( path, header_lines=0, sep=",", dtype=types.float32, encoding="UTF-8", split=None, device=None, comm=MPI_WORLD, ): """ Loads data from a CSV file. The data will be distributed along the 0 axis. Parameters ---------- path : str Path to the CSV file to be read. header_lines : int, optional The number of columns at the beginning of the file that should not be considered as data. default: 0. sep : str, optional The single char or string that separates the values in each row. default: ';' dtype : ht.dtype, optional Data type of the resulting array; default: ht.float32. encoding : str, optional The type of encoding which will be used to interpret the lines of the csv file as strings. default: 'UTF-8' split : None, 0, 1 : optional Along which axis the resulting tensor should be split. Default is None which means each node will have the full tensor. device : None or str, optional The device id on which to place the data, defaults to globally set default device. comm : Communication, optional The communication to use for the data distribution. defaults to MPI_COMM_WORLD. Returns ------- out : ht.DNDarray Data read from the CSV file. Raises ------- TypeError If any of the input parameters are not of correct type Examples -------- >>> import heat as ht >>> a = ht.load_csv('data.csv') >>> a.shape [0/3] (150, 4) [1/3] (150, 4) [2/3] (150, 4) [3/3] (150, 4) >>> a.lshape [0/3] (38, 4) [1/3] (38, 4) [2/3] (37, 4) [3/3] (37, 4) >>> b = ht.load_csv('data.csv', header_lines=10) >>> b.shape [0/3] (140, 4) [1/3] (140, 4) [2/3] (140, 4) [3/3] (140, 4) >>> b.lshape [0/3] (35, 4) [1/3] (35, 4) [2/3] (35, 4) [3/3] (35, 4) """ if not isinstance(path, str): raise TypeError("path must be str, not {}".format(type(path))) if not isinstance(sep, str): raise TypeError("separator must be str, not {}".format(type(sep))) if not isinstance(header_lines, int): raise TypeError("header_lines must int, not {}".format( type(header_lines))) if split not in [None, 0, 1]: raise ValueError( "split must be in [None, 0, 1], but is {}".format(split)) # infer the type and communicator for the loaded array dtype = types.canonical_heat_type(dtype) # determine the comm and device the data will be placed on device = devices.sanitize_device(device) comm = sanitize_comm(comm) file_size = os.stat(path).st_size rank = comm.rank size = comm.size if split is None: with open(path) as f: data = f.readlines() data = data[header_lines:] result = [] for i, line in enumerate(data): values = line.replace("\n", "").replace("\r", "").split(sep) values = [float(val) for val in values] result.append(values) resulting_tensor = factories.array(result, dtype=dtype, split=split, device=device, comm=comm) elif split == 0: counts, displs, _ = comm.counts_displs_shape((file_size, 1), 0) # in case lines are terminated with '\r\n' we need to skip 2 bytes later lineter_len = 1 # Read a chunk of bytes and count the linebreaks with open(path, "rb") as f: f.seek(displs[rank], 0) line_starts = [] r = f.read(counts[rank]) for pos, l in enumerate(r): if chr(l) == "\n": # Check if it is part of '\r\n' if not chr(r[pos - 1]) == "\r": line_starts.append(pos + 1) elif chr(l) == "\r": # check if file line is terminated by '\r\n' if pos + 1 < len(r) and chr(r[pos + 1]) == "\n": line_starts.append(pos + 2) lineter_len = 2 else: line_starts.append(pos + 1) if rank == 0: line_starts = [0] + line_starts # Find the correct starting point total_lines = torch.empty(size, dtype=torch.int32) comm.Allgather(torch.tensor([len(line_starts)], dtype=torch.int32), total_lines) cumsum = total_lines.cumsum(dim=0).tolist() start = next(i for i in range(size) if cumsum[i] > header_lines) if rank < start: line_starts = [] if rank == start: rem = header_lines - (0 if start == 0 else cumsum[start - 1]) line_starts = line_starts[rem:] # Determine the number of columns that each line consists of if len(line_starts) > 1: columns = 1 for li in r[line_starts[0]:line_starts[1]]: if chr(li) == sep: columns += 1 else: columns = 0 columns = torch.tensor([columns], dtype=torch.int32) comm.Allreduce(MPI.IN_PLACE, columns, MPI.MAX) # Share how far the processes need to reed in their last line last_line = file_size if size - start > 1: if rank == start: last_line = torch.empty(1, dtype=torch.int32) comm.Recv(last_line, source=rank + 1) last_line = last_line.item() elif rank == size - 1: first_line = torch.tensor(displs[rank] + line_starts[0] - 1, dtype=torch.int32) comm.Send(first_line, dest=rank - 1) elif start < rank < size - 1: last_line = torch.empty(1, dtype=torch.int32) first_line = torch.tensor(displs[rank] + line_starts[0] - 1, dtype=torch.int32) comm.Send(first_line, dest=rank - 1) comm.Recv(last_line, source=rank + 1) last_line = last_line.item() # Create empty tensor and iteratively fill it with the values local_shape = (len(line_starts), columns) actual_length = 0 local_tensor = torch.empty(local_shape, dtype=dtype.torch_type(), device=device.torch_device) for ind, start in enumerate(line_starts): if ind == len(line_starts) - 1: f.seek(displs[rank] + start, 0) line = f.read(last_line - displs[rank] - start) else: line = r[start:line_starts[ind + 1] - lineter_len] # Decode byte array line = line.decode(encoding) if len(line) > 0: sep_values = [float(val) for val in line.split(sep)] local_tensor[actual_length] = torch.tensor( sep_values, dtype=dtype.torch_type()) actual_length += 1 # In case there are some empty lines in the csv file local_tensor = local_tensor[:actual_length] resulting_tensor = factories.array(local_tensor, dtype=dtype, is_split=0, device=device, comm=comm) resulting_tensor.balance_() elif split == 1: data = [] with open(path) as f: for i in range(header_lines): f.readline() line = f.readline() values = line.replace("\n", "").replace("\r", "").split(sep) values = [float(val) for val in values] rows = len(values) chunk, displs, _ = comm.counts_displs_shape((1, rows), 1) data.append(values[displs[rank]:displs[rank] + chunk[rank]]) # Read file line by line till EOF reached for line in iter(f.readline, ""): values = line.replace("\n", "").replace("\r", "").split(sep) values = [float(val) for val in values] data.append(values[displs[rank]:displs[rank] + chunk[rank]]) resulting_tensor = factories.array(data, dtype=dtype, is_split=1, device=device, comm=comm) return resulting_tensor
def assert_func_equal_for_tensor( self, tensor, heat_func, numpy_func, heat_args=None, numpy_args=None, distributed_result=True, ): """ This function tests if the heat function and the numpy function create the equal result on the given tensor. Parameters ---------- tensor: torch.Tensor or numpy.ndarray The tensor on which the heat function will be executed. heat_func: function The function that is to be tested numpy_func: function The numpy implementation of an equivalent function to test against heat_args: dictionary, optional The keyword arguments that will be passed to the heat function. Array and split function don't need to be specified. Default is {}. numpy_args: dictionary, optional The keyword arguments that will be passed to the numpy function. Array doesn't need to be specified. Default is {}. distributed_result: bool, optional Specify whether the result of the heat function is distributed across all nodes or all nodes have the full result. Default is True. Raises ------ AssertionError if the functions to not perform equally. Examples -------- >>> import numpy as np >>> import heat as ht >>> a = np.arange(10) >>> self.assert_func_equal_for_tensor(a, ht.exp, np.exp) >>> self.assert_func_equal_for_tensor(a, ht.exp, np.log) AssertionError: [...] >>> self.assert_func_equal_for_tensor(a, ht.any, np.any, distributed_result=False) >>> a = torch.ones([5, 5, 5, 5]) >>> heat_args = {'sorted': True, 'axis': 0} >>> numpy_args = {'axis': 0} >>> self.assert_func_equal_for_tensor(a, ht.unique, np.unique, heat_arg=heat_args, numpy_args=numpy_args) """ self.assertTrue(callable(heat_func)) self.assertTrue(callable(numpy_func)) if heat_args is None: heat_args = {} if numpy_args is None: numpy_args = {} if isinstance(tensor, np.ndarray): torch_tensor = torch.from_numpy(tensor.copy()) np_array = tensor elif isinstance(tensor, torch.Tensor): torch_tensor = tensor np_array = tensor.numpy().copy() else: raise TypeError( "The input tensors type must be one of [tuple, list, " + "numpy.ndarray, torch.tensor] but is {}".format(type(tensor))) np_res = numpy_func(np_array, **numpy_args) if not isinstance(np_res, np.ndarray): np_res = np.array([np_res]) dtype = types.canonical_heat_type(torch_tensor.dtype) for i in range(len(tensor.shape)): ht_array = factories.array(torch_tensor, split=i, dtype=dtype, device=self.device, comm=self.comm) ht_res = heat_func(ht_array, **heat_args) self.assertEqual(ht_array.device, ht_res.device) self.assertEqual(ht_array._DNDarray__array.device, ht_res._DNDarray__array.device) if distributed_result: self.assert_array_equal(ht_res, np_res) else: self.assertTrue( np.array_equal(ht_res._DNDarray__array.cpu().numpy(), np_res))