def __init__(self, context): super().__init__(context) # Numpy has no concept of "void" column (at least, not similar to ours), # so avoid that random type: coltype = random_type(allow_void=False) mfraction = random.random() data, mmask = random_column(self.frame.nrows, coltype, missing_fraction=mfraction, missing_nones=False) # On Linux/Mac numpy's default int type is np.int64, # while on Windows it is np.int32. Here we force it to be # np.int64 for consistency. np_dtype = np.int64 if coltype == int else np.dtype(coltype) np_data = np.ma.array(data, mask=mmask, dtype=np_dtype) # Save random numpy arrays to make sure they don't change with # munging. Arrays that are not saved here will be eventually deleted # by Python, in such a case we also test datatable behaviour. if random.random() > 0.5: context.add_deferred_check(deferred_nparray_check(np_data)) for i in range(len(data)): if mmask[i]: data[i] = None self.column_name = random_names(1)[0] self.column_type = coltype self.np_dtype = np_dtype self.np_data = np_data self.np_mask = mmask self.py_data = data
def cbind_numpy_column(self): import numpy as np coltype = random_type() mfraction = random.random() data, mmask = random_column(self.nrows, coltype, mfraction, False) # On Linux/Mac numpy's default int type is np.int64, # while on Windows it is np.int32. Here we force it to be # np.int64 for consistency. np_dtype = np.int64 if coltype == int else np.dtype(coltype) np_data = np.ma.array(data, mask=mmask, dtype=np_dtype) # Save random numpy arrays to make sure they don't change with # munging. Arrays that are not saved here will be eventually deleted # by Python, in such a case we also test datatable behaviour. if random.random() > 0.5: self.np_data += [np_data] self.np_data_deepcopy += [copy.deepcopy(np_data)] names = random_names(1) df = dt.Frame(np_data.T, names=names) for i in range(self.nrows): if mmask[i]: data[i] = None self.df.cbind(df) self.data += [data] self.types += [coltype] self.names += names self.dedup_names()
def random(ncols=None, nrows=None, types=None, names=None, missing_fraction=None): if ncols is None: if types: ncols = len(types) elif names: ncols = len(names) else: ncols = int(random.expovariate(0.2)) + 1 if nrows is None: nrows = int(random.expovariate(0.01)) + 1 if not types: types = [random_type() for _ in range(ncols)] if not names: names = random_names(ncols) if missing_fraction is None: missing_fraction = random.random()**10 if missing_fraction < 0.05 or nrows == 1: missing_fraction = 0.0 assert isinstance(ncols, int) assert isinstance(nrows, int) assert isinstance(types, list) and len(types) == ncols assert isinstance(names, list) and len(names) == ncols assert isinstance(missing_fraction, float) tt = {bool: 0, int: 0, float: 0, str: 0} for t in types: tt[t] += 1 assert len(tt) == 4 print("# Making a frame with nrows=%d, ncols=%d" % (nrows, ncols)) print("# types: bool=%d, int=%d, float=%d, str=%d" % (tt[bool], tt[int], tt[float], tt[str])) print("# missing values: %.3f" % missing_fraction) data = [ random_column(nrows, types[i], missing_fraction)[0] for i in range(ncols) ] frame = MetaFrame() frame.data = data frame.names = names frame.types = types frame.nkeys = 0 frame.df = dt.Frame(data, names=names, stypes=types) print(f"{frame.name} = dt.Frame(") print(f" {repr_data(data, 4)},") print(f" names={names},") print(f" stypes={repr_types(types)}") print(f")") return frame