def _compare_value(self, other, op): _rowid = Index(0) for rowid, val in zip(self._rowid, self._seq): try: if op(val, other): _rowid.append(rowid) except: pass return self._datamatrix._selectrowid(_rowid)
def _upgrade_datamatrix_index(dm): """Fixes the Index object of deprecated versions of DataMatrix.""" from datamatrix._datamatrix._index import Index object.__setattr__(dm, '_rowid', Index(dm._rowid._l)) for colname, col in dm.columns: if hasattr(col._rowid, '_l'): object.__setattr__(col, '_rowid', Index(col._rowid._l)) else: object.__setattr__(col, '_rowid', Index(col._rowid)) return dm
def _compare_sequence(self, other, op): _rowid = Index(0) for rowid, val, ref in zip(self._rowid, self._seq, self._tosequence(other)): try: if op(val, ref): _rowid.append(rowid) except: pass return self._datamatrix._selectrowid(_rowid)
def _compare_set(self, other, op): if op == operator.__eq__: test = lambda val: any(val == v for v in other) elif op == operator.__ne__: test = lambda val: all(val != v for v in other) else: raise TypeError('sets can only be compared with == or !=') _rowid = Index(0) for rowid, val in zip(self._rowid, self._seq): try: if test(val): _rowid.append(rowid) except: pass return self._datamatrix._selectrowid(_rowid)
def _merge(self, other, _rowid): """ visible: False desc: Merges this column with another column, selecting only the rows indicated by _rowid. arguments: other: Another column. _rowid: A list of row ids to select. returns: type: BaseColumn """ col = self._empty_col() col._rowid = Index(_rowid) col._seq = [] for row in _rowid: if row in self._rowid: col._seq.append(self._seq[self._rowid.index(row)]) else: col._seq.append(other._seq[other._rowid.index(row)]) return col
def _merge(self, other, _rowid): """ visible: False desc: Merges this column with another column, selecting only the rows indicated by _rowid. arguments: other: Another column. _rowid: A list of row ids to select. returns: type: BaseColumn """ col = self._empty_col() col._rowid = Index(_rowid) col._seq = [None] * len(_rowid) self_row_id = set(self._rowid) for i, row in enumerate(_rowid): col._seq[i] = (self._seq[self._rowid.index(row)] if row in self_row_id else other._seq[other._rowid.index(row)]) return col
def _setlength(self, value): """ visible: False desc: | Changes the length of the current DataMatrix, adding or removing rows as necessary. *This modifies the current DataMatrix.* __Note__: The preferred way to change the length is by setting the length property: ~~~ dm.length = 10 ~~~ arguments: value: desc: The new length. type: int """ if value < len(self): object.__setattr__(self, u'_rowid', self._rowid[:value]) for name, col in self._cols.items(): self._cols[name] = self._cols[name][:value] else: startid = 0 if not len(self) else self._rowid.max + 1 rowid = Index([i + startid for i in range(value - len(self))]) object.__setattr__(self, u'_rowid', self._rowid.copy() + rowid) for name in self._cols: self._cols[name]._addrowid(rowid) self._mutate()
def __init__(self, length=0, default_col_type=MixedColumn, **columns): """ desc: Constructor. keywords: length: desc: The starting length of the DataMatrix. type: int keyword-dict: columns: Columns can be initialized by passing them as keywords, where the keyword is the column name, and the value is the initial value for the column. """ global _id try: length = int(length) except ValueError: raise TypeError('length should be an integer') object.__setattr__(self, u'_cols', OrderedDict()) object.__setattr__(self, u'_rowid', Index(length)) object.__setattr__(self, u'_default_col_type', default_col_type) object.__setattr__(self, u'_id', _id) object.__setattr__(self, u'_sorted', True) _id += 1 for column_name, val in columns.items(): self[column_name] = val
def shuffle(obj): """ desc: | Shuffles a DataMatrix or a column. If a DataMatrix is shuffled, the order of the rows is shuffled, but values that were in the same row will stay in the same row. __Example:__ %-- python: | from datamatrix import DataMatrix, operations dm = DataMatrix(length=5) dm.A = 'a', 'b', 'c', 'd', 'e' dm.B = operations.shuffle(dm.A) print(dm) --% arguments: obj: type: [DataMatrix, BaseColumn] returns: desc: The shuffled DataMatrix or column. type: [DataMatrix, BaseColumn] """ _rowid = Index(obj._rowid) random.shuffle(_rowid) if isinstance(obj, DataMatrix): return obj._selectrowid(_rowid) col = obj._getrowidkey(_rowid) col._rowid = obj._rowid return col
def realdata(): dm = io.readpickle('data/real-data.pkl') # If the buffered DataMatrix still uses a list-style row index, we convert # it to the new Index object with this hack. if isinstance(dm._rowid, list): from datamatrix._datamatrix._index import Index object.__setattr__(dm, u'_rowid', Index(dm._rowid)) print(len(dm)) return dm
def filter_(fnc, obj): """ desc: | Filters rows from a datamatrix or column based on filter function (`fnc`). If `obj` is a column, `fnc` should be a function that accepts a single value. If `obj` is a datamatrix, `fnc` should be a function that accepts a keyword `dict`, where column names are keys and cells are values. In both cases, `fnc` should return a `bool` indicating whether the row or value should be included. *New in v0.8.0*: You can also directly compare a column with a function or `lambda` expression. However, this is different from `filter_()` in that it returns a datamatrix object and not a column. __Example:__ %-- python: | from datamatrix import DataMatrix, functional as fnc dm = DataMatrix(length=5) dm.col = range(5) # Create a column with only odd values col_new = fnc.filter_(lambda x: x % 2, dm.col) print(col_new) # Create a new datamatrix with only odd values in col dm_new = fnc.filter_(lambda **d: d['col'] % 2, dm) print(dm_new) --% arguments: fnc: desc: A filter function. type: callable obj: desc: A datamatrix or column to filter. type: [BaseColumn, DataMatrix] returns: desc: A new column or datamatrix. type: [BaseColumn, DataMatrix] """ if not callable(fnc): raise TypeError('fnc should be callable') if isinstance(obj, BaseColumn): return (obj == fnc)[obj.name] if not isinstance(obj, DataMatrix): raise TypeError(u'obj should be DataMatrix or BaseColumn') dm = obj keep = lambda fnc, row: fnc(**{col: val for col, val in row}) return dm._selectrowid( Index([rowid for rowid, row in zip(dm._rowid, obj) if keep(fnc, row)]))
def __eq__(self, other): if isinstance(other, type): if other is self.dtype: return self._datamatrix return self._datamatrix._selectrowid(Index(0)) if self._issequence(other): return super(IntColumn, self).__eq__(other) try: return super(IntColumn, self).__eq__(other) except TypeError: # If the other value is not an int, then nothing is equal to it return self._compare_value( 0, lambda x, y: np.zeros(len(self._datamatrix)))
def _sortedrowid(self): """ visible: False desc: Gives a list of rowids that are ordered such that they sort the column. returns: An iterator. """ s = sorted(zip(self._seq, self._rowid), key=lambda x: sortable(x[0])) return Index([rowid for val, rowid in s])
def _compare_function(self, other, op): if op == operator.__eq__: test = other elif op == operator.__ne__: test = lambda val: not other(val) else: raise TypeError('functions can only be compared with == or !=') if not len(inspect.getargspec(other).args) == 1: raise TypeError('function must take exactly one argument') return self._datamatrix._selectrowid( Index([ rowid for rowid, val in zip(self._rowid, self._seq) if test(val) ]))
def _compare_nan(self, other, op): _rowid = Index(0) if op is operator.eq: for rowid, val in zip(self._rowid, self._seq): if math.isnan(val): _rowid.append(rowid) elif op is operator.ne: for rowid, val in zip(self._rowid, self._seq): if not math.isnan(val): _rowid.append(rowid) else: raise TypeError('nans can only be compared with == or !=') return self._datamatrix._selectrowid(_rowid)
def _compare_type(self, type_, op): _rowid = Index(0) if op is operator.eq: for rowid, val in zip(self._rowid, self._seq): if isinstance(val, type_): _rowid.append(rowid) elif op is operator.ne: for rowid, val in zip(self._rowid, self._seq): if not isinstance(val, type_): _rowid.append(rowid) else: raise TypeError('types can only be compared with == or !=') return self._datamatrix._selectrowid(_rowid)
def _getsequencekey(self, key): """ visible: False desc: Gets a slice of this column by list or some other iterable. arguments: key: A list or other iterable object. returns: BaseColunn """ col = self._empty_col() col._rowid = Index() col._seq = [] for i in key: col._rowid.append(self._rowid[i]) col._seq.append(self._seq[i]) return col
def random_sample(obj, k): """ desc: | *New in v0.11.0* Takes a random sample of `k` rows from a DataMatrix or column. The order of the rows in the returned DataMatrix is random. __Example:__ ```python from datamatrix import DataMatrix, operations as ops dm = DataMatrix(length=5) dm.A = 'a', 'b', 'c', 'd', 'e' dm = ops.random_sample(dm, k=3) print(dm) ``` arguments: obj: type: [DataMatrix, BaseColumn] k: type: int returns: desc: A random sample from a DataMatrix or column. type: [DataMatrix, BaseColumn] """ _rowid = Index(obj._rowid) _rowid = random.sample(list(_rowid), k) if isinstance(obj, DataMatrix): return obj._selectrowid(_rowid) col = obj._getrowidkey(_rowid) col._rowid = obj._rowid return col
def _compare_value(self, other, op): _other = self._checktype(other) if np.isnan(_other): # NaN is usually not equal to itself. Here we implement equality # for NaN, as though NaN is equal to itself. This behavior may # change in the future if op is operator.eq: b = np.isnan(self._seq) elif op is operator.ne: b = ~np.isnan(self._seq) else: raise TypeError(u'Cannot compare FloatColumn to %s' % other) elif np.isinf(_other): if op is operator.eq: b = np.isinf(self._seq) elif op is operator.ne: b = ~np.isinf(self._seq) else: raise TypeError(u'Cannot compare FloatColumn to %s' % other) else: b = op(self._seq, _other) i = np.where(b)[0] return self._datamatrix._selectrowid(Index(self._rowid[i]))
def __xor__(self, other): selection = Index(set(self._rowid) ^ set(other._rowid)) return self._merge(other, selection.sorted())
def _compare_sequence(self, other, op): _other = self._tosequence(other) i = np.where(op(self._seq, _other)) return self._datamatrix._selectrowid(Index(self._rowid[i]))
def _sortedrowid(self): return Index(self._rowid[self._seq.argsort()])