def apply_and_cast(x, fn, dtype, skip_undefined): if is_missing(x) and skip_undefined: return [] try: if skip_undefined: return [dtype(item) for item in fn(x) if not is_missing(item)] return [dtype(item) for item in fn(x)] except TypeError: return TypeError
def extend(row, n_cols, na_value): if na_value is not None: if isinstance(row, list): row = [na_value if is_missing(x) else x for x in row] else: row = {x: na_value if is_missing(row[x]) else row[x] for x in row} if len(row) < n_cols: if isinstance(row, list): for i in range(len(row), n_cols): row.append(na_value) else: for i in limit: if i not in row: row[i] = na_value return row
def do_cast(x, dtype, ignore_cast_failure): if is_missing(x): return x if type(x) == dtype: return x try: return dtype(x) except (ValueError, TypeError): # TODO: this does not seem to cach as it should return None if ignore_cast_failure else ValueError
def num_missing(self): """ Number of missing elements in the RDD. """ self._entry() self.materialized = True res = self._rdd.aggregate(0, # action lambda acc, v: acc + 1 if is_missing(v) else acc, lambda acc1, acc2: acc1 + acc2) self._exit() return res
def drop_missing_values(self): """ Create new RDD containing only the non-missing values of the RDD. A missing value shows up in an RDD as 'None'. This will also drop float('nan'). """ self._entry() res = self._rdd.filter(lambda x: not is_missing(x)) self._exit() return self._rv(res)
def fill_missing_values(self, value): """ Create new rdd with all missing values (None or NaN) filled in with the given value. The size of the new rdd will be the same as the original rdd. If the given value is not the same type as the values in the rdd, `fill_missing_values` will attempt to convert the value to the original rdd's type. If this fails, an error will be raised. """ self._entry(value) res = self._rdd.map(lambda x: value if is_missing(x) else x) self._exit() return self._rv(res)
def apply_and_cast(x, fn, dtype, skip_undefined): if is_missing(x) and skip_undefined: return None try: return dtype(fn(x)) except TypeError: return TypeError
def ne_zero(x): if is_missing(x): return False return x != 0