def extend(row, n_cols, na_value): if na_value is not None: if isinstance(row, list): row = [na_value if is_missing(x) else x for x in row] else: row = {x: na_value if is_missing(row[x]) else row[x] for x in row} if len(row) < n_cols: if isinstance(row, list): for i in range(len(row), n_cols): row.append(na_value) else: for i in limit: if i not in row: row[i] = na_value return row
def apply_and_cast(x, fn, dtype, skip_undefined): if is_missing(x) and skip_undefined: return [] try: # It is tempting to define the lambda function on the fly, but that # leads to serilization difficulties. if skip_undefined: if dtype is None: return [item for item in fn(x) if not is_missing(item)] return [dtype(item) for item in fn(x) if not is_missing(item)] if dtype is None: return [item for item in fn(x)] return [dtype(item) for item in fn(x)] except TypeError: return [ApplyError('TypeError')]
def apply_and_cast(x, fn, dtype, skip_undefined): if is_missing(x) and skip_undefined: return [] try: # It is tempting to define the lambda function on the fly, but that # leads to serilization difficulties. if skip_undefined: if dtype is None: return [item for item in fn(x) if not is_missing(item)] return [ dtype(item) for item in fn(x) if not is_missing(item) ] if dtype is None: return [item for item in fn(x)] return [dtype(item) for item in fn(x)] except TypeError: return [ApplyError('TypeError')]
def extend(row, n_cols, na_value): if na_value is not None: if isinstance(row, list): row = [na_value if is_missing(x) else x for x in row] else: row = { x: na_value if is_missing(row[x]) else row[x] for x in row } if len(row) < n_cols: if isinstance(row, list): for i in range(len(row), n_cols): row.append(na_value) else: for i in limit: if i not in row: row[i] = na_value return row
def apply_and_cast(x, fn, dtype, skip_undefined): if is_missing(x) and skip_undefined: return None # noinspection PyBroadException try: fnx = fn(x) except Exception: return ApplyError('Error evaluating function on "{}"'.format(x)) if is_missing(fnx) and skip_undefined: return None if dtype is None: return fnx try: if dtype in [array.array]: return array.array(array_typecode(fnx[0]), fnx) else: return dtype(fnx) except TypeError: return ApplyError('Error converting "{}" to {}'.format(fnx, dtype))
def count_missing_values(self): """ Count missing values. A missing value shows up in an RDD as 'NaN' or 'None'. """ self._entry() res = self._rdd.map(lambda x: 1 if is_missing(x) else 0) total = res.sum() return total
def num_missing(self): """ Number of missing elements in the RDD. """ self._entry() self.materialized = True res = self._rdd.aggregate(0, # action lambda acc, v: acc + 1 if is_missing(v) else acc, lambda acc1, acc2: acc1 + acc2) return res
def num_missing(self): """ Number of missing elements in the RDD. """ self._entry() self.materialized = True res = self._rdd.aggregate( 0, # action lambda acc, v: acc + 1 if is_missing(v) else acc, lambda acc1, acc2: acc1 + acc2) return res
def drop_missing_values(self): """ Create new RDD containing only the non-missing values of the RDD. A missing value shows up in an RDD as 'None'. This will also drop float('nan'). """ self._entry() res = self._rdd.filter(lambda x: not is_missing(x)) return self._rv(res)
def do_cast(x, dtype, ignore_cast_failure): if is_missing(x): return x if isinstance(x, str) and dtype is datetime.datetime: return date_parser.parse(x) if isinstance(x, dtype): return x try: return dtype(x) except (ValueError, TypeError): # TODO: this does not seem to catch as it should return None if ignore_cast_failure else ValueError
def apply_and_cast(x, fn, dtype, skip_undefined): if is_missing(x) and skip_undefined: return None # noinspection PyBroadException try: fnx = fn(x) except Exception: return ApplyError( 'Error evaluating function on "{}"'.format(x)) if is_missing(fnx) and skip_undefined: return None if dtype is None: return fnx try: if dtype in [array.array]: return array.array(array_typecode(fnx[0]), fnx) else: return dtype(fnx) except TypeError: return ApplyError('Error converting "{}" to {}'.format( fnx, dtype))
def fill_missing_values(self, value): """ Create new rdd with all missing values (None or NaN) filled in with the given value. The size of the new rdd will be the same as the original rdd. If the given value is not the same type as the values in the rdd, `fill_missing_values` will attempt to convert the value to the original rdd's type. If this fails, an error will be raised. """ self._entry(value=value) res = self._rdd.map(lambda x: value if is_missing(x) else x) return self._rv(res)
def ne_zero(x): if is_missing(x): return False return x != 0