예제 #1
0
 def extend(row, n_cols, na_value):
     if na_value is not None:
         if isinstance(row, list):
             row = [na_value if is_missing(x) else x for x in row]
         else:
             row = {x: na_value if is_missing(row[x]) else row[x] for x in row}
     if len(row) < n_cols:
         if isinstance(row, list):
             for i in range(len(row), n_cols):
                 row.append(na_value)
         else:
             for i in limit:
                 if i not in row:
                     row[i] = na_value
     return row
예제 #2
0
 def apply_and_cast(x, fn, dtype, skip_undefined):
     if is_missing(x) and skip_undefined:
         return []
     try:
         # It is tempting to define the lambda function on the fly, but that
         #  leads to serilization difficulties.
         if skip_undefined:
             if dtype is None:
                 return [item for item in fn(x) if not is_missing(item)]
             return [dtype(item) for item in fn(x) if not is_missing(item)]
         if dtype is None:
             return [item for item in fn(x)]
         return [dtype(item) for item in fn(x)]
     except TypeError:
         return [ApplyError('TypeError')]
예제 #3
0
 def apply_and_cast(x, fn, dtype, skip_undefined):
     if is_missing(x) and skip_undefined:
         return []
     try:
         # It is tempting to define the lambda function on the fly, but that
         #  leads to serilization difficulties.
         if skip_undefined:
             if dtype is None:
                 return [item for item in fn(x) if not is_missing(item)]
             return [
                 dtype(item) for item in fn(x) if not is_missing(item)
             ]
         if dtype is None:
             return [item for item in fn(x)]
         return [dtype(item) for item in fn(x)]
     except TypeError:
         return [ApplyError('TypeError')]
예제 #4
0
 def extend(row, n_cols, na_value):
     if na_value is not None:
         if isinstance(row, list):
             row = [na_value if is_missing(x) else x for x in row]
         else:
             row = {
                 x: na_value if is_missing(row[x]) else row[x]
                 for x in row
             }
     if len(row) < n_cols:
         if isinstance(row, list):
             for i in range(len(row), n_cols):
                 row.append(na_value)
         else:
             for i in limit:
                 if i not in row:
                     row[i] = na_value
     return row
예제 #5
0
 def apply_and_cast(x, fn, dtype, skip_undefined):
     if is_missing(x) and skip_undefined:
         return None
     # noinspection PyBroadException
     try:
         fnx = fn(x)
     except Exception:
         return ApplyError('Error evaluating function on "{}"'.format(x))
     if is_missing(fnx) and skip_undefined:
         return None
     if dtype is None:
         return fnx
     try:
         if dtype in [array.array]:
             return array.array(array_typecode(fnx[0]), fnx)
         else:
             return dtype(fnx)
     except TypeError:
         return ApplyError('Error converting "{}" to {}'.format(fnx, dtype))
예제 #6
0
    def count_missing_values(self):
        """
        Count missing values.

        A missing value shows up in an RDD as 'NaN' or 'None'.
        """
        self._entry()
        res = self._rdd.map(lambda x: 1 if is_missing(x) else 0)
        total = res.sum()
        return total
예제 #7
0
 def num_missing(self):
     """
     Number of missing elements in the RDD.
     """
     self._entry()
     self.materialized = True
     res = self._rdd.aggregate(0,             # action
                               lambda acc, v: acc + 1 if is_missing(v) else acc,
                               lambda acc1, acc2: acc1 + acc2)
     return res
예제 #8
0
    def count_missing_values(self):
        """
        Count missing values.

        A missing value shows up in an RDD as 'NaN' or 'None'.
        """
        self._entry()
        res = self._rdd.map(lambda x: 1 if is_missing(x) else 0)
        total = res.sum()
        return total
예제 #9
0
 def num_missing(self):
     """
     Number of missing elements in the RDD.
     """
     self._entry()
     self.materialized = True
     res = self._rdd.aggregate(
         0,  # action
         lambda acc, v: acc + 1 if is_missing(v) else acc,
         lambda acc1, acc2: acc1 + acc2)
     return res
예제 #10
0
    def drop_missing_values(self):
        """
        Create new RDD containing only the non-missing values of the
        RDD.

        A missing value shows up in an RDD as 'None'.  This will also drop
        float('nan').
        """
        self._entry()
        res = self._rdd.filter(lambda x: not is_missing(x))
        return self._rv(res)
예제 #11
0
    def drop_missing_values(self):
        """
        Create new RDD containing only the non-missing values of the
        RDD.

        A missing value shows up in an RDD as 'None'.  This will also drop
        float('nan').
        """
        self._entry()
        res = self._rdd.filter(lambda x: not is_missing(x))
        return self._rv(res)
예제 #12
0
 def do_cast(x, dtype, ignore_cast_failure):
     if is_missing(x):
         return x
     if isinstance(x, str) and dtype is datetime.datetime:
         return date_parser.parse(x)
     if isinstance(x, dtype):
         return x
     try:
         return dtype(x)
     except (ValueError, TypeError):
         # TODO: this does not seem to catch as it should
         return None if ignore_cast_failure else ValueError
예제 #13
0
 def apply_and_cast(x, fn, dtype, skip_undefined):
     if is_missing(x) and skip_undefined:
         return None
     # noinspection PyBroadException
     try:
         fnx = fn(x)
     except Exception:
         return ApplyError(
             'Error evaluating function on "{}"'.format(x))
     if is_missing(fnx) and skip_undefined:
         return None
     if dtype is None:
         return fnx
     try:
         if dtype in [array.array]:
             return array.array(array_typecode(fnx[0]), fnx)
         else:
             return dtype(fnx)
     except TypeError:
         return ApplyError('Error converting "{}" to {}'.format(
             fnx, dtype))
예제 #14
0
 def do_cast(x, dtype, ignore_cast_failure):
     if is_missing(x):
         return x
     if isinstance(x, str) and dtype is datetime.datetime:
         return date_parser.parse(x)
     if isinstance(x, dtype):
         return x
     try:
         return dtype(x)
     except (ValueError, TypeError):
         # TODO: this does not seem to catch as it should
         return None if ignore_cast_failure else ValueError
예제 #15
0
    def fill_missing_values(self, value):
        """
        Create new rdd with all missing values (None or NaN) filled in
        with the given value.

        The size of the new rdd will be the same as the original rdd. If
        the given value is not the same type as the values in the rdd,
        `fill_missing_values` will attempt to convert the value to the original rdd's
        type. If this fails, an error will be raised.
        """
        self._entry(value=value)
        res = self._rdd.map(lambda x: value if is_missing(x) else x)
        return self._rv(res)
예제 #16
0
    def fill_missing_values(self, value):
        """
        Create new rdd with all missing values (None or NaN) filled in
        with the given value.

        The size of the new rdd will be the same as the original rdd. If
        the given value is not the same type as the values in the rdd,
        `fill_missing_values` will attempt to convert the value to the original rdd's
        type. If this fails, an error will be raised.
        """
        self._entry(value=value)
        res = self._rdd.map(lambda x: value if is_missing(x) else x)
        return self._rv(res)
예제 #17
0
 def ne_zero(x):
     if is_missing(x):
         return False
     return x != 0
예제 #18
0
 def ne_zero(x):
     if is_missing(x):
         return False
     return x != 0