def run(self, column): """ :returns: An array of :class:`decimal.Decimal`. """ if column.aggregate(HasNulls()): warn_null_calculation(self, column) data = column.values_without_nulls_sorted() # Zeroth percentile is first datum quantiles = [data[0]] for percentile in range(1, 100): k = len(data) * (float(percentile) / 100) low = max(1, int(math.ceil(k))) high = min(len(data), int(math.floor(k + 1))) # No remainder if low == high: value = data[low - 1] # Remainder else: value = (data[low - 1] + data[high - 1]) / 2 quantiles.append(value) # Hundredth percentile is final datum quantiles.append(data[-1]) return Quantiles(quantiles)
def validate(self, table): column = table.columns[self._column_name] if not isinstance(column.data_type, Number): raise DataTypeError('Percent column must contain Number data.') if self._total is not None and self._total <= 0: raise DataTypeError('The total must be a positive number') # Throw a warning if there are nulls in there if HasNulls(self._column_name).run(table): warn_null_calculation(self, column)
def validate(self, table): column = table.columns[self._column_name] if not isinstance(column.data_type, Number): raise DataTypeError('Deciles can only be applied to columns containing Number data.') has_nulls = HasNulls(self._column_name).run(table) if has_nulls: warn_null_calculation(self, column)
def validate(self, table): column = table.columns[self._column_name] if not isinstance(column.data_type, Number): raise DataTypeError('Median can only be applied to columns containing Number data.') has_nulls = HasNulls(self._column_name).run(table) if has_nulls: warn_null_calculation(self, column)
def run(self, column): """ :returns: :class:`decimal.Decimal`. """ if not isinstance(column.data_type, Number): raise DataTypeError( 'Mean can only be applied to columns containing Number data.') if column.aggregate(HasNulls()): warn_null_calculation(self, column) return column.aggregate(Sum()) / len(column.values_without_nulls())
def run(self, column): """ :returns: :class:`decimal.Decimal`. """ if not isinstance(column.data_type, Number): raise DataTypeError( 'IQR can only be applied to columns containing Number data.') if column.aggregate(HasNulls()): warn_null_calculation(self, column) percentiles = column.aggregate(Percentiles()) return percentiles[75] - percentiles[25]
def validate(self, table): before_column = table.columns[self._before_column_name] after_column = table.columns[self._after_column_name] if not isinstance(before_column.data_type, Number): raise DataTypeError('PercentChange before column must contain Number data.') if not isinstance(after_column.data_type, Number): raise DataTypeError('PercentChange after column must contain Number data.') if HasNulls(self._before_column_name).run(table): warn_null_calculation(self, before_column) if HasNulls(self._after_column_name).run(table): warn_null_calculation(self, after_column)
def run(self, column): """ :returns: :class:`decimal.Decimal`. """ if not isinstance(column.data_type, Number): raise DataTypeError( 'MAD can only be applied to columns containing Number data.') if column.aggregate(HasNulls()): warn_null_calculation(self, column) data = column.values_without_nulls_sorted() m = column.aggregate(Percentiles())[50] return median(tuple(abs(n - m) for n in data))
def run(self, column): """ :returns: :class:`decimal.Decimal`. """ if not isinstance(column.data_type, Number): raise DataTypeError( 'PopulationVariance can only be applied to columns containing Number data.' ) if column.aggregate(HasNulls()): warn_null_calculation(self, column) data = column.values_without_nulls() mean = column.aggregate(Mean()) return sum((n - mean)**2 for n in data) / len(data)
def run(self, table): """ :returns: :class:`decimal.Decimal`. """ column = table.columns[self._column_name] if not isinstance(column.data_type, Number): raise DataTypeError('IQR can only be applied to columns containing Number data.') has_nulls = HasNulls(self._column_name).run(table) if has_nulls: warn_null_calculation(self, column) percentiles = Percentiles(self._column_name).run(table) return percentiles[75] - percentiles[25]
def validate(self, table): before_column = table.columns[self._before_column_name] after_column = table.columns[self._after_column_name] for data_type in (Number, Date, DateTime, TimeDelta): if isinstance(before_column.data_type, data_type): if not isinstance(after_column.data_type, data_type): raise DataTypeError('Specified columns must be of the same type') if HasNulls(self._before_column_name).run(table): warn_null_calculation(self, before_column) if HasNulls(self._after_column_name).run(table): warn_null_calculation(self, after_column) return raise DataTypeError('Change before and after columns must both contain data that is one of: Number, Date, DateTime or TimeDelta.')
def run(self, column): """ :returns: :class:`decimal.Decimal`. """ if not isinstance(column.data_type, Number): raise DataTypeError( 'Mode can only be applied to columns containing Number data.') if column.aggregate(HasNulls()): warn_null_calculation(self, column) data = column.values_without_nulls() state = defaultdict(int) for n in data: state[n] += 1 return max(state.keys(), key=lambda x: state[x])
def run(self, table): """ :returns: :class:`decimal.Decimal`. """ column = table.columns[self._column_name] if not isinstance(column.data_type, Number): raise DataTypeError('Mean can only be applied to columns containing Number data.') has_nulls = HasNulls(self._column_name).run(table) if has_nulls: warn_null_calculation(self, column) sum_total = Sum(self._column_name).run(table) return sum_total / len(column.values_without_nulls())
def run(self, table): """ :returns: :class:`decimal.Decimal`. """ column = table.columns[self._column_name] if not isinstance(column.data_type, Number): raise DataTypeError('MAD can only be applied to columns containing Number data.') has_nulls = HasNulls(self._column_name).run(table) if has_nulls: warn_null_calculation(self, column) data = column.values_without_nulls_sorted() m = Median(self._column_name).run(table) return median(tuple(abs(n - m) for n in data))
def run(self, table): """ :returns: :class:`decimal.Decimal`. """ column = table.columns[self._column_name] if not isinstance(column.data_type, Number): raise DataTypeError('PopulationVariance can only be applied to columns containing Number data.') has_nulls = HasNulls(self._column_name).run(table) if has_nulls: warn_null_calculation(self, column) data = column.values_without_nulls() mean = Mean(self._column_name).run(table) return sum((n - mean) ** 2 for n in data) / len(data)
def run(self, table): """ :returns: :class:`decimal.Decimal`. """ column = table.columns[self._column_name] if not isinstance(column.data_type, Number): raise DataTypeError('Mode can only be applied to columns containing Number data.') has_nulls = HasNulls(self._column_name).run(table) if has_nulls: warn_null_calculation(self, column) data = column.values_without_nulls() state = defaultdict(int) for n in data: state[n] += 1 return max(state.keys(), key=lambda x: state[x])
def run(self, table): """ :returns: An instance of :class:`Quantiles`. """ column = table.columns[self._column_name] has_nulls = HasNulls(self._column_name).run(table) if has_nulls: warn_null_calculation(self, column) data = column.values_without_nulls_sorted() # Zeroth percentile is first datum quantiles = [data[0]] for percentile in range(1, 100): k = len(data) * (float(percentile) / 100) low = max(1, int(math.ceil(k))) high = min(len(data), int(math.floor(k + 1))) # No remainder if low == high: value = data[low - 1] # Remainder else: value = (data[low - 1] + data[high - 1]) / 2 quantiles.append(value) # Hundredth percentile is final datum quantiles.append(data[-1]) return Quantiles(quantiles)