def count_na(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. :return: """ columns = parse_columns(self, columns) df = self expr = [] for col_name in columns: # If type column is Struct parse to String. isnan/isNull can not handle Structure/Boolean if is_(df.cols.schema_dtype(col_name), (StructType, BooleanType)): df = df.cols.cast(col_name, "string") if is_(df.cols.schema_dtype(col_name), (float, int)): expr.append(F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name)) elif is_(df.cols.schema_dtype(col_name), (NullType)): expr.append(F.count(col_name).alias(col_name)) else: expr.append(F.count(F.when(F.col(col_name).isNull(), col_name)).alias(col_name)) result = format_dict(df.select(*expr).to_json()) return result
def percentile(columns, values=None, error=1): """ Return the percentile of a dataframe :param columns: '*', list of columns names or a single column name. :param values: list of percentiles to be calculated :param error: :return: percentiles per columns """ if values is None: values = [0.05, 0.25, 0.5, 0.75, 0.95] columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) # Get percentiles percentile_results = [] for c in columns: percentile_per_col = self \ .rows.drop_na(c) \ .cols.cast(c, "double") \ .approxQuantile(c, values, error) percentile_results.append(dict(zip(values, percentile_per_col))) percentile_results = dict(zip(columns, percentile_results)) return format_dict(percentile_results)
def mad(columns, more=None): """ Return the Median Absolute Deviation :param columns: Column to be processed :param more: Return some extra computed values (Median). :return: """ columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) result = {} for col_name in columns: _mad = {} # return mean(absolute(data - mean(data, axis)), axis) median_value = self.cols.median(col_name) mad_value = self.select(col_name) \ .withColumn(col_name, F.abs(F.col(col_name) - median_value)) \ .cols.median(col_name) if more: _mad = {"mad": mad_value, "median": median_value} else: _mad = {"mad": mad_value} result[col_name] = _mad return format_dict(result)
def count_na(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. :param type: Accepts integer, float, string or None :return: """ columns = parse_columns(self, columns) df = self expr = [] for col_name in columns: # If type column is Struct parse to String. isnan/isNull can not handle Structure if is_(df.cols.schema_dtypes(col_name), (StructType, BooleanType)): df = df.cols.cast(col_name, "string") expr.append( F.count( F.when( F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name)) result = format_dict(collect_as_dict(df.select(*expr).collect())) return result
def percentile(columns, values=None, error=1): """ Return the percentile of a dataframe :param columns: '*', list of columns names or a single column name. :param values: list of percentiles to be calculated :return: percentiles per columns """ start_time = timeit.default_timer() if values is None: values = [0.05, 0.25, 0.5, 0.75, 0.95] columns = parse_columns(self, columns) # Get percentiles percentile_results = [] for c in columns: percentile_per_col = self \ .rows.drop_na(c) \ .cols.cast(c, "double") \ .approxQuantile(c, values, error) percentile_results.append(dict(zip(values, percentile_per_col))) percentile_results = dict(zip(columns, percentile_results)) logging.info("percentile") logging.info(timeit.default_timer() - start_time) return format_dict(percentile_results)
def _exprs(funcs, columns): """ Helper function to apply multiple columns expression to multiple columns :param funcs: Aggregation functions from Apache Spark :param columns: list or string of columns names or a . :return: """ def parse_col_names_funcs_to_keys(data): """ Helper function that return a formatted json with function:value inside columns. Transform from {'max_antiguedad_anos': 15, 'max_m2_superficie_construida': 1800000, 'min_antiguedad_anos': 2, 'min_m2_superficie_construida': 20} to {'m2_superficie_construida': {'min': 20, 'max': 1800000}, 'antiguedad_anos': {'min': 2, 'max': 15}} :param data: json data :return: json """ functions_array = [ "min", "max", "stddev", "kurtosis", "mean", "skewness", "sum", "variance", "approx_count_distinct", "na", "zeros", "percentile" ] result = {} if is_dict(data): for k, v in data.items(): for f in functions_array: temp_func_name = f + "_" if k.startswith(temp_func_name): _col_name = k[len(temp_func_name):] result.setdefault(_col_name, {})[f] = v return result else: return data columns = parse_columns(self, columns) # Ensure that is a list funcs = val_to_list(funcs) df = self # Parse the columns to float. Seems that spark can handle some aggregation with string columns giving # unexpected results # df = df.cols.cast(columns, "float") # Create a Column Expression for every column exprs = [] for col_name in columns: for func in funcs: exprs.append( func(col_name).alias(func.__name__ + "_" + col_name)) return (parse_col_names_funcs_to_keys( format_dict(df.agg(*exprs).to_json())))
def schema_dtype(columns): """ Return the column(s) data type as Type :param columns: Columns to be processed :return: """ columns = parse_columns(self, columns) return format_dict([self.schema[col_name].dataType for col_name in columns])
def schema_dtypes(columns): """ Return the columns data type as Type :param columns: :return: """ columns = parse_columns(self, columns) return format_dict( [self.schema[col_name].dataType for col_name in columns])
def count_zeros(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. :param type: Accepts integer, float, string or None :return: """ columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) df = self return format_dict(df.select([F.count(F.when(F.col(c) == 0, c)).alias(c) for c in columns]).to_json())
def dtypes(columns): """ Return the column(s) data type as string :param columns: Columns to be processed :return: """ columns = parse_columns(self, columns) data_types = tuple_to_dict(self.dtypes) return format_dict({c: data_types[c] for c in columns})
def count_zeros(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. :param type: Accepts integer, float, string or None :return: """ columns = parse_columns(self, columns) df = self return format_dict(collect_as_dict(df.select([F.count(F.when(F.col(c) == 0, c)).alias(c) for c in columns]) \ .collect()))