def z_score(df, columns, threshold=None): """ Delete outlier using z score :param df: :param columns: :param threshold: :return: """ if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_int(threshold): raise TypeError("Integer expected") columns = parse_columns(df, columns) for c in columns: # the column with the z_col value is always the string z_col plus the name of column z_col = "z_col_" + c df = df.cols.z_score(c) \ .rows.drop(F.col(z_col) > threshold) \ .cols.drop(z_col) return df
def rename(columns_old_new=None, func=None): """" Changes the name of a column(s) dataFrame. :param columns_old_new: List of tuples. Each tuple has de following form: (oldColumnName, newColumnName). :param func: can be lower, upper or any string transformation function """ df = self # Apply a transformation function if is_function(func): exprs = [F.col(c).alias(func(c)) for c in df.columns] df = df.select(exprs) elif is_list_of_tuples(columns_old_new): # Check that the 1st element in the tuple is a valid set of columns validate_columns_names(self, columns_old_new) for c in columns_old_new: old_col_name = c[0] if is_str(old_col_name): df = df.withColumnRenamed(old_col_name, c[1]) elif is_int(old_col_name): df = df.withColumnRenamed(self.schema.names[old_col_name], c[1]) return df
def _mad(self, action): """ :type action: :return: """ df = self.df columns = self.columns threshold = self.threshold if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_int(threshold): raise TypeError("Integer expected") columns = parse_columns(df, columns) for c in columns: mad_value = df.cols.mad(c, more=True) lower_bound = mad_value["median"] - threshold * mad_value["mad"] upper_bound = mad_value["median"] + threshold * mad_value["mad"] if action is "select": df = df.rows.select((F.col(c) > upper_bound) | (F.col(c) < lower_bound)) elif action is "drop": df = df.rows.drop((F.col(c) > upper_bound) | (F.col(c) < lower_bound)) return df
def length_error(var1, var2): """ Raise a ValueError exception :param var1: :param var2: :return: """ from optimus.helpers.functions import get_var_name if is_int(var2): length_var2 = str(var2) else: length_var2 = str(len(var2)) raise ValueError( "'{var2_name}' must be length '{var1_length}', received '{var2_length}'" .format(var2_name=get_var_name(var2), var1_length=str(len(var1)), var2_length=length_var2))
def mad(df, columns, threshold=None): """ Delete outlier using mad :param df: :param columns: :param threshold: :return: """ if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_int(threshold): raise TypeError("Integer expected") columns = parse_columns(df, columns) for c in columns: mad_value = df.cols.mad(c, more=True) lower_bound = mad_value["median"] - threshold * mad_value["mad"] upper_bound = mad_value["median"] + threshold * mad_value["mad"] df = df.rows.drop((F.col(c) > upper_bound) | (F.col(c) < lower_bound)) return df
def unnest(columns, mark=None, splits=None, index=None): """ Split an array or string in different columns :param columns: Columns to be un-nested :param mark: If column is string. :param splits: Number of rows to un-nested. Because we can not know beforehand the number of splits :param index: :return: Spark DataFrame """ # If a number of split was not defined try to infer the length with the first element infer_splits = None if splits is None: infer_splits = True columns = parse_columns(self, columns) df = self for col_name in columns: # if the col is array col_dtype = self.schema[col_name].dataType # Array if is_(col_dtype, ArrayType): expr = F.col(col_name) # Try to infer the array length using the first row if infer_splits is True: splits = len(self.cols.cell(col_name)) for i in builtins.range(splits): df = df.withColumn(col_name + "_" + str(i), expr.getItem(i)) # String elif is_(col_dtype, StringType): expr = F.split(F.col(col_name), mark) # Try to infer the array length using the first row if infer_splits is True: splits = len(self.cols.cell(col_name).split(mark)) if is_int(index): r = builtins.range(index, index + 1) else: r = builtins.range(0, splits) for i in r: df = df.withColumn(col_name + "_" + str(i), expr.getItem(i)) # Vector elif is_(col_dtype, VectorUDT): def _unnest(row): _dict = row.asDict() # Get the column we want to unnest _list = _dict[col_name] # Ensure that float are python floats and not np floats if index is None: _list = [float(x) for x in _list] else: _list = [float(_list[1])] return row + tuple(_list) df = df.rdd.map(_unnest).toDF(df.columns) return df
def unnest(columns, mark=None, n=None, index=None): """ Split array or string in different columns :param columns: Columns to be un-nested :param mark: is column is string :param n: Number of rows to un-nested :param index: :return: Spark DataFrame """ # If a number of split was not defined try to infer the lenght with the first element infer_n = None if n is None: infer_n = True columns = parse_columns(self, columns) df = self for col_name in columns: # if the col is array expr = None col_dtype = self.schema[col_name].dataType # Array if is_(col_dtype, ArrayType): expr = F.col(col_name) # Try to infer the array length using the first row if infer_n is True: n = len(self.cols.cell(col_name)) for i in builtins.range(n): df = df.withColumn(col_name + "_" + str(i), expr.getItem(i)) # String elif is_(col_dtype, StringType): expr = F.split(F.col(col_name), mark) # Try to infer the array length using the first row if infer_n is True: n = len(self.cols.cell(col_name).split(mark)) if is_int(index): r = builtins.range(index, index + 1) else: r = builtins.range(0, n) for i in r: df = df.withColumn(col_name + "_" + str(i), expr.getItem(i)) # Vector elif is_(col_dtype, VectorUDT): def extract(row): return row + tuple(row.vector.toArray().tolist()) df = df.rdd.map(extract).toDF(df.columns) return df