def get_output_cols(input_cols, output_cols): # Construct input and output columns names if is_list(input_cols) and is_list(output_cols): if len(input_cols) != len(output_cols): RaiseIt.length_error(input_cols, output_cols) elif is_list(input_cols) and is_str(output_cols): if len(input_cols) > 1: output_cols = list([i + output_cols for i in input_cols]) else: output_cols = val_to_list(output_cols) elif is_str(input_cols) and is_str(output_cols): output_cols = val_to_list(output_cols) elif output_cols is None: output_cols = input_cols return output_cols
def rename(columns_old_new=None, func=None): """" Changes the name of a column(s) dataFrame. :param columns_old_new: List of tuples. Each tuple has de following form: (oldColumnName, newColumnName). :param func: can be lower, upper or any string transformation function """ df = self # Apply a transformation function if is_function(func): exprs = [F.col(c).alias(func(c)) for c in df.columns] df = df.select(exprs) elif is_list_of_tuples(columns_old_new): # Check that the 1st element in the tuple is a valid set of columns validate_columns_names(self, columns_old_new) for c in columns_old_new: old_col_name = c[0] if is_str(old_col_name): df = df.withColumnRenamed(old_col_name, c[1]) elif is_int(old_col_name): df = df.withColumnRenamed(self.schema.names[old_col_name], c[1]) return df
def print_json(value): pp = pprint.PrettyPrinter(indent=2) if is_str(value): value = value.replace("'", "\"") value = json.loads(value) pp.pprint(value)
def gbt(df, columns, input_col, **kargs): """ Runs a gradient boosting tree classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with gradient boosting tree and prediction run. """ if not is_dataframe(df): raise TypeError("Spark dataframe expected") columns = parse_columns(df, columns) if not is_str(input_col): raise TypeError("Error, input column must be a string") data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats) model = GBTClassifier(**kargs) df = df.cols.rename([(input_col + "_index", "label")]) gbt_model = model.fit(df) df_model = gbt_model.transform(df) return df_model, gbt_model
def normalizer(df, input_cols, p=2.0): """ Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2) by default. :param df: Dataframe to be transformed :param input_cols: Columns to be normalized. :param p: p-norm used for normalization. :return: Dataframe with normalized columns. """ # Check if columns argument must be a string or list datatype: if is_(input_cols, [str, list]): RaiseIt.type_error(input_cols, [str, list]) if is_str(input_cols): input_cols = [input_cols] if is_(input_cols, [float, int]): RaiseIt.type_error(input_cols, [float, int]) df = df.cols.cast(input_cols, "vector") normal = [ Normalizer(inputCol=column, outputCol=column + "_normalized", p=p) for column in list(set(input_cols)) ] pipeline = Pipeline(stages=normal) df = pipeline.fit(df).transform(df) return df
def print_json(value): """ Print a human readable json :param value: json to be printed :return: json """ pp = pprint.PrettyPrinter(indent=2) if is_str(value): value = value.replace("'", "\"") value = json.loads(value) pp.pprint(value)
def table_name(self, name=None): """ Create a temp view for a data frame :param self: :param name: :return: """ if not is_str(name): RaiseIt.type_error(name, ["string"]) if len(name) is 0: RaiseIt.value_error(name, ["> 0"]) self.createOrReplaceTempView(name) return self
def infer(value): """ Infer a Spark data type from a value :param value: value to be inferred :return: Spark data type """ result = None # print(v) if value is None: result = "null" elif is_bool(value): result = "bool" elif isint(value): result = "int" elif isfloat(value): result = "float" elif is_list(value): result = ArrayType(infer(value[0])) elif is_datetime(value): result = "datetime" elif is_date(value): result = "date" elif is_binary(value): result = "binary" elif is_str(value): if str_to_boolean(value): result = "bool" elif str_to_date(value): result = "string" # date elif str_to_array(value): result = "string" # array else: result = "string" return get_spark_dtypes_object(result)
def create(self, df, func, suffix=None, output="df", *args, **kwargs): """ This is a helper function that output python tests for Spark Dataframes. :param df: Spark Dataframe :param suffix: The create method will try to create a test function with the func param given. If you want to test a function with different params you can use suffix. :param func: Spark dataframe function to be tested :param output: can be a 'df' or a 'json' :param args: Arguments to be used in the function :param kwargs: Keyword arguments to be used in the functions :return: """ buffer = [] def add_buffer(value): buffer.append("\t" + value) if suffix is None: suffix = "" else: suffix = "_" + suffix # Create func test name. If is None we just test the create.df function a not transform the data frame in # any way if func is None: func_test_name = "test_" + "create_df" + suffix + "()" else: func_test_name = "test_" + func.replace(".", "_") + suffix + "()" print("Creating {test} test function...".format(test=func_test_name)) logging.info(func_test_name) add_buffer("@staticmethod\n") add_buffer("def " + func_test_name + ":\n") if df is not None: source_df = "\tsource_df=op.create.df(" + df.export() + ")\n" df_func = df add_buffer(source_df) else: df_func = self.df # Process simple arguments _args = [] for v in args: if is_str(v): _args.append("'" + v + "'") elif is_numeric(v): _args.append(str(v)) elif is_list(v): if is_list_of_strings(v): lst = ["'" + x + "'" for x in v] elif is_list_of_numeric(v): lst = [str(x) for x in v] elif is_list_of_tuples(v): lst = [str(x) for x in v] _args.append('[' + ','.join(lst) + ']') _args = ','.join(_args) _kwargs = [] # print(_args) # Process keywords arguments for k, v in kwargs.items(): if is_str(v): v = "'" + v + "'" _kwargs.append(k + "=" + str(v)) # Separator if we have positional and keyword arguments separator = "" if (not is_list_empty(args)) & (not is_list_empty(kwargs)): separator = "," if func is None: add_buffer("\tactual_df = source_df\n") else: add_buffer("\tactual_df = source_df." + func + "(" + _args + separator + ','.join(_kwargs) + ")\n") # Apply function to the dataframe if func is None: df_result = self.op.create.df(*args, **kwargs) else: for f in func.split("."): df_func = getattr(df_func, f) df_result = df_func(*args, **kwargs) if output == "df": expected = "\texpected_df = op.create.df(" + df_result.export( ) + ")\n" elif output == "json": if is_str(df_result): df_result = "'" + df_result + "'" else: df_result = str(df_result) expected = "\texpected_value =" + df_result + "\n" add_buffer(expected) if output == "df": add_buffer( "\tassert (expected_df.collect() == actual_df.collect())\n") elif output == "json": add_buffer("\tassert (expected_value == actual_df)\n") return "".join(buffer)
def create(self, df, func, suffix=None, output="df", *args, **kwargs): """ This is a helper function that output python tests for Spark Dataframes. :param df: Spark Dataframe :param suffix: The create method will try to create a test function with the func param given. If you want to test a function with different params you can use suffix. :param func: Spark dataframe function to be tested :param output: can be a 'df' or a 'json' :param args: Arguments to be used in the function :param kwargs: Keyword arguments to be used in the functions :return: """ buffer = [] def add_buffer(value): buffer.append("\t" + value) if suffix is None: suffix = "" else: suffix = "_" + suffix # Create func test name. If is None we just test the create.df function a not transform the data frame in # any way if func is None: func_test_name = "test_" + "create_df" + suffix + "()" else: func_test_name = "test_" + func.replace(".", "_") + suffix + "()" print("Creating {test} test function...".format(test=func_test_name)) logger.print(func_test_name) add_buffer("@staticmethod\n") add_buffer("def " + func_test_name + ":\n") source = "source_df" if df is None: # Use the main df df_func = self.df elif isinstance(df, pyspark.sql.dataframe.DataFrame): source_df = "\tsource_df=op.create.df(" + df.export() + ")\n" df_func = df add_buffer(source_df) else: # TODO: op is not supposed to be hardcoded source = "op" df_func = df # Process simple arguments _args = [] for v in args: if is_str(v): _args.append("'" + v + "'") elif is_numeric(v): _args.append(str(v)) elif is_list(v): if is_list_of_strings(v): lst = ["'" + x + "'" for x in v] elif is_list_of_numeric(v): lst = [str(x) for x in v] elif is_list_of_tuples(v): lst = [str(x) for x in v] _args.append('[' + ','.join(lst) + ']') elif is_function(v): _args.append(v.__qualname__) # else: # import marshal # code_string = marshal.dumps(v.__code__) # add_buffer("\tfunction = '" + code_string + "'\n") # import marshal, types # # code = marshal.loads(code_string) # func = types.FunctionType(code, globals(), "some_func_name") _args = ','.join(_args) _kwargs = [] # print(_args) # Process keywords arguments for k, v in kwargs.items(): if is_str(v): v = "'" + v + "'" _kwargs.append(k + "=" + str(v)) # Separator if we have positional and keyword arguments separator = "" if (not is_list_empty(args)) & (not is_list_empty(kwargs)): separator = "," if func is None: add_buffer("\tactual_df = source_df\n") else: add_buffer("\tactual_df =" + source + "." + func + "(" + _args + separator + ','.join(_kwargs) + ")\n") # Apply function to the dataframe if func is None: df_result = self.op.create.df(*args, **kwargs) else: # Here we construct the method to be applied to the source object for f in func.split("."): df_func = getattr(df_func, f) df_result = df_func(*args, **kwargs) if output == "df": expected = "\texpected_df = op.create.df(" + df_result.export( ) + ")\n" elif output == "json": if is_str(df_result): df_result = "'" + df_result + "'" else: df_result = str(df_result) add_buffer("\tactual_df =json_enconding(actual_df)\n") expected = "\texpected_value =json_enconding(" + df_result + ")\n" else: expected = "\t\n" add_buffer(expected) if output == "df": add_buffer( "\tassert (expected_df.collect() == actual_df.collect())\n") elif output == "json": add_buffer("\tassert (expected_value == actual_df)\n") return "".join(buffer)
def parse_columns(df, cols_args, get_args=False, is_regex=None, filter_by_column_dtypes=None, accepts_missing_cols=False): """ Return a list of columns and check that columns exists in the dataframe Accept '*' as parameter in which case return a list of all columns in the dataframe. Also accept a regex. If a list of tuples return to list. The first element is the columns name the others element are params. This params can me used to create custom transformation functions. You can find and example in cols().cast() :param df: Dataframe in which the columns are going to be checked :param cols_args: Accepts * as param to return all the string columns in the dataframe :param get_args: :param is_regex: Use True is col_attrs is a regex :param filter_by_column_dtypes: :param accepts_missing_cols: if true not check if column exist in the dataframe :return: A list of columns string names """ cols = None attrs = None # ensure that cols_args is a list # cols_args = val_to_list(cols_args) # if columns value is * get all dataframes columns if is_regex is True: r = re.compile(cols_args[0]) cols = list(filter(r.match, df.columns)) elif cols_args == "*" or cols_args is None: cols = df.columns # In case we have a list of tuples we use the first element of the tuple is taken as the column name # and the rest as params. We can use the param in a custom function as follow # def func(attrs): attrs return (1,2) and (3,4) # return attrs[0] + 1 # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func) # Verify if we have a list with tuples elif is_tuple(cols_args) or is_list_of_tuples(cols_args): cols_args = val_to_list(cols_args) # Extract a specific position in the tuple cols = [(i[0:1][0]) for i in cols_args] attrs = [(i[1:]) for i in cols_args] else: # if not a list convert to list cols = val_to_list(cols_args) # Get col name from index cols = [c if is_str(c) else df.columns[c] for c in cols] # Check for missing columns if accepts_missing_cols is False: check_for_missing_columns(df, cols) # Filter by column data type filter_by_column_dtypes = val_to_list(filter_by_column_dtypes) if is_list_of_strings(filter_by_column_dtypes): # Get columns for every data type columns_filtered = filter_col_name_by_dtypes(df, filter_by_column_dtypes) # Intersect the columns filtered per datatype from the whole dataframe with the columns passed to the function cols = list(set(cols).intersection(columns_filtered)) # Return cols or cols an params if get_args is True: params = cols, attrs elif get_args is False: params = cols else: RaiseIfNot.value_error(get_args, ["True", "False"]) return params