def append(col_name=None, value=None): """ Append a column to a Dataframe :param col_name: Name of the new column :param value: List of data values :return: """ def lit_array(_value): temp = [] for v in _value: temp.append(F.lit(v)) return F.array(temp) df = self if is_num_or_str(value): value = F.lit(value) elif is_list(value): value = lit_array(value) elif is_tuple(value): value = lit_array(list(value)) if is_(value, F.Column): df = df.withColumn(col_name, value) return df
def _add_driver_class_path(self, driver_class_path): if self.driver_class_path is None: self.driver_class_path = [] if is_list(driver_class_path): for d in val_to_list(driver_class_path): self.driver_class_path.append(d)
def _add_jars(self, jar): if self.jars is None: self.jars = [] if is_list(jar): for j in val_to_list(jar): self.jars.append(j)
def get_output_cols(input_cols, output_cols): # Construct input and output columns names if is_list(input_cols) and is_list(output_cols): if len(input_cols) != len(output_cols): RaiseIt.length_error(input_cols, output_cols) elif is_list(input_cols) and is_str(output_cols): if len(input_cols) > 1: output_cols = list([i + output_cols for i in input_cols]) else: output_cols = val_to_list(output_cols) elif is_str(input_cols) and is_str(output_cols): output_cols = val_to_list(output_cols) elif output_cols is None: output_cols = input_cols return output_cols
def val_to_list(val): """ Convert a single value string or number to a list :param val: :return: """ if not is_list(val): val = [val] return val
def replace(columns, search_and_replace=None, value=None, regex=None): """ Replace a value or a list of values by a specified string :param columns: '*', list of columns names or a single column name. :param search_and_replace: values to look at to be replaced :param value: new value to replace the old one :param regex: :return: """ replace = None search = None if is_list_of_tuples(search_and_replace): params = list(zip(*search_and_replace)) search = list(params[0]) replace = list(params[1]) elif is_list(search_and_replace): search = search_and_replace replace = value elif is_one_element(search_and_replace): search = val_to_list(search_and_replace) replace = value if regex: search = search_and_replace replace = value # if regex or normal replace we use regexp or replace functions # TODO check if .contains can be used instead of regexp def func_regex(_df, _col_name, _search, _replace): return _df.withColumn( c, F.regexp_replace(_col_name, _search, _replace)) def func_replace(_df, _col_name, _search, _replace): data_type = self.cols.dtypes(_col_name) _search = [PYTHON_TYPES_[data_type](s) for s in _search] _df = _df.replace(_search, _replace, _col_name) return _df if regex: func = func_regex else: func = func_replace df = self columns = parse_columns(self, columns, filter_by_column_dtypes="string") for c in columns: df = func(df, c, search, replace) return df
def infer(value): """ Infer a Spark data type from a value :param value: value to be inferred :return: Spark data type """ result = None # print(v) if value is None: result = "null" elif is_bool(value): result = "bool" elif isint(value): result = "int" elif isfloat(value): result = "float" elif is_list(value): result = ArrayType(infer(value[0])) elif is_datetime(value): result = "datetime" elif is_date(value): result = "date" elif is_binary(value): result = "binary" elif is_str(value): if str_to_boolean(value): result = "bool" elif str_to_date(value): result = "string" # date elif str_to_array(value): result = "string" # array else: result = "string" return get_spark_dtypes_object(result)
def escape_columns(columns): """ Add a backtick to a columns name to prevent the dot in name problem :param columns: :return: """ escaped_columns = [] if is_list(columns): for col in columns: # Check if the column is already escaped if col[0] != "`" and col[len(col) - 1] != "`": escaped_columns.append("`" + col + "`") else: escaped_columns.append(col) else: # Check if the column is already escaped if columns[0] != "`" and columns[len(columns) - 1] != "`": escaped_columns = "`" + columns + "`" else: escaped_columns.append(columns) return escaped_columns
def create(self, df, func, suffix=None, output="df", *args, **kwargs): """ This is a helper function that output python tests for Spark Dataframes. :param df: Spark Dataframe :param suffix: The create method will try to create a test function with the func param given. If you want to test a function with different params you can use suffix. :param func: Spark dataframe function to be tested :param output: can be a 'df' or a 'json' :param args: Arguments to be used in the function :param kwargs: Keyword arguments to be used in the functions :return: """ buffer = [] def add_buffer(value): buffer.append("\t" + value) if suffix is None: suffix = "" else: suffix = "_" + suffix # Create func test name. If is None we just test the create.df function a not transform the data frame in # any way if func is None: func_test_name = "test_" + "create_df" + suffix + "()" else: func_test_name = "test_" + func.replace(".", "_") + suffix + "()" print("Creating {test} test function...".format(test=func_test_name)) logging.info(func_test_name) add_buffer("@staticmethod\n") add_buffer("def " + func_test_name + ":\n") if df is not None: source_df = "\tsource_df=op.create.df(" + df.export() + ")\n" df_func = df add_buffer(source_df) else: df_func = self.df # Process simple arguments _args = [] for v in args: if is_str(v): _args.append("'" + v + "'") elif is_numeric(v): _args.append(str(v)) elif is_list(v): if is_list_of_strings(v): lst = ["'" + x + "'" for x in v] elif is_list_of_numeric(v): lst = [str(x) for x in v] elif is_list_of_tuples(v): lst = [str(x) for x in v] _args.append('[' + ','.join(lst) + ']') _args = ','.join(_args) _kwargs = [] # print(_args) # Process keywords arguments for k, v in kwargs.items(): if is_str(v): v = "'" + v + "'" _kwargs.append(k + "=" + str(v)) # Separator if we have positional and keyword arguments separator = "" if (not is_list_empty(args)) & (not is_list_empty(kwargs)): separator = "," if func is None: add_buffer("\tactual_df = source_df\n") else: add_buffer("\tactual_df = source_df." + func + "(" + _args + separator + ','.join(_kwargs) + ")\n") # Apply function to the dataframe if func is None: df_result = self.op.create.df(*args, **kwargs) else: for f in func.split("."): df_func = getattr(df_func, f) df_result = df_func(*args, **kwargs) if output == "df": expected = "\texpected_df = op.create.df(" + df_result.export( ) + ")\n" elif output == "json": if is_str(df_result): df_result = "'" + df_result + "'" else: df_result = str(df_result) expected = "\texpected_value =" + df_result + "\n" add_buffer(expected) if output == "df": add_buffer( "\tassert (expected_df.collect() == actual_df.collect())\n") elif output == "json": add_buffer("\tassert (expected_value == actual_df)\n") return "".join(buffer)
def create(self, df, func, suffix=None, output="df", *args, **kwargs): """ This is a helper function that output python tests for Spark Dataframes. :param df: Spark Dataframe :param suffix: The create method will try to create a test function with the func param given. If you want to test a function with different params you can use suffix. :param func: Spark dataframe function to be tested :param output: can be a 'df' or a 'json' :param args: Arguments to be used in the function :param kwargs: Keyword arguments to be used in the functions :return: """ buffer = [] def add_buffer(value): buffer.append("\t" + value) if suffix is None: suffix = "" else: suffix = "_" + suffix # Create func test name. If is None we just test the create.df function a not transform the data frame in # any way if func is None: func_test_name = "test_" + "create_df" + suffix + "()" else: func_test_name = "test_" + func.replace(".", "_") + suffix + "()" print("Creating {test} test function...".format(test=func_test_name)) logger.print(func_test_name) add_buffer("@staticmethod\n") add_buffer("def " + func_test_name + ":\n") source = "source_df" if df is None: # Use the main df df_func = self.df elif isinstance(df, pyspark.sql.dataframe.DataFrame): source_df = "\tsource_df=op.create.df(" + df.export() + ")\n" df_func = df add_buffer(source_df) else: # TODO: op is not supposed to be hardcoded source = "op" df_func = df # Process simple arguments _args = [] for v in args: if is_str(v): _args.append("'" + v + "'") elif is_numeric(v): _args.append(str(v)) elif is_list(v): if is_list_of_strings(v): lst = ["'" + x + "'" for x in v] elif is_list_of_numeric(v): lst = [str(x) for x in v] elif is_list_of_tuples(v): lst = [str(x) for x in v] _args.append('[' + ','.join(lst) + ']') elif is_function(v): _args.append(v.__qualname__) # else: # import marshal # code_string = marshal.dumps(v.__code__) # add_buffer("\tfunction = '" + code_string + "'\n") # import marshal, types # # code = marshal.loads(code_string) # func = types.FunctionType(code, globals(), "some_func_name") _args = ','.join(_args) _kwargs = [] # print(_args) # Process keywords arguments for k, v in kwargs.items(): if is_str(v): v = "'" + v + "'" _kwargs.append(k + "=" + str(v)) # Separator if we have positional and keyword arguments separator = "" if (not is_list_empty(args)) & (not is_list_empty(kwargs)): separator = "," if func is None: add_buffer("\tactual_df = source_df\n") else: add_buffer("\tactual_df =" + source + "." + func + "(" + _args + separator + ','.join(_kwargs) + ")\n") # Apply function to the dataframe if func is None: df_result = self.op.create.df(*args, **kwargs) else: # Here we construct the method to be applied to the source object for f in func.split("."): df_func = getattr(df_func, f) df_result = df_func(*args, **kwargs) if output == "df": expected = "\texpected_df = op.create.df(" + df_result.export( ) + ")\n" elif output == "json": if is_str(df_result): df_result = "'" + df_result + "'" else: df_result = str(df_result) add_buffer("\tactual_df =json_enconding(actual_df)\n") expected = "\texpected_value =json_enconding(" + df_result + ")\n" else: expected = "\t\n" add_buffer(expected) if output == "df": add_buffer( "\tassert (expected_df.collect() == actual_df.collect())\n") elif output == "json": add_buffer("\tassert (expected_value == actual_df)\n") return "".join(buffer)