def get_output_cols(input_cols, output_cols): """ Construct output columns names :param input_cols: :param output_cols: :return: """ if is_list(input_cols) and is_list(output_cols): if len(input_cols) != len(output_cols): RaiseIt.length_error(input_cols, output_cols) elif is_list(input_cols) and is_str(output_cols): if len(input_cols) > 1: output_cols = list([i + output_cols for i in input_cols]) else: output_cols = val_to_list(output_cols) elif is_str(input_cols) and is_str(output_cols): output_cols = val_to_list(output_cols) elif output_cols is None: output_cols = input_cols return output_cols
def format_dict(_dict, tidy=True): """ This function format a dict. If the main dict or a deep dict has only on element {"col_name":{0.5: 200}} we get 200 :param _dict: dict to be formatted :param tidy: :return: """ from optimus.helpers.check import is_dict, is_list_of_one_element, is_dict_of_one_element, is_list if tidy is True: def _format_dict(_dict): if not is_dict(_dict): return _dict for k, v in _dict.items(): # If the value is a dict if is_dict(v): # and only have one value if len(v) == 1: _dict[k] = next(iter(v.values())) else: if len(_dict) == 1: _dict = v return _dict if is_list_of_one_element(_dict): _dict = _dict[0] elif is_dict_of_one_element(_dict): # if dict_depth(_dict) >4: _dict = next(iter(_dict.values())) # Some aggregation like min or max return a string column def repeat(f, n, _dict): if n == 1: # note 1, not 0 return f(_dict) else: return f(repeat(f, n - 1, _dict)) # call f with returned value # TODO: Maybe this can be done in a recursive way # We apply two passes to the dict so we can process internals dicts and the superiors ones return repeat(_format_dict, 2, _dict) else: # Return the dict from a list if is_list(_dict): return _dict[0] else: return _dict
def infer(value): """ Infer a Spark data type from a value :param value: value to be inferred :return: Spark data type """ result = None if value is None: result = "null" elif is_bool(value): result = "bool" elif isint(value): result = "int" elif isfloat(value): result = "float" elif is_list(value): result = ArrayType(infer(value[0])) elif is_datetime(value): result = "datetime" elif is_date(value): result = "date" elif is_binary(value): result = "binary" elif is_str(value): if str_to_boolean(value): result = "bool" elif str_to_date(value): result = "string" # date elif str_to_array(value): result = "string" # array else: result = "string" return parse_spark_class_dtypes(result)
def escape_columns(columns): """ Add a backtick to a columns name to prevent the dot in name problem :param columns: :return: """ escaped_columns = [] if is_list(columns): for col in columns: # Check if the column is already escaped if col[0] != "`" and col[len(col) - 1] != "`": escaped_columns.append("`" + col + "`") else: escaped_columns.append(col) else: # Check if the column is already escaped if columns[0] != "`" and columns[len(columns) - 1] != "`": escaped_columns = "`" + columns + "`" else: escaped_columns.append(columns) return escaped_columns
def create(self, obj, method, suffix=None, output="df", additional_method=None, *args, **kwargs): """ This is a helper function that output python tests for Spark Dataframes. :param obj: Object to be tested :param method: Method to be tested :param suffix: The test name will be create using the method param. suffix will add a string in case you want to customize the test name. :param output: can be a 'df' or a 'json' :param additional_method: :param args: Arguments to be used in the method :param kwargs: Keyword arguments to be used in the functions :return: """ buffer = [] def add_buffer(value): buffer.append("\t" + value) # Create name name = [] if method is not None: name.append(method.replace(".", "_")) if additional_method is not None: name.append(additional_method) if suffix is not None: name.append(suffix) test_name = "_".join(name) func_test_name = "test_" + test_name + "()" print("Creating {test} test function...".format(test=func_test_name)) logger.print(func_test_name) if not output == "dict": add_buffer("@staticmethod\n") func_test_name = "test_" + test_name + "()" else: func_test_name = "test_" + test_name + "(self)" filename = test_name + ".test" add_buffer("def " + func_test_name + ":\n") source = "source_df" if obj is None: # Use the main df df_func = self.df elif isinstance(obj, pyspark.sql.dataframe.DataFrame): source_df = "\tsource_df=op.create.df(" + obj.export() + ")\n" df_func = obj add_buffer(source_df) else: source = get_var_name(obj) df_func = obj # Process simple arguments _args = [] for v in args: if is_str(v): _args.append("'" + v + "'") elif is_numeric(v): _args.append(str(v)) elif is_list(v): if is_list_of_strings(v): lst = ["'" + x + "'" for x in v] elif is_list_of_numeric(v): lst = [str(x) for x in v] elif is_list_of_tuples(v): lst = [str(x) for x in v] _args.append('[' + ','.join(lst) + ']') elif is_function(v): _args.append(v.__qualname__) else: _args.append(get_var_name(v)) # else: # import marshal # code_string = marshal.dumps(v.__code__) # add_buffer("\tfunction = '" + code_string + "'\n") # import marshal, types # # code = marshal.loads(code_string) # func = types.FunctionType(code, globals(), "some_func_name") _args = ','.join(_args) _kwargs = [] # print(_args) # Process keywords arguments for k, v in kwargs.items(): if is_str(v): v = "'" + v + "'" _kwargs.append(k + "=" + str(v)) # Separator if we have positional and keyword arguments separator = "" if (not is_list_empty(args)) & (not is_list_empty(kwargs)): separator = "," if method is None: add_buffer("\tactual_df = source_df\n") else: am = "" if additional_method: am = "." + additional_method + "()" add_buffer("\tactual_df =" + source + "." + method + "(" + _args + separator + ','.join(_kwargs) + ")" + am + "\n") # Apply function to the dataframe if method is None: df_result = self.op.create.df(*args, **kwargs) else: # Here we construct the method to be applied to the source object for f in method.split("."): df_func = getattr(df_func, f) df_result = df_func(*args, **kwargs) # Additional Methods if additional_method is not None: df_result = getattr(df_result, additional_method)() if output == "df": df_result.table() expected = "\texpected_df = op.create.df(" + df_result.export( ) + ")\n" elif output == "json": print(df_result) if is_str(df_result): df_result = "'" + df_result + "'" else: df_result = str(df_result) add_buffer("\tactual_df =json_enconding(actual_df)\n") expected = "\texpected_value =json_enconding(" + df_result + ")\n" elif output == "dict": print(df_result) expected = "\texpected_value =" + df_result + "\n" else: expected = "\t\n" add_buffer(expected) # Output if output == "df": add_buffer( "\tassert (expected_df.collect() == actual_df.collect())\n") elif output == "json": add_buffer("\tassert(expected_value == actual_df)\n") elif output == "dict": add_buffer( "\tself.assertDictEqual(deep_sort(expected_value), deep_sort(actual_df))\n" ) filename = self.path + "//" + filename if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise # Write file test_file = open(filename, 'w', encoding='utf-8') for b in buffer: test_file.write(b)