def sort(col_sort): """ Sort rows taking in account multiple columns :param col_sort: column and sort type combination (col_name, "asc") :type col_sort: list of tuples """ # If a list of columns names are given order this by desc. If you need to specify the order of every # column use a list of tuples (col_name, "asc") t = [] if is_list_of_str_or_int(col_sort): for col_name in col_sort: t.append(tuple([col_name, "desc"])) col_sort = t func = [] for cs in col_sort: col_name = one_list_to_val(cs[0]) order = cs[1] if order == "asc": sort_func = F.asc elif order == "desc": sort_func = F.desc func.append(sort_func(col_name)) df = self.sort(*func) return df
def correlation(self, columns, method="pearson", output="json"): """ Calculate the correlation between columns. It will try to cast a column to float where necessary and impute missing values :param self: :param columns: Columns to be processed :param method: Method used to calculate the correlation :param output: array or json :return: """ columns = parse_columns(self, columns) # try to parse the select column to float and create a vector df = self if len(columns) == 1: if is_column_a(df, columns, "vector"): output_col = one_list_to_val(columns) else: output_col = "_correlation_features" for col_name in columns: df = df.cols.cast(col_name, "float") logger.print( "Casting {col_name} to float...".format(col_name=col_name)) df = df.cols.nest(columns, "vector", output_cols=output_col) # Create Vector necessary to calculate the correlation corr = Correlation.corr(df, output_col, method).head()[0].toArray() if output is "array": result = corr elif output is "json": # Parse result to json col_pair = [] for col_name in columns: for col_name_2 in columns: col_pair.append({"between": col_name, "an": col_name_2}) # flat array values = corr.flatten('F').tolist() result = [] for n, v in zip(col_pair, values): # Remove correlation between the same column if n["between"] is not n["an"]: n["value"] = v result.append(n) result = sorted(result, key=lambda k: k['value'], reverse=True) return result
def filter_list(val, index=0): """ Convert a list to None, int, str or a list filtering a specific index [] to None ['test'] to test :param val: :param index: :return: """ if len(val) == 0: return None else: return one_list_to_val([column[index] for column in val])
def is_column_a(df, column, dtypes): """ Check if column match a list of data types :param df: :param column: :param dtypes: :return: """ data_type = tuple(val_to_list(parse_spark_dtypes(dtypes))) column = one_list_to_val(column) # Filter columns by data type return isinstance(df.schema[column].dataType, data_type)
def get_spark_dtypes_object(value): """ Get a pyspark data class from a string data type representation. for example 'StringType()' from 'string' :param value: :return: """ value = val_to_list(value) try: data_type = [SPARK_DTYPES_DICT_OBJECTS[SPARK_SHORT_DTYPES[v]] for v in value] except (KeyError, TypeError): data_type = value data_type = one_list_to_val(data_type) return data_type
def parse_spark_dtypes(value): """ Get a pyspark data type from a string data type representation. for example 'StringType' from 'string' :param value: :return: """ value = val_to_list(value) try: data_type = [SPARK_DTYPES_DICT[SPARK_SHORT_DTYPES[v]] for v in value] except KeyError: data_type = value data_type = one_list_to_val(data_type) return data_type
def filter_row_by_data_type(col_name, data_type=None, get_type=False): """ A Pandas UDF function that returns bool if the value match with the data_type param passed to the function. Also can return the data type :param col_name: Column to be process :param data_type: The data_type to be compared with :param get_type: Value to be returned as string or boolean :return: True or False """ from ast import literal_eval if data_type is not None: data_type = parse_python_dtypes(data_type) def pandas_udf_func(v): def str_to_boolean(value): """ Check if a str can be converted to boolean :param value: :return: """ value = value.lower() if value == "true" or value == "false": return True def str_to_date(value): try: dateutil.parser.parse(value) return True except ValueError: pass def str_to_array(value): """ Check if value can be parsed to a tuple or and array. Because Spark can handle tuples we will try to transform tuples to arrays :param value: :return: """ try: if isinstance( literal_eval((value.encode('ascii', 'ignore')).decode("utf-8")), (list, tuple)): return True except ( ValueError, SyntaxError, ): pass def func(value): """ Check if a value can be casted to a specific :param value: value to be checked :return: """ if isinstance(value, bool): _data_type = "bool" # _data_type = data_type elif isint(value): # Check if value is integer _data_type = "int" elif isfloat(value): _data_type = "float" # if string we try to parse it to int, float or bool elif isinstance(value, str): if str_to_boolean(value): _data_type = "bool" elif str_to_date(value): _data_type = "date" elif str_to_array(value): _data_type = "array" else: _data_type = "string" else: _data_type = "null" if get_type is False: if _data_type == data_type: return True else: return False else: return _data_type return v.apply(func) if get_type is True: return_data_type = "string" else: return_data_type = "boolean" col_name = one_list_to_val(col_name) return F.pandas_udf(pandas_udf_func, return_data_type)(col_name)