def drop_by_dtypes(input_cols, data_type=None): """ Drop rows by cell data type :param input_cols: Column in which the filter is going to be apllied :param data_type: filter by string, integer, float or boolean :return: Spark DataFrame """ validate_columns_names(self, input_cols) return self.rows.drop(fbdt(input_cols, data_type))
def drop_by_dtypes(input_cols, data_type=None): """ Drop rows by cell data type :param input_cols: Column in which the filter is going to be apllied :param data_type: filter by string, integer, float or boolean :return: Spark DataFrame """ df = self validate_columns_names(df, input_cols) df = df.rows.drop(fbdt(input_cols, data_type)) df = df.preserve_meta(self, Actions.DROP_ROW.value, df.cols.names()) return df
def select_by_dtypes(input_cols, data_type=None): """ This function has built in order to filter some type of row depending of the var type detected by python for Example if you have a column with | a | | 1 | | b | and you filter by type = integer you will get | 1 | :param input_cols: Column to be filtered :param data_type: Datatype use filter values :return: Spark DataFrame """ input_cols = parse_columns(self, input_cols) return self.where(fbdt(input_cols, data_type))
# ### Create a Pandas UDF function that sum a values(32 in this case) to two columns # + def func(val, attr): return val + attr df.cols.apply(["num", "new_col_1"], func, "int", 10).table() # - # ### Select row where column "filter" is "integer" # + df.rows.select(fbdt("filter", "integer")).table() # - # ### Create an abstract dataframe to filter a rows where the value of column "num"> 1 # + from optimus.audf import abstract_udf as audf, filter_row_by_data_type as fbdt def func(val, attr): return val > 1 df.rows.select(audf("num", func, "boolean")).table() # -
def _count_data_types(col_name): """ Function for determine if register value is float or int or string. :param col_name: :return: """ logger.print("Processing column '" + col_name + "'...") # If String, process the data to try to infer which data type is inside. This a kind of optimization. # We do not need to analyze the data if the column data type is integer or boolean.etc temp = col_name + "_type" col_data_type = df.cols.dtypes(col_name) count_by_data_type = {} count_empty_strings = 0 if infer is True and col_data_type == "string": types = (df.h_repartition(col_name=col_name).withColumn( temp, fbdt(col_name, get_type=True)).groupBy(temp).count().to_json()) for row in types: count_by_data_type[row[temp]] = row["count"] count_empty_strings = df.where(F.col(col_name) == '').count() else: nulls = df.cols.count_na(col_name) count_by_data_type[col_data_type] = int(df.count()) - nulls count_by_data_type["null"] = nulls count_by_data_type = fill_missing_var_types(count_by_data_type) # Subtract white spaces to the total string count data_types_count = { "string": count_by_data_type['string'], "bool": count_by_data_type['bool'], "int": count_by_data_type['int'], "float": count_by_data_type['float'], "double": count_by_data_type['double'], "date": count_by_data_type['date'], "array": count_by_data_type['array'] } null_missed_count = { "null": count_by_data_type['null'], "missing": count_empty_strings, } # Get the greatest count by column data type greatest_data_type_count = max(data_types_count, key=data_types_count.get) if greatest_data_type_count is "string": cat = "categorical" elif greatest_data_type_count is "int" or greatest_data_type_count is "float" or greatest_data_type_count is "double": cat = "numeric" elif greatest_data_type_count is "date": cat = "date" elif greatest_data_type_count is "bool": cat = "bool" elif greatest_data_type_count is "array": cat = "array" else: cat = "null" col = {} col['dtype'] = greatest_data_type_count col['type'] = cat col['details'] = {**data_types_count, **null_missed_count} return col
def _count_data_types(col_name): """ Function for determine if register value is float or int or string. :param col_name: :return: """ # If String, process the data to try to infer which data type is inside. This a kind of optimization. # We do not need to analyze the data if the column data type is integer or boolean.etc temp = col_name + "_type" col_data_type = df.cols.dtypes(col_name) # Parse dtype if col_data_type == "smallint" or col_data_type == "tinyint": col_data_type = "int" elif col_data_type == "float" or col_data_type == "double": col_data_type = "decimal" elif col_data_type.find("array") >= 0: col_data_type = "array" count_by_data_type = {} count_empty_strings = 0 if infer is True and col_data_type == "string": logger.print("Processing column '" + col_name + "'...") types = collect_as_dict(df .h_repartition(col_name=col_name) .withColumn(temp, fbdt(col_name, get_type=True)) .groupBy(temp).count() ) for row in types: count_by_data_type[row[temp]] = row["count"] count_empty_strings = df.where(F.col(col_name) == '').count() else: # if boolean not support count na if "count_na" in stats[col_name]: nulls = stats[col_name]["count_na"] count_by_data_type[col_data_type] = int(df_count) - nulls count_by_data_type["null"] = nulls count_by_data_type = fill_missing_var_types(count_by_data_type) # Subtract white spaces to the total string count null_missed_count = {"null": count_by_data_type['null'], "missing": count_empty_strings, } # Get the greatest count by column data type greatest_data_type_count = max(count_by_data_type, key=count_by_data_type.get) if greatest_data_type_count == "string" or greatest_data_type_count == "boolean": cat = "categorical" elif greatest_data_type_count == "int" or greatest_data_type_count == "decimal": cat = "numeric" elif greatest_data_type_count == "date": cat = "date" elif greatest_data_type_count == "array": cat = "array" elif greatest_data_type_count == "binary": cat = "binary" elif greatest_data_type_count == "null": cat = "null" else: cat = None col = {} col['dtype'] = greatest_data_type_count col['type'] = cat col['details'] = {**count_by_data_type, **null_missed_count} return col