def rename(columns_old_new=None, func=None): """" Changes the name of a column(s) dataFrame. :param columns_old_new: List of tuples. Each tuple has de following form: (oldColumnName, newColumnName). :param func: can be lower, upper or any string transformation function """ df = self # Apply a transformation function if is_function(func): exprs = [F.col(c).alias(func(c)) for c in df.columns] df = df.select(exprs) elif is_list_of_tuples(columns_old_new): # Check that the 1st element in the tuple is a valid set of columns validate_columns_names(self, columns_old_new) for c in columns_old_new: old_col_name = c[0] if is_str(old_col_name): df = df.withColumnRenamed(old_col_name, c[1]) elif is_int(old_col_name): df = df.withColumnRenamed(self.schema.names[old_col_name], c[1]) return df
def years_between(col_name, new_col, date_format): """ This method compute the age based on a born date. :param col_name: Name of the column born dates column. :param new_col: Name of the new column, the new columns is the resulting column of ages. :param date_format: String format date of the column provided. """ # Asserting if column if in dataFrame: validate_columns_names(self, col_name) # Output format date format_dt = "yyyy-MM-dd" # Some SimpleDateFormat string def _years_between(new_col, attr): _date_format = attr[0] _col_name = attr[1] return F.format_number( F.abs( F.months_between( F.date_format( F.unix_timestamp( _col_name, _date_format).cast("timestamp"), format_dt), F.current_date()) / 12), 4) \ .alias( new_col) return apply_expr(new_col, _years_between, [date_format, col_name]).cols.cast(new_col, "float")
def drop_by_dtypes(col_name, data_type=None): """ Drop rows by cell data type :param col_name: Column in which the filter is going to be apllied :param data_type: filter by string, integer, float or boolean :return: Spark DataFrame """ validate_columns_names(self, col_name) return self.rows.drop(fbdt(col_name, data_type))
def date_transform(col_name, new_col, current_format, output_format): """ Tranform a column date format :param col_name: Name date columns to be transformed. Columns ha :param current_format: current_format is the current string dat format of columns specified. Of course, all columns specified must have the same format. Otherwise the function is going to return tons of null values because the transformations in the columns with different formats will fail. :param output_format: output date string format to be expected. """ # Asserting if column if in dataFrame: validate_columns_names(self, col_name) def _date_transform(new_col, attr): _col_name = attr[0] _current_format = attr[1] _output_format = attr[2] return F.date_format( F.unix_timestamp(_col_name, _current_format).cast("timestamp"), _output_format).alias(new_col) return apply_expr(new_col, _date_transform, [col_name, current_format, output_format])