def append(col_name=None, value=None): """ Append a column to a Dataframe :param col_name: Name of the new column :param value: List of data values :return: """ def lit_array(_value): temp = [] for v in _value: temp.append(F.lit(v)) return F.array(temp) df = self if is_num_or_str(value): value = F.lit(value) elif is_list(value): value = lit_array(value) elif is_tuple(value): value = lit_array(list(value)) if is_(value, F.Column): df = df.withColumn(col_name, value) return df
def data_frame(cols=None, rows=None, infer_schema=True, pdf=None): """ Helper to create a Spark dataframe: :param cols: List of Tuple with name, data type and a flag to accept null :param rows: List of Tuples with the same number and types that cols :param infer_schema: Try to infer the schema data type. :param pdf: a pandas dataframe :return: Dataframe """ if is_(pdf, pd.DataFrame): result = Spark.instance.spark.createDataFrame(pdf) else: specs = [] # Process the rows if not is_list_of_tuples(rows): rows = [(i, ) for i in rows] # Process the columns for c, r in zip(cols, rows[0]): # Get columns name if is_one_element(c): col_name = c if infer_schema is True: var_type = infer(r) else: var_type = StringType() nullable = True elif is_tuple(c): # Get columns data type col_name = c[0] var_type = get_spark_dtypes_object(c[1]) count = len(c) if count == 2: nullable = True elif count == 3: nullable = c[2] # If tuple has not the third param with put it to true to accepts Null in columns specs.append([col_name, var_type, nullable]) struct_fields = list(map(lambda x: StructField(*x), specs)) result = Spark.instance.spark.createDataFrame( rows, StructType(struct_fields)) return result
def parse_columns(df, cols_args, get_args=False, is_regex=None, filter_by_column_dtypes=None, accepts_missing_cols=False): """ Return a list of columns and check that columns exists in the dataframe Accept '*' as parameter in which case return a list of all columns in the dataframe. Also accept a regex. If a list of tuples return to list. The first element is the columns name the others element are params. This params can me used to create custom transformation functions. You can find and example in cols().cast() :param df: Dataframe in which the columns are going to be checked :param cols_args: Accepts * as param to return all the string columns in the dataframe :param get_args: :param is_regex: Use True is col_attrs is a regex :param filter_by_column_dtypes: :param accepts_missing_cols: if true not check if column exist in the dataframe :return: A list of columns string names """ cols = None attrs = None # ensure that cols_args is a list # cols_args = val_to_list(cols_args) # if columns value is * get all dataframes columns if is_regex is True: r = re.compile(cols_args[0]) cols = list(filter(r.match, df.columns)) elif cols_args == "*" or cols_args is None: cols = df.columns # In case we have a list of tuples we use the first element of the tuple is taken as the column name # and the rest as params. We can use the param in a custom function as follow # def func(attrs): attrs return (1,2) and (3,4) # return attrs[0] + 1 # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func) # Verify if we have a list with tuples elif is_tuple(cols_args) or is_list_of_tuples(cols_args): cols_args = val_to_list(cols_args) # Extract a specific position in the tuple cols = [(i[0:1][0]) for i in cols_args] attrs = [(i[1:]) for i in cols_args] else: # if not a list convert to list cols = val_to_list(cols_args) # Get col name from index cols = [c if is_str(c) else df.columns[c] for c in cols] # Check for missing columns if accepts_missing_cols is False: check_for_missing_columns(df, cols) # Filter by column data type filter_by_column_dtypes = val_to_list(filter_by_column_dtypes) if is_list_of_strings(filter_by_column_dtypes): # Get columns for every data type columns_filtered = filter_col_name_by_dtypes(df, filter_by_column_dtypes) # Intersect the columns filtered per datatype from the whole dataframe with the columns passed to the function cols = list(set(cols).intersection(columns_filtered)) # Return cols or cols an params if get_args is True: params = cols, attrs elif get_args is False: params = cols else: RaiseIfNot.value_error(get_args, ["True", "False"]) return params