def fix_name(name: str, make_lower: bool = False) -> str: """ Makes a column name a proper identifier. This function will 1. Strip leading/trailing whitspace 2. Remove all puncuation except _ 3. Add a _ to the start of any string that start with a digit """ strip = pipeable(lambda s: s.strip()) remove_punc = pipeable(lambda s: PUNC_REGEX.sub('', s)) fix_starting_digit = pipeable(lambda s: '_' + s if STARTS_WITH_DIGITS_REGEX.match(s) else s) replace_whitespace = pipeable(lambda s: WS_REGEX.sub('_', s)) if name.isidentifier(): return name else: new_name = name >> strip >> remove_punc >> fix_starting_digit >> replace_whitespace return new_name.lower() if make_lower else new_name
sql_type = lambda name, dtype: StructField( name, DTYPES_TO_SPARK_TYPES[dtype.kind](), False if name in keys else True) cols_and_dtypes = lambda df: zip(df.columns, df.dtypes) return StructType( [sql_type(col, dtype) for col, dtype in cols_and_dtypes(df)]) @pipeable def to_pandas(rows): names = list(rows[0].asDict().keys()) # pyspark.Row keys perserve order return pd.DataFrame.from_dict([r.asDict() for r in rows ])[names] # reorder to original order col_startswith = pipeable( lambda prefix, cols: [c for c in cols if c.startswith(prefix)]) col_endswith = pipeable( lambda suffix, cols: [c for c in cols if c.endswith(suffix)]) col_contains = pipeable(lambda substr, cols: [c for c in cols if substr in c]) def col_selector(col_names, from_=None, to=None, inclusive=True): from_idx, to_idx = 0, len(col_names) if from_: from_idx = col_names.index( from_) if inclusive else col_names.index(from_) + 1 if to: to_idx = col_names.index( to) if not inclusive else col_names.index(to) + 1 return col_names[from_idx:to_idx]
from functoolz import pipeable result_dict = lambda r: dict(zip(r.keys(), r)) result_dicts = pipeable(lambda rs: list(map(result_dict, rs))) check_unique = pipeable(lambda df: [(col, df[col].is_unique) for col in df.columns])
import pandas as pd from functoolz import pipeable to_pandas = pipeable( lambda rows: pd.DataFrame.from_dict([r.asDict() for r in rows]))