示例#1
0
def fix_name(name: str, make_lower: bool = False) -> str:
    """ Makes a column name a proper identifier.
    
    This function will
    
    1. Strip leading/trailing whitspace
    2. Remove all puncuation except _
    3. Add a _ to the start of any string that start with a digit
    """
    strip = pipeable(lambda s: s.strip())
    remove_punc = pipeable(lambda s: PUNC_REGEX.sub('', s))
    fix_starting_digit = pipeable(lambda s: '_' + s
                                  if STARTS_WITH_DIGITS_REGEX.match(s) else s)
    replace_whitespace = pipeable(lambda s: WS_REGEX.sub('_', s))
    if name.isidentifier():
        return name
    else:
        new_name = name >> strip >> remove_punc >> fix_starting_digit >> replace_whitespace
        return new_name.lower() if make_lower else new_name
示例#2
0
    sql_type = lambda name, dtype: StructField(
        name, DTYPES_TO_SPARK_TYPES[dtype.kind](), False
        if name in keys else True)
    cols_and_dtypes = lambda df: zip(df.columns, df.dtypes)
    return StructType(
        [sql_type(col, dtype) for col, dtype in cols_and_dtypes(df)])


@pipeable
def to_pandas(rows):
    names = list(rows[0].asDict().keys())  # pyspark.Row keys perserve order
    return pd.DataFrame.from_dict([r.asDict() for r in rows
                                   ])[names]  # reorder to original order


col_startswith = pipeable(
    lambda prefix, cols: [c for c in cols if c.startswith(prefix)])
col_endswith = pipeable(
    lambda suffix, cols: [c for c in cols if c.endswith(suffix)])
col_contains = pipeable(lambda substr, cols: [c for c in cols if substr in c])


def col_selector(col_names, from_=None, to=None, inclusive=True):
    from_idx, to_idx = 0, len(col_names)
    if from_:
        from_idx = col_names.index(
            from_) if inclusive else col_names.index(from_) + 1
    if to:
        to_idx = col_names.index(
            to) if not inclusive else col_names.index(to) + 1
    return col_names[from_idx:to_idx]
示例#3
0
from functoolz import pipeable
result_dict = lambda r: dict(zip(r.keys(), r))
result_dicts = pipeable(lambda rs: list(map(result_dict, rs)))
check_unique = pipeable(lambda df: [(col, df[col].is_unique) for col in df.columns])
示例#4
0
import pandas as pd
from functoolz import pipeable

to_pandas = pipeable(
    lambda rows: pd.DataFrame.from_dict([r.asDict() for r in rows]))