def replace(columns, search_and_replace=None, value=None, regex=None): """ Replace a value or a list of values by a specified string :param columns: '*', list of columns names or a single column name. :param search_and_replace: values to look at to be replaced :param value: new value to replace the old one :param regex: :return: """ replace = None search = None if is_list_of_tuples(search_and_replace): params = list(zip(*search_and_replace)) search = list(params[0]) replace = list(params[1]) elif is_list(search_and_replace): search = search_and_replace replace = value elif is_one_element(search_and_replace): search = val_to_list(search_and_replace) replace = value if regex: search = search_and_replace replace = value # if regex or normal replace we use regexp or replace functions # TODO check if .contains can be used instead of regexp def func_regex(_df, _col_name, _search, _replace): return _df.withColumn( c, F.regexp_replace(_col_name, _search, _replace)) def func_replace(_df, _col_name, _search, _replace): data_type = self.cols.dtypes(_col_name) _search = [PYTHON_TYPES_[data_type](s) for s in _search] _df = _df.replace(_search, _replace, _col_name) return _df if regex: func = func_regex else: func = func_replace df = self columns = parse_columns(self, columns, filter_by_column_dtypes="string") for c in columns: df = func(df, c, search, replace) return df
def data_frame(cols=None, rows=None, infer_schema=True, pdf=None): """ Helper to create a Spark dataframe: :param cols: List of Tuple with name, data type and a flag to accept null :param rows: List of Tuples with the same number and types that cols :param infer_schema: Try to infer the schema data type. :param pdf: a pandas dataframe :return: Dataframe """ if is_(pdf, pd.DataFrame): result = Spark.instance.spark.createDataFrame(pdf) else: specs = [] # Process the rows if not is_list_of_tuples(rows): rows = [(i, ) for i in rows] # Process the columns for c, r in zip(cols, rows[0]): # Get columns name if is_one_element(c): col_name = c if infer_schema is True: var_type = infer(r) else: var_type = StringType() nullable = True elif is_tuple(c): # Get columns data type col_name = c[0] var_type = get_spark_dtypes_object(c[1]) count = len(c) if count == 2: nullable = True elif count == 3: nullable = c[2] # If tuple has not the third param with put it to true to accepts Null in columns specs.append([col_name, var_type, nullable]) struct_fields = list(map(lambda x: StructField(*x), specs)) result = Spark.instance.spark.createDataFrame( rows, StructType(struct_fields)) return result