def cast_factory(cls): # Parse to Vector if is_type(cls, Vectors): func_type = "udf" def cast_to_vectors(val, attr): return Vectors.dense(val) func_return_type = VectorUDT() # Parse standard data types elif get_spark_dtypes_object(cls): func_type = "column_exp" def cast_to_vectors(col_name, attr): return F.col(col_name).cast(get_spark_dtypes_object(cls)) func_return_type = None # Add here any other parse you want else: RaiseIt.value_error(cls) return func_return_type, cast_to_vectors, func_type
def data_frame(cols=None, rows=None, infer_schema=True, pdf=None): """ Helper to create a Spark dataframe: :param cols: List of Tuple with name, data type and a flag to accept null :param rows: List of Tuples with the same number and types that cols :param infer_schema: Try to infer the schema data type. :param pdf: a pandas dataframe :return: Dataframe """ if is_(pdf, pd.DataFrame): result = Spark.instance.spark.createDataFrame(pdf) else: specs = [] # Process the rows if not is_list_of_tuples(rows): rows = [(i, ) for i in rows] # Process the columns for c, r in zip(cols, rows[0]): # Get columns name if is_one_element(c): col_name = c if infer_schema is True: var_type = infer(r) else: var_type = StringType() nullable = True elif is_tuple(c): # Get columns data type col_name = c[0] var_type = get_spark_dtypes_object(c[1]) count = len(c) if count == 2: nullable = True elif count == 3: nullable = c[2] # If tuple has not the third param with put it to true to accepts Null in columns specs.append([col_name, var_type, nullable]) struct_fields = list(map(lambda x: StructField(*x), specs)) result = Spark.instance.spark.createDataFrame( rows, StructType(struct_fields)) return result
def func_factory(func_type=None, func_return_type=None): """ Return column express, udf or pandas udf function. :param func_type: Type of function udf or pandas udf :param func_return_type: :return: """ # if func_return_type is not None: func_return_type = get_spark_dtypes_object(func_return_type) def pandas_udf_func(attr=None, func=None): # TODO: Get the column type, so is not necessary to pass the return type as param. # Apply the function over the whole series def apply_to_series(val, attr): if attr is None: attr = (None, ) else: attr = (attr, ) return val.apply(func, args=attr) def to_serie(value): return apply_to_series(value, attr) return F.pandas_udf(to_serie, func_return_type) def udf_func(attr, func): return F.udf(lambda value: func(value, attr), func_return_type) def expression_func(attr, func): def inner(col_name): return func(col_name, attr) return inner if func_type is "pandas_udf": return pandas_udf_func elif func_type is "udf": return udf_func elif func_type is "column_exp": return expression_func
def data_frame(cols, rows): """ Helper to create a Spark dataframe :param cols: :param rows: :return: """ specs = [] for c in cols: value = c[1] # Try to find if the type var is a Spark datatype if isinstance(value, SPARK_DTYPES): var_type = value # else, try to parse a str, int, float ...... else: var_type = get_spark_dtypes_object(c[1]) specs.append([c[0], var_type, c[2]]) struct_fields = list(map(lambda x: StructField(*x), specs)) return Spark.instance.spark().createDataFrame( rows, StructType(struct_fields))
def cast_to_vectors(col_name, attr): return F.col(col_name).cast(get_spark_dtypes_object(cls))