def __init__(self, master="local[*]", app_name="optimus"): """ :param master: Sets the Spark master URL to connect to, such as “local” to run locally, “local[4]” to run locally with 4 cores, or “spark://master:7077” to run on a Spark standalone cluster. :param app_name: Sets a name for the application, which will be shown in the Spark web UI """ self.master = master self.app_name = app_name logging.info(JUST_CHECKING) logging.info("-----") check_env_vars([ "SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON", "PYSPARK_DRIVER_PYTHON", "JAVA_HOME" ]) if is_pyarrow_installed() is True: logging.info("Pyarrow Installed") else: logging.info( "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'" ) logging.info("-----") logging.info(STARTING_SPARK) # Build the spark session self._spark = (SparkSession.builder.master(self.master).appName( self.app_name).getOrCreate())
def create(self, master="local[*]", app_name="optimus"): """ :param master: Sets the Spark master URL to connect to, such as 'local' to run locally, 'local[4]' to run locally with 4 cores, or spark://master:7077 to run on a Spark standalone cluster. :param app_name: Sets a name for the application, which will be shown in the Spark web UI """ logger.print(JUST_CHECKING) logger.print("-----") check_env_vars([ "SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON", "PYSPARK_DRIVER_PYTHON", "PYSPARK_SUBMIT_ARGS", "JAVA_HOME" ]) if is_pyarrow_installed() is True: logger.print("Pyarrow Installed") else: logger.print( "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'" ) logger.print("-----") logger.print(STARTING_SPARK) # Build the spark session self._spark = SparkSession.builder \ .appName(app_name) \ .master(master) \ .getOrCreate() self._sc = self._spark.sparkContext logger.print("Spark Version:" + self._sc.version) return self
def abstract_udf(col, func, func_return_type=None, attrs=None, func_type=None, verbose=False): """ Abstract User defined functions. This is a helper function to create udf, pandas udf or a Column Exp :param col: Column to created or transformed :param func: Function to be applied to the data :param attrs: If required attributes to be passed to the function :param func_return_type: Required by UDF and Pandas UDF. :param func_type: pandas_udf or udf. The function is going to try to use pandas_udf if func_type is not defined :param verbose: print additional info :return: A function, UDF or Pandas UDF """ # By default is going to try to use pandas UDF if func_type is None and is_pyarrow_installed() is True: func_type = "pandas_udf" types = ["column_exp", "udf", "pandas_udf"] if func_type not in types: RaiseIt.value_error(func_type, types) # if verbose is True: # logging.info("Using '{func_type}' to process column '{column}' with function {func_name}" # .format(func_type=func_type, column=col, func_name=func.__name__)) df_func = func_factory(func_type, func_return_type) return df_func(attrs, func)(col)
def __init__(self, master="local[*]", app_name="optimus"): """ :param master: Sets the Spark master URL to connect to, such as 'local' to run locally, 'local[4]' to run locally with 4 cores, or spark://master:7077 to run on a Spark standalone cluster. :param app_name: Sets a name for the application, which will be shown in the Spark web UI """ self.master = master self.app_name = app_name logger.info(message=JUST_CHECKING) logger.info("-----") check_env_vars(["SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON", "PYSPARK_DRIVER_PYTHON", "PYSPARK_SUBMIT_ARGS", "JAVA_HOME"]) if is_pyarrow_installed() is True: logger.info("Pyarrow Installed") else: logger.info( "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'") logger.info("-----") logger.info(STARTING_SPARK) # Build the spark session self._spark = SparkSession.builder \ .master(master) \ .config("spark.executor.heartbeatInterval", "110") \ .appName(app_name) \ .getOrCreate() # .option("driver", "org.postgresql.Driver") self._sc = self._spark.sparkContext
def abstract_udf(col, func, func_return_type=None, args=None, func_type=None): """ Abstract User defined functions. This is a helper function to create udf, pandas udf or a Column Exp :param col: Column to created or transformed :param func: Function to be applied to the data :param args: If required attributes to be passed to the function :param func_return_type: Required by UDF and Pandas UDF. :param func_type: pandas_udf or udf. The function is going to try to use pandas_udf if func_type is not defined :return: A function, UDF or Pandas UDF """ if func_return_type is None: func_type = "column_expr" # By default is going to try to use pandas UDF elif func_type is None and is_pyarrow_installed() is True: func_type = "pandas_udf" types = ["column_expr", "udf", "pandas_udf"] if func_type not in types: RaiseIt.value_error(func_type, types) # It handle if func param is a plain expression or a function returning and expression def func_col_exp(col_name, attr): return func if is_column(func): _func = func_col_exp else: _func = func # print(func_type) logger.print( "Using '{func_type}' to process column '{column}' with function {func_name}" .format(func_type=func_type, column=col, func_name=_func.__name__)) df_func = func_factory(func_type, func_return_type) if not is_tuple(args): args = (args, ) # print("-----------------df_func(_func, args)(col)", df_func(_func, args)(col)) return df_func(_func, args)(col)