def define_job(self, function_dict=None, perform_search=True, persist_search=False, calc_stats=False, resolution='5m', previous_results=None): """ Define a job that can run calculations on parameter sample data in the ARES cluster. Read documentation for a more thorough description of the input dictionaries. :param function_dict: Dict containing the UDFs and the columns to perform them on. :param perform_search: Bool to flag whether you want to search for params in the job :param persist_search: Bool to persist the results of your search query as a seperate df in HDFS :param calc_stats: Bool if you want to calculate and persist statistics with the resolution :param resolution: Can be String 5m, 30m, 1d, OR any Int in microseconds :param previous_results: Dict defining which job_id's and which result types you want to reuse in the job """ # TODO able to pass multiple functions on multiple columns # TODO columns is (nested) array of columns # TODO columns can also be 'all' # TODO create standard statistics calc function # TODO confirm that the statistics job is being correctly parallelized # TODO confirm if it's preferred to have a single DF for the stat job results or seperate ones for each param # TODO there is a lot going on in this function. Is there a way to seggregate the duties in subfunctions """ Spark SQL defines the following types of functions: (https://jaceklaskowski.gitbooks.io/mastering-spark-sql/spark-sql.html) standard functions or User-Defined Functions (UDFs) that take values from a single row as input to generate a single return value for every input row. basic aggregate functions that operate on a group of rows and calculate a single return value per group. window aggregate functions that operate on a group of rows and calculate a single return value for each row in a group. """ # Security checks # # If you try to run an empty job. if perform_search is False and previous_results is None and calc_stats is False: return "Can't run an empty job." # If you try to run calculations on no data. if perform_search is False and previous_results is None and function_dict is not None: return "You need data to do calculations. Either use a search or previous results to run the job." #if calc_stats is True and resolution not in ('5m', '1h', '1d'): # return "Resolution is in the wrong format. Please choose '5m', '1h' or '1d'." # If function_dict is None and calc_stats is False: # return "You can't run search jobs" # Get the HDFS path to write the results to conf = paconf('pyarex_conf.ini').get_conf('HDFS') result_path = conf['result_path'] # initialize the spark context and the logger sc = SparkContext(conf=SparkConf().setAppName(self.__name)) log4jlogger = sc._jvm.org.apache.log4j logger = log4jlogger.LogManager.getLogger('PyArEx') logger.info("Running PyArEx job.") # part of the fix related to JIRA ARESPY-20 #sqlc = HiveContext(sc) sqlc = SQLContext(sc) # get the needed data from the pickle file with open('pyarex.pkl', 'rb') as file: param_names, start, end = pickle.load(file) # if the data needs to come from HBase if perform_search: params = sc.parallelize(param_names) try: samples = params.flatMap(lambda x: self.__get_samples( x, start, end)).filter(lambda x: x is not None) rows = samples.map(lambda x: Row(timestamp=x.get_time(), value=x.get_value(), var_name=x.get_name())) acdf = sqlc.createDataFrame( rows, schema=['timestamp', 'value', 'var_name']) # TODO; for now type is inferred from the sampling ratio, later this should be in the sample object except ValueError: logger.info( "Search returned no data. " "Please consider using different parameters or a different time period." ) return df = acdf.groupBy('timestamp').pivot('var_name').sum('value') #df = self.__fill_nans(df) acdf.unpersist() # remove the old df from memory old_columns = df.columns[1:] # check which parameter didn't return data from MariaDB for param_name in param_names: if param_name not in old_columns: logger.info( "Parameter ['%s'] was not found in the database or was not found within the " "given time frame. Please check if this is a valid parameter name and consider " "using a different time frame." % param_name) if persist_search: df.write.save(('%s/%s_search_result' % (result_path, str(sc.applicationId))), mode='append') # If you want to use results from previous jobs if previous_results is not None: paths = [ '%s/%s_%s_result/' % (result_path, key, value) for key in previous_results.keys() for value in previous_results[key] ] if perform_search: for path in paths: df = df.join(sqlc.parquetFile(path), on='timestamp') old_columns = df.columns[1:] else: df = sqlc.parquetFile(paths[0]) for path in paths[1:]: df = df.join(sqlc.parquetFile(path), on='timestamp') old_columns = df.columns[1:] # If you want to run a statistics jobs if calc_stats: stat_df = self.__calc_stats(df, resolution) stat_df.write.save( ('%s/%s_stat_result' % (result_path, str(sc.applicationId))), mode='append') """ If you want to run udfs on the parameters According to https://medium.com/@mrpowers/performing-operations-on-multiple-columns-in-a-pyspark-dataframe-36e97896c378 performance should be equal to doing this with map reduce. """ if function_dict is not None: for key in function_dict.keys(): try: f = F.udf(function_dict[key][0], self.__check_f_type(function_dict[key][0])) df = df.withColumn( '%s' % key, f(*[F.col(x) for x in function_dict[key][1]])) # part of the fix related to JIRA ARESPY-20 #bad_func = F.udf(self.__check_not_bad, BooleanType()) #cols = [F.col(x) for x in function_dict[key][1]] #df = df.withColumn('%s' % key, f(*[F.col(x) for x in cols])) #df = df.withColumn('test', F.lit(bad_func(*cols))) #df = df.withColumn('test', F.lit(bad_func(*cols)).cast('string')=='false') #df = df.withColumn('test_%s' % key , F.when(F.lit(bad_func(*cols)).cast('string')=='false', f(*cols)).otherwise(F.lit('None'))) #*[F.col(x) for x in function_dict[key][1]], # f(*[F.col(x) for x in function_dict[key][1]]))) except utils.AnalysisException: logger.info( "Cannot find parameter of query %s in result given current columns %s. " "Please consider using different parameters or a different time period." % (str(function_dict[key][1]), str(df.columns))) pass # persist only the new results df.select([column for column in df.columns if column not in old_columns])\ .write.save(('%s/%s_udf_result' % (result_path, str(sc.applicationId))), mode='append') logger.info("Finished PyArEx job.")
sc = SparkContext(conf=SparkConf().setAppName('sparkoptimizationtest')) sqlc = SQLContext(sc) # persisted results also in dict results_dict = {'application_1533282160533_0003': ['udf']} # create a function that can check the column names from persisted results paths = [ 'hdfs://nameservice1/tmp/sparkdev/%s_%s_result/' % (key, value) for key in results_dict.keys() for value in results_dict[key] ] df = sqlc.parquetFile(paths[0]) for path in paths[1:]: df = df.join(sqlc.parquetFile(path), on='timestamp') #df.show() df = df.withColumn('add100_sa', F.col('add100_sa').astype('int')) #print(df.dtypes) #function_dict = {'add10_sa': (add10, ['sa']), 'add10_sx': (add10, ['sx']), 'addcols_sxsy': (addcols, ['sx', 'sy'])} #function_dict = {'add10_sa': (add_udf, ['sa'])} function_dict = {'add10_sa': (add10, ['add100_sa'])} add_udf = F.udf(lambda x: '1' if x >= 30 else '0', StringType()) def check_type(f):
## warning message says to use # /usr/lib/spark/bin/spark-submit spark_2.py from pyspark import SparkContext from pyspark import SQLContext sc = SparkContext('local', 'pyspark') print(" *** hello world from pyspark") ## https://databricks-training.s3.amazonaws.com/data-exploration-using-spark-sql.html sqlCtx = SQLContext(sc) print(" *** hello world spark sql context created") #wikiData = sqlCtx.parquetFile("data/wiki_parquet") wikiData = sqlCtx.parquetFile("password.txt") # get error: # maybe file was spooled to hdfs, but unable to process it correctly? # py4j.protocol.Py4JJavaError: An error occurred while calling o36.parquetFile. #: java.lang.AssertionError: assertion failed: No predefined schema found, and no Parquet data files or summary files found under hdfs://clp24/user/hoti1/password.txt. wikiDAta.count() wikiData.registerAsTable("wikiData") result = sqlCtx.sql("SELECT COUNT(*) AS pageCount FROM wikiData").collect() print result[0].pageCount print(" *** bye bye world")