Пример #1
0
    def define_job(self,
                   function_dict=None,
                   perform_search=True,
                   persist_search=False,
                   calc_stats=False,
                   resolution='5m',
                   previous_results=None):
        """
        Define a job that can run calculations on parameter sample data in the ARES cluster.
        Read documentation for a more thorough description of the input dictionaries.
        :param function_dict: Dict containing the UDFs and the columns to perform them on.
        :param perform_search: Bool to flag whether you want to search for params in the job
        :param persist_search: Bool to persist the results of your search query as a seperate df in HDFS
        :param calc_stats: Bool if you want to calculate and persist statistics with the resolution
        :param resolution: Can be String 5m, 30m, 1d, OR any Int in microseconds
        :param previous_results: Dict defining which job_id's and which result types you want to reuse in the job
        """
        # TODO able to pass multiple functions on multiple columns
        # TODO columns is (nested) array of columns
        # TODO columns can also be 'all'
        # TODO create standard statistics calc function

        # TODO confirm that the statistics job is being correctly parallelized
        # TODO confirm if it's preferred to have a single DF for the stat job results or seperate ones for each param

        # TODO there is a lot going on in this function. Is there a way to seggregate the duties in subfunctions
        """
        Spark SQL defines the following types of functions:
        (https://jaceklaskowski.gitbooks.io/mastering-spark-sql/spark-sql.html)
        standard functions or User-Defined Functions (UDFs) that take values from a single row as input to generate a single return value for every input row.
        basic aggregate functions that operate on a group of rows and calculate a single return value per group.
        window aggregate functions that operate on a group of rows and calculate a single return value for each row in a group.
        """
        # Security checks #
        # If you try to run an empty job.
        if perform_search is False and previous_results is None and calc_stats is False:
            return "Can't run an empty job."

        # If you try to run calculations on no data.
        if perform_search is False and previous_results is None and function_dict is not None:
            return "You need data to do calculations. Either use a search or previous results to run the job."

        #if calc_stats is True and resolution not in ('5m', '1h', '1d'):
        #    return "Resolution is in the wrong format. Please choose '5m', '1h' or '1d'."

        # If function_dict is None and calc_stats is False:
        #    return "You can't run search jobs"

        # Get the HDFS path to write the results to
        conf = paconf('pyarex_conf.ini').get_conf('HDFS')
        result_path = conf['result_path']

        # initialize the spark context and the logger
        sc = SparkContext(conf=SparkConf().setAppName(self.__name))
        log4jlogger = sc._jvm.org.apache.log4j
        logger = log4jlogger.LogManager.getLogger('PyArEx')
        logger.info("Running PyArEx job.")
        # part of the fix related to JIRA ARESPY-20
        #sqlc = HiveContext(sc)
        sqlc = SQLContext(sc)

        # get the needed data from the pickle file
        with open('pyarex.pkl', 'rb') as file:
            param_names, start, end = pickle.load(file)

        # if the data needs to come from HBase
        if perform_search:
            params = sc.parallelize(param_names)
            try:
                samples = params.flatMap(lambda x: self.__get_samples(
                    x, start, end)).filter(lambda x: x is not None)
                rows = samples.map(lambda x: Row(timestamp=x.get_time(),
                                                 value=x.get_value(),
                                                 var_name=x.get_name()))
                acdf = sqlc.createDataFrame(
                    rows, schema=['timestamp', 'value', 'var_name'])
                # TODO; for now type is inferred from the sampling ratio, later this should be in the sample object
            except ValueError:
                logger.info(
                    "Search returned no data. "
                    "Please consider using different parameters or a different time period."
                )
                return

            df = acdf.groupBy('timestamp').pivot('var_name').sum('value')
            #df = self.__fill_nans(df)
            acdf.unpersist()  # remove the old df from memory
            old_columns = df.columns[1:]

            # check which parameter didn't return data from MariaDB
            for param_name in param_names:
                if param_name not in old_columns:
                    logger.info(
                        "Parameter ['%s'] was not found in the database or was not found within the "
                        "given time frame. Please check if this is a valid parameter name and consider "
                        "using a different time frame." % param_name)

            if persist_search:
                df.write.save(('%s/%s_search_result' %
                               (result_path, str(sc.applicationId))),
                              mode='append')

        # If you want to use results from previous jobs
        if previous_results is not None:
            paths = [
                '%s/%s_%s_result/' % (result_path, key, value)
                for key in previous_results.keys()
                for value in previous_results[key]
            ]
            if perform_search:
                for path in paths:
                    df = df.join(sqlc.parquetFile(path), on='timestamp')
                old_columns = df.columns[1:]
            else:
                df = sqlc.parquetFile(paths[0])
                for path in paths[1:]:
                    df = df.join(sqlc.parquetFile(path), on='timestamp')
                old_columns = df.columns[1:]

        # If you want to run a statistics jobs
        if calc_stats:
            stat_df = self.__calc_stats(df, resolution)
            stat_df.write.save(
                ('%s/%s_stat_result' % (result_path, str(sc.applicationId))),
                mode='append')
        """ If you want to run udfs on the parameters
        According to https://medium.com/@mrpowers/performing-operations-on-multiple-columns-in-a-pyspark-dataframe-36e97896c378
        performance should be equal to doing this with map reduce.
        """
        if function_dict is not None:
            for key in function_dict.keys():
                try:
                    f = F.udf(function_dict[key][0],
                              self.__check_f_type(function_dict[key][0]))
                    df = df.withColumn(
                        '%s' % key,
                        f(*[F.col(x) for x in function_dict[key][1]]))

                    # part of the fix related to JIRA ARESPY-20
                    #bad_func = F.udf(self.__check_not_bad, BooleanType())
                    #cols = [F.col(x) for x in function_dict[key][1]]
                    #df = df.withColumn('%s' % key, f(*[F.col(x) for x in cols]))

                    #df = df.withColumn('test', F.lit(bad_func(*cols)))
                    #df = df.withColumn('test', F.lit(bad_func(*cols)).cast('string')=='false')
                    #df = df.withColumn('test_%s' % key , F.when(F.lit(bad_func(*cols)).cast('string')=='false', f(*cols)).otherwise(F.lit('None')))

                    #*[F.col(x) for x in function_dict[key][1]],
                    #                              f(*[F.col(x) for x in function_dict[key][1]])))
                except utils.AnalysisException:
                    logger.info(
                        "Cannot find parameter of query %s in result given current columns %s. "
                        "Please consider using different parameters or a different time period."
                        % (str(function_dict[key][1]), str(df.columns)))
                    pass

            # persist only the new results
            df.select([column for column in df.columns if column not in old_columns])\
                .write.save(('%s/%s_udf_result' % (result_path, str(sc.applicationId))), mode='append')

        logger.info("Finished PyArEx job.")
Пример #2
0

sc = SparkContext(conf=SparkConf().setAppName('sparkoptimizationtest'))
sqlc = SQLContext(sc)

# persisted results also in dict
results_dict = {'application_1533282160533_0003': ['udf']}

# create a function that can check the column names from persisted results

paths = [
    'hdfs://nameservice1/tmp/sparkdev/%s_%s_result/' % (key, value)
    for key in results_dict.keys() for value in results_dict[key]
]

df = sqlc.parquetFile(paths[0])
for path in paths[1:]:
    df = df.join(sqlc.parquetFile(path), on='timestamp')

#df.show()
df = df.withColumn('add100_sa', F.col('add100_sa').astype('int'))
#print(df.dtypes)

#function_dict = {'add10_sa': (add10, ['sa']), 'add10_sx': (add10, ['sx']), 'addcols_sxsy': (addcols, ['sx', 'sy'])}
#function_dict = {'add10_sa': (add_udf, ['sa'])}
function_dict = {'add10_sa': (add10, ['add100_sa'])}

add_udf = F.udf(lambda x: '1' if x >= 30 else '0', StringType())


def check_type(f):
Пример #3
0
## warning message says to use
# /usr/lib/spark/bin/spark-submit  spark_2.py

from pyspark import SparkContext
from pyspark import SQLContext

sc = SparkContext('local', 'pyspark')
print("  *** hello world from pyspark")

## https://databricks-training.s3.amazonaws.com/data-exploration-using-spark-sql.html
sqlCtx = SQLContext(sc)

print("   *** hello world spark sql context created")

#wikiData = sqlCtx.parquetFile("data/wiki_parquet")
wikiData = sqlCtx.parquetFile("password.txt")

# get  error:
# maybe file was spooled to hdfs, but unable to process it correctly?
# py4j.protocol.Py4JJavaError: An error occurred while calling o36.parquetFile.
#: java.lang.AssertionError: assertion failed: No predefined schema found, and no Parquet data files or summary files found under hdfs://clp24/user/hoti1/password.txt.

wikiDAta.count()
wikiData.registerAsTable("wikiData")
result = sqlCtx.sql("SELECT COUNT(*) AS pageCount FROM wikiData").collect()

print result[0].pageCount

print("  *** bye bye world")