Python SQLContext.parquetFile примеры использования

Язык программирования: Python

Пространство имен/Пакет: pyspark

Класс/Тип: SQLContext

Метод/Функция: parquetFile

Примеров на hotexamples.com: 3

Python SQLContext.parquetFile - 3 примера найдено. Это лучшие примеры Python кода для pyspark.SQLContext.parquetFile, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

sql(30)

createDataFrame(30)

SQLContext(28)

getOrCreate(17)

setConf(14)

registerDataFrameAsTable(10)

load(4)

cacheTable(4)

jsonFile(3)

show(3)

parquetFile(3)

registerFunction(3)

withColumn(2)

dropTempTable(2)

tableNames(2)

clearCache(2)

range(2)

applySchema(2)

jsonRDD(2)

inferSchema(2)

groupby(1)

printSchema(1)

select(1)

persist(1)

filter(1)

Пример #1

Показать файл

    def define_job(self,
                   function_dict=None,
                   perform_search=True,
                   persist_search=False,
                   calc_stats=False,
                   resolution='5m',
                   previous_results=None):
        """
        Define a job that can run calculations on parameter sample data in the ARES cluster.
        Read documentation for a more thorough description of the input dictionaries.
        :param function_dict: Dict containing the UDFs and the columns to perform them on.
        :param perform_search: Bool to flag whether you want to search for params in the job
        :param persist_search: Bool to persist the results of your search query as a seperate df in HDFS
        :param calc_stats: Bool if you want to calculate and persist statistics with the resolution
        :param resolution: Can be String 5m, 30m, 1d, OR any Int in microseconds
        :param previous_results: Dict defining which job_id's and which result types you want to reuse in the job
        """
        # TODO able to pass multiple functions on multiple columns
        # TODO columns is (nested) array of columns
        # TODO columns can also be 'all'
        # TODO create standard statistics calc function

        # TODO confirm that the statistics job is being correctly parallelized
        # TODO confirm if it's preferred to have a single DF for the stat job results or seperate ones for each param

        # TODO there is a lot going on in this function. Is there a way to seggregate the duties in subfunctions
        """
        Spark SQL defines the following types of functions:
        (https://jaceklaskowski.gitbooks.io/mastering-spark-sql/spark-sql.html)
        standard functions or User-Defined Functions (UDFs) that take values from a single row as input to generate a single return value for every input row.
        basic aggregate functions that operate on a group of rows and calculate a single return value per group.
        window aggregate functions that operate on a group of rows and calculate a single return value for each row in a group.
        """
        # Security checks #
        # If you try to run an empty job.
        if perform_search is False and previous_results is None and calc_stats is False:
            return "Can't run an empty job."

        # If you try to run calculations on no data.
        if perform_search is False and previous_results is None and function_dict is not None:
            return "You need data to do calculations. Either use a search or previous results to run the job."

        #if calc_stats is True and resolution not in ('5m', '1h', '1d'):
        #    return "Resolution is in the wrong format. Please choose '5m', '1h' or '1d'."

        # If function_dict is None and calc_stats is False:
        #    return "You can't run search jobs"

        # Get the HDFS path to write the results to
        conf = paconf('pyarex_conf.ini').get_conf('HDFS')
        result_path = conf['result_path']

        # initialize the spark context and the logger
        sc = SparkContext(conf=SparkConf().setAppName(self.__name))
        log4jlogger = sc._jvm.org.apache.log4j
        logger = log4jlogger.LogManager.getLogger('PyArEx')
        logger.info("Running PyArEx job.")
        # part of the fix related to JIRA ARESPY-20
        #sqlc = HiveContext(sc)
        sqlc = SQLContext(sc)

        # get the needed data from the pickle file
        with open('pyarex.pkl', 'rb') as file:
            param_names, start, end = pickle.load(file)

        # if the data needs to come from HBase
        if perform_search:
            params = sc.parallelize(param_names)
            try:
                samples = params.flatMap(lambda x: self.__get_samples(
                    x, start, end)).filter(lambda x: x is not None)
                rows = samples.map(lambda x: Row(timestamp=x.get_time(),
                                                 value=x.get_value(),
                                                 var_name=x.get_name()))
                acdf = sqlc.createDataFrame(
                    rows, schema=['timestamp', 'value', 'var_name'])
                # TODO; for now type is inferred from the sampling ratio, later this should be in the sample object
            except ValueError:
                logger.info(
                    "Search returned no data. "
                    "Please consider using different parameters or a different time period."
                )
                return

            df = acdf.groupBy('timestamp').pivot('var_name').sum('value')
            #df = self.__fill_nans(df)
            acdf.unpersist()  # remove the old df from memory
            old_columns = df.columns[1:]

            # check which parameter didn't return data from MariaDB
            for param_name in param_names:
                if param_name not in old_columns:
                    logger.info(
                        "Parameter ['%s'] was not found in the database or was not found within the "
                        "given time frame. Please check if this is a valid parameter name and consider "
                        "using a different time frame." % param_name)

            if persist_search:
                df.write.save(('%s/%s_search_result' %
                               (result_path, str(sc.applicationId))),
                              mode='append')

        # If you want to use results from previous jobs
        if previous_results is not None:
            paths = [
                '%s/%s_%s_result/' % (result_path, key, value)
                for key in previous_results.keys()
                for value in previous_results[key]
            ]
            if perform_search:
                for path in paths:
                    df = df.join(sqlc.parquetFile(path), on='timestamp')
                old_columns = df.columns[1:]
            else:
                df = sqlc.parquetFile(paths[0])
                for path in paths[1:]:
                    df = df.join(sqlc.parquetFile(path), on='timestamp')
                old_columns = df.columns[1:]

        # If you want to run a statistics jobs
        if calc_stats:
            stat_df = self.__calc_stats(df, resolution)
            stat_df.write.save(
                ('%s/%s_stat_result' % (result_path, str(sc.applicationId))),
                mode='append')
        """ If you want to run udfs on the parameters
        According to https://medium.com/@mrpowers/performing-operations-on-multiple-columns-in-a-pyspark-dataframe-36e97896c378
        performance should be equal to doing this with map reduce.
        """
        if function_dict is not None:
            for key in function_dict.keys():
                try:
                    f = F.udf(function_dict[key][0],
                              self.__check_f_type(function_dict[key][0]))
                    df = df.withColumn(
                        '%s' % key,
                        f(*[F.col(x) for x in function_dict[key][1]]))

                    # part of the fix related to JIRA ARESPY-20
                    #bad_func = F.udf(self.__check_not_bad, BooleanType())
                    #cols = [F.col(x) for x in function_dict[key][1]]
                    #df = df.withColumn('%s' % key, f(*[F.col(x) for x in cols]))

                    #df = df.withColumn('test', F.lit(bad_func(*cols)))
                    #df = df.withColumn('test', F.lit(bad_func(*cols)).cast('string')=='false')
                    #df = df.withColumn('test_%s' % key , F.when(F.lit(bad_func(*cols)).cast('string')=='false', f(*cols)).otherwise(F.lit('None')))

                    #*[F.col(x) for x in function_dict[key][1]],
                    #                              f(*[F.col(x) for x in function_dict[key][1]])))
                except utils.AnalysisException:
                    logger.info(
                        "Cannot find parameter of query %s in result given current columns %s. "
                        "Please consider using different parameters or a different time period."
                        % (str(function_dict[key][1]), str(df.columns)))
                    pass

            # persist only the new results
            df.select([column for column in df.columns if column not in old_columns])\
                .write.save(('%s/%s_udf_result' % (result_path, str(sc.applicationId))), mode='append')

        logger.info("Finished PyArEx job.")

Пример #2

Показать файл


sc = SparkContext(conf=SparkConf().setAppName('sparkoptimizationtest'))
sqlc = SQLContext(sc)

# persisted results also in dict
results_dict = {'application_1533282160533_0003': ['udf']}

# create a function that can check the column names from persisted results

paths = [
    'hdfs://nameservice1/tmp/sparkdev/%s_%s_result/' % (key, value)
    for key in results_dict.keys() for value in results_dict[key]
]

df = sqlc.parquetFile(paths[0])
for path in paths[1:]:
    df = df.join(sqlc.parquetFile(path), on='timestamp')

#df.show()
df = df.withColumn('add100_sa', F.col('add100_sa').astype('int'))
#print(df.dtypes)

#function_dict = {'add10_sa': (add10, ['sa']), 'add10_sx': (add10, ['sx']), 'addcols_sxsy': (addcols, ['sx', 'sy'])}
#function_dict = {'add10_sa': (add_udf, ['sa'])}
function_dict = {'add10_sa': (add10, ['add100_sa'])}

add_udf = F.udf(lambda x: '1' if x >= 30 else '0', StringType())


def check_type(f):

Пример #3

Показать файл

## warning message says to use
# /usr/lib/spark/bin/spark-submit  spark_2.py

from pyspark import SparkContext
from pyspark import SQLContext

sc = SparkContext('local', 'pyspark')
print("  *** hello world from pyspark")

## https://databricks-training.s3.amazonaws.com/data-exploration-using-spark-sql.html
sqlCtx = SQLContext(sc)

print("   *** hello world spark sql context created")

#wikiData = sqlCtx.parquetFile("data/wiki_parquet")
wikiData = sqlCtx.parquetFile("password.txt")

# get  error:
# maybe file was spooled to hdfs, but unable to process it correctly?
# py4j.protocol.Py4JJavaError: An error occurred while calling o36.parquetFile.
#: java.lang.AssertionError: assertion failed: No predefined schema found, and no Parquet data files or summary files found under hdfs://clp24/user/hoti1/password.txt.

wikiDAta.count()
wikiData.registerAsTable("wikiData")
result = sqlCtx.sql("SELECT COUNT(*) AS pageCount FROM wikiData").collect()

print result[0].pageCount

print("  *** bye bye world")