Пример #1
0
    def test_window_functions(self):
        df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        w = Window.partitionBy("value").orderBy("key")
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.rowNumber().over(w),
            F.rank().over(w),
            F.denseRank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 1, 1, 1, 1, 1),
            ("2", 1, 1, 1, 3, 1, 1, 1, 1),
            ("2", 1, 2, 1, 3, 2, 1, 1, 1),
            ("2", 2, 2, 2, 3, 3, 3, 2, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[: len(r)])
Пример #2
0
def do_something_only_once():
    # the command I use to run this script:
    #~/spark-1.6.1/bin/spark-submit --packages=com.databricks:spark-avro_2.10:2.0.1,com.databricks:spark-csv_2.10:1.4.0 server.py
    global topdis, meta, dic, towo, cluto, doctopdat, maxdate, mindate, lda
    ## Loading of data
    sc = SparkContext(appName='Simple App') #"local"
    sqlContext = SQLContext(sc)
    # Load metadata avro
    reader = sqlContext.read.format('com.databricks.spark.avro')
    meta = reader.load('data/spark_metadata.avro')
    # # Loading topic distributions
    topdisFile = 'data/spark_output.tuples'
    csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    topdis = csvLoader.options(delimiter=',',header='false', inferschema='true').load(topdisFile)
    strip_first_col_int = udf(lambda row: int(row[1:]), IntegerType())
    topdis = topdis.withColumn('C0',strip_first_col_int(topdis['C0']))
    strip_first_col_float = udf(lambda row: float(row[1:]), FloatType())
    topdis = topdis.withColumn('C1',strip_first_col_float(topdis['C1']))
    strip_last_col = udf(lambda row: float(row[:-2]), FloatType())
    topdis = topdis.withColumn('C20',strip_last_col(topdis['C20']))
    # # Load dictionary CSV
    dicFile = 'data/spark_dic.csv'
    csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    dic = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(dicFile)
    dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count'))
    ldaFile = 'data/spark_lda.csv'
    csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    lda = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(ldaFile)
    lda = lda.select(rowNumber().alias('id'), lda.columns).join(dic, dic.id == lda.id, 'inner').cache()
    # dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count'))
    # # # Load clustertopics CSV
    # clutoFile = 'enron_small_clustertopics.csv'
    # csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    # cluto = csvLoader.options(delimiter=',', header='false', inferschema='true').load(clutoFile)
    # # # Load topicswords CSV
    # towoFile = 'enron_small_lda_transposed.csv'
    # csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    # towo = csvLoader.options(delimiter=',', header='false', inferschema='true').load(towoFile)
    # # Merge topdis which has document id and with metadata, based on document id
    metasmall = meta.select('id',unix_timestamp(meta['date'],"yyyy-MM-dd'T'HH:mm:ssX").alias("timestamp"))
    doctopdat = topdis.join(metasmall, metasmall.id == topdis.C0,'inner').cache()
    maxdate = doctopdat.select(max('timestamp').alias('maxtimestamp')).collect()[0]['maxtimestamp']
    mindate = doctopdat.select(min('timestamp').alias('mintimestamp')).collect()[0]['mintimestamp']
Пример #3
0
 def test_window_functions(self):
     df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"),
                                       (1, "2")], ["key", "value"])
     w = Window.partitionBy("value").orderBy("key")
     from pyspark.sql import functions as F
     sel = df.select(
         df.value, df.key,
         F.max("key").over(w.rowsBetween(0, 1)),
         F.min("key").over(w.rowsBetween(0, 1)),
         F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))),
         F.rowNumber().over(w),
         F.rank().over(w),
         F.denseRank().over(w),
         F.ntile(2).over(w))
     rs = sorted(sel.collect())
     expected = [
         ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1),
         ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2)
     ]
     for r, ex in zip(rs, expected):
         self.assertEqual(tuple(r), ex[:len(r)])
Пример #4
0
def proportion_samples(sdf, proportions_sdf, count_column='rows_count'):
    '''Load huge tables from Hive slightly faster than over toPandas in Spark
    Parameters
    ----------
    sdf : spark Dataframe to sample from
    proportions_sdf : spark Dataframe with counts to sample from sdf
    count_column: column name with counts, other columns used as statifiers
 
    Returns
    ----------
    sampled : spark Dataframe with number of rows lesser or equal proportions_sdf for each strata
    '''
    import pyspark.sql.functions as F
    from pyspark.sql.window import Window
    groupers = [c for c in proportions_sdf.columns if c != count_column]

    sampled = sdf.join(proportions_sdf, groupers, how='inner') \
        .withColumn('rownum',
                    F.rowNumber().over(Window.partitionBy(groupers))) \
        .filter(F.col('rownum') <= F.col(count_column)).drop(count_column).drop('rownum')
    return sampled
Пример #5
0
def main():
    "Main function"
    optmgr = OptionParser()
    opts = optmgr.parser.parse_args()

    config = ConfigParser.ConfigParser()
    config.read(CONFIG_PATH)
    if opts.es:
        esnode = config.get('ElasticSearch', 'node')
        esport = config.get('ElasticSearch', 'port')
        esresource = config.get('ElasticSearch', 'resource')

    # setup spark/sql context to be used for communication with HDFS
    sc = SparkContext(appName="phedex_br")

    # setting spark log level
    logs = opts.logs.upper()
    validateLogLevel(logs)
    sc.setLogLevel(logs)

    # setting up spark sql variables
    sqlContext = HiveContext(sc)
    schema_def = schema()

    # read given file(s) into RDD
    if opts.fname:
        pdf = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(opts.fname, schema = schema_def)
    elif opts.basedir:
        fromdate, todate = defDates(opts.fromdate, opts.todate)
        datedic = generateDateDict(fromdate, todate, opts.interval)
        boundic = generateBoundDict(datedic)
        max_interval = max(datedic.values())
        lastdate = sorted(datedic.keys())[-1]
        files = getFileList(opts.basedir, fromdate, lastdate)
        msg = "Between dates %s and %s found %d directories" % (
            fromdate, lastdate, len(files))
        print msg

        if not files:
            return
        pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(file_path, schema = schema_def) \
                        for file_path in files])
    else:
        raise ValueError(
            "File or directory not specified. Specify fname or basedir parameters."
        )

    # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date)
    groupdic, nodedic = getJoinDic()
    acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$"
    data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$"
    node_tier_reg = r"^(.{2})"
    campaign_reg = r"^/[^/]*/([^/]*)-[^/]*/[^/]*$"
    groupf = udf(lambda x: groupdic.get(x, ''), StringType())
    nodef = udf(lambda x: nodedic.get(x, ''), StringType())
    regexudf = udf(lambda x, y: bool(regexp_extract(x, y, 1)), BooleanType())

    ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \
         .withColumn("node_kind", nodef(pdf.node_id)) \
         .withColumn("now", from_unixtime(pdf.now_sec, "yyyy-MM-dd")) \
         .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\
                     lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \
         .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\
                     lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1)))\
         .withColumn("node_tier", when(regexp_extract(pdf.node_name, node_tier_reg, 1) == "",\
                     lit("null")).otherwise(regexp_extract(pdf.node_name, node_tier_reg, 1)))\
         .withColumn("campaign", when(regexp_extract(pdf.dataset_name, campaign_reg, 1) == "",\
                     lit("null")).otherwise(regexp_extract(pdf.dataset_name, campaign_reg, 1)))

    # print dataframe schema
    if opts.verbose:
        print("pdf data type", type(ndf))
        ndf.printSchema()
        print("ndf total:", ndf.count())
        for row in ndf.head(10):
            print(row)

    # process aggregation parameters
    keys = [key.lower().strip() for key in opts.keys.split(',')]
    results = [result.lower().strip() for result in opts.results.split(',')]
    aggregations = [agg.strip() for agg in opts.aggregations.split(',')]
    order = [orde.strip()
             for orde in opts.order.split(',')] if opts.order else []
    asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else []
    filtc = [fil.split(':')[0]
             for fil in opts.filt.split(',')] if opts.filt else []
    filtv = [fil.split(':')[1]
             for fil in opts.filt.split(',')] if opts.filt else []
    isavgday = (AVERAGEDAY in aggregations)

    validateAggregationParams(keys, results, aggregations, order, filtc)

    # filtering data by regex
    for index, val in enumerate(filtc):
        ndf = ndf.filter(
            regexp_extract(getattr(ndf, val), filtv[index], 0) != "")

    # if delta aggregation is used
    if DELTA in aggregations:
        if len(results) != 1:
            raise ValueError("Delta aggregation can have only 1 result field")
        result = results[0]

        #1 for all dates generate interval group dictionary
        interval_group = udf(lambda x: datedic[x], IntegerType())
        interval_start = udf(lambda x: boundic[x][0], StringType())
        interval_end = udf(lambda x: boundic[x][1], StringType())

        #2 group data by block, node, interval and last result in the interval
        ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now,
                         getattr(ndf, result))
        idf = ndf.withColumn("interval_group", interval_group(ndf.now))
        win = Window.partitionBy(idf.block_name, idf.node_name,
                                 idf.interval_group).orderBy(idf.now.desc())
        idf = idf.withColumn("row_number", rowNumber().over(win))
        rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\
                 .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0)))
        rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group,
                         getattr(rdf, result))

        #3 create intervals that not exist but has minus delta
        win = Window.partitionBy(idf.block_name,
                                 idf.node_name).orderBy(idf.interval_group)
        adf = rdf.withColumn("interval_group_aft",
                             lead(rdf.interval_group, 1, 0).over(win))
        hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\
                 .withColumn("interval_group", adf.interval_group + 1)\
                 .withColumn(result, lit(0))\
                 .drop(adf.interval_group_aft)

        #4 join data frames
        idf = rdf.unionAll(hdf)

        #3 join every interval with previous interval
        win = Window.partitionBy(idf.block_name,
                                 idf.node_name).orderBy(idf.interval_group)
        fdf = idf.withColumn(
            "delta",
            getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win))

        #5 calculate delta_plus and delta_minus columns and aggregate by date and node
        ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \
                .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0))

        aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\
                                                                    sum(ddf.delta_minus).alias("delta_minus"))

        aggres = aggres.select(
            aggres.node_name,
            interval_end(aggres.interval_group).alias("date"),
            aggres.delta_plus, aggres.delta_minus)

    else:
        # clean-up unnecessary dataframe and columns
        pdf.unpersist()

        if isavgday:
            datescount = ndf.select(ndf.now).distinct().count()
            aggregations = [
                "sum" if aggregation == "avg-day" else aggregation
                for aggregation in aggregations
            ]

        resAgg_dic = zipResultAgg(results, aggregations)
        order, asc = formOrdAsc(order, asc, resAgg_dic)

        # perform aggregation
        if order:
            aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order,
                                                               ascending=asc)
        else:
            aggres = ndf.groupBy(keys).agg(resAgg_dic)
        ndf.unpersist()

        # if average day then divide by dates count
        if isavgday:
            resfields = [
                resAgg_dic[result] + "(" + result + ")" for result in results
            ]
            for field in resfields:
                aggres = aggres.withColumn(field,
                                           getattr(aggres, field) / datescount)

    # output results
    if opts.fout:
        is_header = str(opts.header).lower()
        fout_header = formFileHeader(opts.fout)

        if opts.collect:
            fout_header = fout_header + ".json"
            aggres = aggres.toJSON().collect()
            with open(fout_header, 'w+') as f:
                f.write('[')
                f.write(",".join(aggres))
                f.write(']')
        else:
            lines = aggres.map(to_csv)
            lines.saveAsTextFile(opts.fout)

        if opts.es:
            validateEsParams(esnode, esport, esresource)
            aggres = aggres.withColumn("origin", lit(opts.esorigin))
            aggres.repartition(1).write.format("org.elasticsearch.spark.sql").option("es.nodes", esnode)\
                                                                                 .option("es.port", esport)\
                                                                                 .option("es.resource", esresource)\
                                                                                 .save(mode="append")
    else:
        for row in aggres.head(10):
            print(row)
Пример #6
0
def main():
    cluster_seeds = ['199.60.17.136', '199.60.17.173']
    conf = SparkConf().set('spark.cassandra.connection.host',
                           ','.join(cluster_seeds))
    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    # Loading yelp_business data
    yelp_business = sc.cassandraTable("bigbang", "yelp_business").\
        select('business_id', 'city', 'full_address', 'latitude', 'longitude', 'name', 'open', 'review_count', 'stars', 'state')\
        .toDF()
    yelp_business.registerTempTable('business')

    # Loading yelp_user data
    yelp_user = sc.cassandraTable("bigbang",
                                  "yelp_user").select('user_id', 'name',
                                                      'yelping_since').toDF()
    yelp_user.registerTempTable('user')

    # Loading user_pred_ratings data
    user_pred_ratings = sc.cassandraTable("bigbang", "user_pred_ratings").\
       select('user_id', 'business_id', 'user_pred_rating').toDF()
    user_pred_ratings.registerTempTable('userPredRatings')

    # Loading checkin data
    business_checkin = sc.cassandraTable("bigbang", "yelp_business_checkin")\
        .select('business_id', 'checkin','day','hour').toDF()
    business_checkin.registerTempTable('businessCheckin')

    # Joining user, business and ratings to get denormalized data
    user_res_pred_denorm_df = sqlContext.sql(
        """SELECT a.user_id as user_id, b.city as city, a.business_id as business_id, b.full_address as full_address,
       b.latitude as latitude, b.longitude as longitude, b.name as business_name, b.open as open,
       b.review_count as review_count, b.stars as stars, b.state as state, c.name as user_name,
       a.user_pred_rating as user_pred_rating
       from userPredRatings a inner join business b
       on a.business_id = b.business_id
       inner join user c
       on a.user_id = c.user_id""")

    # user_res_pred_denorm_df.registerTempTable('userResPredDenorm')
    #
    # recommendations_top10_df = sqlContext.sql(
    #    """SELECT user_id, city, business_id, full_address, latitude, longitude, business_name, open, review_count,
    #    stars, state, user_name, user_pred_rating FROM
    #     (SELECT user_id, city, business_id, full_address, latitude, longitude, business_name, open, review_count, stars,
    #     state, user_name, user_pred_rating, row_number() OVER (PARTITION BY user_id, state ORDER BY user_pred_rating desc) rn
    #     FROM userResPredDenorm) temp
    #     WHERE temp.rn <= 20""")

    #Retrieving top 20 restaurants per user per state using Window Analytic Functions
    windowSpec = Window.partitionBy(user_res_pred_denorm_df['user_id'],user_res_pred_denorm_df['state']).\
        orderBy(user_res_pred_denorm_df['user_pred_rating'].desc())

    rn_business = (func.rowNumber().over(windowSpec))
    recommendations_ranked_df = user_res_pred_denorm_df.select(
        user_res_pred_denorm_df['user_id'], user_res_pred_denorm_df['city'],
        user_res_pred_denorm_df['business_id'],
        user_res_pred_denorm_df['full_address'],
        user_res_pred_denorm_df['latitude'],
        user_res_pred_denorm_df['longitude'],
        user_res_pred_denorm_df['business_name'],
        user_res_pred_denorm_df['open'],
        user_res_pred_denorm_df['review_count'],
        user_res_pred_denorm_df['stars'], user_res_pred_denorm_df['state'],
        user_res_pred_denorm_df['user_name'],
        user_res_pred_denorm_df['user_pred_rating'], rn_business.alias("rn"))

    recommendations_top20_df = recommendations_ranked_df.filter(
        recommendations_ranked_df["rn"] <= 20).select(
            recommendations_ranked_df['user_id'],
            recommendations_ranked_df['city'],
            recommendations_ranked_df['business_id'],
            recommendations_ranked_df['full_address'],
            recommendations_ranked_df['latitude'],
            recommendations_ranked_df['longitude'],
            recommendations_ranked_df['business_name'],
            recommendations_ranked_df['open'],
            recommendations_ranked_df['review_count'],
            recommendations_ranked_df['stars'],
            recommendations_ranked_df['state'],
            recommendations_ranked_df['user_name'],
            recommendations_ranked_df['user_pred_rating'])
    recommendations_top20_df.registerTempTable('recommendations')

    recommendations_top20_rdd = recommendations_top20_df.rdd.map(lambda p: (
        str(uuid.uuid1()), p['user_id'], p['city'], p['business_id'], p[
            'full_address'], p['latitude'], p['longitude'], p[
                'business_name'], p['open'], p['review_count'], p['stars'], p[
                    'state'], p['user_name'], p['user_pred_rating']))
    # Loading the recommendations table in cassandra
    recommendations_top20_rdd.saveToCassandra(
        'bigbang',
        'recommendations',
        columns=[
            "uid", "user_id", "business_city", "business_id",
            "business_full_address", "business_latitude", "business_longitude",
            "business_name", "business_open", "business_review_count",
            "business_stars", "business_state", "user_name", "user_pred_rating"
        ],
        parallelism_level=1000)

    # Joining recommendations and checkin data
    user_res_checkin_denorm_df = sqlContext.sql(
        """SELECT a.user_id as user_id, a.city as business_city, a.business_id as business_id,
       a.full_address as business_full_address, a.latitude as business_latitude,
       a.longitude as business_longitude, a.business_name as business_name, a.open as business_open,
       a.review_count as business_review_count, a.stars as business_stars,
       a.state as business_state, a.user_name as user_name,
       a.user_pred_rating as user_pred_rating, b.day as day, b.hour as hour, b.checkin as checkin
       from recommendations a inner join businessCheckin b
       on a.business_id = b.business_id""")

    recommendations_checkin_rdd = user_res_checkin_denorm_df.rdd.map(
        lambda p: (str(uuid.uuid1()), p['user_id'], p['business_city'], p[
            'business_id'], p['business_full_address'], p['business_latitude'],
                   p['business_longitude'], p['business_name'], p[
                       'business_open'], p['business_review_count'],
                   p['business_stars'], p['business_state'], p['user_name'], p[
                       'user_pred_rating'], p['day'], p['hour'], p['checkin']))
    # Loading the recommendations_checkin table in cassandra
    recommendations_checkin_rdd.saveToCassandra(
        'bigbang',
        'recommendations_checkin',
        columns=[
            "uid", "user_id", "business_city", "business_id",
            "business_full_address", "business_latitude", "business_longitude",
            "business_name", "business_open", "business_review_count",
            "business_stars", "business_state", "user_name",
            "user_pred_rating", "day", "hour", "checkin"
        ],
        parallelism_level=1000)
Пример #7
0
# build dataframe with column shown in schema, the reason to build data frame is that VectorAssembler's input should be two column in dataframe
schema = 'score,gilded,distinguished,controversiality'.split(',')
itemsForDataFrame = textScorePair.map(
    lambda x: [x[1], x[0][2], x[0][3], x[0][4]])
otherFeaturesDF = sqlContext.createDataFrame(itemsForDataFrame, schema)

# build datafame for tf_idf, same reason as above
tfidfSchema = StructType([StructField("tf_idf", VectorUDT(), True)])
row = Row("tf_idf")
tfidfDF = tfidf.map(lambda x: row(x)).toDF(tfidfSchema)

# add row number to the two dataframe, in order to perform a join
w = Window().orderBy()
otherFeaturesDF = otherFeaturesDF.withColumn("columnindex",
                                             rowNumber().over(w))
tfidfDF = tfidfDF.withColumn("columnindex", rowNumber().over(w))

mergedDF = otherFeaturesDF.join(
    tfidfDF, otherFeaturesDF.columnindex == tfidfDF.columnindex, 'inner')

# assemble the tf-idf and other features to form a single vector
assembler = VectorAssembler(
    inputCols=["tf_idf", "gilded", "distinguished", "controversiality"],
    outputCol="features")
mergedDF = assembler.transform(mergedDF)
mergedDF.show()
scoreFeaturesPair = mergedDF.map(lambda x: (x[7], x[0])).repartition(500)
features = scoreFeaturesPair.map(lambda x: x[0])
scores = scoreFeaturesPair.map(lambda x: int(x[1]))
Пример #8
0
def main():
    "Main function"
    optmgr  = OptionParser()
    opts = optmgr.parser.parse_args()

    # setup spark/sql context to be used for communication with HDFS
    sc = SparkContext(appName="phedex_br")
    if not opts.yarn:
        sc.setLogLevel("ERROR")
    sqlContext = HiveContext(sc)

    schema_def = schema()

    # read given file(s) into RDD
    if opts.fname:
        pdf = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(opts.fname, schema = schema_def)
    elif opts.basedir:
        fromdate, todate = defDates(opts.fromdate, opts.todate)
        files = getFileList(opts.basedir, fromdate, todate)
        msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files))
        print msg

        if not files:
            return
        pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(file_path, schema = schema_def) \
                        for file_path in files])
    else:
        raise ValueError("File or directory not specified. Specify fname or basedir parameters.")

    # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date)
    groupdic, nodedic = getJoinDic()
    acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$"	
    data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$"
    groupf = udf(lambda x: groupdic[x], StringType())
    nodef = udf(lambda x: nodedic[x], StringType())

    ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \
         .withColumn("node_kind", nodef(pdf.node_id)) \
         .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \
         .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \
        .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1)))

	# print dataframe schema
    if opts.verbose:
        ndf.show()
        print("pdf data type", type(ndf))
        ndf.printSchema()

    # process aggregation parameters
    keys = [key.lower().strip() for key in opts.keys.split(',')]
    results = [result.lower().strip() for result in opts.results.split(',')]
    aggregations = [agg.strip() for agg in opts.aggregations.split(',')]
    order = [orde.strip() for orde in opts.order.split(',')] if opts.order else []
    asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else []
    filtc, filtv = opts.filt.split(":") if opts.filt else (None,None)

    validateAggregationParams(keys, results, aggregations, order, filtc)

    if filtc and filtv:
        ndf = ndf.filter(getattr(ndf, filtc) == filtv)

    # if delta aggregation is used
    if DELTA in aggregations:
        validateDeltaParam(opts.interval, results)			
        result = results[0]

        #1 for all dates generate interval group dictionary
        datedic = generateDateDict(fromdate, todate, opts.interval)
        boundic = generateBoundDict(datedic)
        max_interval = max(datedic.values())

        interval_group = udf(lambda x: datedic[x], IntegerType())
        interval_start = udf(lambda x: boundic[x][0], StringType())		
        interval_end = udf(lambda x: boundic[x][1], StringType())

        #2 group data by block, node, interval and last result in the interval
        ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result))
        idf = ndf.withColumn("interval_group", interval_group(ndf.now))
        win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc())	
        idf = idf.withColumn("row_number", rowNumber().over(win))
        rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\
                 .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0)))
        rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result))
        rdf.cache()

        #3 create intervals that not exist but has minus delta
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win))
        hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\
                 .withColumn("interval_group", adf.interval_group + 1)\
                 .withColumn(result, lit(0))\
                 .drop(adf.interval_group_aft)

        #4 join data frames
        idf = rdf.unionAll(hdf)
		
        #3 join every interval with previous interval
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win))

        #5 calculate delta_plus and delta_minus columns and aggregate by date and node
        ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \
                .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0))

        aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\
                                                                    sum(ddf.delta_minus).alias("delta_minus"))

        aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus)
		
    else:	
        resAgg_dic = zipResultAgg(results, aggregations)
        order, asc = formOrdAsc(order, asc, resAgg_dic)

        # perform aggregation
        if order:
            aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc)
        else:
            aggres = ndf.groupBy(keys).agg(resAgg_dic)

    # output results
    if opts.fout:
        fout_header = formFileHeader(opts.fout)
        if opts.header:
            aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header)
        else:
            aggres.write.format('com.databricks.spark.csv').save(fout_header)
    else:
        aggres.show(50)