示例#1
0
 def generateExpr(columnName, listIntervals):
     if (len(listIntervals) == 1):
         return when(col(columnName).between(listIntervals[0][0], listIntervals[0][1]), 0).otherwise(None)
     else:
         return (when((col(columnName) >= listIntervals[0][0]) & (col(columnName) < listIntervals[0][1]),
                      len(listIntervals) - 1)
                 .otherwise(generateExpr(columnName, listIntervals[1:])))
示例#2
0
def log_loss(df):

	epsilon = 1e-12
	temp = df.select("label", when(df.outcome == 1.0, 1.0-epsilon).otherwise(df.outcome).alias("p"))
	temp = temp.select("label", when(temp.p == .0,epsilon).otherwise(temp.p).alias("p"))
	temp = temp.select("p","label", when(temp.label == 1, -log(temp.p)).otherwise(-log(1-temp.p)).alias("log_loss"))	
	
	return temp.selectExpr("mean(log_loss)").first()[0]
示例#3
0
    def runBPwithGraphFrames(cls, g, numIter):
        """Run Belief Propagation using GraphFrame.

        This implementation of BP shows how to use GraphFrame's aggregateMessages method.
        """
        # choose colors for vertices for BP scheduling
        colorG = cls._colorGraph(g)
        numColors = colorG.vertices.select('color').distinct().count()

        # TODO: handle vertices without any edges

        # initialize vertex beliefs at 0.0
        gx = GraphFrame(colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)), colorG.edges)

        # run BP for numIter iterations
        for iter_ in range(numIter):
            # for each color, have that color receive messages from neighbors
            for color in range(numColors):
                # Send messages to vertices of the current color.
                # We may send to source or destination since edges are treated as undirected.
                msgForSrc = sqlfunctions.when(
                    AM.src['color'] == color,
                    AM.edge['b'] * AM.dst['belief'])
                msgForDst = sqlfunctions.when(
                    AM.dst['color'] == color,
                    AM.edge['b'] * AM.src['belief'])
                # numerically stable sigmoid
                logistic = sqlfunctions.udf(cls._sigmoid, returnType=types.DoubleType())
                aggregates = gx.aggregateMessages(
                    sqlfunctions.sum(AM.msg).alias("aggMess"),
                    sendToSrc=msgForSrc,
                    sendToDst=msgForDst)
                v = gx.vertices
                # receive messages and update beliefs for vertices of the current color
                newBeliefCol = sqlfunctions.when(
                    (v['color'] == color) & (aggregates['aggMess'].isNotNull()),
                    logistic(aggregates['aggMess'] + v['a'])
                ).otherwise(v['belief'])  # keep old beliefs for other colors
                newVertices = (v
                    .join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer')
                    .drop(aggregates['id'])  # drop duplicate ID column (from outer join)
                    .withColumn('newBelief', newBeliefCol)  # compute new beliefs
                    .drop('aggMess')  # drop messages
                    .drop('belief')  # drop old beliefs
                    .withColumnRenamed('newBelief', 'belief')
                )
                # cache new vertices using workaround for SPARK-1334
                cachedNewVertices = AM.getCachedDataFrame(newVertices)
                gx = GraphFrame(cachedNewVertices, gx.edges)

        # Drop the "color" column from vertices
        return GraphFrame(gx.vertices.drop('color'), gx.edges)
示例#4
0
 def gen_report_table(hc,curUnixDay):
     rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6])))
     HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor")
     #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY 
     sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_indoor order by entityid,clientmac,utoday" 
     df_id_stat=hc.sql(sql)
     df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_id_mm df_min_max ,to caculate firtarrival and last arrival 
     df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY]
     df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon)
     df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth")
     
     #newly added part for indoors7 and indoors30 based on current date
     df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0))
     df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0))
     df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30")
     hc.sql("drop table if exists df_indoor_fin")
     df_indoor_fin.write.saveAsTable("df_indoor_fin")
     
     rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5])))
     HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow")
     
     # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday
     sql="select entityid,clientmac,utoday,UFIRSTDAY,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_flow order by entityid,clientmac,utoday" 
     df_fl_stat=hc.sql(sql)
     df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_fl_mm df_min_max ,to caculate firtarrival and last arrival 
     df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY]
     df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon)
     df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth")
     hc.sql("drop table if exists df_flow_fin")
     df_flow_fin.write.saveAsTable("df_flow_fin") 
示例#5
0
 def test_first_last_ignorenulls(self):
     from pyspark.sql import functions
     df = self.spark.range(0, 100)
     df2 = df.select(functions.when(df.id % 3 == 0, None).otherwise(df.id).alias("id"))
     df3 = df2.select(functions.first(df2.id, False).alias('a'),
                      functions.first(df2.id, True).alias('b'),
                      functions.last(df2.id, False).alias('c'),
                      functions.last(df2.id, True).alias('d'))
     self.assertEqual([Row(a=None, b=1, c=None, d=98)], df3.collect())
示例#6
0
    def create_hist_data(df, column, minim, maxim, bins=10):

        def create_all_conditions(current_col, column, left_edges, count=1):
            """
            Recursive function that exploits the
            ability to call the Spark SQL Column method
            .when() in a recursive way.
            """
            left_edges = left_edges[:]
            if len(left_edges) == 0:
                return current_col
            if len(left_edges) == 1:
                next_col = current_col.when(col(column) >= float(left_edges[0]), count)
                left_edges.pop(0)
                return create_all_conditions(next_col, column, left_edges[:], count+1)
            next_col = current_col.when((float(left_edges[0]) <= col(column))
                                        & (col(column) < float(left_edges[1])), count)
            left_edges.pop(0)
            return create_all_conditions(next_col, column, left_edges[:], count+1)

        num_range = maxim - minim
        bin_width = num_range / float(bins)
        left_edges = [minim]
        for _bin in range(bins):
            left_edges = left_edges + [left_edges[-1] + bin_width]
        left_edges.pop()
        expression_col = when((float(left_edges[0]) <= col(column))
                              & (col(column) < float(left_edges[1])), 0)
        left_edges_copy = left_edges[:]
        left_edges_copy.pop(0)
        bin_data = (df.select(col(column))
                    .na.drop()
                    .select(col(column),
                            create_all_conditions(expression_col,
                                                  column,
                                                  left_edges_copy
                                                 ).alias("bin_id")
                           )
                    .groupBy("bin_id").count()
                   ).toPandas()

        # If no data goes into one bin, it won't 
        # appear in bin_data; so we should fill
        # in the blanks:
        bin_data.index = bin_data["bin_id"]
        new_index = list(range(bins))
        bin_data = bin_data.reindex(new_index)
        bin_data["bin_id"] = bin_data.index
        bin_data = bin_data.fillna(0)

        # We add the left edges and bin width:
        bin_data["left_edge"] = left_edges
        bin_data["width"] = bin_width

        return bin_data
    def create_binary_feature(self, dataframe, base_field, binary_field):
        """Produces a PySpark dataframe containing a field that is 0 or 1.

        The value of the binary field will be 1 if the value of the evaluated field is greater than 0; otherwise it will be 0.

        :param dataframe: the PySpark dataframe
        :param base_field: the field to use as the basis for the binary field
        :param binary_field: the name to give to the field that will contain values of 0 or 1
        :returns: the PySpark dataframe containing the binary field and all fields in the supplied dataframe.
        """
        return(dataframe.withColumn(binary_field, when(dataframe[base_field] > 0, 1).otherwise(0)))
def processMSC():
    """
    Parses MSC records as per defined rules
    :return: Records returned in pipe-delimited format
    """
    # Assumption: MSC folder under the provided input path
    inputDir = os.path.join(args.inputdir, "INPUT")
    lines = sc.textFile(inputDir)

    # Call the parsing function
    parsedMSCLines = lines.map(parseMSCRecords)

    # The schema is encoded in a string.
    schemaString = "RecordType FirstNum SecondNum CallDate CallHour Duration StartTower StartLAC CallType"
    fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
    schema = StructType(fields)

    # Apply the schema to the RDD.
    schemaData = sqlContext.createDataFrame(parsedMSCLines, schema)
    
    modify_phone_number_udf = udf(mod_number, StringType())
    ph_num_mod = schemaData.select(
        schemaData.RecordType,
        modify_phone_number_udf(schemaData.FirstNum).alias('FirstNum'),
        modify_phone_number_udf(schemaData.SecondNum).alias('SecondNum'),
        schemaData.CallDate,
        schemaData.CallHour,
        schemaData.Duration,
        schemaData.StartTower,
        schemaData.StartLAC,
        schemaData.CallType)

    get_phone_type_udf = udf(get_phone_type, StringType())

    first_ph_type = ph_num_mod.withColumn('FirstPhoneType', get_phone_type_udf(ph_num_mod.FirstNum))

    sec_ph_type = first_ph_type.withColumn('SecondPhoneType', get_phone_type_udf(first_ph_type.SecondNum))

    final_df = sec_ph_type.select(
        sec_ph_type.RecordType,
        sec_ph_type.FirstNum,
        sec_ph_type.SecondNum,
        sec_ph_type.CallDate,
        sec_ph_type.CallHour,
        sec_ph_type.Duration,
        sec_ph_type.StartTower,
        sec_ph_type.StartLAC,
        sec_ph_type.CallType,
        F.when(sec_ph_type.FirstPhoneType.isin(["mobile", "landline", "shortcode"])
               & sec_ph_type.SecondPhoneType.isin(["mobile", "landline", "shortcode"]), "National")
            .otherwise("International").alias('PhoneType'))

    print final_df.show()
    def create_valence_column(self, dataframe, base_field, valence_field):
        """Produces a PySpark dataframe containing a field that is -1, 0, or 1 depending on the value of a specified field.

        The valence will be:
        -1 if the value in the specified column is negative
        0 if the value in the specified column is zero
        1 if the value in the specified column is positive

        :param dataframe: the PySpark dataframe
        :param base_field: the field containing values to use to determine the valence
        :param valence_field: the name of the field that will contain the valence
        :returns: the PySpark dataframe containing the valence field and all fields in the supplied dataframe
        """
        return(dataframe.withColumn(valence_field, when(dataframe[base_field] < 0, -1).when(dataframe[base_field] > 0, 1).otherwise(0)))
    def create_levels_column(self, dataframe, base_field, levels_field):
        """Produces a PySpark dataframe containing a field based on the level of a specified field

        The level will be:
        0 if the value in the specified column is an integer less than 1
        1 if the value in the specified column is an integer between 1 and 2
        2 if the value in the specified column is an integer between 3 and 4
        3 if the value in the specified column is an integer that is 5 or greater

        :param dataframe: the PySpark dataframe
        :param base_field: the field containing integers to use to determine the level
        :param levels_field: the name of the field that will contain the levels
        :returns: the PySpark dataframe containing the levels field and all fields in the supplied dataframe
        """
        return(dataframe.withColumn(levels_field, when(dataframe[base_field].between(1,2), 1) \
                        .when(dataframe[base_field].between(3,4), 2).when(dataframe[base_field] >= 5, 3).otherwise(0)))
示例#11
0
def reduce_to_ohlc(time, rdd):
    row_rdd = rdd.map(lambda row: row.split(',')) \
                 .filter(lambda row: len(row) == 3) \
                 .map(lambda row: Row(
                       symbol=row[0],
                       tx_time=datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S.%f'),
                       price=float(row[1])
                 ))
    sql_context = get_sql_context_instance(rdd.context)
    data = sql_context.createDataFrame(row_rdd)
    data.cache()
    data.write.format('org.apache.spark.sql.cassandra') \
            .options(table='transactions2', keyspace='stock', cluster='Test Cluster') \
            .mode('append') \
            .save()

    ohlc = data.select('symbol', truncate_min(data.tx_time).alias('batch_time'), 'price', 'tx_time') \
                .orderBy('tx_time') \
                .groupBy('symbol', 'batch_time') \
                .agg(
                   F.first(data.price).alias('open'),
                   F.max(data.price).alias('high'),
                   F.min(data.price).alias('low'),
                   F.last(data.price).alias('close'),
                   F.first(data.tx_time).alias('open_time'),
                   F.last(data.tx_time).alias('close_time')
                )

    existing_ohlc = sql_context.read.format('org.apache.spark.sql.cassandra') \
            .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \
            .load() \
            .select('symbol', 'batch_time', 'open', 'open_time', 'high', 'low', 'close', 'close_time')

    merged_ohlc = ohlc.join(existing_ohlc,
                             (ohlc.symbol == existing_ohlc.symbol) &
                             (ohlc.batch_time == existing_ohlc.batch_time),
                             'left'
                           )

    merged_ohlc = merged_ohlc.select(
        ohlc.symbol.alias('symbol'),
        ohlc.batch_time.alias('batch_time'),
        F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open).otherwise(ohlc.open).alias('open'),
        F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open_time).otherwise(ohlc.open_time).alias('open_time'),
        F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close).otherwise(ohlc.close).alias('close'),
        F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close_time).otherwise(ohlc.close_time).alias('close_time'),
        F.when(existing_ohlc.low < ohlc.low, existing_ohlc.low).otherwise(ohlc.low).alias('low'),
        F.when(existing_ohlc.high > ohlc.high, existing_ohlc.high).otherwise(ohlc.high).alias('high')
    )
    merged_ohlc.write.format('org.apache.spark.sql.cassandra') \
                .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \
                .mode('append') \
                .save()
示例#12
0
 def test_aggregate_messages(self):
     g = self._graph("friends")
     # For each user, sum the ages of the adjacent users,
     # plus 1 for the src's sum if the edge is "friend".
     sendToSrc = (
         AM.dst['age'] +
         sqlfunctions.when(
             AM.edge['relationship'] == 'friend',
             sqlfunctions.lit(1)
         ).otherwise(0))
     sendToDst = AM.src['age']
     agg = g.aggregateMessages(
         sqlfunctions.sum(AM.msg).alias('summedAges'),
         sendToSrc=sendToSrc,
         sendToDst=sendToDst)
     # Run the aggregation again providing SQL expressions as String instead.
     agg2 = g.aggregateMessages(
         "sum(MSG) AS `summedAges`",
         sendToSrc="(dst['age'] + CASE WHEN (edge['relationship'] = 'friend') THEN 1 ELSE 0 END)",
         sendToDst="src['age']")
     # Convert agg and agg2 to a mapping from id to the aggregated message.
     aggMap = {id_: s for id_, s in agg.select('id', 'summedAges').collect()}
     agg2Map = {id_: s for id_, s in agg2.select('id', 'summedAges').collect()}
     # Compute the truth via brute force.
     user2age = {id_: age for id_, age in g.vertices.select('id', 'age').collect()}
     trueAgg = {}
     for src, dst, rel in g.edges.select("src", "dst", "relationship").collect():
         trueAgg[src] = trueAgg.get(src, 0) + user2age[dst] + (1 if rel == 'friend' else 0)
         trueAgg[dst] = trueAgg.get(dst, 0) + user2age[src]
     # Compare if the agg mappings match the brute force mapping
     self.assertEqual(aggMap, trueAgg)
     self.assertEqual(agg2Map, trueAgg)
     # Check that TypeError is raises with messages of wrong type
     with self.assertRaises(TypeError):
         g.aggregateMessages(
             "sum(MSG) AS `summedAges`",
             sendToSrc=object(),
             sendToDst="src['age']")
     with self.assertRaises(TypeError):
         g.aggregateMessages(
             "sum(MSG) AS `summedAges`",
             sendToSrc=dst['age'],
             sendToDst=object())
示例#13
0
def comb_creation(apps,cpu_perc_list):
    from pyspark.sql.functions import when

    q1 = datetime.now()

    df_t = df.registerTempTable('dummy')
    df_t = sqlContext.sql('select sum(byte_count) as byte_count_sum , time_stamp , location from dummy  group by location, time_stamp')
    df_t = df_t[df_t.byte_count_sum!=0]

    cpu_perc_list.append(py.cpu_percent())
    cpu_perc_list = [max(cpu_perc_list)]


    #df_needed = df[df.application.isin(apps)]
    df_t = df_t.registerTempTable('dummy')
    df_t = sqlContext.sql('select count(*) as count, location  from dummy group by location')

    df_t= df_t.withColumn('count_flag', when(df_t['count']>config.limit,1).otherwise(0))
    df_t = df_t[df_t.count_flag==1]
    
    # fetching the  location which is to be filtered from filter_db table
    with conn.cursor() as cursor:
        # Read a  record
        sql = "select * from filter_db" 
        cursor.execute(sql)
        so_result = pd.DataFrame(cursor.fetchall())
    
    #filtering
    from pyspark.sql.functions import col
    #print(so_result)
    s_filter = list(so_result.source)
    df_t = df_t.filter(~col('source').isin(s_filter))
    #df_t = df_t[df_t.location!='134.141.5.104']
    df2 = df_t.toPandas()

    q2 = datetime.now()

    print('time to refernce data prepration is ',str(q2-q1))
    print('length of table is ',len(df2))
    
    return df2
def get_http_filter(time, rdd):
    try:
        print "========= %s =========" % str(time)
        sqlContext = getSqlContextInstance(rdd.context)
        df = json_rdd_to_sql_df(rdd)
        url_type = udf(lambda url: url.split('/')[-1].split('.')[-1])
        df.select('*', when(url_type(df['url']).inSet(STATIC_URL_TYPE), 1).otherwise(0).alias("url_type")).registerAsTable("http")
        url_info = sqlContext.sql("""SELECT url_type, 
                                          avg(in_bytes) as in_bytes, 
                                          avg(out_bytes) as out_bytes, 
                                          avg(latency_sec) as latency_sec,
                                          avg(latency_usec) as latency_usec, 
                                          count(*) as requests 
                                          FROM http group by url_type""").toJSON().collect()
        output = {}
        for info in url_info:
            temp = json.loads(info)
            if temp['url_type'] == 1:
                output['static_request'] = temp
            else:
                output['dynamic_request'] = temp
        dump_file("http", output, "http_filter")
    except Exception as e:
        print e     
示例#15
0
文件: base.py 项目: tomspur/koalas
 def rtruediv(left, right):
     return F.when(left == 0,
                   F.lit(np.inf).__div__(right)).otherwise(
                       F.lit(right).__truediv__(left))
示例#16
0
文件: base.py 项目: tomspur/koalas
 def rfloordiv(left, right):
     return F.when(F.lit(left == 0),
                   F.lit(np.inf).__div__(right)).otherwise(
                       F.when(F.lit(left) == np.nan, np.nan).otherwise(
                           F.floor(F.lit(right).__div__(left))))
示例#17
0
# stays_df.withColumn('AGE', stays_df['INDATE'] - stays_df["DOBDATE"])
stays_df = stays_df.select("*",
                           psql.to_date(stays_df["INTIME"]).alias("INDATE"))
stays_df = stays_df.select(
    "*",
    psql.to_date(stays_df["DOB"]).alias("DOBDATE"))  #.alias("DOBDATE")
stays_df = stays_df.select(
    "*",
    psql.floor((psql.datediff(stays_df['INDATE'], stays_df["DOBDATE"]) /
                365.0)).alias("AGE"))
# alias("AGE"))#.alias("AGE")
# print(stays_df.filter("AGE > 250").count())
stays_df = stays_df.withColumn(
    "AGE",
    psql.when(stays_df.AGE > 250, 90).otherwise(stays_df.AGE))
# print(stays_df.filter("AGE < 0").count())

stays = add_inunit_mortality_to_icustays(stays)

#add_inunit_mortality_to_icustays below
mortality = stays_df.withColumn("DODFILTER", (stays_df.INTIME <= stays_df.DOD)
                                & (stays_df.OUTTIME >= stays_df.DOD))
# print(mortality.filter("DODFILTER = true").count())
mortality = mortality.withColumn("DEATHTIMEFILTER",
                                 (stays_df.INTIME <= stays_df.DEATHTIME) &
                                 (stays_df.OUTTIME >= stays_df.DEATHTIME))
mortality = mortality.withColumn(
    "MORTALITY_InUnit", mortality.DODFILTER | mortality.DEATHTIMEFILTER)
# mortality = mortality.withColumn("MORTALITY", mortality.MORTALITY_InUnit.cast('int'))
stays_df = mortality.withColumn("MORTALITY_InUnit",
示例#18
0
    df = sqlContext.read.parquet(config.proj_path+'/datas/appid_datapoint_parquet1')

    from sqlalchemy import create_engine
    engine = create_engine(str("mysql+pymysql://"+config.db_user+":"+config.db_pass+"@"+config.db_host+":"+str(config.db_port)+"/"+config.db_name))


    from pyspark.sql.functions import when

    q1 = datetime.now()
    df1 = df[(df.app_rsp_time !=0)]
    
    df_t = df1.registerTempTable('dummy')
    df_t = sqlContext.sql('select count(*) as count, source , application, target_address  from dummy group by source, application, target_address')

    df_t= df_t.withColumn('count_flag', when(df_t['count']>config.limit,1).otherwise(0))
    df_t = df_t[df_t.count_flag==1]
    
    # fetching the  source which is to be filtered from filter_db table
    with conn.cursor() as cursor:
        # Read a  record
        sql = "select * from filter_db" 
        cursor.execute(sql)
        so_result = pd.DataFrame(cursor.fetchall())
    
    #filtering
    from pyspark.sql.functions import col
    #print(so_result)
    s_filter = list(so_result.source)
    df_t = df_t.filter(~col('source').isin(s_filter))
    #df_t = df_t[df_t.source!='134.141.5.104']
示例#19
0
def gg(x, index):
    return F.when(dd[index] == dd[index+2], "").otherwise(dd[index]).alias(index)
def test_auto_mapper_concat_multiple_items_structs_different_elements(
    spark_session: SparkSession, ) -> None:
    # Arrange
    clean_spark_session(spark_session)
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, None, "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients",
        enable_schema_pruning=True).columns(dst2=AutoMapperList([
            AutoMapperDataTypeComplexBase(a=A.column("first_name"),
                                          b=A.column("last_name"))
        ], ).concat(
            AutoMapperList([
                AutoMapperDataTypeComplexBase(a=A.column("first_name"),
                                              c=A.column("last_name")),
            ], )))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    array1 = when(
        array(
            struct(
                col("b.first_name").alias("a"),
                col("b.last_name").alias("b"),
                lit(None).alias("c"),
            ), ).isNotNull(),
        filter(
            coalesce(
                array(
                    struct(
                        col("b.first_name").alias("a"),
                        col("b.last_name").alias("b"),
                        lit(None).alias("c"),
                    ), ),
                array(),
            ),
            lambda x: x.isNotNull(),
        ),
    )
    array2 = when(
        array(
            struct(
                col("b.first_name").alias("a"),
                lit(None).alias("b"),
                col("b.last_name").alias("c"),
            ), ).isNotNull(),
        filter(
            coalesce(
                array(
                    struct(
                        col("b.first_name").alias("a"),
                        lit(None).alias("b"),
                        col("b.last_name").alias("c"),
                    ), ),
                array(),
            ),
            lambda x: x.isNotNull(),
        ),
    )
    assert_compare_expressions(sql_expressions["dst2"],
                               concat(array1, array2).alias("dst2"))

    result_df: DataFrame = mapper.transform(df=source_df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            [0] == "Imran")
    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            [1] == "Qureshi")
    assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0]
            [0] == "Michael")
    assert (
        result_df.where("member_id == 2").select("dst2").collect()[0][0][0][1]
        is None)
示例#21
0
			'School Not Found',
			'School or City Wide Complaint',
			'Bridge Highway Name',
			'Bridge Highway Direction',
			'Road Ramp',
			'Bridge Highway Segment',
			'Garage Lot Name',
			'Ferry Direction',
			'Ferry Terminal Name',
			'Latitude',
			'Longitude']

df = df.select([name for name in df.schema.names if name not in drop_list])

#Replacing invalid Zipcodes with N/A Zip codes should either be 5 digits or 5 digits followed by 4 digits.
df = df.withColumn('Incident Zip', when(col('Incident Zip').rlike('^\d{5}(?:[-\s]\d{4})?$')!= True, 'N/A').otherwise(df['Incident Zip']))

# Replacing invalid closed dates (before 2009) with N/A
years = [str(i) for i in range(2009, 2018)]
df = df.withColumn('Closed Date',when( col('Closed Date').substr(7,4).isin(years), col('Closed Date')).otherwise('N/A'))

#Replacing Null values with "N/A"
for x in df.schema.names:
	# Basic replacement 
	df = df.withColumn(x, when(col(x).isin("", "Unspecified", "0 Unspecified") != True, col(x)).otherwise('N/A'))  


#writing cleaned dataframes onto a csv for comparision and Analysis phase.
df.write.format('com.databricks.spark.csv').options(header='true').save('/user/sdv267/cleaned_311.csv') 

示例#22
0
# %%
#Test data samples
df_test.show(10)
df_train.printSchema
df_train = df_train.withColumn("usefulCount",round(df_train["usefulCount"]).cast('integer'))


# %%
#Joining train and test data set
df = df_train.join(df_test, on=['uniqueID', 'drugName', 'condition','review','rating','date','usefulCount'], how='left_outer')


# %%
#Computing setniment column based on rating
sentiment = when(col("rating")<=5, 0).otherwise(1)

df = df.withColumn("sentiment",sentiment)
df = df.withColumn('length',length(df['review']))

# %% [markdown]
# ## Feature Transformation

# %%
from pyspark.ml.feature import Tokenizer,StopWordsRemover, CountVectorizer,IDF,StringIndexer

tokenizer = Tokenizer(inputCol="review", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
pos_neg = StringIndexer(inputCol='sentiment',outputCol='label')
示例#23
0
def defaultDebtRatioToThreshold(df):
    df = df.withColumn(
        'DebtRatio',
        F.when((F.col('DebtRatio') > 1.5), 1.5).otherwise(F.col('DebtRatio')))

    return df
示例#24
0
##StringEncoding of categorical variables
cat_x_vars = ["term", "grade", "home_ownership", "pred_KM", "emp_length"]

#df2 = df #backup in case of trouble

for cat_var in cat_x_vars:
    df = StringIndexer(inputCol=cat_var, outputCol=cat_var +
                       'Idx').fit(df).transform(df).drop(cat_var)
    df = df.withColumnRenamed(cat_var + 'Idx', cat_var)

#df.select(cat_x_vars).show(5) #check

##Create y or target variables for neural networks
#probability/indicator for default
df = df.withColumn('probDef',
                   F.when(df['loan_status'] == 1,
                          1.0).otherwise(0.0))  #default is 1, repaid is 0
#indicator for early replayment
df = df.withColumn(
    'probER',
    F.when((df['loan_status'] == 0) & (df['fracNumPmts'] < 1),
           1.0).otherwise(0.0))
#indicator for on-schedule repayment can be inferred as probDef=probER=0,0, with

#visually:
#plot of timing of either default or eventual (not early repayment)
#df.filter((df['loan_status']==1)|(df.fracNumPmts >=1)).select(df.fracNumPmts).toPandas().plot.hist()
#plt.show()  #This is  bi-modal, mostly low over 0,1 and then a spike at 1.

#plot of timing of either repayment (whenever)
#df.filter(df['loan_status']==0).select(df.fracNumPmts).toPandas().plot.hist()
#plt.show() #This is more like a uniform over 0,1 + a spike at 1.
示例#25
0
df = sc.read.parquet('../data/userdata1.parquet')
print(df)

# Handle duplicate values
print(df.drop_duplicates().count())

# Handling missing data
print(df.fillna(0).show())
print(df.dropna().show())

# fill missing values in specefic columns
print(df.fillna({'cc':'6767119071901597' }).show())

# Changing data type in the DF
df1 = df.withColumn("salary",  df["salary"].cast(FloatType()))
print(df1.show())
print(df1.printSchema())

# replace null values with mean salary
print(F.avg(df1.salary))
# df1 = df1.fillna()

# drop String literal and cast to integer
df = df.withColumn("cc", F.when(df.cc != '', df.cc).otherwise('0'))
df = df.withColumn("cc", df.cc.cast(IntegerType()))
print(df.printSchema())

# replace empty String literal with something
df = df.withColumn("birthdate",F.when(df.birthdate != '',df.birthdate).otherwise("05/05/2020"))
df = df.withColumn("birthdate", F.to_date(df.birthdate,'mm/dd/yyyy'))
print(df.show())
def calculate_metrics(predictions,y,data_type):
    start_time4 = time.time()

    # Calculate ROC
    evaluator = BinaryClassificationEvaluator(labelCol=y,rawPredictionCol='probability')
    auroc = evaluator.evaluate(predictions,{evaluator.metricName: "areaUnderROC"})
    print('AUC calculated',auroc)

    selectedCols = predictions.select(F.col("probability"), F.col('prediction'), F.col(y)).rdd.map(lambda row: (float(row['probability'][1]), float(row['prediction']), float(row[y]))).collect()
    y_score, y_pred, y_true = zip(*selectedCols)

    # Calculate Accuracy
    accuracydf=predictions.withColumn('acc',F.when(predictions.prediction==predictions[y],1).otherwise(0))
    accuracydf.createOrReplaceTempView("accuracyTable")
    RFaccuracy=spark.sql("select sum(acc)/count(1) as accuracy from accuracyTable").collect()[0][0]
    print('Accuracy calculated',RFaccuracy)

#     # Build KS Table
    split1_udf = udf(lambda value: value[1].item(), DoubleType())

    if data_type in ['train','valid','test','oot1','oot2']:
        decileDF = predictions.select(y, split1_udf('probability').alias('probability'))
    else:
        decileDF = predictions.select(y, 'probability')

    decileDF=decileDF.withColumn('non_target',1-decileDF[y])

    window = Window.orderBy(desc("probability"))
    decileDF = decileDF.withColumn("rownum", F.row_number().over(window))
    decileDF.cache()
    decileDF=decileDF.withColumn("rownum",decileDF["rownum"].cast("double"))

    window2 = Window.orderBy("rownum")
    RFbucketedData=decileDF.withColumn("deciles", F.ntile(10).over(window2))
    RFbucketedData = RFbucketedData.withColumn('deciles',RFbucketedData['deciles'].cast("int"))
    RFbucketedData.cache()
    #a = RFbucketedData.count()
    #print(RFbucketedData.show())

    ## to pandas from here
    print('KS calculation starting')
    target_cnt=RFbucketedData.groupBy('deciles').agg(F.sum(y).alias('target')).toPandas()
    non_target_cnt=RFbucketedData.groupBy('deciles').agg(F.sum("non_target").alias('non_target')).toPandas()
    overall_cnt=RFbucketedData.groupBy('deciles').count().alias('Total').toPandas()
    overall_cnt = overall_cnt.merge(target_cnt,on='deciles',how='inner').merge(non_target_cnt,on='deciles',how='inner')
    overall_cnt=overall_cnt.sort_values(by='deciles',ascending=True)
    overall_cnt['Pct_target']=(overall_cnt['target']/overall_cnt['count'])*100
    overall_cnt['cum_target'] = overall_cnt.target.cumsum()
    overall_cnt['cum_non_target'] = overall_cnt.non_target.cumsum()
    overall_cnt['%Dist_Target'] = (overall_cnt['cum_target'] / overall_cnt.target.sum())*100
    overall_cnt['%Dist_non_Target'] = (overall_cnt['cum_non_target'] / overall_cnt.non_target.sum())*100
    overall_cnt['spread'] = builtins.abs(overall_cnt['%Dist_Target']-overall_cnt['%Dist_non_Target'])
    decile_table=overall_cnt.round(2)
    print("KS_Value =", builtins.round(overall_cnt.spread.max(),2))
    #print "Test Error =", builtin.round((1.0 - RFaccuracy),3)
    #print "Accuracy =", builtin.round(RFaccuracy,3)
    #print "AUC=", builtin.round(auroc,3)
    decileDF.unpersist()
    RFbucketedData.unpersist()
    print("Metrics calculation process Completed in : "+ " %s seconds" % (time.time() - start_time4))
    return auroc,RFaccuracy,builtins.round(overall_cnt.spread.max(),2), y_score, y_pred, y_true, overall_cnt
示例#27
0
def load_test_silver_data():
  return load_test_data().withColumn("nulls", when(col("tz").isNull(), array(lit(11))).otherwise(array())).cache()
示例#28
0
	
	trainFeatPos = [1,3]
	trainFeatPos = trainFeatPos + [k for k in range(5,40)]
	trainFeatPos = trainFeatPos + [k for k in xrange(43,55)]
	trainLabelPos = 42+13
	delta = 0.99
	#trainDF,temp = trainDF.randomSplit([delta,1-delta])
	
	leakageParser = ParseDF([2,4],trainLabelPos)
	leakageTrain = leakageParser.parse_raw_df(trainDF).drop("text").drop("PeopleID")
	leakageParser = ParseDF([2,4],42)
	leakageTest = leakageParser.parse_raw_df(testDF,train = False).drop("text").drop("PeopleID")
	leakageTest = leakageTest.withColumnRenamed("label","activity_id")

	leakageTrain = leakageTrain.groupBy("features").avg().withColumnRenamed("avg(label)","leakage")
	leakageTrain = leakageTrain.select("features", when(leakageTrain.leakage >=0.5,1).otherwise(0).alias("leak")).drop("leakage")
	leakageTrain.printSchema()
	
	leakageTrain.show(5)
	leakageTest.show(3)
	print leakageTest.count()
	leakageTest = leakageTest.join(leakageTrain, "features", "left_outer").drop("features")
	print leakageTest.count()
	leakageTest.show(3)
	Parser = ParseDF(trainFeatPos,trainLabelPos) 	
	trainDF = Parser.parse_raw_df(trainDF).drop("text")

	testParser = ParseDF(trainFeatPos,42) 	
	testDF = testParser.parse_raw_df(testDF,train = False).drop("text")
	testDF = testDF.withColumnRenamed("label","activity_id")
	
def execute_process(options):
    spark = (pyspark.sql.session.SparkSession.builder.appName(
        "Radar").enableHiveSupport().getOrCreate())

    spark.sql("""
        SELECT  d.docu_dk,
                v.vist_orgi_orga_dk as orgao_id,
                a.pcao_dt_andamento,
                s.stao_tppr_dk
        FROM {schema}.mcpr_documento d
        join {schema}.mcpr_vista v on v.vist_docu_dk = d.docu_dk
        join {schema}.mcpr_andamento a on a.pcao_vist_dk = v.vist_dk
        join {schema}.mcpr_sub_andamento s on s.stao_pcao_dk = a.pcao_dk
        WHERE to_date(pcao_dt_andamento)
            > to_date(date_sub(current_timestamp(), {days_ago}))
        AND to_date(pcao_dt_andamento) <= to_date(current_timestamp())
        AND pcao_dt_cancelamento IS NULL
        AND docu_tpst_dk != 11
        GROUP BY docu_dk, v.vist_orgi_orga_dk,
            a.pcao_dt_andamento, s.stao_tppr_dk
    """.format(
        schema=options["schema_exadata"],
        days_ago=options["days_ago"])).createOrReplaceTempView("andamentos")
    spark.catalog.cacheTable("andamentos")

    spark.sql("""
    select docu_dk, orgao_id, pcao_dt_andamento, stao_tppr_dk
    from andamentos
    where stao_tppr_dk in (7912,6548,6326,6681,6678,6645,6682,6680,6679,6644,
                           6668,6666,6665,6669,6667,6664,6655,6662,6659,6658,
                           6663,6661,6660,6657,6670,6676,6674,6673,6677,6675,
                           6672,6018,6341,6338,6019,6017,6591,6339,6553,7871,
                           6343,6340,6342,6021,6334,6331,6022,6020,6593,6332,
                           7872,6336,6333,6335,7745,6346,6345,6015,6016,6325,
                           6327,6328,6329,6330,6337,6344,6656,6671,7869,7870,
                           6324,6322,6011,6012,6013,1092,1094,1095,6251,7834,
                           6007)
    """).createOrReplaceTempView("andamentos_codigos")

    # -1 indica cancelamento de indeferimento (cancela 1)
    # -2 indica desarquivamento (cancela arquivamento - 2, 4, 5)
    # -3 indica indeferimento (ou seja, cancela instauracao 3)
    cancela_indeferimento = spark.sql("""
        select docu_dk, orgao_id, pcao_dt_andamento, -1 as tipo_andamento
        from andamentos where stao_tppr_dk = 6007
        union all
        select docu_dk, orgao_id, pcao_dt_andamento, -2 as tipo_andamento
        from andamentos where stao_tppr_dk IN (
            6075, 1028, 6798, 7245, 6307, 1027, 7803, 6003, 7802,
            7801, 6004, 6696)
        union all
        select docu_dk, orgao_id, pcao_dt_andamento, -3 as tipo_andamento
        from andamentos where stao_tppr_dk = 6322
        """)

    # cancelamento de indeferimento conta como instauracao
    # tipo_andamento funciona como hierarquia para priorizar certos tipos
    # quando ocorrem no mesmo dia:
    # Aj.Acao (5) > TAC (4) > Instauracao (3) > Arquivamento (2) > Indeferimento (1)
    documento_andamentos = spark.sql("""
        select
            docu_dk,
            orgao_id,
            CASE WHEN stao_tppr_dk IN (7912,6548,6681,6678,6645,6682,6680,
                                       6679,6644,6668,6666,6665,6669,6667,6664,
                                       6662,6659,6658,6663,6661,6660,6657,
                                       6670,6676,6674,6673,6677,6675,6672,6018,
                                       6341,6338,6019,6017,6591,6339,6553,7871,
                                       6343,6340,6342,6021,6334,6331,6022,6020,
                                       6593,6332,7872,6336,6333,6335,7745,6346,
                                       6345,6015,6016,6325,6327,6328,6329,6330,
                                       6337,6344,6656,6671,7869,7870,6324,7834)
             THEN 2
             WHEN stao_tppr_dk = 6322 THEN 1
             WHEN stao_tppr_dk IN (6011, 6012, 6013, 1092, 1094, 1095, 6007) THEN 3
             WHEN stao_tppr_dk IN (6655, 6326) THEN 4
             WHEN stao_tppr_dk = 6251 THEN 5 end tipo_andamento,
            pcao_dt_andamento
        from andamentos_codigos
    """)
    cancela_df = cancela_indeferimento.groupby([
        "orgao_id", "docu_dk", "pcao_dt_andamento"
    ]).agg(max("tipo_andamento").alias("tipo_andamento"))

    documento_df = documento_andamentos.groupby([
        "orgao_id", "docu_dk", "pcao_dt_andamento"
    ]).agg(max("tipo_andamento").alias("tipo_andamento"))

    final_df = (
        documento_df\
        .withColumn("group_type", when(col("tipo_andamento").isin(2, 4, 5), 2).otherwise(col("tipo_andamento")))
        .alias("d")
        .join(
            cancela_df.alias("c"),
            (col("d.docu_dk") == col("c.docu_dk"))
            & (col("c.pcao_dt_andamento") >= col("d.pcao_dt_andamento"))
            & (col("c.tipo_andamento") + col("d.group_type") == 0),
            "left",
        )
        .where("c.tipo_andamento is null")
        .groupby(["d.orgao_id"])
        .pivot("d.tipo_andamento")
        .agg(count("d.tipo_andamento"))
        .na.fill(0)
        .withColumnRenamed("2", "arquivamento")
        .withColumnRenamed("1", "indeferimento")
        .withColumnRenamed("3", "instauracao")
        .withColumnRenamed("4", "tac")
        .withColumnRenamed("5", "acao")
    )

    final_df.createOrReplaceTempView("final_andamentos")
    spark.sql("""
            SELECT fa.*, ap.cod_pct, ap.pacote_atribuicao,
            ap.orgi_nm_orgao nm_orgao
            FROM final_andamentos fa
            INNER JOIN {schema_aux}.atualizacao_pj_pacote ap
            ON ap.id_orgao = fa.orgao_id
    """.format(schema_aux=options["schema_exadata_aux"])
              ).createOrReplaceTempView("final_com_pacote")

    max_pacote = spark.sql("""
                   SELECT cod_pct, nm_orgao,
                   max(arquivamento) as max_arq,
                   max(indeferimento) as max_indef,
                   max(instauracao) as max_inst,
                   max(tac) as max_tac,
                   max(acao) as max_acoes
                   FROM
                   final_com_pacote fp
                   GROUP BY cod_pct, nm_orgao
    """)
    w = Window.partitionBy("cod_pct")
    orgao_max_arq = (max_pacote.withColumn(
        "m_max_arq",
        max("max_arq").over(w)).where(
            col("max_arq") == col("m_max_arq")).select(
                ["cod_pct", "nm_orgao"]).groupBy("cod_pct").agg(
                    concat_ws(", ", collect_list("nm_orgao")).alias(
                        "nm_max_arquivamentos")).withColumnRenamed(
                            "cod_pct", "arq_cod_pct"))
    orgao_max_indef = (max_pacote.withColumn(
        "m_max_indef",
        max("max_indef").over(w)).where(
            col("max_indef") == col("m_max_indef")).select(
                ["cod_pct", "nm_orgao"]).groupBy("cod_pct").agg(
                    concat_ws(", ", collect_list("nm_orgao")).alias(
                        "nm_max_indeferimentos")).withColumnRenamed(
                            "cod_pct", "indef_cod_pct"))
    orgao_max_inst = (max_pacote.withColumn(
        "m_max_inst",
        max("max_inst").over(w)).where(
            col("max_inst") == col("m_max_inst")).select(
                ["cod_pct", "nm_orgao"]).groupBy("cod_pct").agg(
                    concat_ws(", ", collect_list("nm_orgao")).alias(
                        "nm_max_instauracoes")).withColumnRenamed(
                            "cod_pct", "inst_cod_pct"))
    orgao_max_tac = (max_pacote.withColumn(
        "m_max_tac",
        max("max_tac").over(w)).where(
            col("max_tac") == col("m_max_tac")).select([
                "cod_pct", "nm_orgao"
            ]).groupBy("cod_pct").agg(
                concat_ws(", ", collect_list("nm_orgao")).alias(
                    "nm_max_tac")).withColumnRenamed("cod_pct", "tac_cod_pct"))
    orgao_max_acoes = (max_pacote.withColumn(
        "m_max_acoes",
        max("max_acoes").over(w)).where(
            col("max_acoes") == col("m_max_acoes")).select([
                "cod_pct", "nm_orgao"
            ]).groupBy("cod_pct").agg(
                concat_ws(", ", collect_list("nm_orgao")).alias(
                    "nm_max_acoes")).withColumnRenamed("cod_pct",
                                                       "acoes_cod_pct"))

    spark.sql("""
            SELECT cod_pct, max(arquivamento) as max_pacote_arquivamentos,
                   max(indeferimento) as max_pacote_indeferimentos,
                   max(instauracao) as max_pacote_instauracoes,
                   max(tac) as max_pacote_tac,
                   max(acao) as max_pacote_acoes,
                   percentile(arquivamento, 0.5) as med_pacote_arquivamentos,
                   percentile(indeferimento, 0.5) as med_pacote_indeferimentos,
                   percentile(instauracao, 0.5) as med_pacote_instauracoes,
                   percentile(tac, 0.5) as med_pacote_tac,
                   percentile(acao, 0.5) as med_pacote_acoes
                   FROM final_com_pacote
                   GROUP BY cod_pct
    """).createOrReplaceTempView("stats_pacote")
    stats = (spark.sql("""
            SELECT fp.cod_pct,
                   fp.pacote_atribuicao,
                   fp.orgao_id,
                   arquivamento as nr_arquivamentos,
                   indeferimento as nr_indeferimentos,
                   instauracao as nr_instauracaoes,
                   tac as nr_tac,
                   acao as nr_acoes,
                   max_pacote_arquivamentos,
                   max_pacote_indeferimentos,
                   max_pacote_instauracoes,
                   max_pacote_tac,
                   max_pacote_acoes,
                   arquivamento / max_pacote_arquivamentos
                       as perc_arquivamentos,
                   indeferimento / max_pacote_indeferimentos
                       as perc_indeferimentos,
                   instauracao / max_pacote_instauracoes
                       as perc_instauracaoes,
                   tac / max_pacote_tac as perc_tac,
                   acao / max_pacote_acoes as perc_acoes,
                   med_pacote_arquivamentos,
                   med_pacote_indeferimentos,
                   med_pacote_instauracoes,
                   med_pacote_tac,
                   med_pacote_acoes,
                   (arquivamento - med_pacote_arquivamentos)
                       / med_pacote_arquivamentos as var_med_arquivaentos,
                   (indeferimento - med_pacote_indeferimentos)
                       / med_pacote_indeferimentos as var_med_indeferimentos,
                   (instauracao - med_pacote_instauracoes)
                       / med_pacote_instauracoes as var_med_instauracoes,
                   (tac - med_pacote_tac) / med_pacote_tac as var_med_tac,
                   (acao - med_pacote_acoes)
                       / med_pacote_acoes as var_med_acoes,
                   current_timestamp() as dt_calculo
            FROM final_com_pacote fp
            INNER JOIN stats_pacote sp
            ON fp.cod_pct = sp.cod_pct
    """).join(
        orgao_max_arq,
        col("cod_pct") == col("arq_cod_pct")).drop("arq_cod_pct").join(
            orgao_max_indef,
            col("cod_pct") == col("indef_cod_pct")).drop("indef_cod_pct").join(
                orgao_max_inst,
                col("cod_pct") == col("inst_cod_pct")).drop(
                    "inst_cod_pct").join(orgao_max_tac,
                                         col("cod_pct") == col("tac_cod_pct")).
             drop("tac_cod_pct").join(
                 orgao_max_acoes,
                 col("cod_pct") == col("acoes_cod_pct")).drop("acoes_cod_pct"))

    table_name = options['table_name']
    table_name = "{}.{}".format(options["schema_exadata_aux"], table_name)

    stats.write.mode("overwrite").saveAsTable("temp_table_radar_performance")
    temp_table = spark.table("temp_table_radar_performance")

    temp_table.write.mode("overwrite").saveAsTable(table_name)
    spark.sql("drop table temp_table_radar_performance")

    execute_compute_stats(table_name)
    def usage(transform_context, record_store_df):
        """component which groups together record store records by
        provided group by columns list, sorts within the group by event
        timestamp field, applies group stats udf and returns the latest
        quantity as a instance usage dataframe

        This component does groups records by event_type (a.k.a metric name)
        and expects two kinds of records in record_store data
        total quantity records - the total available quantity
        e.g. cpu.total_logical_cores
        idle perc records - percentage that is idle
        e.g. cpu.idle_perc

        To calculate the utilized quantity  this component uses following
        formula:

        utilized quantity = ceil((100 - idle_perc) * total_quantity / 100)

        """

        sql_context = SQLContext.getOrCreate(record_store_df.rdd.context)

        transform_spec_df = transform_context.transform_spec_df_info

        # get rollup operation (sum, max, avg, min)
        agg_params = transform_spec_df.select(
            "aggregation_params_map.usage_fetch_operation"). \
            collect()[0].asDict()
        usage_fetch_operation = agg_params["usage_fetch_operation"]

        # check if operation is valid
        if not FetchQuantityUtil. \
                _is_valid_fetch_quantity_util_operation(usage_fetch_operation):
            raise FetchQuantityUtilException(
                "Operation %s is not supported" % usage_fetch_operation)

        # get the quantities for idle perc and quantity
        instance_usage_df = FetchQuantity().usage(
            transform_context, record_store_df)

        # get aggregation period for instance usage dataframe
        agg_params = transform_spec_df.select(
            "aggregation_params_map.aggregation_period").collect()[0].asDict()
        aggregation_period = agg_params["aggregation_period"]
        group_by_period_list = ComponentUtils.\
            _get_instance_group_by_period_list(aggregation_period)

        # get what we want to group by
        agg_params = transform_spec_df.select(
            "aggregation_params_map.aggregation_group_by_list").\
            collect()[0].asDict()
        aggregation_group_by_list = agg_params["aggregation_group_by_list"]

        # group by columns list
        group_by_columns_list = group_by_period_list + \
            aggregation_group_by_list

        # get quantity event type
        agg_params = transform_spec_df.select(
            "aggregation_params_map.usage_fetch_util_quantity_event_type").\
            collect()[0].asDict()
        usage_fetch_util_quantity_event_type = \
            agg_params["usage_fetch_util_quantity_event_type"]

        # check if driver parameter is provided
        if usage_fetch_util_quantity_event_type is None or \
                usage_fetch_util_quantity_event_type == "":
            raise FetchQuantityUtilException(
                "Driver parameter  '%s' is missing"
                % "usage_fetch_util_quantity_event_type")

        # get idle perc event type
        agg_params = transform_spec_df.select(
            "aggregation_params_map.usage_fetch_util_idle_perc_event_type").\
            collect()[0].asDict()
        usage_fetch_util_idle_perc_event_type = \
            agg_params["usage_fetch_util_idle_perc_event_type"]

        # check if driver parameter is provided
        if usage_fetch_util_idle_perc_event_type is None or \
                usage_fetch_util_idle_perc_event_type == "":
            raise FetchQuantityUtilException(
                "Driver parameter  '%s' is missing"
                % "usage_fetch_util_idle_perc_event_type")

        # get quantity records dataframe
        event_type_quantity_clause = "processing_meta.event_type='%s'" \
            % usage_fetch_util_quantity_event_type
        quantity_df = instance_usage_df.select('*').where(
            event_type_quantity_clause).alias("quantity_df_alias")

        # get idle perc records dataframe
        event_type_idle_perc_clause = "processing_meta.event_type='%s'" \
            % usage_fetch_util_idle_perc_event_type
        idle_perc_df = instance_usage_df.select('*').where(
            event_type_idle_perc_clause).alias("idle_perc_df_alias")

        # join quantity records with idle perc records
        # create a join condition without the event_type
        cond = [item for item in group_by_columns_list
                if item != 'event_type']
        quant_idle_perc_df = quantity_df.join(idle_perc_df, cond, 'left')

        #
        # Find utilized quantity based on idle percentage
        #
        # utilized quantity = (100 - idle_perc) * total_quantity / 100
        #
        quant_idle_perc_calc_df = quant_idle_perc_df.select(
            col("quantity_df_alias.*"),
            when(col("idle_perc_df_alias.quantity") != 0.0,
                 ceil(((100.0 - col(
                     "idle_perc_df_alias.quantity"))) * col(
                     "quantity_df_alias.quantity") / 100.0))
            .otherwise(col("quantity_df_alias.quantity"))
            .alias("utilized_quantity"),

            col("quantity_df_alias.quantity")
            .alias("total_quantity"),

            col("idle_perc_df_alias.quantity")
            .alias("idle_perc"))

        instance_usage_json_rdd = \
            quant_idle_perc_calc_df.rdd.map(
                FetchQuantityUtil._format_quantity_util)

        instance_usage_df = \
            InstanceUsageUtils.create_df_from_json_rdd(sql_context,
                                                       instance_usage_json_rdd)

        return instance_usage_df
# Creating spark session
spark = SparkSession.builder.appName("DecisionTree App").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

# Loading the data
data = spark.read.format("csv").option("header", True) \
                               .option("inferSchema", True) \
                               .option("delimiter", ",") \
                               .load("adult.data")

data.printSchema()

data = data.withColumn(
    "X",
    F.when(F.col("X") == ' <=50K', 0).when(F.col("X") == ' >50K', 1))

data = data.withColumnRenamed("X", "label")
data = data.select(data.label.cast("double"), "age", "education-num",
                   "hours-per-week")
data.show()

assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)
data.show()

# Splitting the data into training and data set
training, test = data.select("label", "features").randomSplit([0.70, 0.30])

# Create Random Forest model and fit the model with training dataset
rf = RandomForestClassifier()
    def rdd_to_recordstore(rdd_transform_context_rdd):

        if rdd_transform_context_rdd.isEmpty():
            MonMetricsKafkaProcessor.log_debug(
                "rdd_to_recordstore: nothing to process...")
        else:

            sql_context = SQLContext(rdd_transform_context_rdd.context)
            data_driven_specs_repo = DataDrivenSpecsRepoFactory.\
                get_data_driven_specs_repo()
            pre_transform_specs_df = data_driven_specs_repo.\
                get_data_driven_specs(
                    sql_context=sql_context,
                    data_driven_spec_type=DataDrivenSpecsRepo.
                    pre_transform_specs_type)

            #
            # extract second column containing raw metric data
            #
            raw_mon_metrics = rdd_transform_context_rdd.map(
                lambda nt: nt.rdd_info[1])

            #
            # convert raw metric data rdd to dataframe rdd
            #
            raw_mon_metrics_df = \
                MonMetricUtils.create_mon_metrics_df_from_json_rdd(
                    sql_context,
                    raw_mon_metrics)

            #
            # filter out unwanted metrics and keep metrics we are interested in
            #
            cond = [
                raw_mon_metrics_df.metric.name ==
                pre_transform_specs_df.event_type]
            filtered_metrics_df = raw_mon_metrics_df.join(
                pre_transform_specs_df, cond)

            #
            # validate filtered metrics to check if required fields
            # are present and not empty
            # In order to be able to apply filter function had to convert
            # data frame rdd to normal rdd. After validation the rdd is
            # converted back to dataframe rdd
            #
            # FIXME: find a way to apply filter function on dataframe rdd data
            validated_mon_metrics_rdd = filtered_metrics_df.rdd.filter(
                MonMetricsKafkaProcessor._validate_raw_mon_metrics)
            validated_mon_metrics_df = sql_context.createDataFrame(
                validated_mon_metrics_rdd, filtered_metrics_df.schema)

            #
            # record generator
            # generate a new intermediate metric record if a given metric
            # metric_id_list, in pre_transform_specs table has several
            # intermediate metrics defined.
            # intermediate metrics are used as a convenient way to
            # process (aggregated) metric in mutiple ways by making a copy
            # of the source data for each processing
            #
            gen_mon_metrics_df = validated_mon_metrics_df.select(
                validated_mon_metrics_df.meta,
                validated_mon_metrics_df.metric,
                validated_mon_metrics_df.event_processing_params,
                validated_mon_metrics_df.event_type,
                explode(validated_mon_metrics_df.metric_id_list).alias(
                    "this_metric_id"),
                validated_mon_metrics_df.service_id)

            #
            # transform metrics data to record_store format
            # record store format is the common format which will serve as
            # source to aggregation processing.
            # converting the metric to common standard format helps in writing
            # generic aggregation routines driven by configuration parameters
            #  and can be reused
            #
            record_store_df = gen_mon_metrics_df.select(
                (gen_mon_metrics_df.metric.timestamp / 1000).alias(
                    "event_timestamp_unix"),
                from_unixtime(
                    gen_mon_metrics_df.metric.timestamp / 1000).alias(
                    "event_timestamp_string"),
                gen_mon_metrics_df.event_type.alias("event_type"),
                gen_mon_metrics_df.event_type.alias("event_quantity_name"),
                (gen_mon_metrics_df.metric.value / 1.0).alias(
                    "event_quantity"),
                when(gen_mon_metrics_df.metric.dimensions.state != '',
                     gen_mon_metrics_df.metric.dimensions.state).otherwise(
                    'NA').alias("event_status"),
                lit('1.0').alias('event_version'),
                lit('metrics').alias("record_type"),

                # resource_uuid
                when(gen_mon_metrics_df.metric.dimensions.instanceId != '',
                     gen_mon_metrics_df.metric.dimensions.instanceId).when(
                    gen_mon_metrics_df.metric.dimensions.resource_id != '',
                    gen_mon_metrics_df.metric.dimensions.resource_id).
                otherwise('NA').alias("resource_uuid"),

                when(gen_mon_metrics_df.metric.dimensions.tenantId != '',
                     gen_mon_metrics_df.metric.dimensions.tenantId).when(
                    gen_mon_metrics_df.metric.dimensions.tenant_id != '',
                    gen_mon_metrics_df.metric.dimensions.tenant_id).when(
                    gen_mon_metrics_df.metric.dimensions.project_id != '',
                    gen_mon_metrics_df.metric.dimensions.project_id).otherwise(
                    'NA').alias("tenant_id"),

                when(gen_mon_metrics_df.metric.dimensions.mount != '',
                     gen_mon_metrics_df.metric.dimensions.mount).otherwise(
                    'NA').alias("mount"),

                when(gen_mon_metrics_df.metric.dimensions.device != '',
                     gen_mon_metrics_df.metric.dimensions.device).otherwise(
                    'NA').alias("device"),

                when(gen_mon_metrics_df.meta.userId != '',
                     gen_mon_metrics_df.meta.userId).otherwise('NA').alias(
                    "user_id"),

                when(gen_mon_metrics_df.meta.region != '',
                     gen_mon_metrics_df.meta.region).when(
                    gen_mon_metrics_df.event_processing_params
                    .set_default_region_to != '',
                    gen_mon_metrics_df.event_processing_params
                    .set_default_region_to).otherwise(
                    'NA').alias("region"),

                when(gen_mon_metrics_df.meta.zone != '',
                     gen_mon_metrics_df.meta.zone).when(
                    gen_mon_metrics_df.event_processing_params
                    .set_default_zone_to != '',
                    gen_mon_metrics_df.event_processing_params
                    .set_default_zone_to).otherwise(
                    'NA').alias("zone"),

                when(gen_mon_metrics_df.metric.dimensions.hostname != '',
                     gen_mon_metrics_df.metric.dimensions.hostname).when(
                    gen_mon_metrics_df.metric.value_meta.host != '',
                    gen_mon_metrics_df.metric.value_meta.host).otherwise(
                    'NA').alias("host"),

                when(gen_mon_metrics_df.service_id != '',
                     gen_mon_metrics_df.service_id).otherwise(
                    'NA').alias("service_group"),

                when(gen_mon_metrics_df.service_id != '',
                     gen_mon_metrics_df.service_id).otherwise(
                    'NA').alias("service_id"),

                from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000,
                              'yyyy-MM-dd').alias("event_date"),
                from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000,
                              'HH').alias("event_hour"),
                from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000,
                              'mm').alias("event_minute"),
                from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000,
                              'ss').alias("event_second"),
                gen_mon_metrics_df.this_metric_id.alias("metric_group"),
                gen_mon_metrics_df.this_metric_id.alias("metric_id"))

            #
            # get transform context
            #
            rdd_transform_context = rdd_transform_context_rdd.first()
            transform_context = rdd_transform_context.transform_context_info

            #
            # cache record store rdd
            #
            if cfg.CONF.service.enable_record_store_df_cache:
                storage_level_prop = \
                    cfg.CONF.service.record_store_df_cache_storage_level
                storage_level = StorageUtils.get_storage_level(
                    storage_level_prop)
                record_store_df.persist(storage_level)

            #
            # start processing metrics available in record_store data
            #
            MonMetricsKafkaProcessor.process_metrics(transform_context,
                                                     record_store_df)

            # remove df from cache
            if cfg.CONF.service.enable_record_store_df_cache:
                record_store_df.unpersist()

            #
            # extract kafka offsets and batch processing time
            # stored in transform_context and save offsets
            #
            offsets = transform_context.offset_info

            # batch time
            batch_time_info = \
                transform_context.batch_time_info

            MonMetricsKafkaProcessor.save_kafka_offsets(
                offsets, rdd_transform_context_rdd.context.appName,
                batch_time_info)

            # call pre hourly processor, if its time to run
            if (cfg.CONF.stage_processors.pre_hourly_processor_enabled
                    is True and PreHourlyProcessor.is_time_to_run(
                        batch_time_info)):
                PreHourlyProcessor.run_processor(
                    record_store_df.rdd.context,
                    batch_time_info)
示例#33
0
    def __init__(
        self,
        dataframe: DataFrame,
        spark: SparkSession,
        profiling_config: DataLakeProfilerConfig,
        report: DataLakeSourceReport,
        file_path: str,
    ):
        self.spark = spark
        self.dataframe = dataframe
        self.analyzer = AnalysisRunner(spark).onData(dataframe)
        self.column_specs = []
        self.row_count = dataframe.count()
        self.profiling_config = profiling_config
        self.file_path = file_path
        self.columns_to_profile = []
        self.ignored_columns = []
        self.profile = DatasetProfileClass(timestampMillis=get_sys_time())
        self.report = report

        self.profile.rowCount = self.row_count
        self.profile.columnCount = len(dataframe.columns)

        column_types = {x.name: x.dataType for x in dataframe.schema.fields}

        if self.profiling_config.profile_table_level_only:

            return

        # get column distinct counts
        for column in dataframe.columns:

            if not self.profiling_config.allow_deny_patterns.allowed(column):
                self.ignored_columns.append(column)
                continue

            self.columns_to_profile.append(column)
            # Normal CountDistinct is ridiculously slow
            self.analyzer.addAnalyzer(ApproxCountDistinct(column))

        if self.profiling_config.max_number_of_fields_to_profile is not None:
            if (len(self.columns_to_profile) >
                    self.profiling_config.max_number_of_fields_to_profile):
                columns_being_dropped = self.columns_to_profile[
                    self.profiling_config.max_number_of_fields_to_profile:]
                self.columns_to_profile = self.columns_to_profile[:self.
                                                                  profiling_config
                                                                  .
                                                                  max_number_of_fields_to_profile]

                self.report.report_file_dropped(
                    f"The max_number_of_fields_to_profile={self.profiling_config.max_number_of_fields_to_profile} reached. Profile of columns {self.file_path}({', '.join(sorted(columns_being_dropped))})"
                )

        analysis_result = self.analyzer.run()
        analysis_metrics = AnalyzerContext.successMetricsAsJson(
            self.spark, analysis_result)

        # reshape distinct counts into dictionary
        column_distinct_counts = {
            x["instance"]: int(x["value"])
            for x in analysis_metrics if x["name"] == "ApproxCountDistinct"
        }

        select_numeric_null_counts = [
            count(when(
                isnan(c) | col(c).isNull(),
                c,
            )).alias(c) for c in self.columns_to_profile
            if column_types[column] in [DoubleType, FloatType]
        ]

        # PySpark doesn't support isnan() on non-float/double columns
        select_nonnumeric_null_counts = [
            count(when(
                col(c).isNull(),
                c,
            )).alias(c) for c in self.columns_to_profile
            if column_types[column] not in [DoubleType, FloatType]
        ]

        null_counts = dataframe.select(select_numeric_null_counts +
                                       select_nonnumeric_null_counts)
        column_null_counts = null_counts.toPandas().T[0].to_dict()
        column_null_fractions = {
            c: column_null_counts[c] / self.row_count
            for c in self.columns_to_profile
        }
        column_nonnull_counts = {
            c: self.row_count - column_null_counts[c]
            for c in self.columns_to_profile
        }

        column_unique_proportions = {
            c:
            (column_distinct_counts[c] /
             column_nonnull_counts[c] if column_nonnull_counts[c] > 0 else 0)
            for c in self.columns_to_profile
        }

        if self.profiling_config.include_field_sample_values:
            # take sample and convert to Pandas DataFrame
            if self.row_count < NUM_SAMPLE_ROWS:
                # if row count is less than number to sample, just take all rows
                rdd_sample = dataframe.rdd.take(self.row_count)
            else:
                rdd_sample = dataframe.rdd.takeSample(False,
                                                      NUM_SAMPLE_ROWS,
                                                      seed=0)

        # init column specs with profiles
        for column in self.columns_to_profile:
            column_profile = DatasetFieldProfileClass(fieldPath=column)

            column_spec = _SingleColumnSpec(column, column_profile)

            column_profile.uniqueCount = column_distinct_counts.get(column)
            column_profile.uniqueProportion = column_unique_proportions.get(
                column)
            column_profile.nullCount = column_null_counts.get(column)
            column_profile.nullProportion = column_null_fractions.get(column)
            if self.profiling_config.include_field_sample_values:
                column_profile.sampleValues = sorted(
                    [str(x[column]) for x in rdd_sample])

            column_spec.type_ = column_types[column]
            column_spec.cardinality = _convert_to_cardinality(
                column_distinct_counts[column],
                column_null_fractions[column],
            )

            self.column_specs.append(column_spec)
示例#34
0
def tune_ALS_NLP(spark, train_data, validation_data, val_true_list, maxIter, regParams, ranks, review_val_predictions):
    # initial
    min_error = float('inf')
    best_iter1 = -1
    best_rank1 = -1
    best_regularization1 = 0
    best_model_rmse = None
    max_map = 0.0
    best_iter2 = -1
    best_rank2 = -1
    best_regularization2 = 0
    best_model_map = None

    for iteration in maxIter:
        for current_rank in ranks:
            for reg in regParams:
                als=ALS(maxIter=iteration,regParam=reg,rank=current_rank, \
                        userCol='user_id',itemCol='book_id',ratingCol='rating', \
                        coldStartStrategy="drop",nonnegative=True)
                als_model = als.fit(train_data)
                predictions = als_model.transform(validation_data)
                
                review_predictions = review_val_predictions.withColumnRenamed('prediction','review_prediction')
                als_predictions = predictions.withColumnRenamed('prediction','als_prediction')
                total_predictions = als_predictions.join(review_predictions,['user_id','book_id','rating'],'outer')
                total_predictions = total_predictions.withColumn('total_prediction', \
                                                                 when(total_predictions['review_prediction'].isNotNull(), \
                                                                      total_predictions['review_prediction']) \
                                                                 .otherwise(total_predictions['als_prediction']))
                              
                window = Window.partitionBy(total_predictions['user_id']).orderBy(total_predictions['total_prediction'].desc())
                top_predictions = total_predictions.select('*', rank().over(window).alias('row_num')).filter(col('row_num') <= 500)

                # rmse
                evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='total_prediction')
                rmse = evaluator.evaluate(top_predictions)
                if rmse < min_error:
                    min_error = rmse
                    best_rank1 = current_rank
                    best_regularization1 = reg
                    best_iter1 = iteration
                    best_model_rmse = als_model

                # MAP
                current_map = MAP.getMAP(top_predictions, val_true_list)
                if current_map > max_map:
                    max_map = current_map
                    best_rank2 = current_rank
                    best_regularization2 = reg
                    best_iter2 = iteration
                    best_model_map = als_model

                print('{} latent factors and regularization = {} with maxIter {}: '
                  'validation RMSE is {}' 'validation MAP is {}' .format(current_rank, reg, iteration, rmse, current_map))
              
                with open('train05_review_eval.csv', 'ab') as f:
                    np.savetxt(f, [np.array([iteration, current_rank, reg, rmse, current_map])],delimiter=",")

    print('\nThe best model select by RMSE has {} latent factors and '
          'regularization = {}'' with maxIter = {}: RMSE = {}'.format(best_rank1, best_regularization1, best_iter1, min_error))
    print('\nThe best model select by MAP has {} latent factors and '
          'regularization = {}'' with maxIter = {}: MAP = {}'.format(best_rank2, best_regularization2, best_iter2, max_map))

    return best_model_rmse,best_model_map
示例#35
0
def transform_predictions_missing_values(dataframe):
    df_transformed_null = dataframe.select(
        [func.count(func.when(func.isnan(c) | func.isnull(c), c)).alias(c) for (c, c_type) in
         dataframe.dtypes])

    return df_transformed_null
示例#36
0
def Validate(ngrams \
			, sampleSizes \
			, ctxSize \
			, sqc \
			, seqs \
			, outFile \
			, minval \
			, maxval \
			, avg \
			, nlines):

	accuracy = []
	gramSize = GramSize(ctxSize, lookahead)

	c1 = (((maxval - minval) * 1.0) / nlines) / avg
	c2 = ((minval * 1.0) / nlines) / avg
	print seqs.count()
				


	ngrams = ngrams.repartition(1 << nPartLog)
	ngrams.cache()

	#we will validate separately for each vector size
	for vecSize in vecSizes:
		print '======TESTING FOR VECTOR SIZE', vecSize
		#start fresh
		old_ngrams = ngrams
		ngrams = ngrams.withColumn('correct', lit(0))



		#use models from each sample
		modelId = 0
		for sampleSize in sampleSizes:

			w2v = Word2VecModel.load(w2vFile(outDir, ctxSize, sampleSize, vecSize))
			lrmodels = []
			for dim in range(0, vecSize):
				lrmodels.append(LinearRegressionModel.load(lrmFile(outDir, ctxSize, sampleSize, vecSize, dim)))

			success = 0
			fail = 0
			unopt = 0

			#add columns to store model success and failure
			modelSucc = 'succ_' + str(modelId)
			modelFail = 'fail_' + str(modelId)
			modelUnopt = 'unopt_' + str(modelId)
			seqs = seqs.withColumn(modelSucc, lit(0)) \
						.withColumn(modelFail, lit(0)) \
						.withColumn(modelUnopt, lit(0))
			modelId = modelId + 1



			ngrams = ngrams \
				.withColumn('predSeq', lit(''))

			#create initial feature vector
			#transform each word into a cluster center
			words, d, centers = ClusterWords(w2v \
											, seqs \
											)
		
			#record correctness for this model only
			old_ngrams = ngrams
			ngrams = ngrams.withColumn('sample_correct', lit(0)).withColumn('sample_confi', lit(1.0))

			for nextPos in range(0,lookahead):
				#build the feature vector
				ngrams = BuildSubstringFeature(ngrams, w2v, nextPos, nextPos + ctxSize, ctxSize, lookahead,)

				#build the prediction vector
				ngrams = BuildPredictionVector(ngrams, lrmodels, ctxSize, vecSize)


			

				#now assign a cluster id to each prediction vector
				old_ngrams = ngrams
				ngrams = centers.transform(ngrams).withColumnRenamed('cluster', 'predWord').withColumnRenamed('vector', 'predictionVector')
				
				
				#get the predicted word
				ngrams = ngrams.join(broadcast(words), words.cluster == ngrams.predWord, 'inner') \
								.drop('cluster') #\

				#calculate the cosine similarity between prediction vector and center vector 
				epsilon = 0.0001
				def CosineSimi (v1, v2):
					d1 = DenseVector(v1)
					d2 = DenseVector(v2)
					n1 = d1.norm(2)
					n2 = d2.norm(2)
					return float(d1.dot(d2) / (n1 * n2))
				cossim = udf(lambda v1, v2: CosineSimi(v1, v2), DoubleType())
				ngrams = ngrams.withColumn('simi', cossim('centerVector', 'predictionVector'))
				ngrams = ngrams.drop('centerVector').drop('predictionVector')


				#update predicted sequence
				ngrams = ngrams.withColumn('predSeq', concat_ws(' ', 'predSeq', 'word')) 
				ngrams = ngrams.withColumn('predSeq', ltrim(ngrams.predSeq))


				#get actual sequence
				ngrams = CreateSubstring(ngrams, 'sentence', 'actualSeq', gramSize, ' ', ctxSize, ctxSize + nextPos + 1)


				#now get the cluster id for the predicted word in the sentence
				ngrams = BuildLabelVector(ngrams, w2v, ctxSize, lookahead, nextPos).withColumnRenamed('labelVec', 'vector').drop('ngrams')
				ngrams = centers.transform(ngrams).drop('vector')

				#and host latency for actual word
				ngrams = ngrams.join(broadcast(words), 'cluster', 'inner') \
						.drop('word') \
						.drop('centerVector') #\
				
				
			
				#record correctness
				ngrams = ngrams.withColumn('round_correct', when((ngrams.predWord != ngrams.cluster) | (ngrams.simi < confidence), 0).otherwise(nextPos + 1)).drop('predWord').drop('cluster')
				ngrams = ngrams.withColumn('sample_correct', when(ngrams.sample_correct + 1 == ngrams.round_correct, ngrams.round_correct).otherwise(ngrams.sample_correct)) 




				#get overall correctness
				ngrams = ngrams.withColumn('correct', greatest('sample_correct', 'correct'))

				#get binary correctness
				ngrams = ngrams.withColumn('binary_correct', when(ngrams.correct >= nextPos + 1, 1).otherwise(0))
				ngrams = ngrams.withColumn('sample_confi', when(ngrams.binary_correct == 1, 1.0).otherwise(least(ngrams.simi, ngrams.sample_confi)))
				ngrams = ngrams.withColumn('simi', when(ngrams.binary_correct == 1, ngrams.simi).otherwise(ngrams.sample_confi))


				ngrams = ngrams.withColumn('predSeq', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), ngrams.actualSeq).otherwise(ngrams.predSeq))
				ngrams = ngrams.withColumn('succ_wt', when(ngrams.binary_correct == 1, ngrams.wt).otherwise(0))
				ngrams = ngrams.withColumn('fail_wt', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), 0).otherwise(ngrams.wt))
				ngrams = ngrams.withColumn('unopt_wt', when((ngrams.binary_correct == 0) & (ngrams.simi < confidence), ngrams.wt).otherwise(0))
				ngrams = ngrams.drop('simi')

				#now summarize success and failure rates by predicted sequence
				seqWts = ngrams.groupBy('predSeq').agg(sum('succ_wt').alias('succ_wt'), sum('fail_wt').alias('fail_wt'), sum('unopt_wt').alias('unopt_wt'))

				#update sequences table
				seqs = seqWts.join(broadcast(seqs), seqWts.predSeq==seqs.word, 'right_outer').drop('predSeq').fillna(-c2/c1, ['succ_wt', 'fail_wt', 'unopt_wt'])


				scaleback = udf(lambda s: float(s*c1 + c2), DoubleType())
				seqs = seqs.withColumn(modelSucc, col(modelSucc) + scaleback(seqs.succ_wt)).drop('succ_wt')
				seqs = seqs.withColumn(modelFail, col(modelFail) + scaleback(seqs.fail_wt)).drop('fail_wt')
				seqs = seqs.withColumn(modelUnopt, col(modelUnopt) + scaleback(seqs.unopt_wt)).drop('unopt_wt')
				seqs.cache()

				aggregated = seqs.agg(sum(modelSucc), sum(modelFail), sum(modelUnopt))
				aggregated.cache()
				new_success = aggregated.head()['sum(' + modelSucc + ')']
				new_fail = aggregated.head()['sum(' + modelFail + ')']
				new_unopt = aggregated.head()['sum(' + modelUnopt + ')']
				print nextPos, new_success - success, new_fail - fail, new_unopt - unopt 
				success = new_success
				fail = new_fail
				unopt = new_unopt


		#end for testing for each model for a particular vector size

	#end for each vector size


	seqs.orderBy('succ_0', ascending=False).write.mode('overwrite').csv(outputFile(outDir, ctxSize, vecSize, sampleSizes))


	return accuracy
示例#37
0
    def expect_column_values_to_be_between(
        self,
        column,
        min_value=None,
        max_value=None,
        strict_min=False,
        strict_max=False,
        parse_strings_as_datetimes=None,
        output_strftime_format=None,
        allow_cross_type_comparisons=None,
        mostly=None,
        result_format=None,
        include_config=True,
        catch_exceptions=None,
        meta=None,
    ):
        # NOTE: This function is implemented using native functions instead of UDFs, which is a faster
        # implementation. Please ensure new spark implementations migrate to the new style where possible
        if allow_cross_type_comparisons:
            raise ValueError(
                "Cross-type comparisons are not valid for SparkDFDataset")

        if parse_strings_as_datetimes:
            min_value = parse(min_value)
            max_value = parse(max_value)

        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")
        elif min_value is None:
            if strict_max:
                return column.withColumn(
                    "__success",
                    when(column[0] < max_value,
                         lit(True)).otherwise(lit(False)),
                )
            else:
                return column.withColumn(
                    "__success",
                    when(column[0] <= max_value,
                         lit(True)).otherwise(lit(False)),
                )
        elif max_value is None:
            if strict_min:
                return column.withColumn(
                    "__success",
                    when(column[0] > min_value,
                         lit(True)).otherwise(lit(False)),
                )
            else:
                return column.withColumn(
                    "__success",
                    when(column[0] >= min_value,
                         lit(True)).otherwise(lit(False)),
                )
        else:
            if min_value > max_value:
                raise ValueError("minvalue cannot be greater than max_value")
            if strict_min and strict_max:
                return column.withColumn(
                    "__success",
                    when((min_value < column[0]) & (column[0] < max_value),
                         lit(True)).otherwise(lit(False)),
                )
            elif strict_min:
                return column.withColumn(
                    "__success",
                    when((min_value < column[0]) & (column[0] <= max_value),
                         lit(True)).otherwise(lit(False)),
                )
            elif strict_max:
                return column.withColumn(
                    "__success",
                    when((min_value <= column[0]) & (column[0] < max_value),
                         lit(True)).otherwise(lit(False)),
                )
            else:
                return column.withColumn(
                    "__success",
                    when((min_value <= column[0]) & (column[0] <= max_value),
                         lit(True)).otherwise(lit(False)),
                )
示例#38
0
#[2] Weather data ingestion
sensors = spark.read.option("delimiter", ",")\
  .csv('inputs/mi_meteo_legend.csv')\
  .toDF('sensor_id','street_name','lat', 'lon','sensor_type','unity_of_measure')
sensors = sensors.filter(sensors['sensor_type'] == "Precipitation")
sensors = sensors.filter(sensors['street_name'] == "Milano - via Lambrate")

weather = spark.read.option("delimiter", ",")\
   .csv('inputs/weather_phenomena/*.csv')\
  .toDF('sensor_id','time_istant','measurement')

#[3] Grouping by date abd Precipitation intensity classification
ws = weather.join(sensors, on='sensor_id', how='left')
ws = ws.withColumn('rain_intensity', func.when(func.col('measurement') == 0, 0)\
                   .when((func.col('measurement') > 0) & (func.col('measurement') < 2.6),1)\
                   .when((func.col('measurement') >= 2.6) & (func.col('measurement') < 7.6),2)\
                   .otherwise(3))
ws = ws.filter(ws['rain_intensity'] > 0)
city = mobileDF.join(ws, on='time_istant', how='left')
city.show()
city = city.withColumn(
    'date_again',
    func.from_unixtime(
        func.unix_timestamp(city.time_istant, 'yyyy/MM/dd HH:mm'),
        'yyyy-MM-dd HH:mm'))
city = city.filter(city.sensor_type.isNotNull())

#[4] Filtering only for working hours
city.createOrReplaceTempView("sparkCityData")
novemberactivity = city
novemberactivity = novemberactivity.withColumn(
示例#39
0
def prepare_df(
    df: pyspark.sql.DataFrame,
    store_csv: pyspark.sql.DataFrame,
    store_states_csv: pyspark.sql.DataFrame,
    state_names_csv: pyspark.sql.DataFrame,
    google_trend_csv: pyspark.sql.DataFrame,
    weather_csv: pyspark.sql.DataFrame,
) -> pyspark.sql.DataFrame:
    num_rows = df.count()

    # expand dates
    df = expand_date(df)

    # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed).
    df = (df.withColumn("Open", df.Open != "0").withColumn(
        "Promo",
        df.Promo != "0").withColumn("StateHoliday",
                                    df.StateHoliday != "0").withColumn(
                                        "SchoolHoliday",
                                        df.SchoolHoliday != "0"))

    # merge store information
    store = store_csv.join(store_states_csv, "Store")
    df = df.join(store, "Store")

    # merge Google Trend information
    google_trend_all = prepare_google_trend(google_trend_csv)
    df = df.join(google_trend_all,
                 ["State", "Year", "Week"]).select(df["*"],
                                                   google_trend_all.trend)

    # merge in Google Trend for whole Germany
    google_trend_de = google_trend_all[google_trend_all.file ==
                                       "Rossmann_DE"].withColumnRenamed(
                                           "trend", "trend_de")
    df = df.join(google_trend_de,
                 ["Year", "Week"]).select(df["*"], google_trend_de.trend_de)

    # merge weather
    weather = weather_csv.join(state_names_csv,
                               weather_csv.file == state_names_csv.StateName)
    df = df.join(weather, ["State", "Date"])

    # fix null values
    df = (df.withColumn(
        "CompetitionOpenSinceYear",
        F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)),
    ).withColumn(
        "CompetitionOpenSinceMonth",
        F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)),
    ).withColumn("Promo2SinceYear",
                 F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn(
                     "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek,
                                                   F.lit(1))))

    # days and months since the competition has been open, cap it to 2 years
    df = df.withColumn(
        "CompetitionOpenSince",
        F.to_date(
            F.format_string("%s-%s-15", df.CompetitionOpenSinceYear,
                            df.CompetitionOpenSinceMonth)),
    )
    df = df.withColumn(
        "CompetitionDaysOpen",
        F.when(
            df.CompetitionOpenSinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(360 * 2),
                        F.datediff(df.Date, df.CompetitionOpenSince)),
            ),
        ).otherwise(0),
    )
    df = df.withColumn("CompetitionMonthsOpen",
                       (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

    # days and weeks of promotion, cap it to 25 weeks
    df = df.withColumn(
        "Promo2Since",
        F.expr(
            'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)'
        ),
    )
    df = df.withColumn(
        "Promo2Days",
        F.when(
            df.Promo2SinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))),
        ).otherwise(0),
    )
    df = df.withColumn("Promo2Weeks",
                       (df.Promo2Days / 7).cast(T.IntegerType()))

    # ensure that no row was lost through inner joins
    assert num_rows == df.count(), "lost rows in joins"
    return df
示例#40
0
        meter_id = row.meter
        building_meter = get_meter(building, meter_id)

        print("Predicting meter readings for building {0} meter {1}".format(
            building_id, meter_id))
        model = load_model(building_id, meter_id)
        predictions = model.transform(building_meter)
        predictions = predictions.withColumn("prediction",
                                             F.expm1(predictions.prediction))

        print("Saving submission")
        predictions = predictions.withColumn("submitted_ts",
                                             F.current_timestamp())
        predictions = predictions.withColumn("submit_id", F.lit(submit_id))
        predictions = predictions.withColumn("algo", F.lit(algo))
        predictions = predictions.withColumnRenamed(
            "prediction",
            "meter_reading").select("row_id", "building_id", "meter",
                                    "timestamp", "meter_reading", "submit_id",
                                    "submitted_ts", "algo")
        predictions = predictions.withColumn(
            "meter_reading",
            F.when(predictions.meter_reading < 0,
                   F.lit(0.0)).otherwise(predictions.meter_reading))
        predictions = predictions.fillna(0.0, "meter_reading")
        predictions.coalesce(1).write.saveAsTable("submitted_predictions",
                                                  format="parquet",
                                                  mode="append")

to_csv(submit_id, algo)
    except:
        pass


# device_categories = json_cleaned.select("deviceType").distinct().rdd.flatMap(lambda x: x).collect()
device_categories = [u'Personal computer', u'Tablet', u'Smartphone']


refDomain_categories = json_cleaned.groupBy("refDomain").count().orderBy(desc("count"))\
                        .select("refDomain").rdd.flatMap(lambda x: x).collect()


refDomain_categories_filter = []
for x in refDomain_categories:
    try:
        temp = x.split('.')[-2]
        if temp not in refDomain_categories_filter:
            refDomain_categories_filter.append(temp)
    except:
        pass



exprs_device = [F.when(F.col("deviceType") == category, 1).otherwise(0).alias("is_device_"+category)
         for category in device_categories]
exprs_domain = [F.when(F.col("refDomain") == category, 1).otherwise(0).alias("is_domain_"+category)
         for category in refDomain_categories_filter[0:100]]

labeled_json_cleaned = json_cleaned.select("*", *exprs_device ) \
        .select("*", *exprs_domain).drop("deviceType").drop("refDomain")
def nscore(words):
    scores = filter(lambda x: x<0, [wordlist[t] for t in words if t in wordlist])
    return 0.0 if len(scores)==0 else (float(sum(scores))/len(scores))
neg_score = F.udf(lambda w: nscore(w), FloatType()) 

# Create feature matrix for the model
tw1 = hc.sql("""
SELECT text, query, polarity, date,
       regexp_extract(date, '([0-9]{2}):([0-9]{2}):([0-9]{2})', 1) as hour,
       regexp_extract(date, '(Sun|Mon|Tue|Wed|Thu|Fri|Sat)', 1) as dayofweek,
       regexp_extract(date, '(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)', 1) as month
FROM tweets 
""")
tw2 = tw1.filter("polarity != 2").withColumn('words', tokenize(tw1['text']))
tw3 = (tw2.select("user", "hour", "dayofweek", "month", "words",
            	  F.when(tw2.polarity == 4, "Pos").otherwise("Neg").alias("sentiment"),
            	  pos_score(tw2["words"]).alias("pscore"), 	
	    	  neg_score(tw2["words"]).alias("nscore")))
tw3.registerTempTable("fm")

# paramaters for modeling
numFeatures = 5000
minDocFreq = 50
numTrees = 1000

# Build Machine Learning pipeline
inx1 = StringIndexer(inputCol="hour", outputCol="hour-inx")
inx2 = StringIndexer(inputCol="month", outputCol="month-inx")
inx3 = StringIndexer(inputCol="dayofweek", outputCol="dow-inx")
inx4 = StringIndexer(inputCol="sentiment", outputCol="label")
hashingTF = HashingTF(numFeatures=numFeatures, inputCol="words", outputCol="hash-tf")
示例#43
0
def summary(df, cols):
    spark = df.sql_ctx
    types = {x.name: x.dataType for x in list(df.schema) if x.name in cols}

    res = pd.DataFrame.from_dict(types, orient='index')
    res.columns = ['datatype']

    count = df.count()
    res['count'] = count

    d = df.select([F.approx_count_distinct(c).alias(c)
                   for c in cols]).toPandas().T
    d.columns = ['approx_distinct']
    d.index.name = 'index'
    res = res.join(d)

    res['unique_ratio'] = res['approx_distinct'] / count

    sel = []
    for c, v in types.items():
        if isinstance(v, (T.NumericType)):
            sel += [F.mean(c).alias(c)]
        else:
            sel += [F.min(F.lit(None)).alias(c)]
    d = df.select(sel).toPandas().T
    d.columns = ['mean']
    d.index.name = 'index'
    res = res.join(d)

    d = df.select([F.min(c).alias(c) for c in cols]).toPandas().T
    d.columns = ['min']
    d.index.name = 'index'
    res = res.join(d)

    d = df.select([F.max(c).alias(c) for c in cols]).toPandas().T
    d.columns = ['max']
    d.index.name = 'index'
    res = res.join(d)

    d = df.select([F.count(F.when(F.isnull(c), c)).alias(c)
                   for c in cols]).toPandas().T
    d.columns = ['null']
    d.index.name = 'index'
    res = res.join(d)

    sel = []
    for c, v in types.items():
        if isinstance(v, (T.NumericType)):
            sel += [F.count(F.when(F.isnan(c), c)).alias(c)]
        else:
            sel += [F.min(F.lit(0)).alias(c)]
    d = df.select(sel).toPandas().T
    d.columns = ['nan']
    d.index.name = 'index'
    res = res.join(d)

    sel = []
    for c, v in types.items():
        if isinstance(v, (T.StringType)):
            sel += [F.count(F.when(F.col(c).isin(''), c)).alias(c)]
        else:
            sel += [F.min(F.lit(0)).alias(c)]
    d = df.select(sel).toPandas().T
    d.columns = ['empty']
    d.index.name = 'index'
    res = res.join(d)

    return res
示例#44
0
def delayed_flights(spark, flights_file_path, other_files_path, year):
    """

    PARAMETERS
    ----------

    spark: SparkSession
    file:  The data file e.g."s3://air-traffic-dataset/ontimeperformance_flights_test.csv".
    year: 1994-2008 inclusive for the tiny dataset.

    """

    flights_tiny_df = (spark.read.format("csv").options(header="true").load(
        "{}/year={}".format(flights_file_path, year)))

    airlines_df = (spark.read.format("csv").options(header="true").load(
        "{}/ontimeperformance_airlines.csv".format(other_files_path)))

    flights_tiny_df = flights_tiny_df \
        .withColumn(
            "scheduled_departure_timestamp",
            F.to_timestamp(
                F.when(
                    F.col("scheduled_depature_time") == "24:00:00", "00:00:00"
                ).otherwise(F.col("scheduled_depature_time")),
                "HH:mm:ss",
            ),
        ) \
        .withColumn(
            "actual_departure_timestamp",
            F.to_timestamp(
                F.when(F.col("actual_departure_time") == "24:00:00", "00:00:00").otherwise(
                    F.col("actual_departure_time")
                ),
                "HH:mm:ss",
            ),
        ) \
        .withColumn(
            "delayed_time",
            F.when(
                F.col("actual_departure_timestamp").cast("long")
                - F.col("scheduled_departure_timestamp").cast("long")
                > (60 * 60 * 12),
                (
                    F.col("scheduled_departure_timestamp").cast("long")
                    + (60 * 60 * 24)
                    - F.col("actual_departure_timestamp").cast("long")
                )
                / 60,
            )
            .when(
                F.col("scheduled_departure_timestamp").cast("long")
                - F.col("actual_departure_timestamp").cast("long")
                > (60 * 60 * 12),
                (
                    F.col("actual_departure_timestamp").cast("long")
                    + (60 * 60 * 24)
                    - F.col("scheduled_departure_timestamp").cast("long")
                )
                / 60,
            )
            .otherwise(
                (
                    F.col("actual_departure_timestamp").cast("long")
                    - F.col("scheduled_departure_timestamp").cast("long")
                )
                / 60
            ),
        ) \
        .filter(F.col("actual_departure_timestamp").isNotNull()) \
        .filter(F.col("delayed_time") > 0) \
        .groupBy("carrier_code") \
        .agg(
            F.count("delayed_time").alias("numOfDelays"),
            F.mean("delayed_time").alias("avgDelays"),
            F.min("delayed_time").alias("minDelay"),
            F.max("delayed_time").alias("maxDelay"),
        ) \
        .withColumn("avgDelays", F.round(F.col("avgDelays"), 2)) \
        .select(
            "carrier_code",
            "numOfDelays",
            "avgDelays",
            "minDelay",
            "maxDelay"
        ) \

    avg_flights = F.broadcast(airlines_df) \
        .join(
            flights_tiny_df,
            flights_tiny_df.carrier_code == airlines_df.carrier_code
        ) \
        .select(
            "name",
            "numOfDelays",
            "avgDelays",
            "minDelay",
            "maxDelay"
        )

    parseAvgFlights(avg_flights)
示例#45
0
 def na(col_name):
     return F.count(
         F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name))
示例#46
0
文件: base.py 项目: yiming1012/koalas
    def shift(self, periods=1, fill_value=None):
        """
        Shift Series/Index by desired number of periods.

        .. note:: the current implementation of shift uses Spark's Window without
            specifying partition specification. This leads to move all data into
            single partition in single machine and could cause serious
            performance degradation. Avoid this method against very large dataset.

        Parameters
        ----------
        periods : int
            Number of periods to shift. Can be positive or negative.
        fill_value : object, optional
            The scalar value to use for newly introduced missing values.
            The default depends on the dtype of self. For numeric data, np.nan is used.

        Returns
        -------
        Copy of input Series/Index, shifted.

        Examples
        --------
        >>> df = ks.DataFrame({'Col1': [10, 20, 15, 30, 45],
        ...                    'Col2': [13, 23, 18, 33, 48],
        ...                    'Col3': [17, 27, 22, 37, 52]},
        ...                   columns=['Col1', 'Col2', 'Col3'])

        >>> df.Col1.shift(periods=3)
        0     NaN
        1     NaN
        2     NaN
        3    10.0
        4    20.0
        Name: Col1, dtype: float64

        >>> df.Col2.shift(periods=3, fill_value=0)
        0     0
        1     0
        2     0
        3    13
        4    23
        Name: Col2, dtype: int64

        >>> df.index.shift(periods=3, fill_value=0)
        Int64Index([0, 0, 0, 0, 1], dtype='int64')
        """
        if len(self._internal.index_columns) == 0:
            raise ValueError("Index must be set.")

        if not isinstance(periods, int):
            raise ValueError('periods should be an int; however, got [%s]' %
                             type(periods))

        col = self._scol
        window = Window.orderBy(self._kdf._internal.index_scols).rowsBetween(
            -periods, -periods)
        shifted_col = F.lag(col, periods).over(window)
        col = F.when(shifted_col.isNull() | F.isnan(shifted_col),
                     fill_value).otherwise(shifted_col)

        return self._with_new_scol(col).rename(self.name)
示例#47
0
 def zeros(col_name):
     return F.count(F.when(F.col(col_name) == 0, col_name))
示例#48
0
    df = sqlContext.read \
        .format('com.databricks.spark.csv') \
        .options(header='false') \
        .load(args.file, schema=StructType(fields))

    # calculate the totals summed across all dates
    countDF = df.groupBy('name').agg({"count": "sum"}).withColumnRenamed('sum(count)', 'total')

    # read from the column dates
    dates = sorted(df.select("date")
        .distinct()
        .map(lambda row: row[0])
        .collect())

    # find the counts for each date
    cols = [when(col("date") == m, col("percentage")).otherwise(None).alias(m)
        for m in  dates]
    maxs = [max(col(m)).alias(m) for m in dates]

    # reformat dataframe
    series = (df
        .select(col("name"), *cols)
        .groupBy("name")
        .agg(*maxs)
        .na.fill(0))

    compressedTimeseries = series.select("name", concat_ws(",", *dates).alias("timeseries"))

    # add totals to timeseries table
    resultDF = compressedTimeseries.join(countDF, 'name', 'inner')
示例#49
0
# Extract Trip time
def time_delta(pickup_time, dropoff_time):
    pickup_time_out  = datetime.datetime.strptime(pickup_time, '%m/%d/%y %H:%M')
    dropoff_time_out = datetime.datetime.strptime(dropoff_time, '%m/%d/%y %H:%M')
    trip_time        = (dropoff_time_out - pickup_time_out).seconds / float(60)
    return trip_time

f = udf(time_delta, FloatType())

# (1) Calculate "trip_time"
# (2) Create a "tip_flag" for any record where a customer leaves a tip
# (3) Extract the Pickup Day (as an integer)
# (4) Extract the Pickup Hour (as an integer)
transformed1 = rawdata.withColumn("trip_time", f(rawdata.pickup_datetime, rawdata.dropoff_datetime)) \
                      .withColumn("tip_flag", (when(rawdata.tip_amount > 0.0, 1).otherwise(0)) ) \
                      .withColumn("pickup_day", split(rawdata.pickup_datetime,"\/")[1].cast("integer") ) \
                      .withColumn("pickup_hour", split(split(rawdata.pickup_datetime," ")[1],":")[0].cast("integer") )


#######################################################################################
#
#   Model Prep
#
#######################################################################################

# String Indexer
strindexer = StringIndexer(inputCol="vehicle_id", outputCol="vehicle_id_index")
modelprep1 = strindexer.fit(transformed1).transform(transformed1)

features = ['pickup_longitude','passenger_count','tolls_amount','tip_amount','trip_distance']
示例#50
0
def main(inputFile, outputFile):
    def getItemName(payload):
        payload = json.loads(payload)
        return payload['item_name'].split('.')[0]

    item_udf = udf(lambda payload: getItemName(payload))
    df = spark.read.parquet(inputFile+'/*')

     #DF for Item Play Started
    start = df.filter(col('event').isin(['ITEM_PLAY_STARTED']))
    start = start.withColumn('Content Name', item_udf(start['payload'])) # get content name
    start = start.withColumn('Time Start', df['time'].cast("timestamp")) #turn Time Start String to easy to use Time Stamp Object
    cols = ["device_id", "Content Name", "Time Start"]    #Select just the columns we need
    start = start.select(*cols)
    
    def getEndOfStream(payload):
        payload = json.loads(payload)
        if "did_reach_end_of_stream" in payload:
            return payload['did_reach_end_of_stream']
        else:
            return "false"

    stream_end_UDF = udf(lambda x: getEndOfStream(x))
        
        #DF for Item Play Finished
    finished = df.filter(col('event').isin(['ITEM_PLAY_FINISHED'])) 
    finished = finished.withColumn('Content Name', item_udf(finished['payload'])) # get content name
    finished = finished.withColumn('reach_end_of_stream', stream_end_UDF(finished['payload'])) # get did_reach_end_of_stream        
    finished = finished.withColumn("reach_end_of_stream", F.trim(col("reach_end_of_stream"))) #Get rid of white space

        
        #Convert True/False strings to actual boolean values
    finished = finished.withColumn(
    'reach_end_of_stream',
    F.when(F.col("reach_end_of_stream") == "true", True)\
    .otherwise(False)
    )
        #turn Time End String to easy to use Time Stamp Object
    finished = finished.withColumn('Time End', df['time'].cast("timestamp"))
    #Select just the columns we need
    cols = ["device_id", "Content Name", "Time End", "reach_end_of_stream"]
    finished = finished.select(*cols)
        
        #combine two dataframes for our transformed Schema
    transformed = start.join(finished, on=["device_id", "Content Name"], how='left_outer')
    
    #Make sure Time Start before time end
    transformed = transformed.where(col("Time Start") <= col("Time End"))
    
    #Convert time stamps to unix
    #transformed = transformed.withColumn('Time Start', F.unix_timestamp('Time Start'))
    #transformed = transformed.withColumn('Time End', F.unix_timestamp('Time End'))
    
    #Get correct Time Ends
    def getEndTime(end_time_list):
        return end_time_list[0]
    
    end_time_udf = udf(lambda end_time_list: end_time_list[0], T.TimestampType())
    df = transformed.withColumn("end_time_list", F.collect_list("Time End").over(Window.partitionBy("device_id",'Content Name','Time Start', "reach_end_of_stream").orderBy('Time End')))
    df = df.groupBy('device_id','Time Start','Content Name', "reach_end_of_stream").agg(F.max('end_time_list').alias('end_time_list'))
    #Still gets laggy here running the udf that takes first item of list (aka the smallest date time)
    df = df.withColumn('Time End', end_time_udf("end_time_list"))
    df = df.drop('end_time_list')
    
    #rename columns + reorder
    df = df.withColumnRenamed("Time Start", "start").withColumnRenamed("Time End", "end").withColumnRenamed("Content Name", "item_name")
    df = df.select("device_id", "item_name", "start", "end", "reach_end_of_stream")

    df.write.parquet(outputFile) # Write onto output Parquet
示例#51
0
        return None
    new_row = []
    new_row.append(row[0])
    for i in range(1,3):
        if row[i] == row[i+2]:
            new_row.append[-99]
        else:
            new_row.append[row[i+2]]
    print (new_row)



from pyspark.sql import functions as F


ds.withColumn("new_ALM", F.when(ds['AL MOBILE'] == df['AL MOBILE'], "").otherwise(ds['ALM'])).show() # or with ds. too



F.when( (df["col-1"]>0.0) & (df["col-2">0.0), 1).otherwise(0)

df.select(F.when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()


def get_row(row):
    F.when(row['age'] == 2, 3).otherwise(4).alias("age")

new_cols = df_new.select([x for x in df.columns])


ds.select(ds['HOTEL ID'], F.when(ds['AL MOBILE'] == df['AL MOBILE'], "").otherwise(ds['ALM']), F.when(ds['AL DESKTOP'] == df['ALD'], "").otherwise(ds['ALD']))
    
convert_enqueue_utcdtstr_date_udf = udf(convert_enqueue_utcdtstr_date, TimestampType())

# COMMAND ----------

# Add a column EnqueuedTimeUtc which has the Enqueued Date as a Date
sourceTransactionsDf = sourceTransactionsDf.withColumn("EnqueuedDateTimeUTC", convert_enqueue_utcdtstr_date_udf(sourceTransactionsDf['EnqueuedTimeUtc']))
sourceAccrualsDf = sourceAccrualsDf.withColumn("EnqueuedDateTimeUTC", convert_enqueue_utcdtstr_date_udf(sourceAccrualsDf['EnqueuedTimeUtc']))
sourceRedemptionsDf = sourceRedemptionsDf.withColumn("EnqueuedDateTimeUTC", convert_enqueue_utcdtstr_date_udf(sourceRedemptionsDf['EnqueuedTimeUtc']))
sourceMemberBalancesDf = sourceMemberBalancesDf.withColumn("EnqueuedDateTimeUTC", convert_enqueue_utcdtstr_date_udf(sourceMemberBalancesDf['EnqueuedTimeUtc']))

# COMMAND ----------

# Add a column which checks whether the Enqueued Date is between yesterday 3 AM and today 4 AM UTC
from pyspark.sql import functions as F
sourceTransactionsDf = sourceTransactionsDf.withColumn("ValidForDate", F.when((yesterday_utc_date_3am_datetime <= sourceTransactionsDf.EnqueuedDateTimeUTC) & (sourceTransactionsDf.EnqueuedDateTimeUTC <= today_utc_date_4am_datetime),"YES").otherwise("NO"))
sourceAccrualsDf = sourceAccrualsDf.withColumn("ValidForDate", F.when((yesterday_utc_date_3am_datetime <= sourceAccrualsDf.EnqueuedDateTimeUTC) & (sourceAccrualsDf.EnqueuedDateTimeUTC <= today_utc_date_4am_datetime),"YES").otherwise("NO"))
sourceRedemptionsDf = sourceRedemptionsDf.withColumn("ValidForDate", F.when((yesterday_utc_date_3am_datetime <= sourceRedemptionsDf.EnqueuedDateTimeUTC) & (sourceRedemptionsDf.EnqueuedDateTimeUTC <= today_utc_date_4am_datetime),"YES").otherwise("NO"))
sourceMemberBalancesDf = sourceMemberBalancesDf.withColumn("ValidForDate", F.when((yesterday_utc_date_3am_datetime <= sourceMemberBalancesDf.EnqueuedDateTimeUTC) & (sourceMemberBalancesDf.EnqueuedDateTimeUTC <= today_utc_date_4am_datetime),"YES").otherwise("NO"))

# COMMAND ----------

# get data valid only from 3.00 AM UTC previous day to 4 AM UTC today
validTransactionsDf = sourceTransactionsDf.filter("ValidForDate = 'YES'")
validAccrualsDf = sourceAccrualsDf.filter("ValidForDate = 'YES'")
validRedemptionsDf = sourceRedemptionsDf.filter("ValidForDate = 'YES'")
validMemberBalancesDf = sourceMemberBalancesDf.filter("ValidForDate <> 'YES'")

# COMMAND ----------

# UDF - this would take just the body dataframe and decrypt it using UDF 
示例#53
0
def get_row(row):
    F.when(row['age'] == 2, 3).otherwise(4).alias("age")
示例#54
0
文件: utils.py 项目: tahervali/koalas
def combine_frames(this, *args, how="full"):
    """
    This method combines `this` DataFrame with a different `that` DataFrame or
    Series from a different DataFrame.

    It returns a DataFrame that has prefix `this_` and `that_` to distinct
    the columns names from both DataFrames

    It internally performs a join operation which can be expensive in general.
    So, if `compute.ops_on_diff_frames` option is False,
    this method throws an exception.
    """
    from databricks.koalas import Series
    from databricks.koalas import DataFrame
    from databricks.koalas.config import get_option

    if all(isinstance(arg, Series) for arg in args):
        assert all(
            arg._kdf is args[0]._kdf for arg in args
        ), "Currently only one different DataFrame (from given Series) is supported"
        if this is args[0]._kdf:
            return  # We don't need to combine. All series is in this.
        that = args[0]._kdf[list(args)]
    elif len(args) == 1 and isinstance(args[0], DataFrame):
        assert isinstance(args[0], DataFrame)
        if this is args[0]:
            return  # We don't need to combine. `this` and `that` are same.
        that = args[0]
    else:
        raise AssertionError("args should be single DataFrame or "
                             "single/multiple Series")

    if get_option("compute.ops_on_diff_frames"):
        this_index_map = this._internal.index_map
        that_index_map = that._internal.index_map
        assert len(this_index_map) == len(that_index_map)

        join_scols = []
        merged_index_scols = []

        # Note that the order of each element in index_map is guaranteed according to the index
        # level.
        this_and_that_index_map = zip(this_index_map.items(),
                                      that_index_map.items())

        # If the same named index is found, that's used.
        for (this_column, this_name), (that_column,
                                       that_name) in this_and_that_index_map:
            if this_name == that_name:
                # We should merge the Spark columns into one
                # to mimic pandas' behavior.
                this_scol = scol_for(this._sdf, this_column)
                that_scol = scol_for(that._sdf, that_column)
                join_scol = this_scol == that_scol
                join_scols.append(join_scol)
                merged_index_scols.append(
                    F.when(this_scol.isNotNull(),
                           this_scol).otherwise(that_scol).alias(this_column))
            else:
                raise ValueError(
                    "Index names must be exactly matched currently.")

        assert len(
            join_scols) > 0, "cannot join with no overlapping index names"

        joined_df = this._sdf.alias("this").join(that._sdf.alias("that"),
                                                 on=join_scols,
                                                 how=how)

        joined_df = joined_df.select(merged_index_scols + [
            this[label].spark_column.alias(
                "__this_%s" % this._internal.spark_column_name_for(label))
            for label in this._internal.column_labels
        ] + [
            that[label].spark_column.alias(
                "__that_%s" % that._internal.spark_column_name_for(label))
            for label in that._internal.column_labels
        ])

        index_columns = set(this._internal.index_spark_column_names)
        new_data_columns = [
            c for c in joined_df.columns if c not in index_columns
        ]
        level = max(this._internal.column_labels_level,
                    that._internal.column_labels_level)
        column_labels = [
            tuple(["this"] + ([""] * (level - len(label))) + list(label))
            for label in this._internal.column_labels
        ] + [
            tuple(["that"] + ([""] * (level - len(label))) + list(label))
            for label in that._internal.column_labels
        ]
        column_label_names = (
            (([None] * (1 + level - len(this._internal.column_labels_level))) +
             this._internal.column_label_names)
            if this._internal.column_label_names is not None else None)
        return DataFrame(
            this._internal.copy(
                spark_frame=joined_df,
                column_labels=column_labels,
                data_spark_columns=[
                    scol_for(joined_df, col) for col in new_data_columns
                ],
                column_label_names=column_label_names,
            ))
    else:
        raise ValueError(
            "Cannot combine the series or dataframe because it comes from a different dataframe. "
            "In order to allow this operation, enable 'compute.ops_on_diff_frames' option."
        )
def main():
    spark_session = (SparkSession.builder.appName(APPLICATION_NAME).master(
        MASTER_URL).config('spark.cassandra.connection.host',
                           MORPHL_SERVER_IP_ADDRESS).config(
                               'spark.cassandra.auth.username',
                               MORPHL_CASSANDRA_USERNAME).config(
                                   'spark.cassandra.auth.password',
                                   MORPHL_CASSANDRA_PASSWORD).config(
                                       'spark.sql.shuffle.partitions',
                                       16).config(
                                           'parquet.enable.summary-metadata',
                                           'true').getOrCreate())

    log4j = spark_session.sparkContext._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)

    # All users from the database are already retained (they are filtered from the BQ SQL)
    ga_chp_bq_users = fetch_from_cassandra(
        'ga_chp_bq_features_raw_t' if TRAINING_OR_PREDICTION == 'training' else
        'ga_chp_bq_features_raw_p', spark_session)
    ga_chp_bq_users.createOrReplaceTempView('ga_chp_bq_users')

    # Using window functions: https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html
    grouped_by_client_id_before_dedup_sql_parts = [
        'SELECT', 'client_id,',
        'SUM(bounces) OVER (PARTITION BY client_id) AS bounces,'
        'SUM(events) OVER (PARTITION BY client_id) AS events,'
        'SUM(page_views) OVER (PARTITION BY client_id) AS page_views,'
        'SUM(session_duration) OVER (PARTITION BY client_id) AS session_duration,'
        'SUM(sessions) OVER (PARTITION BY client_id) AS sessions,'
        'FIRST_VALUE(is_desktop) OVER (PARTITION BY client_id ORDER BY day_of_data_capture DESC) AS is_desktop,'
        'FIRST_VALUE(is_mobile) OVER (PARTITION BY client_id ORDER BY day_of_data_capture DESC) AS is_mobile,'
        'FIRST_VALUE(is_tablet) OVER (PARTITION BY client_id ORDER BY day_of_data_capture DESC) AS is_tablet,'
        'ROW_NUMBER() OVER (PARTITION BY client_id ORDER BY day_of_data_capture DESC) AS rownum'
    ]

    if TRAINING_OR_PREDICTION == 'training':
        grouped_by_client_id_before_dedup_sql_parts = grouped_by_client_id_before_dedup_sql_parts + [
            ', FIRST_VALUE(days_since_last_session) OVER (PARTITION BY client_id ORDER BY day_of_data_capture DESC) AS days_since_last_session,',
            'AVG(days_since_last_session) OVER (PARTITION BY client_id) AS avgdays',
        ]

    grouped_by_client_id_before_dedup_sql_parts = grouped_by_client_id_before_dedup_sql_parts + [
        'FROM', 'ga_chp_bq_users'
    ]

    grouped_by_client_id_before_dedup_sql = ' '.join(
        grouped_by_client_id_before_dedup_sql_parts)
    grouped_by_client_id_before_dedup_df = spark_session.sql(
        grouped_by_client_id_before_dedup_sql)
    grouped_by_client_id_before_dedup_df.createOrReplaceTempView(
        'grouped_by_client_id_before_dedup')

    # Only keeping the most recent record from every client id
    # rownum = 1 while day_of_data_capture is sorted in descending order
    grouped_by_client_id_sql = 'SELECT * FROM grouped_by_client_id_before_dedup WHERE rownum = 1'
    grouped_by_client_id_df = spark_session.sql(grouped_by_client_id_sql)
    grouped_by_client_id_df.createOrReplaceTempView('grouped_by_client_id')

    # The schema for grouped_by_client_id_df is:
    # |-- client_id: string (nullable = true)
    # |-- bounces: double (nullable = true)
    # |-- events: double (nullable = true)
    # |-- page_views: double (nullable = true)
    # |-- session_duration: double (nullable = true)
    # |-- sessions: double (nullable = true)
    # |-- is_desktop: double (nullable = true)
    # |-- is_mobile: double (nullable = true)
    # |-- is_tablet: double (nullable = true)
    # |-- days_since_last_session: float (nullable = true)
    # |-- rownum: integer (nullable = true)
    # |-- avgdays: double (nullable = true)

    if TRAINING_OR_PREDICTION == 'training':
        mean_value_of_avg_days_sql = 'SELECT AVG(avgdays) mean_value_of_avgdays FROM grouped_by_client_id'
        mean_value_of_avg_days_df = spark_session.sql(
            mean_value_of_avg_days_sql)
        churn_threshold = mean_value_of_avg_days_df.first(
        ).mean_value_of_avgdays

        final_df = (grouped_by_client_id_df.withColumn(
            'churned',
            f.when(f.col('days_since_last_session') > churn_threshold,
                   1.0).otherwise(0.0)).select('client_id', 'bounces',
                                               'events', 'page_views',
                                               'session_duration', 'sessions',
                                               'is_desktop', 'is_mobile',
                                               'is_tablet',
                                               'churned').repartition(32))

        # The schema for final_df is:
        # |-- client_id: string (nullable = true)
        # |-- bounces: double (nullable = true)
        # |-- events: double (nullable = true)
        # |-- page_views: double (nullable = true)
        # |-- session_duration: double (nullable = true)
        # |-- sessions: double (nullable = true)
        # |-- is_desktop: double (nullable = true)
        # |-- is_mobile: double (nullable = true)
        # |-- is_tablet: double (nullable = true)
        # |-- churned: double (nullable = false)

        final_df.cache()

        final_df.write.parquet(HDFS_DIR_TRAINING)

        save_options_ga_chp_bq_features_training = {
            'keyspace': MORPHL_CASSANDRA_KEYSPACE,
            'table': 'ga_chp_bq_features_training'
        }

        (final_df.write.format('org.apache.spark.sql.cassandra').mode(
            'append').options(
                **save_options_ga_chp_bq_features_training).save())

        with open(CHURN_THRESHOLD_FILE, 'w') as fh:
            fh.write(str(churn_threshold))
    else:
        final_df = (grouped_by_client_id_df.select(
            'client_id', 'bounces', 'events', 'page_views', 'session_duration',
            'sessions', 'is_desktop', 'is_mobile',
            'is_tablet').repartition(32))

        # The schema for final_df is:
        # |-- client_id: string (nullable = true)
        # |-- bounces: double (nullable = true)
        # |-- events: double (nullable = true)
        # |-- page_views: double (nullable = true)
        # |-- session_duration: double (nullable = true)
        # |-- sessions: double (nullable = true)
        # |-- is_desktop: double (nullable = true)
        # |-- is_mobile: double (nullable = true)
        # |-- is_tablet: double (nullable = true)

        final_df.cache()

        final_df.write.parquet(HDFS_DIR_PREDICTION)

        save_options_ga_chp_bq_features_prediction = {
            'keyspace': MORPHL_CASSANDRA_KEYSPACE,
            'table': 'ga_chp_bq_features_prediction'
        }

        (final_df.write.format('org.apache.spark.sql.cassandra').mode(
            'append').options(
                **save_options_ga_chp_bq_features_prediction).save())
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("LogisticRegression App").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")


# Loading the data
data = spark.read.format("csv").option("header", True) \
                               .option("inferSchema", True) \
                               .option("delimiter", ",") \
                               .load("/Users/louis_lyu/Desktop/SourceCode/data/imports-85.data")


data.printSchema()

data = data.withColumn("label", F.when(F.col("num-of-doors") == "four", 1).otherwise(0)).select("label","length", "width","height")
data.show()

# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
model = lr.fit(data)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))
示例#57
0
def main():
    "Main function"
    optmgr  = OptionParser()
    opts = optmgr.parser.parse_args()

    # setup spark/sql context to be used for communication with HDFS
    sc = SparkContext(appName="phedex_br")
    if not opts.yarn:
        sc.setLogLevel("ERROR")
    sqlContext = HiveContext(sc)

    schema_def = schema()

    # read given file(s) into RDD
    if opts.fname:
        pdf = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(opts.fname, schema = schema_def)
    elif opts.basedir:
        fromdate, todate = defDates(opts.fromdate, opts.todate)
        files = getFileList(opts.basedir, fromdate, todate)
        msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files))
        print msg

        if not files:
            return
        pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(file_path, schema = schema_def) \
                        for file_path in files])
    else:
        raise ValueError("File or directory not specified. Specify fname or basedir parameters.")

    # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date)
    groupdic, nodedic = getJoinDic()
    acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$"	
    data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$"
    groupf = udf(lambda x: groupdic[x], StringType())
    nodef = udf(lambda x: nodedic[x], StringType())

    ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \
         .withColumn("node_kind", nodef(pdf.node_id)) \
         .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \
         .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \
        .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1)))

	# print dataframe schema
    if opts.verbose:
        ndf.show()
        print("pdf data type", type(ndf))
        ndf.printSchema()

    # process aggregation parameters
    keys = [key.lower().strip() for key in opts.keys.split(',')]
    results = [result.lower().strip() for result in opts.results.split(',')]
    aggregations = [agg.strip() for agg in opts.aggregations.split(',')]
    order = [orde.strip() for orde in opts.order.split(',')] if opts.order else []
    asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else []
    filtc, filtv = opts.filt.split(":") if opts.filt else (None,None)

    validateAggregationParams(keys, results, aggregations, order, filtc)

    if filtc and filtv:
        ndf = ndf.filter(getattr(ndf, filtc) == filtv)

    # if delta aggregation is used
    if DELTA in aggregations:
        validateDeltaParam(opts.interval, results)			
        result = results[0]

        #1 for all dates generate interval group dictionary
        datedic = generateDateDict(fromdate, todate, opts.interval)
        boundic = generateBoundDict(datedic)
        max_interval = max(datedic.values())

        interval_group = udf(lambda x: datedic[x], IntegerType())
        interval_start = udf(lambda x: boundic[x][0], StringType())		
        interval_end = udf(lambda x: boundic[x][1], StringType())

        #2 group data by block, node, interval and last result in the interval
        ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result))
        idf = ndf.withColumn("interval_group", interval_group(ndf.now))
        win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc())	
        idf = idf.withColumn("row_number", rowNumber().over(win))
        rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\
                 .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0)))
        rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result))
        rdf.cache()

        #3 create intervals that not exist but has minus delta
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win))
        hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\
                 .withColumn("interval_group", adf.interval_group + 1)\
                 .withColumn(result, lit(0))\
                 .drop(adf.interval_group_aft)

        #4 join data frames
        idf = rdf.unionAll(hdf)
		
        #3 join every interval with previous interval
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win))

        #5 calculate delta_plus and delta_minus columns and aggregate by date and node
        ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \
                .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0))

        aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\
                                                                    sum(ddf.delta_minus).alias("delta_minus"))

        aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus)
		
    else:	
        resAgg_dic = zipResultAgg(results, aggregations)
        order, asc = formOrdAsc(order, asc, resAgg_dic)

        # perform aggregation
        if order:
            aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc)
        else:
            aggres = ndf.groupBy(keys).agg(resAgg_dic)

    # output results
    if opts.fout:
        fout_header = formFileHeader(opts.fout)
        if opts.header:
            aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header)
        else:
            aggres.write.format('com.databricks.spark.csv').save(fout_header)
    else:
        aggres.show(50)