def generateExpr(columnName, listIntervals): if (len(listIntervals) == 1): return when(col(columnName).between(listIntervals[0][0], listIntervals[0][1]), 0).otherwise(None) else: return (when((col(columnName) >= listIntervals[0][0]) & (col(columnName) < listIntervals[0][1]), len(listIntervals) - 1) .otherwise(generateExpr(columnName, listIntervals[1:])))
def log_loss(df): epsilon = 1e-12 temp = df.select("label", when(df.outcome == 1.0, 1.0-epsilon).otherwise(df.outcome).alias("p")) temp = temp.select("label", when(temp.p == .0,epsilon).otherwise(temp.p).alias("p")) temp = temp.select("p","label", when(temp.label == 1, -log(temp.p)).otherwise(-log(1-temp.p)).alias("log_loss")) return temp.selectExpr("mean(log_loss)").first()[0]
def runBPwithGraphFrames(cls, g, numIter): """Run Belief Propagation using GraphFrame. This implementation of BP shows how to use GraphFrame's aggregateMessages method. """ # choose colors for vertices for BP scheduling colorG = cls._colorGraph(g) numColors = colorG.vertices.select('color').distinct().count() # TODO: handle vertices without any edges # initialize vertex beliefs at 0.0 gx = GraphFrame(colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)), colorG.edges) # run BP for numIter iterations for iter_ in range(numIter): # for each color, have that color receive messages from neighbors for color in range(numColors): # Send messages to vertices of the current color. # We may send to source or destination since edges are treated as undirected. msgForSrc = sqlfunctions.when( AM.src['color'] == color, AM.edge['b'] * AM.dst['belief']) msgForDst = sqlfunctions.when( AM.dst['color'] == color, AM.edge['b'] * AM.src['belief']) # numerically stable sigmoid logistic = sqlfunctions.udf(cls._sigmoid, returnType=types.DoubleType()) aggregates = gx.aggregateMessages( sqlfunctions.sum(AM.msg).alias("aggMess"), sendToSrc=msgForSrc, sendToDst=msgForDst) v = gx.vertices # receive messages and update beliefs for vertices of the current color newBeliefCol = sqlfunctions.when( (v['color'] == color) & (aggregates['aggMess'].isNotNull()), logistic(aggregates['aggMess'] + v['a']) ).otherwise(v['belief']) # keep old beliefs for other colors newVertices = (v .join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer') .drop(aggregates['id']) # drop duplicate ID column (from outer join) .withColumn('newBelief', newBeliefCol) # compute new beliefs .drop('aggMess') # drop messages .drop('belief') # drop old beliefs .withColumnRenamed('newBelief', 'belief') ) # cache new vertices using workaround for SPARK-1334 cachedNewVertices = AM.getCachedDataFrame(newVertices) gx = GraphFrame(cachedNewVertices, gx.edges) # Drop the "color" column from vertices return GraphFrame(gx.vertices.drop('color'), gx.edges)
def gen_report_table(hc,curUnixDay): rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6]))) HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor") #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds," sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt," sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon " sql=sql+"from df_indoor order by entityid,clientmac,utoday" df_id_stat=hc.sql(sql) df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac"))) #df_id_mm df_min_max ,to caculate firtarrival and last arrival df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct() #distinct df is for lag function to work df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0) cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY] df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon) df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth") #newly added part for indoors7 and indoors30 based on current date df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0)) df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0)) df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac"))) df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac"))) df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30") hc.sql("drop table if exists df_indoor_fin") df_indoor_fin.write.saveAsTable("df_indoor_fin") rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5]))) HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow") # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday sql="select entityid,clientmac,utoday,UFIRSTDAY," sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt," sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon " sql=sql+"from df_flow order by entityid,clientmac,utoday" df_fl_stat=hc.sql(sql) df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac"))) #df_fl_mm df_min_max ,to caculate firtarrival and last arrival df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct() #distinct df is for lag function to work df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0) cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY] df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon) df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth") hc.sql("drop table if exists df_flow_fin") df_flow_fin.write.saveAsTable("df_flow_fin")
def test_first_last_ignorenulls(self): from pyspark.sql import functions df = self.spark.range(0, 100) df2 = df.select(functions.when(df.id % 3 == 0, None).otherwise(df.id).alias("id")) df3 = df2.select(functions.first(df2.id, False).alias('a'), functions.first(df2.id, True).alias('b'), functions.last(df2.id, False).alias('c'), functions.last(df2.id, True).alias('d')) self.assertEqual([Row(a=None, b=1, c=None, d=98)], df3.collect())
def create_hist_data(df, column, minim, maxim, bins=10): def create_all_conditions(current_col, column, left_edges, count=1): """ Recursive function that exploits the ability to call the Spark SQL Column method .when() in a recursive way. """ left_edges = left_edges[:] if len(left_edges) == 0: return current_col if len(left_edges) == 1: next_col = current_col.when(col(column) >= float(left_edges[0]), count) left_edges.pop(0) return create_all_conditions(next_col, column, left_edges[:], count+1) next_col = current_col.when((float(left_edges[0]) <= col(column)) & (col(column) < float(left_edges[1])), count) left_edges.pop(0) return create_all_conditions(next_col, column, left_edges[:], count+1) num_range = maxim - minim bin_width = num_range / float(bins) left_edges = [minim] for _bin in range(bins): left_edges = left_edges + [left_edges[-1] + bin_width] left_edges.pop() expression_col = when((float(left_edges[0]) <= col(column)) & (col(column) < float(left_edges[1])), 0) left_edges_copy = left_edges[:] left_edges_copy.pop(0) bin_data = (df.select(col(column)) .na.drop() .select(col(column), create_all_conditions(expression_col, column, left_edges_copy ).alias("bin_id") ) .groupBy("bin_id").count() ).toPandas() # If no data goes into one bin, it won't # appear in bin_data; so we should fill # in the blanks: bin_data.index = bin_data["bin_id"] new_index = list(range(bins)) bin_data = bin_data.reindex(new_index) bin_data["bin_id"] = bin_data.index bin_data = bin_data.fillna(0) # We add the left edges and bin width: bin_data["left_edge"] = left_edges bin_data["width"] = bin_width return bin_data
def create_binary_feature(self, dataframe, base_field, binary_field): """Produces a PySpark dataframe containing a field that is 0 or 1. The value of the binary field will be 1 if the value of the evaluated field is greater than 0; otherwise it will be 0. :param dataframe: the PySpark dataframe :param base_field: the field to use as the basis for the binary field :param binary_field: the name to give to the field that will contain values of 0 or 1 :returns: the PySpark dataframe containing the binary field and all fields in the supplied dataframe. """ return(dataframe.withColumn(binary_field, when(dataframe[base_field] > 0, 1).otherwise(0)))
def processMSC(): """ Parses MSC records as per defined rules :return: Records returned in pipe-delimited format """ # Assumption: MSC folder under the provided input path inputDir = os.path.join(args.inputdir, "INPUT") lines = sc.textFile(inputDir) # Call the parsing function parsedMSCLines = lines.map(parseMSCRecords) # The schema is encoded in a string. schemaString = "RecordType FirstNum SecondNum CallDate CallHour Duration StartTower StartLAC CallType" fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] schema = StructType(fields) # Apply the schema to the RDD. schemaData = sqlContext.createDataFrame(parsedMSCLines, schema) modify_phone_number_udf = udf(mod_number, StringType()) ph_num_mod = schemaData.select( schemaData.RecordType, modify_phone_number_udf(schemaData.FirstNum).alias('FirstNum'), modify_phone_number_udf(schemaData.SecondNum).alias('SecondNum'), schemaData.CallDate, schemaData.CallHour, schemaData.Duration, schemaData.StartTower, schemaData.StartLAC, schemaData.CallType) get_phone_type_udf = udf(get_phone_type, StringType()) first_ph_type = ph_num_mod.withColumn('FirstPhoneType', get_phone_type_udf(ph_num_mod.FirstNum)) sec_ph_type = first_ph_type.withColumn('SecondPhoneType', get_phone_type_udf(first_ph_type.SecondNum)) final_df = sec_ph_type.select( sec_ph_type.RecordType, sec_ph_type.FirstNum, sec_ph_type.SecondNum, sec_ph_type.CallDate, sec_ph_type.CallHour, sec_ph_type.Duration, sec_ph_type.StartTower, sec_ph_type.StartLAC, sec_ph_type.CallType, F.when(sec_ph_type.FirstPhoneType.isin(["mobile", "landline", "shortcode"]) & sec_ph_type.SecondPhoneType.isin(["mobile", "landline", "shortcode"]), "National") .otherwise("International").alias('PhoneType')) print final_df.show()
def create_valence_column(self, dataframe, base_field, valence_field): """Produces a PySpark dataframe containing a field that is -1, 0, or 1 depending on the value of a specified field. The valence will be: -1 if the value in the specified column is negative 0 if the value in the specified column is zero 1 if the value in the specified column is positive :param dataframe: the PySpark dataframe :param base_field: the field containing values to use to determine the valence :param valence_field: the name of the field that will contain the valence :returns: the PySpark dataframe containing the valence field and all fields in the supplied dataframe """ return(dataframe.withColumn(valence_field, when(dataframe[base_field] < 0, -1).when(dataframe[base_field] > 0, 1).otherwise(0)))
def create_levels_column(self, dataframe, base_field, levels_field): """Produces a PySpark dataframe containing a field based on the level of a specified field The level will be: 0 if the value in the specified column is an integer less than 1 1 if the value in the specified column is an integer between 1 and 2 2 if the value in the specified column is an integer between 3 and 4 3 if the value in the specified column is an integer that is 5 or greater :param dataframe: the PySpark dataframe :param base_field: the field containing integers to use to determine the level :param levels_field: the name of the field that will contain the levels :returns: the PySpark dataframe containing the levels field and all fields in the supplied dataframe """ return(dataframe.withColumn(levels_field, when(dataframe[base_field].between(1,2), 1) \ .when(dataframe[base_field].between(3,4), 2).when(dataframe[base_field] >= 5, 3).otherwise(0)))
def reduce_to_ohlc(time, rdd): row_rdd = rdd.map(lambda row: row.split(',')) \ .filter(lambda row: len(row) == 3) \ .map(lambda row: Row( symbol=row[0], tx_time=datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S.%f'), price=float(row[1]) )) sql_context = get_sql_context_instance(rdd.context) data = sql_context.createDataFrame(row_rdd) data.cache() data.write.format('org.apache.spark.sql.cassandra') \ .options(table='transactions2', keyspace='stock', cluster='Test Cluster') \ .mode('append') \ .save() ohlc = data.select('symbol', truncate_min(data.tx_time).alias('batch_time'), 'price', 'tx_time') \ .orderBy('tx_time') \ .groupBy('symbol', 'batch_time') \ .agg( F.first(data.price).alias('open'), F.max(data.price).alias('high'), F.min(data.price).alias('low'), F.last(data.price).alias('close'), F.first(data.tx_time).alias('open_time'), F.last(data.tx_time).alias('close_time') ) existing_ohlc = sql_context.read.format('org.apache.spark.sql.cassandra') \ .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \ .load() \ .select('symbol', 'batch_time', 'open', 'open_time', 'high', 'low', 'close', 'close_time') merged_ohlc = ohlc.join(existing_ohlc, (ohlc.symbol == existing_ohlc.symbol) & (ohlc.batch_time == existing_ohlc.batch_time), 'left' ) merged_ohlc = merged_ohlc.select( ohlc.symbol.alias('symbol'), ohlc.batch_time.alias('batch_time'), F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open).otherwise(ohlc.open).alias('open'), F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open_time).otherwise(ohlc.open_time).alias('open_time'), F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close).otherwise(ohlc.close).alias('close'), F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close_time).otherwise(ohlc.close_time).alias('close_time'), F.when(existing_ohlc.low < ohlc.low, existing_ohlc.low).otherwise(ohlc.low).alias('low'), F.when(existing_ohlc.high > ohlc.high, existing_ohlc.high).otherwise(ohlc.high).alias('high') ) merged_ohlc.write.format('org.apache.spark.sql.cassandra') \ .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \ .mode('append') \ .save()
def test_aggregate_messages(self): g = self._graph("friends") # For each user, sum the ages of the adjacent users, # plus 1 for the src's sum if the edge is "friend". sendToSrc = ( AM.dst['age'] + sqlfunctions.when( AM.edge['relationship'] == 'friend', sqlfunctions.lit(1) ).otherwise(0)) sendToDst = AM.src['age'] agg = g.aggregateMessages( sqlfunctions.sum(AM.msg).alias('summedAges'), sendToSrc=sendToSrc, sendToDst=sendToDst) # Run the aggregation again providing SQL expressions as String instead. agg2 = g.aggregateMessages( "sum(MSG) AS `summedAges`", sendToSrc="(dst['age'] + CASE WHEN (edge['relationship'] = 'friend') THEN 1 ELSE 0 END)", sendToDst="src['age']") # Convert agg and agg2 to a mapping from id to the aggregated message. aggMap = {id_: s for id_, s in agg.select('id', 'summedAges').collect()} agg2Map = {id_: s for id_, s in agg2.select('id', 'summedAges').collect()} # Compute the truth via brute force. user2age = {id_: age for id_, age in g.vertices.select('id', 'age').collect()} trueAgg = {} for src, dst, rel in g.edges.select("src", "dst", "relationship").collect(): trueAgg[src] = trueAgg.get(src, 0) + user2age[dst] + (1 if rel == 'friend' else 0) trueAgg[dst] = trueAgg.get(dst, 0) + user2age[src] # Compare if the agg mappings match the brute force mapping self.assertEqual(aggMap, trueAgg) self.assertEqual(agg2Map, trueAgg) # Check that TypeError is raises with messages of wrong type with self.assertRaises(TypeError): g.aggregateMessages( "sum(MSG) AS `summedAges`", sendToSrc=object(), sendToDst="src['age']") with self.assertRaises(TypeError): g.aggregateMessages( "sum(MSG) AS `summedAges`", sendToSrc=dst['age'], sendToDst=object())
def comb_creation(apps,cpu_perc_list): from pyspark.sql.functions import when q1 = datetime.now() df_t = df.registerTempTable('dummy') df_t = sqlContext.sql('select sum(byte_count) as byte_count_sum , time_stamp , location from dummy group by location, time_stamp') df_t = df_t[df_t.byte_count_sum!=0] cpu_perc_list.append(py.cpu_percent()) cpu_perc_list = [max(cpu_perc_list)] #df_needed = df[df.application.isin(apps)] df_t = df_t.registerTempTable('dummy') df_t = sqlContext.sql('select count(*) as count, location from dummy group by location') df_t= df_t.withColumn('count_flag', when(df_t['count']>config.limit,1).otherwise(0)) df_t = df_t[df_t.count_flag==1] # fetching the location which is to be filtered from filter_db table with conn.cursor() as cursor: # Read a record sql = "select * from filter_db" cursor.execute(sql) so_result = pd.DataFrame(cursor.fetchall()) #filtering from pyspark.sql.functions import col #print(so_result) s_filter = list(so_result.source) df_t = df_t.filter(~col('source').isin(s_filter)) #df_t = df_t[df_t.location!='134.141.5.104'] df2 = df_t.toPandas() q2 = datetime.now() print('time to refernce data prepration is ',str(q2-q1)) print('length of table is ',len(df2)) return df2
def get_http_filter(time, rdd): try: print "========= %s =========" % str(time) sqlContext = getSqlContextInstance(rdd.context) df = json_rdd_to_sql_df(rdd) url_type = udf(lambda url: url.split('/')[-1].split('.')[-1]) df.select('*', when(url_type(df['url']).inSet(STATIC_URL_TYPE), 1).otherwise(0).alias("url_type")).registerAsTable("http") url_info = sqlContext.sql("""SELECT url_type, avg(in_bytes) as in_bytes, avg(out_bytes) as out_bytes, avg(latency_sec) as latency_sec, avg(latency_usec) as latency_usec, count(*) as requests FROM http group by url_type""").toJSON().collect() output = {} for info in url_info: temp = json.loads(info) if temp['url_type'] == 1: output['static_request'] = temp else: output['dynamic_request'] = temp dump_file("http", output, "http_filter") except Exception as e: print e
def rtruediv(left, right): return F.when(left == 0, F.lit(np.inf).__div__(right)).otherwise( F.lit(right).__truediv__(left))
def rfloordiv(left, right): return F.when(F.lit(left == 0), F.lit(np.inf).__div__(right)).otherwise( F.when(F.lit(left) == np.nan, np.nan).otherwise( F.floor(F.lit(right).__div__(left))))
# stays_df.withColumn('AGE', stays_df['INDATE'] - stays_df["DOBDATE"]) stays_df = stays_df.select("*", psql.to_date(stays_df["INTIME"]).alias("INDATE")) stays_df = stays_df.select( "*", psql.to_date(stays_df["DOB"]).alias("DOBDATE")) #.alias("DOBDATE") stays_df = stays_df.select( "*", psql.floor((psql.datediff(stays_df['INDATE'], stays_df["DOBDATE"]) / 365.0)).alias("AGE")) # alias("AGE"))#.alias("AGE") # print(stays_df.filter("AGE > 250").count()) stays_df = stays_df.withColumn( "AGE", psql.when(stays_df.AGE > 250, 90).otherwise(stays_df.AGE)) # print(stays_df.filter("AGE < 0").count()) stays = add_inunit_mortality_to_icustays(stays) #add_inunit_mortality_to_icustays below mortality = stays_df.withColumn("DODFILTER", (stays_df.INTIME <= stays_df.DOD) & (stays_df.OUTTIME >= stays_df.DOD)) # print(mortality.filter("DODFILTER = true").count()) mortality = mortality.withColumn("DEATHTIMEFILTER", (stays_df.INTIME <= stays_df.DEATHTIME) & (stays_df.OUTTIME >= stays_df.DEATHTIME)) mortality = mortality.withColumn( "MORTALITY_InUnit", mortality.DODFILTER | mortality.DEATHTIMEFILTER) # mortality = mortality.withColumn("MORTALITY", mortality.MORTALITY_InUnit.cast('int')) stays_df = mortality.withColumn("MORTALITY_InUnit",
df = sqlContext.read.parquet(config.proj_path+'/datas/appid_datapoint_parquet1') from sqlalchemy import create_engine engine = create_engine(str("mysql+pymysql://"+config.db_user+":"+config.db_pass+"@"+config.db_host+":"+str(config.db_port)+"/"+config.db_name)) from pyspark.sql.functions import when q1 = datetime.now() df1 = df[(df.app_rsp_time !=0)] df_t = df1.registerTempTable('dummy') df_t = sqlContext.sql('select count(*) as count, source , application, target_address from dummy group by source, application, target_address') df_t= df_t.withColumn('count_flag', when(df_t['count']>config.limit,1).otherwise(0)) df_t = df_t[df_t.count_flag==1] # fetching the source which is to be filtered from filter_db table with conn.cursor() as cursor: # Read a record sql = "select * from filter_db" cursor.execute(sql) so_result = pd.DataFrame(cursor.fetchall()) #filtering from pyspark.sql.functions import col #print(so_result) s_filter = list(so_result.source) df_t = df_t.filter(~col('source').isin(s_filter)) #df_t = df_t[df_t.source!='134.141.5.104']
def gg(x, index): return F.when(dd[index] == dd[index+2], "").otherwise(dd[index]).alias(index)
def test_auto_mapper_concat_multiple_items_structs_different_elements( spark_session: SparkSession, ) -> None: # Arrange clean_spark_session(spark_session) spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, None, "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # Act mapper = AutoMapper( view="members", source_view="patients", enable_schema_pruning=True).columns(dst2=AutoMapperList([ AutoMapperDataTypeComplexBase(a=A.column("first_name"), b=A.column("last_name")) ], ).concat( AutoMapperList([ AutoMapperDataTypeComplexBase(a=A.column("first_name"), c=A.column("last_name")), ], ))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") array1 = when( array( struct( col("b.first_name").alias("a"), col("b.last_name").alias("b"), lit(None).alias("c"), ), ).isNotNull(), filter( coalesce( array( struct( col("b.first_name").alias("a"), col("b.last_name").alias("b"), lit(None).alias("c"), ), ), array(), ), lambda x: x.isNotNull(), ), ) array2 = when( array( struct( col("b.first_name").alias("a"), lit(None).alias("b"), col("b.last_name").alias("c"), ), ).isNotNull(), filter( coalesce( array( struct( col("b.first_name").alias("a"), lit(None).alias("b"), col("b.last_name").alias("c"), ), ), array(), ), lambda x: x.isNotNull(), ), ) assert_compare_expressions(sql_expressions["dst2"], concat(array1, array2).alias("dst2")) result_df: DataFrame = mapper.transform(df=source_df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] [0] == "Imran") assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] [1] == "Qureshi") assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0] [0] == "Michael") assert ( result_df.where("member_id == 2").select("dst2").collect()[0][0][0][1] is None)
'School Not Found', 'School or City Wide Complaint', 'Bridge Highway Name', 'Bridge Highway Direction', 'Road Ramp', 'Bridge Highway Segment', 'Garage Lot Name', 'Ferry Direction', 'Ferry Terminal Name', 'Latitude', 'Longitude'] df = df.select([name for name in df.schema.names if name not in drop_list]) #Replacing invalid Zipcodes with N/A Zip codes should either be 5 digits or 5 digits followed by 4 digits. df = df.withColumn('Incident Zip', when(col('Incident Zip').rlike('^\d{5}(?:[-\s]\d{4})?$')!= True, 'N/A').otherwise(df['Incident Zip'])) # Replacing invalid closed dates (before 2009) with N/A years = [str(i) for i in range(2009, 2018)] df = df.withColumn('Closed Date',when( col('Closed Date').substr(7,4).isin(years), col('Closed Date')).otherwise('N/A')) #Replacing Null values with "N/A" for x in df.schema.names: # Basic replacement df = df.withColumn(x, when(col(x).isin("", "Unspecified", "0 Unspecified") != True, col(x)).otherwise('N/A')) #writing cleaned dataframes onto a csv for comparision and Analysis phase. df.write.format('com.databricks.spark.csv').options(header='true').save('/user/sdv267/cleaned_311.csv')
# %% #Test data samples df_test.show(10) df_train.printSchema df_train = df_train.withColumn("usefulCount",round(df_train["usefulCount"]).cast('integer')) # %% #Joining train and test data set df = df_train.join(df_test, on=['uniqueID', 'drugName', 'condition','review','rating','date','usefulCount'], how='left_outer') # %% #Computing setniment column based on rating sentiment = when(col("rating")<=5, 0).otherwise(1) df = df.withColumn("sentiment",sentiment) df = df.withColumn('length',length(df['review'])) # %% [markdown] # ## Feature Transformation # %% from pyspark.ml.feature import Tokenizer,StopWordsRemover, CountVectorizer,IDF,StringIndexer tokenizer = Tokenizer(inputCol="review", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens') count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec') idf = IDF(inputCol="c_vec", outputCol="tf_idf") pos_neg = StringIndexer(inputCol='sentiment',outputCol='label')
def defaultDebtRatioToThreshold(df): df = df.withColumn( 'DebtRatio', F.when((F.col('DebtRatio') > 1.5), 1.5).otherwise(F.col('DebtRatio'))) return df
##StringEncoding of categorical variables cat_x_vars = ["term", "grade", "home_ownership", "pred_KM", "emp_length"] #df2 = df #backup in case of trouble for cat_var in cat_x_vars: df = StringIndexer(inputCol=cat_var, outputCol=cat_var + 'Idx').fit(df).transform(df).drop(cat_var) df = df.withColumnRenamed(cat_var + 'Idx', cat_var) #df.select(cat_x_vars).show(5) #check ##Create y or target variables for neural networks #probability/indicator for default df = df.withColumn('probDef', F.when(df['loan_status'] == 1, 1.0).otherwise(0.0)) #default is 1, repaid is 0 #indicator for early replayment df = df.withColumn( 'probER', F.when((df['loan_status'] == 0) & (df['fracNumPmts'] < 1), 1.0).otherwise(0.0)) #indicator for on-schedule repayment can be inferred as probDef=probER=0,0, with #visually: #plot of timing of either default or eventual (not early repayment) #df.filter((df['loan_status']==1)|(df.fracNumPmts >=1)).select(df.fracNumPmts).toPandas().plot.hist() #plt.show() #This is bi-modal, mostly low over 0,1 and then a spike at 1. #plot of timing of either repayment (whenever) #df.filter(df['loan_status']==0).select(df.fracNumPmts).toPandas().plot.hist() #plt.show() #This is more like a uniform over 0,1 + a spike at 1.
df = sc.read.parquet('../data/userdata1.parquet') print(df) # Handle duplicate values print(df.drop_duplicates().count()) # Handling missing data print(df.fillna(0).show()) print(df.dropna().show()) # fill missing values in specefic columns print(df.fillna({'cc':'6767119071901597' }).show()) # Changing data type in the DF df1 = df.withColumn("salary", df["salary"].cast(FloatType())) print(df1.show()) print(df1.printSchema()) # replace null values with mean salary print(F.avg(df1.salary)) # df1 = df1.fillna() # drop String literal and cast to integer df = df.withColumn("cc", F.when(df.cc != '', df.cc).otherwise('0')) df = df.withColumn("cc", df.cc.cast(IntegerType())) print(df.printSchema()) # replace empty String literal with something df = df.withColumn("birthdate",F.when(df.birthdate != '',df.birthdate).otherwise("05/05/2020")) df = df.withColumn("birthdate", F.to_date(df.birthdate,'mm/dd/yyyy')) print(df.show())
def calculate_metrics(predictions,y,data_type): start_time4 = time.time() # Calculate ROC evaluator = BinaryClassificationEvaluator(labelCol=y,rawPredictionCol='probability') auroc = evaluator.evaluate(predictions,{evaluator.metricName: "areaUnderROC"}) print('AUC calculated',auroc) selectedCols = predictions.select(F.col("probability"), F.col('prediction'), F.col(y)).rdd.map(lambda row: (float(row['probability'][1]), float(row['prediction']), float(row[y]))).collect() y_score, y_pred, y_true = zip(*selectedCols) # Calculate Accuracy accuracydf=predictions.withColumn('acc',F.when(predictions.prediction==predictions[y],1).otherwise(0)) accuracydf.createOrReplaceTempView("accuracyTable") RFaccuracy=spark.sql("select sum(acc)/count(1) as accuracy from accuracyTable").collect()[0][0] print('Accuracy calculated',RFaccuracy) # # Build KS Table split1_udf = udf(lambda value: value[1].item(), DoubleType()) if data_type in ['train','valid','test','oot1','oot2']: decileDF = predictions.select(y, split1_udf('probability').alias('probability')) else: decileDF = predictions.select(y, 'probability') decileDF=decileDF.withColumn('non_target',1-decileDF[y]) window = Window.orderBy(desc("probability")) decileDF = decileDF.withColumn("rownum", F.row_number().over(window)) decileDF.cache() decileDF=decileDF.withColumn("rownum",decileDF["rownum"].cast("double")) window2 = Window.orderBy("rownum") RFbucketedData=decileDF.withColumn("deciles", F.ntile(10).over(window2)) RFbucketedData = RFbucketedData.withColumn('deciles',RFbucketedData['deciles'].cast("int")) RFbucketedData.cache() #a = RFbucketedData.count() #print(RFbucketedData.show()) ## to pandas from here print('KS calculation starting') target_cnt=RFbucketedData.groupBy('deciles').agg(F.sum(y).alias('target')).toPandas() non_target_cnt=RFbucketedData.groupBy('deciles').agg(F.sum("non_target").alias('non_target')).toPandas() overall_cnt=RFbucketedData.groupBy('deciles').count().alias('Total').toPandas() overall_cnt = overall_cnt.merge(target_cnt,on='deciles',how='inner').merge(non_target_cnt,on='deciles',how='inner') overall_cnt=overall_cnt.sort_values(by='deciles',ascending=True) overall_cnt['Pct_target']=(overall_cnt['target']/overall_cnt['count'])*100 overall_cnt['cum_target'] = overall_cnt.target.cumsum() overall_cnt['cum_non_target'] = overall_cnt.non_target.cumsum() overall_cnt['%Dist_Target'] = (overall_cnt['cum_target'] / overall_cnt.target.sum())*100 overall_cnt['%Dist_non_Target'] = (overall_cnt['cum_non_target'] / overall_cnt.non_target.sum())*100 overall_cnt['spread'] = builtins.abs(overall_cnt['%Dist_Target']-overall_cnt['%Dist_non_Target']) decile_table=overall_cnt.round(2) print("KS_Value =", builtins.round(overall_cnt.spread.max(),2)) #print "Test Error =", builtin.round((1.0 - RFaccuracy),3) #print "Accuracy =", builtin.round(RFaccuracy,3) #print "AUC=", builtin.round(auroc,3) decileDF.unpersist() RFbucketedData.unpersist() print("Metrics calculation process Completed in : "+ " %s seconds" % (time.time() - start_time4)) return auroc,RFaccuracy,builtins.round(overall_cnt.spread.max(),2), y_score, y_pred, y_true, overall_cnt
def load_test_silver_data(): return load_test_data().withColumn("nulls", when(col("tz").isNull(), array(lit(11))).otherwise(array())).cache()
trainFeatPos = [1,3] trainFeatPos = trainFeatPos + [k for k in range(5,40)] trainFeatPos = trainFeatPos + [k for k in xrange(43,55)] trainLabelPos = 42+13 delta = 0.99 #trainDF,temp = trainDF.randomSplit([delta,1-delta]) leakageParser = ParseDF([2,4],trainLabelPos) leakageTrain = leakageParser.parse_raw_df(trainDF).drop("text").drop("PeopleID") leakageParser = ParseDF([2,4],42) leakageTest = leakageParser.parse_raw_df(testDF,train = False).drop("text").drop("PeopleID") leakageTest = leakageTest.withColumnRenamed("label","activity_id") leakageTrain = leakageTrain.groupBy("features").avg().withColumnRenamed("avg(label)","leakage") leakageTrain = leakageTrain.select("features", when(leakageTrain.leakage >=0.5,1).otherwise(0).alias("leak")).drop("leakage") leakageTrain.printSchema() leakageTrain.show(5) leakageTest.show(3) print leakageTest.count() leakageTest = leakageTest.join(leakageTrain, "features", "left_outer").drop("features") print leakageTest.count() leakageTest.show(3) Parser = ParseDF(trainFeatPos,trainLabelPos) trainDF = Parser.parse_raw_df(trainDF).drop("text") testParser = ParseDF(trainFeatPos,42) testDF = testParser.parse_raw_df(testDF,train = False).drop("text") testDF = testDF.withColumnRenamed("label","activity_id")
def execute_process(options): spark = (pyspark.sql.session.SparkSession.builder.appName( "Radar").enableHiveSupport().getOrCreate()) spark.sql(""" SELECT d.docu_dk, v.vist_orgi_orga_dk as orgao_id, a.pcao_dt_andamento, s.stao_tppr_dk FROM {schema}.mcpr_documento d join {schema}.mcpr_vista v on v.vist_docu_dk = d.docu_dk join {schema}.mcpr_andamento a on a.pcao_vist_dk = v.vist_dk join {schema}.mcpr_sub_andamento s on s.stao_pcao_dk = a.pcao_dk WHERE to_date(pcao_dt_andamento) > to_date(date_sub(current_timestamp(), {days_ago})) AND to_date(pcao_dt_andamento) <= to_date(current_timestamp()) AND pcao_dt_cancelamento IS NULL AND docu_tpst_dk != 11 GROUP BY docu_dk, v.vist_orgi_orga_dk, a.pcao_dt_andamento, s.stao_tppr_dk """.format( schema=options["schema_exadata"], days_ago=options["days_ago"])).createOrReplaceTempView("andamentos") spark.catalog.cacheTable("andamentos") spark.sql(""" select docu_dk, orgao_id, pcao_dt_andamento, stao_tppr_dk from andamentos where stao_tppr_dk in (7912,6548,6326,6681,6678,6645,6682,6680,6679,6644, 6668,6666,6665,6669,6667,6664,6655,6662,6659,6658, 6663,6661,6660,6657,6670,6676,6674,6673,6677,6675, 6672,6018,6341,6338,6019,6017,6591,6339,6553,7871, 6343,6340,6342,6021,6334,6331,6022,6020,6593,6332, 7872,6336,6333,6335,7745,6346,6345,6015,6016,6325, 6327,6328,6329,6330,6337,6344,6656,6671,7869,7870, 6324,6322,6011,6012,6013,1092,1094,1095,6251,7834, 6007) """).createOrReplaceTempView("andamentos_codigos") # -1 indica cancelamento de indeferimento (cancela 1) # -2 indica desarquivamento (cancela arquivamento - 2, 4, 5) # -3 indica indeferimento (ou seja, cancela instauracao 3) cancela_indeferimento = spark.sql(""" select docu_dk, orgao_id, pcao_dt_andamento, -1 as tipo_andamento from andamentos where stao_tppr_dk = 6007 union all select docu_dk, orgao_id, pcao_dt_andamento, -2 as tipo_andamento from andamentos where stao_tppr_dk IN ( 6075, 1028, 6798, 7245, 6307, 1027, 7803, 6003, 7802, 7801, 6004, 6696) union all select docu_dk, orgao_id, pcao_dt_andamento, -3 as tipo_andamento from andamentos where stao_tppr_dk = 6322 """) # cancelamento de indeferimento conta como instauracao # tipo_andamento funciona como hierarquia para priorizar certos tipos # quando ocorrem no mesmo dia: # Aj.Acao (5) > TAC (4) > Instauracao (3) > Arquivamento (2) > Indeferimento (1) documento_andamentos = spark.sql(""" select docu_dk, orgao_id, CASE WHEN stao_tppr_dk IN (7912,6548,6681,6678,6645,6682,6680, 6679,6644,6668,6666,6665,6669,6667,6664, 6662,6659,6658,6663,6661,6660,6657, 6670,6676,6674,6673,6677,6675,6672,6018, 6341,6338,6019,6017,6591,6339,6553,7871, 6343,6340,6342,6021,6334,6331,6022,6020, 6593,6332,7872,6336,6333,6335,7745,6346, 6345,6015,6016,6325,6327,6328,6329,6330, 6337,6344,6656,6671,7869,7870,6324,7834) THEN 2 WHEN stao_tppr_dk = 6322 THEN 1 WHEN stao_tppr_dk IN (6011, 6012, 6013, 1092, 1094, 1095, 6007) THEN 3 WHEN stao_tppr_dk IN (6655, 6326) THEN 4 WHEN stao_tppr_dk = 6251 THEN 5 end tipo_andamento, pcao_dt_andamento from andamentos_codigos """) cancela_df = cancela_indeferimento.groupby([ "orgao_id", "docu_dk", "pcao_dt_andamento" ]).agg(max("tipo_andamento").alias("tipo_andamento")) documento_df = documento_andamentos.groupby([ "orgao_id", "docu_dk", "pcao_dt_andamento" ]).agg(max("tipo_andamento").alias("tipo_andamento")) final_df = ( documento_df\ .withColumn("group_type", when(col("tipo_andamento").isin(2, 4, 5), 2).otherwise(col("tipo_andamento"))) .alias("d") .join( cancela_df.alias("c"), (col("d.docu_dk") == col("c.docu_dk")) & (col("c.pcao_dt_andamento") >= col("d.pcao_dt_andamento")) & (col("c.tipo_andamento") + col("d.group_type") == 0), "left", ) .where("c.tipo_andamento is null") .groupby(["d.orgao_id"]) .pivot("d.tipo_andamento") .agg(count("d.tipo_andamento")) .na.fill(0) .withColumnRenamed("2", "arquivamento") .withColumnRenamed("1", "indeferimento") .withColumnRenamed("3", "instauracao") .withColumnRenamed("4", "tac") .withColumnRenamed("5", "acao") ) final_df.createOrReplaceTempView("final_andamentos") spark.sql(""" SELECT fa.*, ap.cod_pct, ap.pacote_atribuicao, ap.orgi_nm_orgao nm_orgao FROM final_andamentos fa INNER JOIN {schema_aux}.atualizacao_pj_pacote ap ON ap.id_orgao = fa.orgao_id """.format(schema_aux=options["schema_exadata_aux"]) ).createOrReplaceTempView("final_com_pacote") max_pacote = spark.sql(""" SELECT cod_pct, nm_orgao, max(arquivamento) as max_arq, max(indeferimento) as max_indef, max(instauracao) as max_inst, max(tac) as max_tac, max(acao) as max_acoes FROM final_com_pacote fp GROUP BY cod_pct, nm_orgao """) w = Window.partitionBy("cod_pct") orgao_max_arq = (max_pacote.withColumn( "m_max_arq", max("max_arq").over(w)).where( col("max_arq") == col("m_max_arq")).select( ["cod_pct", "nm_orgao"]).groupBy("cod_pct").agg( concat_ws(", ", collect_list("nm_orgao")).alias( "nm_max_arquivamentos")).withColumnRenamed( "cod_pct", "arq_cod_pct")) orgao_max_indef = (max_pacote.withColumn( "m_max_indef", max("max_indef").over(w)).where( col("max_indef") == col("m_max_indef")).select( ["cod_pct", "nm_orgao"]).groupBy("cod_pct").agg( concat_ws(", ", collect_list("nm_orgao")).alias( "nm_max_indeferimentos")).withColumnRenamed( "cod_pct", "indef_cod_pct")) orgao_max_inst = (max_pacote.withColumn( "m_max_inst", max("max_inst").over(w)).where( col("max_inst") == col("m_max_inst")).select( ["cod_pct", "nm_orgao"]).groupBy("cod_pct").agg( concat_ws(", ", collect_list("nm_orgao")).alias( "nm_max_instauracoes")).withColumnRenamed( "cod_pct", "inst_cod_pct")) orgao_max_tac = (max_pacote.withColumn( "m_max_tac", max("max_tac").over(w)).where( col("max_tac") == col("m_max_tac")).select([ "cod_pct", "nm_orgao" ]).groupBy("cod_pct").agg( concat_ws(", ", collect_list("nm_orgao")).alias( "nm_max_tac")).withColumnRenamed("cod_pct", "tac_cod_pct")) orgao_max_acoes = (max_pacote.withColumn( "m_max_acoes", max("max_acoes").over(w)).where( col("max_acoes") == col("m_max_acoes")).select([ "cod_pct", "nm_orgao" ]).groupBy("cod_pct").agg( concat_ws(", ", collect_list("nm_orgao")).alias( "nm_max_acoes")).withColumnRenamed("cod_pct", "acoes_cod_pct")) spark.sql(""" SELECT cod_pct, max(arquivamento) as max_pacote_arquivamentos, max(indeferimento) as max_pacote_indeferimentos, max(instauracao) as max_pacote_instauracoes, max(tac) as max_pacote_tac, max(acao) as max_pacote_acoes, percentile(arquivamento, 0.5) as med_pacote_arquivamentos, percentile(indeferimento, 0.5) as med_pacote_indeferimentos, percentile(instauracao, 0.5) as med_pacote_instauracoes, percentile(tac, 0.5) as med_pacote_tac, percentile(acao, 0.5) as med_pacote_acoes FROM final_com_pacote GROUP BY cod_pct """).createOrReplaceTempView("stats_pacote") stats = (spark.sql(""" SELECT fp.cod_pct, fp.pacote_atribuicao, fp.orgao_id, arquivamento as nr_arquivamentos, indeferimento as nr_indeferimentos, instauracao as nr_instauracaoes, tac as nr_tac, acao as nr_acoes, max_pacote_arquivamentos, max_pacote_indeferimentos, max_pacote_instauracoes, max_pacote_tac, max_pacote_acoes, arquivamento / max_pacote_arquivamentos as perc_arquivamentos, indeferimento / max_pacote_indeferimentos as perc_indeferimentos, instauracao / max_pacote_instauracoes as perc_instauracaoes, tac / max_pacote_tac as perc_tac, acao / max_pacote_acoes as perc_acoes, med_pacote_arquivamentos, med_pacote_indeferimentos, med_pacote_instauracoes, med_pacote_tac, med_pacote_acoes, (arquivamento - med_pacote_arquivamentos) / med_pacote_arquivamentos as var_med_arquivaentos, (indeferimento - med_pacote_indeferimentos) / med_pacote_indeferimentos as var_med_indeferimentos, (instauracao - med_pacote_instauracoes) / med_pacote_instauracoes as var_med_instauracoes, (tac - med_pacote_tac) / med_pacote_tac as var_med_tac, (acao - med_pacote_acoes) / med_pacote_acoes as var_med_acoes, current_timestamp() as dt_calculo FROM final_com_pacote fp INNER JOIN stats_pacote sp ON fp.cod_pct = sp.cod_pct """).join( orgao_max_arq, col("cod_pct") == col("arq_cod_pct")).drop("arq_cod_pct").join( orgao_max_indef, col("cod_pct") == col("indef_cod_pct")).drop("indef_cod_pct").join( orgao_max_inst, col("cod_pct") == col("inst_cod_pct")).drop( "inst_cod_pct").join(orgao_max_tac, col("cod_pct") == col("tac_cod_pct")). drop("tac_cod_pct").join( orgao_max_acoes, col("cod_pct") == col("acoes_cod_pct")).drop("acoes_cod_pct")) table_name = options['table_name'] table_name = "{}.{}".format(options["schema_exadata_aux"], table_name) stats.write.mode("overwrite").saveAsTable("temp_table_radar_performance") temp_table = spark.table("temp_table_radar_performance") temp_table.write.mode("overwrite").saveAsTable(table_name) spark.sql("drop table temp_table_radar_performance") execute_compute_stats(table_name)
def usage(transform_context, record_store_df): """component which groups together record store records by provided group by columns list, sorts within the group by event timestamp field, applies group stats udf and returns the latest quantity as a instance usage dataframe This component does groups records by event_type (a.k.a metric name) and expects two kinds of records in record_store data total quantity records - the total available quantity e.g. cpu.total_logical_cores idle perc records - percentage that is idle e.g. cpu.idle_perc To calculate the utilized quantity this component uses following formula: utilized quantity = ceil((100 - idle_perc) * total_quantity / 100) """ sql_context = SQLContext.getOrCreate(record_store_df.rdd.context) transform_spec_df = transform_context.transform_spec_df_info # get rollup operation (sum, max, avg, min) agg_params = transform_spec_df.select( "aggregation_params_map.usage_fetch_operation"). \ collect()[0].asDict() usage_fetch_operation = agg_params["usage_fetch_operation"] # check if operation is valid if not FetchQuantityUtil. \ _is_valid_fetch_quantity_util_operation(usage_fetch_operation): raise FetchQuantityUtilException( "Operation %s is not supported" % usage_fetch_operation) # get the quantities for idle perc and quantity instance_usage_df = FetchQuantity().usage( transform_context, record_store_df) # get aggregation period for instance usage dataframe agg_params = transform_spec_df.select( "aggregation_params_map.aggregation_period").collect()[0].asDict() aggregation_period = agg_params["aggregation_period"] group_by_period_list = ComponentUtils.\ _get_instance_group_by_period_list(aggregation_period) # get what we want to group by agg_params = transform_spec_df.select( "aggregation_params_map.aggregation_group_by_list").\ collect()[0].asDict() aggregation_group_by_list = agg_params["aggregation_group_by_list"] # group by columns list group_by_columns_list = group_by_period_list + \ aggregation_group_by_list # get quantity event type agg_params = transform_spec_df.select( "aggregation_params_map.usage_fetch_util_quantity_event_type").\ collect()[0].asDict() usage_fetch_util_quantity_event_type = \ agg_params["usage_fetch_util_quantity_event_type"] # check if driver parameter is provided if usage_fetch_util_quantity_event_type is None or \ usage_fetch_util_quantity_event_type == "": raise FetchQuantityUtilException( "Driver parameter '%s' is missing" % "usage_fetch_util_quantity_event_type") # get idle perc event type agg_params = transform_spec_df.select( "aggregation_params_map.usage_fetch_util_idle_perc_event_type").\ collect()[0].asDict() usage_fetch_util_idle_perc_event_type = \ agg_params["usage_fetch_util_idle_perc_event_type"] # check if driver parameter is provided if usage_fetch_util_idle_perc_event_type is None or \ usage_fetch_util_idle_perc_event_type == "": raise FetchQuantityUtilException( "Driver parameter '%s' is missing" % "usage_fetch_util_idle_perc_event_type") # get quantity records dataframe event_type_quantity_clause = "processing_meta.event_type='%s'" \ % usage_fetch_util_quantity_event_type quantity_df = instance_usage_df.select('*').where( event_type_quantity_clause).alias("quantity_df_alias") # get idle perc records dataframe event_type_idle_perc_clause = "processing_meta.event_type='%s'" \ % usage_fetch_util_idle_perc_event_type idle_perc_df = instance_usage_df.select('*').where( event_type_idle_perc_clause).alias("idle_perc_df_alias") # join quantity records with idle perc records # create a join condition without the event_type cond = [item for item in group_by_columns_list if item != 'event_type'] quant_idle_perc_df = quantity_df.join(idle_perc_df, cond, 'left') # # Find utilized quantity based on idle percentage # # utilized quantity = (100 - idle_perc) * total_quantity / 100 # quant_idle_perc_calc_df = quant_idle_perc_df.select( col("quantity_df_alias.*"), when(col("idle_perc_df_alias.quantity") != 0.0, ceil(((100.0 - col( "idle_perc_df_alias.quantity"))) * col( "quantity_df_alias.quantity") / 100.0)) .otherwise(col("quantity_df_alias.quantity")) .alias("utilized_quantity"), col("quantity_df_alias.quantity") .alias("total_quantity"), col("idle_perc_df_alias.quantity") .alias("idle_perc")) instance_usage_json_rdd = \ quant_idle_perc_calc_df.rdd.map( FetchQuantityUtil._format_quantity_util) instance_usage_df = \ InstanceUsageUtils.create_df_from_json_rdd(sql_context, instance_usage_json_rdd) return instance_usage_df
# Creating spark session spark = SparkSession.builder.appName("DecisionTree App").getOrCreate() spark.sparkContext.setLogLevel("ERROR") # Loading the data data = spark.read.format("csv").option("header", True) \ .option("inferSchema", True) \ .option("delimiter", ",") \ .load("adult.data") data.printSchema() data = data.withColumn( "X", F.when(F.col("X") == ' <=50K', 0).when(F.col("X") == ' >50K', 1)) data = data.withColumnRenamed("X", "label") data = data.select(data.label.cast("double"), "age", "education-num", "hours-per-week") data.show() assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) data.show() # Splitting the data into training and data set training, test = data.select("label", "features").randomSplit([0.70, 0.30]) # Create Random Forest model and fit the model with training dataset rf = RandomForestClassifier()
def rdd_to_recordstore(rdd_transform_context_rdd): if rdd_transform_context_rdd.isEmpty(): MonMetricsKafkaProcessor.log_debug( "rdd_to_recordstore: nothing to process...") else: sql_context = SQLContext(rdd_transform_context_rdd.context) data_driven_specs_repo = DataDrivenSpecsRepoFactory.\ get_data_driven_specs_repo() pre_transform_specs_df = data_driven_specs_repo.\ get_data_driven_specs( sql_context=sql_context, data_driven_spec_type=DataDrivenSpecsRepo. pre_transform_specs_type) # # extract second column containing raw metric data # raw_mon_metrics = rdd_transform_context_rdd.map( lambda nt: nt.rdd_info[1]) # # convert raw metric data rdd to dataframe rdd # raw_mon_metrics_df = \ MonMetricUtils.create_mon_metrics_df_from_json_rdd( sql_context, raw_mon_metrics) # # filter out unwanted metrics and keep metrics we are interested in # cond = [ raw_mon_metrics_df.metric.name == pre_transform_specs_df.event_type] filtered_metrics_df = raw_mon_metrics_df.join( pre_transform_specs_df, cond) # # validate filtered metrics to check if required fields # are present and not empty # In order to be able to apply filter function had to convert # data frame rdd to normal rdd. After validation the rdd is # converted back to dataframe rdd # # FIXME: find a way to apply filter function on dataframe rdd data validated_mon_metrics_rdd = filtered_metrics_df.rdd.filter( MonMetricsKafkaProcessor._validate_raw_mon_metrics) validated_mon_metrics_df = sql_context.createDataFrame( validated_mon_metrics_rdd, filtered_metrics_df.schema) # # record generator # generate a new intermediate metric record if a given metric # metric_id_list, in pre_transform_specs table has several # intermediate metrics defined. # intermediate metrics are used as a convenient way to # process (aggregated) metric in mutiple ways by making a copy # of the source data for each processing # gen_mon_metrics_df = validated_mon_metrics_df.select( validated_mon_metrics_df.meta, validated_mon_metrics_df.metric, validated_mon_metrics_df.event_processing_params, validated_mon_metrics_df.event_type, explode(validated_mon_metrics_df.metric_id_list).alias( "this_metric_id"), validated_mon_metrics_df.service_id) # # transform metrics data to record_store format # record store format is the common format which will serve as # source to aggregation processing. # converting the metric to common standard format helps in writing # generic aggregation routines driven by configuration parameters # and can be reused # record_store_df = gen_mon_metrics_df.select( (gen_mon_metrics_df.metric.timestamp / 1000).alias( "event_timestamp_unix"), from_unixtime( gen_mon_metrics_df.metric.timestamp / 1000).alias( "event_timestamp_string"), gen_mon_metrics_df.event_type.alias("event_type"), gen_mon_metrics_df.event_type.alias("event_quantity_name"), (gen_mon_metrics_df.metric.value / 1.0).alias( "event_quantity"), when(gen_mon_metrics_df.metric.dimensions.state != '', gen_mon_metrics_df.metric.dimensions.state).otherwise( 'NA').alias("event_status"), lit('1.0').alias('event_version'), lit('metrics').alias("record_type"), # resource_uuid when(gen_mon_metrics_df.metric.dimensions.instanceId != '', gen_mon_metrics_df.metric.dimensions.instanceId).when( gen_mon_metrics_df.metric.dimensions.resource_id != '', gen_mon_metrics_df.metric.dimensions.resource_id). otherwise('NA').alias("resource_uuid"), when(gen_mon_metrics_df.metric.dimensions.tenantId != '', gen_mon_metrics_df.metric.dimensions.tenantId).when( gen_mon_metrics_df.metric.dimensions.tenant_id != '', gen_mon_metrics_df.metric.dimensions.tenant_id).when( gen_mon_metrics_df.metric.dimensions.project_id != '', gen_mon_metrics_df.metric.dimensions.project_id).otherwise( 'NA').alias("tenant_id"), when(gen_mon_metrics_df.metric.dimensions.mount != '', gen_mon_metrics_df.metric.dimensions.mount).otherwise( 'NA').alias("mount"), when(gen_mon_metrics_df.metric.dimensions.device != '', gen_mon_metrics_df.metric.dimensions.device).otherwise( 'NA').alias("device"), when(gen_mon_metrics_df.meta.userId != '', gen_mon_metrics_df.meta.userId).otherwise('NA').alias( "user_id"), when(gen_mon_metrics_df.meta.region != '', gen_mon_metrics_df.meta.region).when( gen_mon_metrics_df.event_processing_params .set_default_region_to != '', gen_mon_metrics_df.event_processing_params .set_default_region_to).otherwise( 'NA').alias("region"), when(gen_mon_metrics_df.meta.zone != '', gen_mon_metrics_df.meta.zone).when( gen_mon_metrics_df.event_processing_params .set_default_zone_to != '', gen_mon_metrics_df.event_processing_params .set_default_zone_to).otherwise( 'NA').alias("zone"), when(gen_mon_metrics_df.metric.dimensions.hostname != '', gen_mon_metrics_df.metric.dimensions.hostname).when( gen_mon_metrics_df.metric.value_meta.host != '', gen_mon_metrics_df.metric.value_meta.host).otherwise( 'NA').alias("host"), when(gen_mon_metrics_df.service_id != '', gen_mon_metrics_df.service_id).otherwise( 'NA').alias("service_group"), when(gen_mon_metrics_df.service_id != '', gen_mon_metrics_df.service_id).otherwise( 'NA').alias("service_id"), from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000, 'yyyy-MM-dd').alias("event_date"), from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000, 'HH').alias("event_hour"), from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000, 'mm').alias("event_minute"), from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000, 'ss').alias("event_second"), gen_mon_metrics_df.this_metric_id.alias("metric_group"), gen_mon_metrics_df.this_metric_id.alias("metric_id")) # # get transform context # rdd_transform_context = rdd_transform_context_rdd.first() transform_context = rdd_transform_context.transform_context_info # # cache record store rdd # if cfg.CONF.service.enable_record_store_df_cache: storage_level_prop = \ cfg.CONF.service.record_store_df_cache_storage_level storage_level = StorageUtils.get_storage_level( storage_level_prop) record_store_df.persist(storage_level) # # start processing metrics available in record_store data # MonMetricsKafkaProcessor.process_metrics(transform_context, record_store_df) # remove df from cache if cfg.CONF.service.enable_record_store_df_cache: record_store_df.unpersist() # # extract kafka offsets and batch processing time # stored in transform_context and save offsets # offsets = transform_context.offset_info # batch time batch_time_info = \ transform_context.batch_time_info MonMetricsKafkaProcessor.save_kafka_offsets( offsets, rdd_transform_context_rdd.context.appName, batch_time_info) # call pre hourly processor, if its time to run if (cfg.CONF.stage_processors.pre_hourly_processor_enabled is True and PreHourlyProcessor.is_time_to_run( batch_time_info)): PreHourlyProcessor.run_processor( record_store_df.rdd.context, batch_time_info)
def __init__( self, dataframe: DataFrame, spark: SparkSession, profiling_config: DataLakeProfilerConfig, report: DataLakeSourceReport, file_path: str, ): self.spark = spark self.dataframe = dataframe self.analyzer = AnalysisRunner(spark).onData(dataframe) self.column_specs = [] self.row_count = dataframe.count() self.profiling_config = profiling_config self.file_path = file_path self.columns_to_profile = [] self.ignored_columns = [] self.profile = DatasetProfileClass(timestampMillis=get_sys_time()) self.report = report self.profile.rowCount = self.row_count self.profile.columnCount = len(dataframe.columns) column_types = {x.name: x.dataType for x in dataframe.schema.fields} if self.profiling_config.profile_table_level_only: return # get column distinct counts for column in dataframe.columns: if not self.profiling_config.allow_deny_patterns.allowed(column): self.ignored_columns.append(column) continue self.columns_to_profile.append(column) # Normal CountDistinct is ridiculously slow self.analyzer.addAnalyzer(ApproxCountDistinct(column)) if self.profiling_config.max_number_of_fields_to_profile is not None: if (len(self.columns_to_profile) > self.profiling_config.max_number_of_fields_to_profile): columns_being_dropped = self.columns_to_profile[ self.profiling_config.max_number_of_fields_to_profile:] self.columns_to_profile = self.columns_to_profile[:self. profiling_config . max_number_of_fields_to_profile] self.report.report_file_dropped( f"The max_number_of_fields_to_profile={self.profiling_config.max_number_of_fields_to_profile} reached. Profile of columns {self.file_path}({', '.join(sorted(columns_being_dropped))})" ) analysis_result = self.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsJson( self.spark, analysis_result) # reshape distinct counts into dictionary column_distinct_counts = { x["instance"]: int(x["value"]) for x in analysis_metrics if x["name"] == "ApproxCountDistinct" } select_numeric_null_counts = [ count(when( isnan(c) | col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] in [DoubleType, FloatType] ] # PySpark doesn't support isnan() on non-float/double columns select_nonnumeric_null_counts = [ count(when( col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] not in [DoubleType, FloatType] ] null_counts = dataframe.select(select_numeric_null_counts + select_nonnumeric_null_counts) column_null_counts = null_counts.toPandas().T[0].to_dict() column_null_fractions = { c: column_null_counts[c] / self.row_count for c in self.columns_to_profile } column_nonnull_counts = { c: self.row_count - column_null_counts[c] for c in self.columns_to_profile } column_unique_proportions = { c: (column_distinct_counts[c] / column_nonnull_counts[c] if column_nonnull_counts[c] > 0 else 0) for c in self.columns_to_profile } if self.profiling_config.include_field_sample_values: # take sample and convert to Pandas DataFrame if self.row_count < NUM_SAMPLE_ROWS: # if row count is less than number to sample, just take all rows rdd_sample = dataframe.rdd.take(self.row_count) else: rdd_sample = dataframe.rdd.takeSample(False, NUM_SAMPLE_ROWS, seed=0) # init column specs with profiles for column in self.columns_to_profile: column_profile = DatasetFieldProfileClass(fieldPath=column) column_spec = _SingleColumnSpec(column, column_profile) column_profile.uniqueCount = column_distinct_counts.get(column) column_profile.uniqueProportion = column_unique_proportions.get( column) column_profile.nullCount = column_null_counts.get(column) column_profile.nullProportion = column_null_fractions.get(column) if self.profiling_config.include_field_sample_values: column_profile.sampleValues = sorted( [str(x[column]) for x in rdd_sample]) column_spec.type_ = column_types[column] column_spec.cardinality = _convert_to_cardinality( column_distinct_counts[column], column_null_fractions[column], ) self.column_specs.append(column_spec)
def tune_ALS_NLP(spark, train_data, validation_data, val_true_list, maxIter, regParams, ranks, review_val_predictions): # initial min_error = float('inf') best_iter1 = -1 best_rank1 = -1 best_regularization1 = 0 best_model_rmse = None max_map = 0.0 best_iter2 = -1 best_rank2 = -1 best_regularization2 = 0 best_model_map = None for iteration in maxIter: for current_rank in ranks: for reg in regParams: als=ALS(maxIter=iteration,regParam=reg,rank=current_rank, \ userCol='user_id',itemCol='book_id',ratingCol='rating', \ coldStartStrategy="drop",nonnegative=True) als_model = als.fit(train_data) predictions = als_model.transform(validation_data) review_predictions = review_val_predictions.withColumnRenamed('prediction','review_prediction') als_predictions = predictions.withColumnRenamed('prediction','als_prediction') total_predictions = als_predictions.join(review_predictions,['user_id','book_id','rating'],'outer') total_predictions = total_predictions.withColumn('total_prediction', \ when(total_predictions['review_prediction'].isNotNull(), \ total_predictions['review_prediction']) \ .otherwise(total_predictions['als_prediction'])) window = Window.partitionBy(total_predictions['user_id']).orderBy(total_predictions['total_prediction'].desc()) top_predictions = total_predictions.select('*', rank().over(window).alias('row_num')).filter(col('row_num') <= 500) # rmse evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='total_prediction') rmse = evaluator.evaluate(top_predictions) if rmse < min_error: min_error = rmse best_rank1 = current_rank best_regularization1 = reg best_iter1 = iteration best_model_rmse = als_model # MAP current_map = MAP.getMAP(top_predictions, val_true_list) if current_map > max_map: max_map = current_map best_rank2 = current_rank best_regularization2 = reg best_iter2 = iteration best_model_map = als_model print('{} latent factors and regularization = {} with maxIter {}: ' 'validation RMSE is {}' 'validation MAP is {}' .format(current_rank, reg, iteration, rmse, current_map)) with open('train05_review_eval.csv', 'ab') as f: np.savetxt(f, [np.array([iteration, current_rank, reg, rmse, current_map])],delimiter=",") print('\nThe best model select by RMSE has {} latent factors and ' 'regularization = {}'' with maxIter = {}: RMSE = {}'.format(best_rank1, best_regularization1, best_iter1, min_error)) print('\nThe best model select by MAP has {} latent factors and ' 'regularization = {}'' with maxIter = {}: MAP = {}'.format(best_rank2, best_regularization2, best_iter2, max_map)) return best_model_rmse,best_model_map
def transform_predictions_missing_values(dataframe): df_transformed_null = dataframe.select( [func.count(func.when(func.isnan(c) | func.isnull(c), c)).alias(c) for (c, c_type) in dataframe.dtypes]) return df_transformed_null
def Validate(ngrams \ , sampleSizes \ , ctxSize \ , sqc \ , seqs \ , outFile \ , minval \ , maxval \ , avg \ , nlines): accuracy = [] gramSize = GramSize(ctxSize, lookahead) c1 = (((maxval - minval) * 1.0) / nlines) / avg c2 = ((minval * 1.0) / nlines) / avg print seqs.count() ngrams = ngrams.repartition(1 << nPartLog) ngrams.cache() #we will validate separately for each vector size for vecSize in vecSizes: print '======TESTING FOR VECTOR SIZE', vecSize #start fresh old_ngrams = ngrams ngrams = ngrams.withColumn('correct', lit(0)) #use models from each sample modelId = 0 for sampleSize in sampleSizes: w2v = Word2VecModel.load(w2vFile(outDir, ctxSize, sampleSize, vecSize)) lrmodels = [] for dim in range(0, vecSize): lrmodels.append(LinearRegressionModel.load(lrmFile(outDir, ctxSize, sampleSize, vecSize, dim))) success = 0 fail = 0 unopt = 0 #add columns to store model success and failure modelSucc = 'succ_' + str(modelId) modelFail = 'fail_' + str(modelId) modelUnopt = 'unopt_' + str(modelId) seqs = seqs.withColumn(modelSucc, lit(0)) \ .withColumn(modelFail, lit(0)) \ .withColumn(modelUnopt, lit(0)) modelId = modelId + 1 ngrams = ngrams \ .withColumn('predSeq', lit('')) #create initial feature vector #transform each word into a cluster center words, d, centers = ClusterWords(w2v \ , seqs \ ) #record correctness for this model only old_ngrams = ngrams ngrams = ngrams.withColumn('sample_correct', lit(0)).withColumn('sample_confi', lit(1.0)) for nextPos in range(0,lookahead): #build the feature vector ngrams = BuildSubstringFeature(ngrams, w2v, nextPos, nextPos + ctxSize, ctxSize, lookahead,) #build the prediction vector ngrams = BuildPredictionVector(ngrams, lrmodels, ctxSize, vecSize) #now assign a cluster id to each prediction vector old_ngrams = ngrams ngrams = centers.transform(ngrams).withColumnRenamed('cluster', 'predWord').withColumnRenamed('vector', 'predictionVector') #get the predicted word ngrams = ngrams.join(broadcast(words), words.cluster == ngrams.predWord, 'inner') \ .drop('cluster') #\ #calculate the cosine similarity between prediction vector and center vector epsilon = 0.0001 def CosineSimi (v1, v2): d1 = DenseVector(v1) d2 = DenseVector(v2) n1 = d1.norm(2) n2 = d2.norm(2) return float(d1.dot(d2) / (n1 * n2)) cossim = udf(lambda v1, v2: CosineSimi(v1, v2), DoubleType()) ngrams = ngrams.withColumn('simi', cossim('centerVector', 'predictionVector')) ngrams = ngrams.drop('centerVector').drop('predictionVector') #update predicted sequence ngrams = ngrams.withColumn('predSeq', concat_ws(' ', 'predSeq', 'word')) ngrams = ngrams.withColumn('predSeq', ltrim(ngrams.predSeq)) #get actual sequence ngrams = CreateSubstring(ngrams, 'sentence', 'actualSeq', gramSize, ' ', ctxSize, ctxSize + nextPos + 1) #now get the cluster id for the predicted word in the sentence ngrams = BuildLabelVector(ngrams, w2v, ctxSize, lookahead, nextPos).withColumnRenamed('labelVec', 'vector').drop('ngrams') ngrams = centers.transform(ngrams).drop('vector') #and host latency for actual word ngrams = ngrams.join(broadcast(words), 'cluster', 'inner') \ .drop('word') \ .drop('centerVector') #\ #record correctness ngrams = ngrams.withColumn('round_correct', when((ngrams.predWord != ngrams.cluster) | (ngrams.simi < confidence), 0).otherwise(nextPos + 1)).drop('predWord').drop('cluster') ngrams = ngrams.withColumn('sample_correct', when(ngrams.sample_correct + 1 == ngrams.round_correct, ngrams.round_correct).otherwise(ngrams.sample_correct)) #get overall correctness ngrams = ngrams.withColumn('correct', greatest('sample_correct', 'correct')) #get binary correctness ngrams = ngrams.withColumn('binary_correct', when(ngrams.correct >= nextPos + 1, 1).otherwise(0)) ngrams = ngrams.withColumn('sample_confi', when(ngrams.binary_correct == 1, 1.0).otherwise(least(ngrams.simi, ngrams.sample_confi))) ngrams = ngrams.withColumn('simi', when(ngrams.binary_correct == 1, ngrams.simi).otherwise(ngrams.sample_confi)) ngrams = ngrams.withColumn('predSeq', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), ngrams.actualSeq).otherwise(ngrams.predSeq)) ngrams = ngrams.withColumn('succ_wt', when(ngrams.binary_correct == 1, ngrams.wt).otherwise(0)) ngrams = ngrams.withColumn('fail_wt', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), 0).otherwise(ngrams.wt)) ngrams = ngrams.withColumn('unopt_wt', when((ngrams.binary_correct == 0) & (ngrams.simi < confidence), ngrams.wt).otherwise(0)) ngrams = ngrams.drop('simi') #now summarize success and failure rates by predicted sequence seqWts = ngrams.groupBy('predSeq').agg(sum('succ_wt').alias('succ_wt'), sum('fail_wt').alias('fail_wt'), sum('unopt_wt').alias('unopt_wt')) #update sequences table seqs = seqWts.join(broadcast(seqs), seqWts.predSeq==seqs.word, 'right_outer').drop('predSeq').fillna(-c2/c1, ['succ_wt', 'fail_wt', 'unopt_wt']) scaleback = udf(lambda s: float(s*c1 + c2), DoubleType()) seqs = seqs.withColumn(modelSucc, col(modelSucc) + scaleback(seqs.succ_wt)).drop('succ_wt') seqs = seqs.withColumn(modelFail, col(modelFail) + scaleback(seqs.fail_wt)).drop('fail_wt') seqs = seqs.withColumn(modelUnopt, col(modelUnopt) + scaleback(seqs.unopt_wt)).drop('unopt_wt') seqs.cache() aggregated = seqs.agg(sum(modelSucc), sum(modelFail), sum(modelUnopt)) aggregated.cache() new_success = aggregated.head()['sum(' + modelSucc + ')'] new_fail = aggregated.head()['sum(' + modelFail + ')'] new_unopt = aggregated.head()['sum(' + modelUnopt + ')'] print nextPos, new_success - success, new_fail - fail, new_unopt - unopt success = new_success fail = new_fail unopt = new_unopt #end for testing for each model for a particular vector size #end for each vector size seqs.orderBy('succ_0', ascending=False).write.mode('overwrite').csv(outputFile(outDir, ctxSize, vecSize, sampleSizes)) return accuracy
def expect_column_values_to_be_between( self, column, min_value=None, max_value=None, strict_min=False, strict_max=False, parse_strings_as_datetimes=None, output_strftime_format=None, allow_cross_type_comparisons=None, mostly=None, result_format=None, include_config=True, catch_exceptions=None, meta=None, ): # NOTE: This function is implemented using native functions instead of UDFs, which is a faster # implementation. Please ensure new spark implementations migrate to the new style where possible if allow_cross_type_comparisons: raise ValueError( "Cross-type comparisons are not valid for SparkDFDataset") if parse_strings_as_datetimes: min_value = parse(min_value) max_value = parse(max_value) if min_value is None and max_value is None: raise ValueError("min_value and max_value cannot both be None") elif min_value is None: if strict_max: return column.withColumn( "__success", when(column[0] < max_value, lit(True)).otherwise(lit(False)), ) else: return column.withColumn( "__success", when(column[0] <= max_value, lit(True)).otherwise(lit(False)), ) elif max_value is None: if strict_min: return column.withColumn( "__success", when(column[0] > min_value, lit(True)).otherwise(lit(False)), ) else: return column.withColumn( "__success", when(column[0] >= min_value, lit(True)).otherwise(lit(False)), ) else: if min_value > max_value: raise ValueError("minvalue cannot be greater than max_value") if strict_min and strict_max: return column.withColumn( "__success", when((min_value < column[0]) & (column[0] < max_value), lit(True)).otherwise(lit(False)), ) elif strict_min: return column.withColumn( "__success", when((min_value < column[0]) & (column[0] <= max_value), lit(True)).otherwise(lit(False)), ) elif strict_max: return column.withColumn( "__success", when((min_value <= column[0]) & (column[0] < max_value), lit(True)).otherwise(lit(False)), ) else: return column.withColumn( "__success", when((min_value <= column[0]) & (column[0] <= max_value), lit(True)).otherwise(lit(False)), )
#[2] Weather data ingestion sensors = spark.read.option("delimiter", ",")\ .csv('inputs/mi_meteo_legend.csv')\ .toDF('sensor_id','street_name','lat', 'lon','sensor_type','unity_of_measure') sensors = sensors.filter(sensors['sensor_type'] == "Precipitation") sensors = sensors.filter(sensors['street_name'] == "Milano - via Lambrate") weather = spark.read.option("delimiter", ",")\ .csv('inputs/weather_phenomena/*.csv')\ .toDF('sensor_id','time_istant','measurement') #[3] Grouping by date abd Precipitation intensity classification ws = weather.join(sensors, on='sensor_id', how='left') ws = ws.withColumn('rain_intensity', func.when(func.col('measurement') == 0, 0)\ .when((func.col('measurement') > 0) & (func.col('measurement') < 2.6),1)\ .when((func.col('measurement') >= 2.6) & (func.col('measurement') < 7.6),2)\ .otherwise(3)) ws = ws.filter(ws['rain_intensity'] > 0) city = mobileDF.join(ws, on='time_istant', how='left') city.show() city = city.withColumn( 'date_again', func.from_unixtime( func.unix_timestamp(city.time_istant, 'yyyy/MM/dd HH:mm'), 'yyyy-MM-dd HH:mm')) city = city.filter(city.sensor_type.isNotNull()) #[4] Filtering only for working hours city.createOrReplaceTempView("sparkCityData") novemberactivity = city novemberactivity = novemberactivity.withColumn(
def prepare_df( df: pyspark.sql.DataFrame, store_csv: pyspark.sql.DataFrame, store_states_csv: pyspark.sql.DataFrame, state_names_csv: pyspark.sql.DataFrame, google_trend_csv: pyspark.sql.DataFrame, weather_csv: pyspark.sql.DataFrame, ) -> pyspark.sql.DataFrame: num_rows = df.count() # expand dates df = expand_date(df) # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed). df = (df.withColumn("Open", df.Open != "0").withColumn( "Promo", df.Promo != "0").withColumn("StateHoliday", df.StateHoliday != "0").withColumn( "SchoolHoliday", df.SchoolHoliday != "0")) # merge store information store = store_csv.join(store_states_csv, "Store") df = df.join(store, "Store") # merge Google Trend information google_trend_all = prepare_google_trend(google_trend_csv) df = df.join(google_trend_all, ["State", "Year", "Week"]).select(df["*"], google_trend_all.trend) # merge in Google Trend for whole Germany google_trend_de = google_trend_all[google_trend_all.file == "Rossmann_DE"].withColumnRenamed( "trend", "trend_de") df = df.join(google_trend_de, ["Year", "Week"]).select(df["*"], google_trend_de.trend_de) # merge weather weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName) df = df.join(weather, ["State", "Date"]) # fix null values df = (df.withColumn( "CompetitionOpenSinceYear", F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)), ).withColumn( "CompetitionOpenSinceMonth", F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)), ).withColumn("Promo2SinceYear", F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn( "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek, F.lit(1)))) # days and months since the competition has been open, cap it to 2 years df = df.withColumn( "CompetitionOpenSince", F.to_date( F.format_string("%s-%s-15", df.CompetitionOpenSinceYear, df.CompetitionOpenSinceMonth)), ) df = df.withColumn( "CompetitionDaysOpen", F.when( df.CompetitionOpenSinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince)), ), ).otherwise(0), ) df = df.withColumn("CompetitionMonthsOpen", (df.CompetitionDaysOpen / 30).cast(T.IntegerType())) # days and weeks of promotion, cap it to 25 weeks df = df.withColumn( "Promo2Since", F.expr( 'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)' ), ) df = df.withColumn( "Promo2Days", F.when( df.Promo2SinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))), ).otherwise(0), ) df = df.withColumn("Promo2Weeks", (df.Promo2Days / 7).cast(T.IntegerType())) # ensure that no row was lost through inner joins assert num_rows == df.count(), "lost rows in joins" return df
meter_id = row.meter building_meter = get_meter(building, meter_id) print("Predicting meter readings for building {0} meter {1}".format( building_id, meter_id)) model = load_model(building_id, meter_id) predictions = model.transform(building_meter) predictions = predictions.withColumn("prediction", F.expm1(predictions.prediction)) print("Saving submission") predictions = predictions.withColumn("submitted_ts", F.current_timestamp()) predictions = predictions.withColumn("submit_id", F.lit(submit_id)) predictions = predictions.withColumn("algo", F.lit(algo)) predictions = predictions.withColumnRenamed( "prediction", "meter_reading").select("row_id", "building_id", "meter", "timestamp", "meter_reading", "submit_id", "submitted_ts", "algo") predictions = predictions.withColumn( "meter_reading", F.when(predictions.meter_reading < 0, F.lit(0.0)).otherwise(predictions.meter_reading)) predictions = predictions.fillna(0.0, "meter_reading") predictions.coalesce(1).write.saveAsTable("submitted_predictions", format="parquet", mode="append") to_csv(submit_id, algo)
except: pass # device_categories = json_cleaned.select("deviceType").distinct().rdd.flatMap(lambda x: x).collect() device_categories = [u'Personal computer', u'Tablet', u'Smartphone'] refDomain_categories = json_cleaned.groupBy("refDomain").count().orderBy(desc("count"))\ .select("refDomain").rdd.flatMap(lambda x: x).collect() refDomain_categories_filter = [] for x in refDomain_categories: try: temp = x.split('.')[-2] if temp not in refDomain_categories_filter: refDomain_categories_filter.append(temp) except: pass exprs_device = [F.when(F.col("deviceType") == category, 1).otherwise(0).alias("is_device_"+category) for category in device_categories] exprs_domain = [F.when(F.col("refDomain") == category, 1).otherwise(0).alias("is_domain_"+category) for category in refDomain_categories_filter[0:100]] labeled_json_cleaned = json_cleaned.select("*", *exprs_device ) \ .select("*", *exprs_domain).drop("deviceType").drop("refDomain")
def nscore(words): scores = filter(lambda x: x<0, [wordlist[t] for t in words if t in wordlist]) return 0.0 if len(scores)==0 else (float(sum(scores))/len(scores)) neg_score = F.udf(lambda w: nscore(w), FloatType()) # Create feature matrix for the model tw1 = hc.sql(""" SELECT text, query, polarity, date, regexp_extract(date, '([0-9]{2}):([0-9]{2}):([0-9]{2})', 1) as hour, regexp_extract(date, '(Sun|Mon|Tue|Wed|Thu|Fri|Sat)', 1) as dayofweek, regexp_extract(date, '(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)', 1) as month FROM tweets """) tw2 = tw1.filter("polarity != 2").withColumn('words', tokenize(tw1['text'])) tw3 = (tw2.select("user", "hour", "dayofweek", "month", "words", F.when(tw2.polarity == 4, "Pos").otherwise("Neg").alias("sentiment"), pos_score(tw2["words"]).alias("pscore"), neg_score(tw2["words"]).alias("nscore"))) tw3.registerTempTable("fm") # paramaters for modeling numFeatures = 5000 minDocFreq = 50 numTrees = 1000 # Build Machine Learning pipeline inx1 = StringIndexer(inputCol="hour", outputCol="hour-inx") inx2 = StringIndexer(inputCol="month", outputCol="month-inx") inx3 = StringIndexer(inputCol="dayofweek", outputCol="dow-inx") inx4 = StringIndexer(inputCol="sentiment", outputCol="label") hashingTF = HashingTF(numFeatures=numFeatures, inputCol="words", outputCol="hash-tf")
def summary(df, cols): spark = df.sql_ctx types = {x.name: x.dataType for x in list(df.schema) if x.name in cols} res = pd.DataFrame.from_dict(types, orient='index') res.columns = ['datatype'] count = df.count() res['count'] = count d = df.select([F.approx_count_distinct(c).alias(c) for c in cols]).toPandas().T d.columns = ['approx_distinct'] d.index.name = 'index' res = res.join(d) res['unique_ratio'] = res['approx_distinct'] / count sel = [] for c, v in types.items(): if isinstance(v, (T.NumericType)): sel += [F.mean(c).alias(c)] else: sel += [F.min(F.lit(None)).alias(c)] d = df.select(sel).toPandas().T d.columns = ['mean'] d.index.name = 'index' res = res.join(d) d = df.select([F.min(c).alias(c) for c in cols]).toPandas().T d.columns = ['min'] d.index.name = 'index' res = res.join(d) d = df.select([F.max(c).alias(c) for c in cols]).toPandas().T d.columns = ['max'] d.index.name = 'index' res = res.join(d) d = df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in cols]).toPandas().T d.columns = ['null'] d.index.name = 'index' res = res.join(d) sel = [] for c, v in types.items(): if isinstance(v, (T.NumericType)): sel += [F.count(F.when(F.isnan(c), c)).alias(c)] else: sel += [F.min(F.lit(0)).alias(c)] d = df.select(sel).toPandas().T d.columns = ['nan'] d.index.name = 'index' res = res.join(d) sel = [] for c, v in types.items(): if isinstance(v, (T.StringType)): sel += [F.count(F.when(F.col(c).isin(''), c)).alias(c)] else: sel += [F.min(F.lit(0)).alias(c)] d = df.select(sel).toPandas().T d.columns = ['empty'] d.index.name = 'index' res = res.join(d) return res
def delayed_flights(spark, flights_file_path, other_files_path, year): """ PARAMETERS ---------- spark: SparkSession file: The data file e.g."s3://air-traffic-dataset/ontimeperformance_flights_test.csv". year: 1994-2008 inclusive for the tiny dataset. """ flights_tiny_df = (spark.read.format("csv").options(header="true").load( "{}/year={}".format(flights_file_path, year))) airlines_df = (spark.read.format("csv").options(header="true").load( "{}/ontimeperformance_airlines.csv".format(other_files_path))) flights_tiny_df = flights_tiny_df \ .withColumn( "scheduled_departure_timestamp", F.to_timestamp( F.when( F.col("scheduled_depature_time") == "24:00:00", "00:00:00" ).otherwise(F.col("scheduled_depature_time")), "HH:mm:ss", ), ) \ .withColumn( "actual_departure_timestamp", F.to_timestamp( F.when(F.col("actual_departure_time") == "24:00:00", "00:00:00").otherwise( F.col("actual_departure_time") ), "HH:mm:ss", ), ) \ .withColumn( "delayed_time", F.when( F.col("actual_departure_timestamp").cast("long") - F.col("scheduled_departure_timestamp").cast("long") > (60 * 60 * 12), ( F.col("scheduled_departure_timestamp").cast("long") + (60 * 60 * 24) - F.col("actual_departure_timestamp").cast("long") ) / 60, ) .when( F.col("scheduled_departure_timestamp").cast("long") - F.col("actual_departure_timestamp").cast("long") > (60 * 60 * 12), ( F.col("actual_departure_timestamp").cast("long") + (60 * 60 * 24) - F.col("scheduled_departure_timestamp").cast("long") ) / 60, ) .otherwise( ( F.col("actual_departure_timestamp").cast("long") - F.col("scheduled_departure_timestamp").cast("long") ) / 60 ), ) \ .filter(F.col("actual_departure_timestamp").isNotNull()) \ .filter(F.col("delayed_time") > 0) \ .groupBy("carrier_code") \ .agg( F.count("delayed_time").alias("numOfDelays"), F.mean("delayed_time").alias("avgDelays"), F.min("delayed_time").alias("minDelay"), F.max("delayed_time").alias("maxDelay"), ) \ .withColumn("avgDelays", F.round(F.col("avgDelays"), 2)) \ .select( "carrier_code", "numOfDelays", "avgDelays", "minDelay", "maxDelay" ) \ avg_flights = F.broadcast(airlines_df) \ .join( flights_tiny_df, flights_tiny_df.carrier_code == airlines_df.carrier_code ) \ .select( "name", "numOfDelays", "avgDelays", "minDelay", "maxDelay" ) parseAvgFlights(avg_flights)
def na(col_name): return F.count( F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name))
def shift(self, periods=1, fill_value=None): """ Shift Series/Index by desired number of periods. .. note:: the current implementation of shift uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. Parameters ---------- periods : int Number of periods to shift. Can be positive or negative. fill_value : object, optional The scalar value to use for newly introduced missing values. The default depends on the dtype of self. For numeric data, np.nan is used. Returns ------- Copy of input Series/Index, shifted. Examples -------- >>> df = ks.DataFrame({'Col1': [10, 20, 15, 30, 45], ... 'Col2': [13, 23, 18, 33, 48], ... 'Col3': [17, 27, 22, 37, 52]}, ... columns=['Col1', 'Col2', 'Col3']) >>> df.Col1.shift(periods=3) 0 NaN 1 NaN 2 NaN 3 10.0 4 20.0 Name: Col1, dtype: float64 >>> df.Col2.shift(periods=3, fill_value=0) 0 0 1 0 2 0 3 13 4 23 Name: Col2, dtype: int64 >>> df.index.shift(periods=3, fill_value=0) Int64Index([0, 0, 0, 0, 1], dtype='int64') """ if len(self._internal.index_columns) == 0: raise ValueError("Index must be set.") if not isinstance(periods, int): raise ValueError('periods should be an int; however, got [%s]' % type(periods)) col = self._scol window = Window.orderBy(self._kdf._internal.index_scols).rowsBetween( -periods, -periods) shifted_col = F.lag(col, periods).over(window) col = F.when(shifted_col.isNull() | F.isnan(shifted_col), fill_value).otherwise(shifted_col) return self._with_new_scol(col).rename(self.name)
def zeros(col_name): return F.count(F.when(F.col(col_name) == 0, col_name))
df = sqlContext.read \ .format('com.databricks.spark.csv') \ .options(header='false') \ .load(args.file, schema=StructType(fields)) # calculate the totals summed across all dates countDF = df.groupBy('name').agg({"count": "sum"}).withColumnRenamed('sum(count)', 'total') # read from the column dates dates = sorted(df.select("date") .distinct() .map(lambda row: row[0]) .collect()) # find the counts for each date cols = [when(col("date") == m, col("percentage")).otherwise(None).alias(m) for m in dates] maxs = [max(col(m)).alias(m) for m in dates] # reformat dataframe series = (df .select(col("name"), *cols) .groupBy("name") .agg(*maxs) .na.fill(0)) compressedTimeseries = series.select("name", concat_ws(",", *dates).alias("timeseries")) # add totals to timeseries table resultDF = compressedTimeseries.join(countDF, 'name', 'inner')
# Extract Trip time def time_delta(pickup_time, dropoff_time): pickup_time_out = datetime.datetime.strptime(pickup_time, '%m/%d/%y %H:%M') dropoff_time_out = datetime.datetime.strptime(dropoff_time, '%m/%d/%y %H:%M') trip_time = (dropoff_time_out - pickup_time_out).seconds / float(60) return trip_time f = udf(time_delta, FloatType()) # (1) Calculate "trip_time" # (2) Create a "tip_flag" for any record where a customer leaves a tip # (3) Extract the Pickup Day (as an integer) # (4) Extract the Pickup Hour (as an integer) transformed1 = rawdata.withColumn("trip_time", f(rawdata.pickup_datetime, rawdata.dropoff_datetime)) \ .withColumn("tip_flag", (when(rawdata.tip_amount > 0.0, 1).otherwise(0)) ) \ .withColumn("pickup_day", split(rawdata.pickup_datetime,"\/")[1].cast("integer") ) \ .withColumn("pickup_hour", split(split(rawdata.pickup_datetime," ")[1],":")[0].cast("integer") ) ####################################################################################### # # Model Prep # ####################################################################################### # String Indexer strindexer = StringIndexer(inputCol="vehicle_id", outputCol="vehicle_id_index") modelprep1 = strindexer.fit(transformed1).transform(transformed1) features = ['pickup_longitude','passenger_count','tolls_amount','tip_amount','trip_distance']
def main(inputFile, outputFile): def getItemName(payload): payload = json.loads(payload) return payload['item_name'].split('.')[0] item_udf = udf(lambda payload: getItemName(payload)) df = spark.read.parquet(inputFile+'/*') #DF for Item Play Started start = df.filter(col('event').isin(['ITEM_PLAY_STARTED'])) start = start.withColumn('Content Name', item_udf(start['payload'])) # get content name start = start.withColumn('Time Start', df['time'].cast("timestamp")) #turn Time Start String to easy to use Time Stamp Object cols = ["device_id", "Content Name", "Time Start"] #Select just the columns we need start = start.select(*cols) def getEndOfStream(payload): payload = json.loads(payload) if "did_reach_end_of_stream" in payload: return payload['did_reach_end_of_stream'] else: return "false" stream_end_UDF = udf(lambda x: getEndOfStream(x)) #DF for Item Play Finished finished = df.filter(col('event').isin(['ITEM_PLAY_FINISHED'])) finished = finished.withColumn('Content Name', item_udf(finished['payload'])) # get content name finished = finished.withColumn('reach_end_of_stream', stream_end_UDF(finished['payload'])) # get did_reach_end_of_stream finished = finished.withColumn("reach_end_of_stream", F.trim(col("reach_end_of_stream"))) #Get rid of white space #Convert True/False strings to actual boolean values finished = finished.withColumn( 'reach_end_of_stream', F.when(F.col("reach_end_of_stream") == "true", True)\ .otherwise(False) ) #turn Time End String to easy to use Time Stamp Object finished = finished.withColumn('Time End', df['time'].cast("timestamp")) #Select just the columns we need cols = ["device_id", "Content Name", "Time End", "reach_end_of_stream"] finished = finished.select(*cols) #combine two dataframes for our transformed Schema transformed = start.join(finished, on=["device_id", "Content Name"], how='left_outer') #Make sure Time Start before time end transformed = transformed.where(col("Time Start") <= col("Time End")) #Convert time stamps to unix #transformed = transformed.withColumn('Time Start', F.unix_timestamp('Time Start')) #transformed = transformed.withColumn('Time End', F.unix_timestamp('Time End')) #Get correct Time Ends def getEndTime(end_time_list): return end_time_list[0] end_time_udf = udf(lambda end_time_list: end_time_list[0], T.TimestampType()) df = transformed.withColumn("end_time_list", F.collect_list("Time End").over(Window.partitionBy("device_id",'Content Name','Time Start', "reach_end_of_stream").orderBy('Time End'))) df = df.groupBy('device_id','Time Start','Content Name', "reach_end_of_stream").agg(F.max('end_time_list').alias('end_time_list')) #Still gets laggy here running the udf that takes first item of list (aka the smallest date time) df = df.withColumn('Time End', end_time_udf("end_time_list")) df = df.drop('end_time_list') #rename columns + reorder df = df.withColumnRenamed("Time Start", "start").withColumnRenamed("Time End", "end").withColumnRenamed("Content Name", "item_name") df = df.select("device_id", "item_name", "start", "end", "reach_end_of_stream") df.write.parquet(outputFile) # Write onto output Parquet
return None new_row = [] new_row.append(row[0]) for i in range(1,3): if row[i] == row[i+2]: new_row.append[-99] else: new_row.append[row[i+2]] print (new_row) from pyspark.sql import functions as F ds.withColumn("new_ALM", F.when(ds['AL MOBILE'] == df['AL MOBILE'], "").otherwise(ds['ALM'])).show() # or with ds. too F.when( (df["col-1"]>0.0) & (df["col-2">0.0), 1).otherwise(0) df.select(F.when(df['age'] == 2, 3).otherwise(4).alias("age")).collect() def get_row(row): F.when(row['age'] == 2, 3).otherwise(4).alias("age") new_cols = df_new.select([x for x in df.columns]) ds.select(ds['HOTEL ID'], F.when(ds['AL MOBILE'] == df['AL MOBILE'], "").otherwise(ds['ALM']), F.when(ds['AL DESKTOP'] == df['ALD'], "").otherwise(ds['ALD']))
convert_enqueue_utcdtstr_date_udf = udf(convert_enqueue_utcdtstr_date, TimestampType()) # COMMAND ---------- # Add a column EnqueuedTimeUtc which has the Enqueued Date as a Date sourceTransactionsDf = sourceTransactionsDf.withColumn("EnqueuedDateTimeUTC", convert_enqueue_utcdtstr_date_udf(sourceTransactionsDf['EnqueuedTimeUtc'])) sourceAccrualsDf = sourceAccrualsDf.withColumn("EnqueuedDateTimeUTC", convert_enqueue_utcdtstr_date_udf(sourceAccrualsDf['EnqueuedTimeUtc'])) sourceRedemptionsDf = sourceRedemptionsDf.withColumn("EnqueuedDateTimeUTC", convert_enqueue_utcdtstr_date_udf(sourceRedemptionsDf['EnqueuedTimeUtc'])) sourceMemberBalancesDf = sourceMemberBalancesDf.withColumn("EnqueuedDateTimeUTC", convert_enqueue_utcdtstr_date_udf(sourceMemberBalancesDf['EnqueuedTimeUtc'])) # COMMAND ---------- # Add a column which checks whether the Enqueued Date is between yesterday 3 AM and today 4 AM UTC from pyspark.sql import functions as F sourceTransactionsDf = sourceTransactionsDf.withColumn("ValidForDate", F.when((yesterday_utc_date_3am_datetime <= sourceTransactionsDf.EnqueuedDateTimeUTC) & (sourceTransactionsDf.EnqueuedDateTimeUTC <= today_utc_date_4am_datetime),"YES").otherwise("NO")) sourceAccrualsDf = sourceAccrualsDf.withColumn("ValidForDate", F.when((yesterday_utc_date_3am_datetime <= sourceAccrualsDf.EnqueuedDateTimeUTC) & (sourceAccrualsDf.EnqueuedDateTimeUTC <= today_utc_date_4am_datetime),"YES").otherwise("NO")) sourceRedemptionsDf = sourceRedemptionsDf.withColumn("ValidForDate", F.when((yesterday_utc_date_3am_datetime <= sourceRedemptionsDf.EnqueuedDateTimeUTC) & (sourceRedemptionsDf.EnqueuedDateTimeUTC <= today_utc_date_4am_datetime),"YES").otherwise("NO")) sourceMemberBalancesDf = sourceMemberBalancesDf.withColumn("ValidForDate", F.when((yesterday_utc_date_3am_datetime <= sourceMemberBalancesDf.EnqueuedDateTimeUTC) & (sourceMemberBalancesDf.EnqueuedDateTimeUTC <= today_utc_date_4am_datetime),"YES").otherwise("NO")) # COMMAND ---------- # get data valid only from 3.00 AM UTC previous day to 4 AM UTC today validTransactionsDf = sourceTransactionsDf.filter("ValidForDate = 'YES'") validAccrualsDf = sourceAccrualsDf.filter("ValidForDate = 'YES'") validRedemptionsDf = sourceRedemptionsDf.filter("ValidForDate = 'YES'") validMemberBalancesDf = sourceMemberBalancesDf.filter("ValidForDate <> 'YES'") # COMMAND ---------- # UDF - this would take just the body dataframe and decrypt it using UDF
def get_row(row): F.when(row['age'] == 2, 3).otherwise(4).alias("age")
def combine_frames(this, *args, how="full"): """ This method combines `this` DataFrame with a different `that` DataFrame or Series from a different DataFrame. It returns a DataFrame that has prefix `this_` and `that_` to distinct the columns names from both DataFrames It internally performs a join operation which can be expensive in general. So, if `compute.ops_on_diff_frames` option is False, this method throws an exception. """ from databricks.koalas import Series from databricks.koalas import DataFrame from databricks.koalas.config import get_option if all(isinstance(arg, Series) for arg in args): assert all( arg._kdf is args[0]._kdf for arg in args ), "Currently only one different DataFrame (from given Series) is supported" if this is args[0]._kdf: return # We don't need to combine. All series is in this. that = args[0]._kdf[list(args)] elif len(args) == 1 and isinstance(args[0], DataFrame): assert isinstance(args[0], DataFrame) if this is args[0]: return # We don't need to combine. `this` and `that` are same. that = args[0] else: raise AssertionError("args should be single DataFrame or " "single/multiple Series") if get_option("compute.ops_on_diff_frames"): this_index_map = this._internal.index_map that_index_map = that._internal.index_map assert len(this_index_map) == len(that_index_map) join_scols = [] merged_index_scols = [] # Note that the order of each element in index_map is guaranteed according to the index # level. this_and_that_index_map = zip(this_index_map.items(), that_index_map.items()) # If the same named index is found, that's used. for (this_column, this_name), (that_column, that_name) in this_and_that_index_map: if this_name == that_name: # We should merge the Spark columns into one # to mimic pandas' behavior. this_scol = scol_for(this._sdf, this_column) that_scol = scol_for(that._sdf, that_column) join_scol = this_scol == that_scol join_scols.append(join_scol) merged_index_scols.append( F.when(this_scol.isNotNull(), this_scol).otherwise(that_scol).alias(this_column)) else: raise ValueError( "Index names must be exactly matched currently.") assert len( join_scols) > 0, "cannot join with no overlapping index names" joined_df = this._sdf.alias("this").join(that._sdf.alias("that"), on=join_scols, how=how) joined_df = joined_df.select(merged_index_scols + [ this[label].spark_column.alias( "__this_%s" % this._internal.spark_column_name_for(label)) for label in this._internal.column_labels ] + [ that[label].spark_column.alias( "__that_%s" % that._internal.spark_column_name_for(label)) for label in that._internal.column_labels ]) index_columns = set(this._internal.index_spark_column_names) new_data_columns = [ c for c in joined_df.columns if c not in index_columns ] level = max(this._internal.column_labels_level, that._internal.column_labels_level) column_labels = [ tuple(["this"] + ([""] * (level - len(label))) + list(label)) for label in this._internal.column_labels ] + [ tuple(["that"] + ([""] * (level - len(label))) + list(label)) for label in that._internal.column_labels ] column_label_names = ( (([None] * (1 + level - len(this._internal.column_labels_level))) + this._internal.column_label_names) if this._internal.column_label_names is not None else None) return DataFrame( this._internal.copy( spark_frame=joined_df, column_labels=column_labels, data_spark_columns=[ scol_for(joined_df, col) for col in new_data_columns ], column_label_names=column_label_names, )) else: raise ValueError( "Cannot combine the series or dataframe because it comes from a different dataframe. " "In order to allow this operation, enable 'compute.ops_on_diff_frames' option." )
def main(): spark_session = (SparkSession.builder.appName(APPLICATION_NAME).master( MASTER_URL).config('spark.cassandra.connection.host', MORPHL_SERVER_IP_ADDRESS).config( 'spark.cassandra.auth.username', MORPHL_CASSANDRA_USERNAME).config( 'spark.cassandra.auth.password', MORPHL_CASSANDRA_PASSWORD).config( 'spark.sql.shuffle.partitions', 16).config( 'parquet.enable.summary-metadata', 'true').getOrCreate()) log4j = spark_session.sparkContext._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR) # All users from the database are already retained (they are filtered from the BQ SQL) ga_chp_bq_users = fetch_from_cassandra( 'ga_chp_bq_features_raw_t' if TRAINING_OR_PREDICTION == 'training' else 'ga_chp_bq_features_raw_p', spark_session) ga_chp_bq_users.createOrReplaceTempView('ga_chp_bq_users') # Using window functions: https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html grouped_by_client_id_before_dedup_sql_parts = [ 'SELECT', 'client_id,', 'SUM(bounces) OVER (PARTITION BY client_id) AS bounces,' 'SUM(events) OVER (PARTITION BY client_id) AS events,' 'SUM(page_views) OVER (PARTITION BY client_id) AS page_views,' 'SUM(session_duration) OVER (PARTITION BY client_id) AS session_duration,' 'SUM(sessions) OVER (PARTITION BY client_id) AS sessions,' 'FIRST_VALUE(is_desktop) OVER (PARTITION BY client_id ORDER BY day_of_data_capture DESC) AS is_desktop,' 'FIRST_VALUE(is_mobile) OVER (PARTITION BY client_id ORDER BY day_of_data_capture DESC) AS is_mobile,' 'FIRST_VALUE(is_tablet) OVER (PARTITION BY client_id ORDER BY day_of_data_capture DESC) AS is_tablet,' 'ROW_NUMBER() OVER (PARTITION BY client_id ORDER BY day_of_data_capture DESC) AS rownum' ] if TRAINING_OR_PREDICTION == 'training': grouped_by_client_id_before_dedup_sql_parts = grouped_by_client_id_before_dedup_sql_parts + [ ', FIRST_VALUE(days_since_last_session) OVER (PARTITION BY client_id ORDER BY day_of_data_capture DESC) AS days_since_last_session,', 'AVG(days_since_last_session) OVER (PARTITION BY client_id) AS avgdays', ] grouped_by_client_id_before_dedup_sql_parts = grouped_by_client_id_before_dedup_sql_parts + [ 'FROM', 'ga_chp_bq_users' ] grouped_by_client_id_before_dedup_sql = ' '.join( grouped_by_client_id_before_dedup_sql_parts) grouped_by_client_id_before_dedup_df = spark_session.sql( grouped_by_client_id_before_dedup_sql) grouped_by_client_id_before_dedup_df.createOrReplaceTempView( 'grouped_by_client_id_before_dedup') # Only keeping the most recent record from every client id # rownum = 1 while day_of_data_capture is sorted in descending order grouped_by_client_id_sql = 'SELECT * FROM grouped_by_client_id_before_dedup WHERE rownum = 1' grouped_by_client_id_df = spark_session.sql(grouped_by_client_id_sql) grouped_by_client_id_df.createOrReplaceTempView('grouped_by_client_id') # The schema for grouped_by_client_id_df is: # |-- client_id: string (nullable = true) # |-- bounces: double (nullable = true) # |-- events: double (nullable = true) # |-- page_views: double (nullable = true) # |-- session_duration: double (nullable = true) # |-- sessions: double (nullable = true) # |-- is_desktop: double (nullable = true) # |-- is_mobile: double (nullable = true) # |-- is_tablet: double (nullable = true) # |-- days_since_last_session: float (nullable = true) # |-- rownum: integer (nullable = true) # |-- avgdays: double (nullable = true) if TRAINING_OR_PREDICTION == 'training': mean_value_of_avg_days_sql = 'SELECT AVG(avgdays) mean_value_of_avgdays FROM grouped_by_client_id' mean_value_of_avg_days_df = spark_session.sql( mean_value_of_avg_days_sql) churn_threshold = mean_value_of_avg_days_df.first( ).mean_value_of_avgdays final_df = (grouped_by_client_id_df.withColumn( 'churned', f.when(f.col('days_since_last_session') > churn_threshold, 1.0).otherwise(0.0)).select('client_id', 'bounces', 'events', 'page_views', 'session_duration', 'sessions', 'is_desktop', 'is_mobile', 'is_tablet', 'churned').repartition(32)) # The schema for final_df is: # |-- client_id: string (nullable = true) # |-- bounces: double (nullable = true) # |-- events: double (nullable = true) # |-- page_views: double (nullable = true) # |-- session_duration: double (nullable = true) # |-- sessions: double (nullable = true) # |-- is_desktop: double (nullable = true) # |-- is_mobile: double (nullable = true) # |-- is_tablet: double (nullable = true) # |-- churned: double (nullable = false) final_df.cache() final_df.write.parquet(HDFS_DIR_TRAINING) save_options_ga_chp_bq_features_training = { 'keyspace': MORPHL_CASSANDRA_KEYSPACE, 'table': 'ga_chp_bq_features_training' } (final_df.write.format('org.apache.spark.sql.cassandra').mode( 'append').options( **save_options_ga_chp_bq_features_training).save()) with open(CHURN_THRESHOLD_FILE, 'w') as fh: fh.write(str(churn_threshold)) else: final_df = (grouped_by_client_id_df.select( 'client_id', 'bounces', 'events', 'page_views', 'session_duration', 'sessions', 'is_desktop', 'is_mobile', 'is_tablet').repartition(32)) # The schema for final_df is: # |-- client_id: string (nullable = true) # |-- bounces: double (nullable = true) # |-- events: double (nullable = true) # |-- page_views: double (nullable = true) # |-- session_duration: double (nullable = true) # |-- sessions: double (nullable = true) # |-- is_desktop: double (nullable = true) # |-- is_mobile: double (nullable = true) # |-- is_tablet: double (nullable = true) final_df.cache() final_df.write.parquet(HDFS_DIR_PREDICTION) save_options_ga_chp_bq_features_prediction = { 'keyspace': MORPHL_CASSANDRA_KEYSPACE, 'table': 'ga_chp_bq_features_prediction' } (final_df.write.format('org.apache.spark.sql.cassandra').mode( 'append').options( **save_options_ga_chp_bq_features_prediction).save())
from pyspark.sql import functions as F spark = SparkSession.builder.appName("LogisticRegression App").getOrCreate() spark.sparkContext.setLogLevel("ERROR") # Loading the data data = spark.read.format("csv").option("header", True) \ .option("inferSchema", True) \ .option("delimiter", ",") \ .load("/Users/louis_lyu/Desktop/SourceCode/data/imports-85.data") data.printSchema() data = data.withColumn("label", F.when(F.col("num-of-doors") == "four", 1).otherwise(0)).select("label","length", "width","height") data.show() # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model model = lr.fit(data) # Print the coefficients and intercept for logistic regression print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept))
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() # setup spark/sql context to be used for communication with HDFS sc = SparkContext(appName="phedex_br") if not opts.yarn: sc.setLogLevel("ERROR") sqlContext = HiveContext(sc) schema_def = schema() # read given file(s) into RDD if opts.fname: pdf = sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(opts.fname, schema = schema_def) elif opts.basedir: fromdate, todate = defDates(opts.fromdate, opts.todate) files = getFileList(opts.basedir, fromdate, todate) msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files)) print msg if not files: return pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv') .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(file_path, schema = schema_def) \ for file_path in files]) else: raise ValueError("File or directory not specified. Specify fname or basedir parameters.") # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date) groupdic, nodedic = getJoinDic() acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$" data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$" groupf = udf(lambda x: groupdic[x], StringType()) nodef = udf(lambda x: nodedic[x], StringType()) ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \ .withColumn("node_kind", nodef(pdf.node_id)) \ .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \ .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \ .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1))) # print dataframe schema if opts.verbose: ndf.show() print("pdf data type", type(ndf)) ndf.printSchema() # process aggregation parameters keys = [key.lower().strip() for key in opts.keys.split(',')] results = [result.lower().strip() for result in opts.results.split(',')] aggregations = [agg.strip() for agg in opts.aggregations.split(',')] order = [orde.strip() for orde in opts.order.split(',')] if opts.order else [] asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else [] filtc, filtv = opts.filt.split(":") if opts.filt else (None,None) validateAggregationParams(keys, results, aggregations, order, filtc) if filtc and filtv: ndf = ndf.filter(getattr(ndf, filtc) == filtv) # if delta aggregation is used if DELTA in aggregations: validateDeltaParam(opts.interval, results) result = results[0] #1 for all dates generate interval group dictionary datedic = generateDateDict(fromdate, todate, opts.interval) boundic = generateBoundDict(datedic) max_interval = max(datedic.values()) interval_group = udf(lambda x: datedic[x], IntegerType()) interval_start = udf(lambda x: boundic[x][0], StringType()) interval_end = udf(lambda x: boundic[x][1], StringType()) #2 group data by block, node, interval and last result in the interval ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result)) idf = ndf.withColumn("interval_group", interval_group(ndf.now)) win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc()) idf = idf.withColumn("row_number", rowNumber().over(win)) rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\ .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0))) rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result)) rdf.cache() #3 create intervals that not exist but has minus delta win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win)) hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\ .withColumn("interval_group", adf.interval_group + 1)\ .withColumn(result, lit(0))\ .drop(adf.interval_group_aft) #4 join data frames idf = rdf.unionAll(hdf) #3 join every interval with previous interval win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win)) #5 calculate delta_plus and delta_minus columns and aggregate by date and node ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \ .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0)) aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\ sum(ddf.delta_minus).alias("delta_minus")) aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus) else: resAgg_dic = zipResultAgg(results, aggregations) order, asc = formOrdAsc(order, asc, resAgg_dic) # perform aggregation if order: aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc) else: aggres = ndf.groupBy(keys).agg(resAgg_dic) # output results if opts.fout: fout_header = formFileHeader(opts.fout) if opts.header: aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header) else: aggres.write.format('com.databricks.spark.csv').save(fout_header) else: aggres.show(50)