def get_df_file_rse_ts_size(df_replicas_j_dids): """Combines columns to get filled and correct values from join of DIDS and REPLICAS Firstly, REPLICAS size value will be used. If there are files with no size values, DIDS size values will be used: see 'when' function order. For accessed_at and created_at, their max values will be got. Columns: file, rse_id, accessed_at, f_size, created_at df_file_rse_ts_size: files and their rse_id, size and access time are completed """ # f_size is not NULL, already verified. # df_file_rse_ts_size.filter(col('f_size').isNull()).limit(5).toPandas() return df_replicas_j_dids \ .withColumn('f_size', when(col('f_size_replicas').isNotNull(), col('f_size_replicas')) .when(col('f_size_dids').isNotNull(), col('f_size_dids')) ) \ .withColumn('accessed_at', greatest(col('dids_accessed_at'), col('rep_accessed_at')) ) \ .withColumn('created_at', greatest(col('dids_created_at'), col('rep_created_at')) ) \ .select(['f_name', 'rse_id', 'accessed_at', 'f_size', 'created_at']) \ .cache()
def prepare_df(df): num_rows = df.count() # Expand dates. df = expand_date(df) df = df \ .withColumn('Open', df.Open != '0') \ .withColumn('Promo', df.Promo != '0') \ .withColumn('StateHoliday', df.StateHoliday != '0') \ .withColumn('SchoolHoliday', df.SchoolHoliday != '0') # Merge in store information. store = store_csv.join(store_states_csv, 'Store') df = df.join(store, 'Store') # Merge in Google Trend information. google_trend_all = prepare_google_trend() df = df.join(google_trend_all, ['State', 'Year', 'Week']).select(df['*'], google_trend_all.trend) # Merge in Google Trend for whole Germany. google_trend_de = google_trend_all[google_trend_all.file == 'Rossmann_DE'] google_trend_de = google_trend_de.withColumnRenamed('trend', 'trend_de') df = df.join(google_trend_de, ['Year', 'Week']).select(df['*'], google_trend_de.trend_de) # Merge in weather. weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName) df = df.join(weather, ['State', 'Date']) # Fix null values. df = df \ .withColumn('CompetitionOpenSinceYear', F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900))) \ .withColumn('CompetitionOpenSinceMonth', F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1))) \ .withColumn('Promo2SinceYear', F.coalesce(df.Promo2SinceYear, F.lit(1900))) \ .withColumn('Promo2SinceWeek', F.coalesce(df.Promo2SinceWeek, F.lit(1))) # Days & months competition was open, cap to 2 years. df = df.withColumn('CompetitionOpenSince', F.to_date(F.format_string('%s-%s-15', df.CompetitionOpenSinceYear, df.CompetitionOpenSinceMonth))) df = df.withColumn('CompetitionDaysOpen', F.when(df.CompetitionOpenSinceYear > 1900, F.greatest(F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince)))) .otherwise(0)) df = df.withColumn('CompetitionMonthsOpen', (df.CompetitionDaysOpen / 30).cast(T.IntegerType())) # Days & weeks of promotion, cap to 25 weeks. df = df.withColumn('Promo2Since', F.expr('date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)')) df = df.withColumn('Promo2Days', F.when(df.Promo2SinceYear > 1900, F.greatest(F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since)))) .otherwise(0)) df = df.withColumn('Promo2Weeks', (df.Promo2Days / 7).cast(T.IntegerType())) # Check that we did not lose any rows through inner joins. assert num_rows == df.count(), 'lost rows in joins' return df
def algorithm1(i, g): while (True): aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"), sendToDst=F.when( AM.src['value'] == -1, AM.src["id"])) new_vertices = g.vertices.join( aggregates, on="id", how="left_outer").withColumn( "newValue", getid_maximum_udf2("id", "agg", lit(i), "value")).drop("agg").withColumn( 'max_by_rows', greatest('value', 'newValue')).drop( "value", "newValue").withColumnRenamed( "max_by_rows", "value") cached_new_vertices = AM.getCachedDataFrame(new_vertices) g = GraphFrame(cached_new_vertices, g.edges) i += 1 g.vertices.show() g.vertices.createOrReplaceTempView("temp_table") if (spark.sql("SELECT * from temp_table where value = -1").count() == 0 ): final_df = g.vertices break return final_df
def algorithm2(i, g): while (True): aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"), sendToDst=F.when( AM.src['value'] == -1, AM.src["id"])) new_vertices = g.vertices.join( aggregates, on="id", how="left_outer").withColumn( "newValue", getid_maximum_udf2("id", "agg", lit(i), "value")).drop("agg").withColumn( 'max_by_rows', greatest('value', 'newValue')).drop( "value", "newValue").withColumnRenamed( "max_by_rows", "value") cached_new_vertices = AM.getCachedDataFrame(new_vertices) g = GraphFrame(cached_new_vertices, g.edges) i += 1 g.vertices.show() if (g.filterVertices( "value == -1").dropIsolatedVertices().edges.count() == 0): final_df = g.vertices final_df = final_df.withColumn( "value", F.when(final_df["value"] == -1, i).otherwise(final_df["value"])) break return final_df
def glean_2(invoice_df, line_item_df): invoice = invoice_df.alias('invoice') line_item = line_item_df.alias('line_item') joined_table = invoice.join(line_item, invoice.invoice_id == line_item.invoice_id, how='left') glean2 = joined_table.groupBy( invoice.invoice_id, invoice.canonical_vendor_id, invoice.invoice_date, invoice.period_end_date).agg( max_('line_item.period_end_date').alias('max_line_end_date')) glean2 = glean2.withColumn( 'end_date', funcs.greatest('max_line_end_date', invoice.period_end_date)) glean2 = glean2.withColumn('difference', funcs.datediff('end_date', 'invoice_date')) glean2 = glean2[glean2['difference'] > 90] glean2 = glean2.withColumn('glean_location', funcs.lit('invoice')) glean2 = glean2.withColumn("glean_type", funcs.lit('accrual_alert')) glean2 = glean2.withColumn( "glean_text", funcs.concat( funcs.lit('Line items from vendor '), funcs.col('canonical_vendor_id'), funcs.lit(' in this invoice cover future periods (through '), funcs.col('end_date'), funcs.lit(' )'))) glean2 = glean2.withColumn('glean_date', funcs.col('invoice_date')) return glean2
def calc_min_max(): if len(sdf.columns) > 1: min_col = F.least(*map(F.min, sdf)) max_col = F.greatest(*map(F.max, sdf)) else: min_col = F.min(sdf.columns[-1]) max_col = F.max(sdf.columns[-1]) return sdf.select(min_col, max_col).first()
def compile_greatest(t, expr, scope, **kwargs): op = expr.op() src_columns = t.translate(op.arg, scope) if len(src_columns) == 1: return src_columns[0] else: return F.greatest(*src_columns)
def user_item_serendipity(self): """Calculate serendipity of each item in the recommendations for each user. The metric definition is based on the following references: :Citation: Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist: introducing serendipity into music recommendation, WSDM 2012 Eugene Yan, Serendipity: Accuracy’s unpopular best friend in Recommender Systems, eugeneyan.com, April 2020 Returns: pyspark.sql.dataframe.DataFrame: A dataframe with columns: col_user, col_item, user_item_serendipity. """ # for every col_user, col_item in reco_df, join all interacted items from train_df. # These interacted items are repeated for each item in reco_df for a specific user. if self.df_user_item_serendipity is None: self.df_cosine_similarity = self._get_cosine_similarity() self.df_user_item_serendipity = ( self.reco_df.select( self.col_user, self.col_item, F.col(self.col_item).alias( "reco_item_tmp" ), # duplicate col_item to keep ) .join( self.train_df.select( self.col_user, F.col(self.col_item).alias("train_item_tmp") ), on=[self.col_user], ) .select( self.col_user, self.col_item, F.least(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias( "i1" ), F.greatest(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias( "i2" ), ) .join(self.df_cosine_similarity, on=["i1", "i2"], how="left") .fillna(0) .groupBy(self.col_user, self.col_item) .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim")) .join(self.reco_df, on=[self.col_user, self.col_item]) .withColumn( "user_item_serendipity", (1 - F.col("avg_item2interactedHistory_sim")) * F.col(self.col_relevance), ) .select(self.col_user, self.col_item, "user_item_serendipity") .orderBy(self.col_user, self.col_item) ) return self.df_user_item_serendipity
def __fence(df, values): colname, (lfence, ufence) = list(values.items())[0] # Generates two columns, for lower and upper fences # and then applies `greatest` and `least` functions # to effectively fence the values. return (df.withColumn('__fence', F.lit(lfence)) .withColumn(colname, F.greatest(colname, '__fence')) .withColumn('__fence', F.lit(ufence)) .withColumn(colname, F.least(colname, '__fence')) .drop('__fence'))
def Silhouette(self, clusterlab=None, sildistmethod="Euclidean", silfilter_str=None, dbname="risk"): '''return Silhouette index''' from pyspark.sql.functions import col, avg, greatest import pyspark.sql.functions as F if clusterlab == None: clusterlab = self.clusterLabelCol prwise_cart = self.pairwise_dist(self.df.select([self.idcol] + self.varnames), distmethod=sildistmethod, filter_str=silfilter_str) prwise_cart.createOrReplaceTempView("pairwise_dist") hive_context.sql("drop table if exists " + dbname + ".pairwise_dist") hive_context.sql( "create table pairwise_dist as select * from pairwise_dist") del prwise_cart prwise_cart = hive_context.table("pairwise_dist") ID_cluster_link = self.df.select([self.idcol, clusterlab]) ID_cluster_link.createOrReplaceTempView("id_cluster") hive_context.sql("drop table if exists " + dbname + ".id_cluster") hive_context.sql("create table id_cluster as select * from id_cluster") del ID_cluster_link ID_cluster_link = hive_context.table("id_cluster") #a big cartisan join for pairwise points, coumputation is N^2 mapping ID_pairwise_cart = prwise_cart.alias("dist").join(ID_cluster_link.alias("id1"),col("id1."+self.idcol)==col("dist.ID1"))\ .join(ID_cluster_link.alias("id2"),col("id2."+self.idcol)==col("dist.ID2"))\ .selectExpr("id1."+clusterlab+" as ID1_"+clusterlab,\ "id2."+clusterlab+" as ID2_"+clusterlab,\ "dist.*") #point i to other cluster's average's min ID_pairwise_bi = ID_pairwise_cart.filter("ID1_" + clusterlab +" <> "+"ID2_" + clusterlab)\ .groupBy("ID1_" + clusterlab,"ID1","ID2_" + clusterlab)\ .agg(avg(sildistmethod+"_distance").alias("avg_distance_"+"ID2"+clusterlab)) ID_pairwise_bi = ID_pairwise_bi.groupBy( "ID1_" + clusterlab, "ID1").agg( F.min("avg_distance_" + "ID2" + clusterlab).alias("b_i")) #point i to self cluster's average ID_pairwise_ai = ID_pairwise_cart.filter("ID1_" + clusterlab +" = "+"ID2_" + clusterlab)\ .groupBy("ID1_" + clusterlab,"ID1")\ .agg(avg(sildistmethod+"_distance").alias("a_i")) #calculate the bi-ai / max(ai bi) formula ID_pairwise_aibi = ID_pairwise_ai.alias("a").join(ID_pairwise_bi.alias("b"), ID_pairwise_ai.ID1 == ID_pairwise_bi.ID1)\ .selectExpr("a.*","b.b_i") #calculate silhouette for each data point ID_pairwise_aibi = ID_pairwise_aibi.withColumn( "silouette", (ID_pairwise_aibi["b_i"] - ID_pairwise_aibi["a_i"]) / greatest(ID_pairwise_aibi["a_i"], ID_pairwise_aibi["b_i"])) Silhouette = ID_pairwise_aibi.select("silouette").agg( avg("silouette")).collect()[0][0] hive_context.sql("drop table if exists " + dbname + ".pairwise_dist") hive_context.sql("drop table if exists " + dbname + ".id_cluster") return (Silhouette, ID_pairwise_aibi)
def build_tsne_matrix( spark, latent_matrix, genre_df='hdfs:/user/yw2115/gooreads_book_genres_initial.json.gz', save_csv='tsne_matrix.csv'): """ saves the csv for the tsne plot in viz.py # reference: https://stackoverflow.com/questions/46179453/how-to-compute-maximum-per-row-and-return-a-colum-of-max-value-and-another-colu genre_df: hdfs:/user/yw2115/gooreads_book_genres_initial.json.gz, downloaded from goodreads online latent_matrix: output from load_latent(model) return: None saves: data structure with bookid, lf's from the model, and genre matched """ from pyspark.sql.types import StringType from pyspark.sql.functions import col, greatest, udf, array import pyspark.sql.functions as f genre_df = spark.read.json(genre_df) genre_at = genre_df.select('book_id',f.expr('genres.children'),f.expr('genres.`comics, graphic`'),\ f.expr('genres.`fantasy, paranormal`'),f.expr('genres.fiction'), \ f.expr('genres.`history, historical fiction, biography`'), f.expr('genres.`mystery, thriller, crime`'),\ f.expr('genres.`non-fiction`'),f.expr('genres.poetry'),f.expr('genres.romance'),f.expr('genres.`young-adult`')) #genre_at = genre_at.toDF() #genre_only = genre_at.drop('book_id') df1 = genre_at.withColumn( "maxValue", greatest(*[col(x) for x in genre_at.columns[1:]])) col_arr = df1.columns def modify_values(r): for i in range(len(r[:-1])): if r[i] == r[-1]: return col_arr[i] modify_values_udf = udf(modify_values, StringType()) df1 = df1.withColumn("maxColumn", modify_values_udf(array(df1.columns))) book_genre = df1.select('book_id', 'maxColumn') tsne_matrix = latent_matrix.join(book_genre, on='book_id', how='inner') #tsne_matrix.createOrReplaceTempView('spark_df') #books = spark.sql('SELECT DISTINCT book_id FROM spark_df') #splits = books.randomSplit([0.25, 0.75], seed=42) #book_samp = splits[0] # save to csv for py script tsne_matrix.coalesce(1).write.csv(save_csv)
def main(): spark = SparkSession \ .builder \ .appName("Supported_Tables_Aggregations") \ .getOrCreate() sc = spark.sparkContext sc._jsc.hadoopConfiguration().set( "fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem") sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY_ID']) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_ACCESS_KEY']) # load zipcode and income datasets from S3 # df_review = spark.read.json("s3n://susiehuang-s3/yelp_json_all/yelp_academic_dataset_business.json") df_census = spark.read.format("csv").option( "header", "true").load("s3n://susiehuang-s3/yelp_all_csv/census_data.csv") df_zipcode1 = spark.read.format("csv").option( "header", "true").load("s3n://susiehuang-s3/yelp_all_csv/zipcode_county.csv") df_zipcode = df_zipcode1.select( col("zip_code").alias("zipcode"), col("state").alias("state_code"), col("county").alias("County"), col("city").alias("City")) # data transformation for income and zipcode datasets df1 = df_census.join(df_zipcode, (df_census.County == df_zipcode.County)) cond = "psf.when" + ".when".join([ "(psf.col('" + c + "') == psf.col('max_value'), psf.lit('" + c + "'))" for c in df1.columns if c in ['Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific'] ]) df2= df1.withColumn("max_value", psf.greatest(df1.Hispanic,df1.White, df1.Black, df1.Native, df1.Asian, df1.Pacific))\ .withColumn("MAX_Racial", eval(cond)) df3 = df2.select('zipcode', 'state_code', 'State', 'County', 'City', 'Income', 'max_value', 'MAX_Racial') # export to DB df3.createOrReplaceTempView("zipcode_income_table") output = spark.sql("SELECT * FROM zipcode_income_table") output.write.format('jdbc').options(url='jdbc:xxx://10.0.0.7/business', driver='com.xxx.jdbc.Driver', dbtable='zipcode_income', user='******', password='******').mode('append').save()
def add_domtopic(df): """ find the dominant topic of each sample/row/document input: dataframe of weight of each topic output: the raw dominant topic number dataframe """ argmax_udf = lambda cols: F.udf(lambda *args: argmax(cols, *args), StringType()) return (df.withColumn( 'domtopic', argmax_udf(df.columns[2:])(*df.columns[2:])).withColumn( 'weight', F.greatest(*[F.col(x) for x in df.columns[2:-1]])))
def test_greatest(data_gen): num_cols = 20 s1 = gen_scalar(data_gen, force_no_nulls=True) # we want lots of nulls gen = StructGen( [('_c' + str(x), data_gen.copy_special_case(None, weight=100.0)) for x in range(0, num_cols)], nullable=False) command_args = [f.col('_c' + str(x)) for x in range(0, num_cols)] command_args.append(s1) data_type = data_gen.data_type assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, gen).select(f.greatest(*command_args)))
def quotient(primary_col: str, secondary_col: str, output_col: str, df: DataFrame): """The quotient is simply the minimum value divided by the maximum value Note that if the values are the same this will result in a score of 1.0, but if the values are very different this will result in scores close to 0.0""" return df.withColumn( output_col, F.when( F.col(primary_col).isNull() | F.col(secondary_col).isNull(), None).otherwise( F.least(F.col(primary_col), F.col(secondary_col)) / F.greatest(F.col(primary_col), F.col(secondary_col))), )
def get_bins(sdf, bins): # 'data' is a Spark DataFrame that selects all columns. if len(sdf.columns) > 1: min_col = F.least(*map(F.min, sdf)) max_col = F.greatest(*map(F.max, sdf)) else: min_col = F.min(sdf.columns[-1]) max_col = F.max(sdf.columns[-1]) boundaries = sdf.select(min_col, max_col).first() # divides the boundaries into bins if boundaries[0] == boundaries[1]: boundaries = (boundaries[0] - 0.5, boundaries[1] + 0.5) return np.linspace(boundaries[0], boundaries[1], bins + 1)
def get_predictions(label_col): pred_col = filter(lambda x: label_col in x, preds_cols) cond_lst = [ F.when( F.col(_x) == F.col("max_value"), F.lit(_x.split("_{}_".format(label_col))[-1])) for _x in pred_col ] cond = reduce(lambda left, right: left.otherwise(right), cond_lst) tmp_sdf = preds_sdf.withColumn("max_value", F.greatest(*(F.col(c) for c in preds_cols)))\ .withColumn(label_col, cond) return tmp_sdf.select(["id2", label_col])
def generate_graphs(data_1, data_2, name_1, name_2): def group_by_product(data): avg_stars = data.groupby('product_id', 'product_title').agg(f.avg('star_rating'), f.count('review_id'), f.min(to_date(data['review_date'], 'yyyy-MM-dd')), f.stddev('star_rating')) avg_stars = avg_stars.withColumnRenamed('count(review_id)', 'n_reviews') \ .withColumnRenamed('avg(star_rating)', 'rating') \ .withColumnRenamed("min(to_date(`review_date`, 'yyyy-MM-dd'))", 'first_date') \ .withColumnRenamed('stddev_samp(star_rating)', 'std_rating') return avg_stars avg_1 = group_by_product(data_1) avg_2 = group_by_product(data_2) # To be able to differenciate columns after a later join c1 = avg_1.alias("c1") c2 = avg_2.alias("c2") c1_c2 = c1.join(c2, f.col('c1.product_id') == f.col('c2.product_id')) latest_date = c1_c2.select(f.col('c1.product_id'),greatest(f.col('c1.first_date'), f.col('c2.first_date'))) \ .withColumnRenamed("product_id", "id").withColumnRenamed("greatest(c1.first_date, c2.first_date)", 'latest_date') c1_common_with_date = data_1.join(latest_date, data_1['product_id'] == latest_date['id']) c1_common_reviews = c1_common_with_date.where('review_date >= latest_date') c2_common_with_date = data_2.join(latest_date, data_2['product_id'] == latest_date['id']) c2_common_reviews = c2_common_with_date.where('review_date >= latest_date') common_c1_avg = group_by_product(c1_common_reviews) common_c2_avg = group_by_product(c2_common_reviews) c1_pd = common_c1_avg.toPandas() c2_pd = common_c2_avg.toPandas() plt.figure(figsize=(10,6)) plt.boxplot([c1_pd['rating'], c2_pd['rating']], 0, sym='',autorange=True, labels=[name_1, name_2]) plt.title('Distribution of the average ratings / product - '+ name_1 + " vs " + name_2) plt.ylabel('Average rating') plt.ylim(2.4, 5.1) plt.savefig(IMG_PATH + "countries/average_rating_" + name_1 + "_" + name_2 + ".png", bbox_inches='tight') plt.clf()
def prepare_df( df: pyspark.sql.DataFrame, store_csv: pyspark.sql.DataFrame, store_states_csv: pyspark.sql.DataFrame, state_names_csv: pyspark.sql.DataFrame, google_trend_csv: pyspark.sql.DataFrame, weather_csv: pyspark.sql.DataFrame, ) -> pyspark.sql.DataFrame: num_rows = df.count() # expand dates df = expand_date(df) # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed). df = (df.withColumn("Open", df.Open != "0").withColumn( "Promo", df.Promo != "0").withColumn("StateHoliday", df.StateHoliday != "0").withColumn( "SchoolHoliday", df.SchoolHoliday != "0")) # merge store information store = store_csv.join(store_states_csv, "Store") df = df.join(store, "Store") # merge Google Trend information google_trend_all = prepare_google_trend(google_trend_csv) df = df.join(google_trend_all, ["State", "Year", "Week"]).select(df["*"], google_trend_all.trend) # merge in Google Trend for whole Germany google_trend_de = google_trend_all[google_trend_all.file == "Rossmann_DE"].withColumnRenamed( "trend", "trend_de") df = df.join(google_trend_de, ["Year", "Week"]).select(df["*"], google_trend_de.trend_de) # merge weather weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName) df = df.join(weather, ["State", "Date"]) # fix null values df = (df.withColumn( "CompetitionOpenSinceYear", F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)), ).withColumn( "CompetitionOpenSinceMonth", F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)), ).withColumn("Promo2SinceYear", F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn( "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek, F.lit(1)))) # days and months since the competition has been open, cap it to 2 years df = df.withColumn( "CompetitionOpenSince", F.to_date( F.format_string("%s-%s-15", df.CompetitionOpenSinceYear, df.CompetitionOpenSinceMonth)), ) df = df.withColumn( "CompetitionDaysOpen", F.when( df.CompetitionOpenSinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince)), ), ).otherwise(0), ) df = df.withColumn("CompetitionMonthsOpen", (df.CompetitionDaysOpen / 30).cast(T.IntegerType())) # days and weeks of promotion, cap it to 25 weeks df = df.withColumn( "Promo2Since", F.expr( 'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)' ), ) df = df.withColumn( "Promo2Days", F.when( df.Promo2SinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))), ).otherwise(0), ) df = df.withColumn("Promo2Weeks", (df.Promo2Days / 7).cast(T.IntegerType())) # ensure that no row was lost through inner joins assert num_rows == df.count(), "lost rows in joins" return df
def assignment_transformation(expedia_df, hotels_weather_df): # enriching booking data with avg temp on the srch_ci and duration of stay expedia_enriched = (expedia_df.join( hotels_weather_df, (expedia_df.hotel_id == hotels_weather_df.id) & (expedia_df.srch_ci == hotels_weather_df.wthr_date)).select( expedia_df["*"], hotels_weather_df.avg_c).withColumn( "duration_of_stay", datediff(col("srch_co"), col("srch_ci")))) # enriching expedia data with stay types for further summing stay_data = (expedia_enriched.withColumn( "short_stay", when(col("duration_of_stay") == 1, 1).otherwise(0)).withColumn( "erroneous_data", when( (col("duration_of_stay") <= 0) | (col("duration_of_stay") > 30) | (col("duration_of_stay").isNull()), 1).otherwise(0)).withColumn( "standard_stay", when((col("duration_of_stay") >= 2) & (col("duration_of_stay") < 7), 1).otherwise(0)).withColumn( "standard_extended_stay", when((col("duration_of_stay") >= 8) & (col("duration_of_stay") < 14), 1).otherwise(0)).withColumn( "long_stay", when((col("duration_of_stay") >= 15) & (col("duration_of_stay") < 30), 1).otherwise(0)).withColumn( "batch_timestamp", current_timestamp())) # if dataframe is streaming for aggregation we need to define a watermark period. stay_data = stay_data.withWatermark( "batch_timestamp", "1 minute") if stay_data.isStreaming else stay_data # grouping and calculating stay types for each hotel cnt = (stay_data.groupBy("hotel_id", "batch_timestamp").agg( sum("short_stay").alias("short_stay_cnt"), sum("erroneous_data").alias("erroneous_data_cnt"), sum("standard_stay").alias("standard_stay_cnt"), sum("standard_extended_stay").alias("standard_extended_stay_cnt"), sum("long_stay").alias("long_stay_cnt"))) # calculating most popular stay type for each hotel return (cnt.withColumn( "popular_stay_cnt", greatest("erroneous_data_cnt", "short_stay_cnt", "standard_stay_cnt", "standard_extended_stay_cnt", "long_stay_cnt") ).withColumn( "most_popular_stay_type", when( col("popular_stay_cnt") == cnt["erroneous_data_cnt"], "Erroneous data").when( col("popular_stay_cnt") == cnt["short_stay_cnt"], "Short stay").when( col("popular_stay_cnt") == cnt["standard_stay_cnt"], "Standard stay").when( col("popular_stay_cnt") == cnt["standard_extended_stay_cnt"], "Standard extended stay").when( col("popular_stay_cnt") == cnt["long_stay_cnt"], "Long stay")).select( cnt["*"], col("most_popular_stay_type")))
def tocolumns(df, expr): import pyspark.sql.functions as fcns if isinstance(expr, histbook.expr.Const): return fcns.lit(expr.value) elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)): return df[expr.value] elif isinstance(expr, histbook.expr.Call): if expr.fcn == "abs" or expr.fcn == "fabs": return fcns.abs(tocolumns(df, expr.args[0])) elif expr.fcn == "max" or expr.fcn == "fmax": return fcns.greatest(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "min" or expr.fcn == "fmin": return fcns.least(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "arccos": return fcns.acos(tocolumns(df, expr.args[0])) elif expr.fcn == "arccosh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arcsin": return fcns.asin(tocolumns(df, expr.args[0])) elif expr.fcn == "arcsinh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arctan2": return fcns.atan2(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "arctan": return fcns.atan(tocolumns(df, expr.args[0])) elif expr.fcn == "arctanh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "ceil": return fcns.ceil(tocolumns(df, expr.args[0])) elif expr.fcn == "copysign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "cos": return fcns.cos(tocolumns(df, expr.args[0])) elif expr.fcn == "cosh": return fcns.cosh(tocolumns(df, expr.args[0])) elif expr.fcn == "rad2deg": return tocolumns(df, expr.args[0]) * (180.0 / math.pi) elif expr.fcn == "erfc": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "erf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp": return fcns.exp(tocolumns(df, expr.args[0])) elif expr.fcn == "expm1": return fcns.expm1(tocolumns(df, expr.args[0])) elif expr.fcn == "factorial": return fcns.factorial(tocolumns(df, expr.args[0])) elif expr.fcn == "floor": return fcns.floor(tocolumns(df, expr.args[0])) elif expr.fcn == "fmod": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "gamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "hypot": return fcns.hypot(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "isinf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isnan": return fcns.isnan(tocolumns(df, expr.args[0])) elif expr.fcn == "lgamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "log10": return fcns.log10(tocolumns(df, expr.args[0])) elif expr.fcn == "log1p": return fcns.log1p(tocolumns(df, expr.args[0])) elif expr.fcn == "log": return fcns.log(tocolumns(df, expr.args[0])) elif expr.fcn == "pow": return fcns.pow(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "deg2rad": return tocolumns(df, expr.args[0]) * (math.pi / 180.0) elif expr.fcn == "sinh": return fcns.sinh(tocolumns(df, expr.args[0])) elif expr.fcn == "sin": return fcns.sin(tocolumns(df, expr.args[0])) elif expr.fcn == "sqrt": return fcns.sqrt(tocolumns(df, expr.args[0])) elif expr.fcn == "tanh": return fcns.tanh(tocolumns(df, expr.args[0])) elif expr.fcn == "tan": return fcns.tan(tocolumns(df, expr.args[0])) elif expr.fcn == "trunc": raise NotImplementedError( expr.fcn) # FIXME (fcns.trunc is for dates) elif expr.fcn == "xor": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "conjugate": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "heaviside": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isfinite": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "left_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftLeft(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "log2": return fcns.log2(tocolumns(df, expr.args[0])) elif expr.fcn == "logaddexp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "logaddexp": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "mod" or expr.fcn == "fmod": return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1]) elif expr.fcn == "right_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftRight(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "rint": return fcns.rint(tocolumns(df, expr.args[0])) elif expr.fcn == "sign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "where": return fcns.when(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])).otherwise( tocolumns(df, expr.args[2])) elif expr.fcn == "numpy.equal": return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.not_equal": return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less": return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less_equal": return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.isin": return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_not": return ~tocolumns(df, expr.args[0]) elif expr.fcn == "numpy.add": return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.subtract": return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.multiply": return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.true_divide": return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_or": return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_and": return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1]) else: raise NotImplementedError(expr.fcn) else: raise AssertionError(expr)
def fillspark(hist, df): import pyspark.sql.functions as fcns indexes = [] for axis in hist._group + hist._fixed: exprcol = tocolumns(df, histbook.instr.totree(axis._parsed)) if isinstance(axis, histbook.axis.groupby): indexes.append(exprcol) elif isinstance(axis, histbook.axis.groupbin): scaled = (exprcol - float(axis.origin)) * (1.0 / float(axis.binwidth)) if axis.closedlow: discretized = fcns.floor(scaled) else: discretized = fcns.ceil(scaled) - 1 indexes.append( fcns.nanvl( discretized * float(axis.binwidth) + float(axis.origin), fcns.lit("NaN"))) elif isinstance(axis, histbook.axis.bin): scaled = (exprcol - float(axis.low)) * (int(axis.numbins) / (float(axis.high) - float(axis.low))) if axis.closedlow: discretized = fcns.floor(scaled) + 1 else: discretized = fcns.ceil(scaled) indexes.append( fcns.when( fcns.isnull(exprcol) | fcns.isnan(exprcol), int(axis.numbins) + 2).otherwise( fcns.greatest( fcns.lit(0), fcns.least(fcns.lit(int(axis.numbins) + 1), discretized)))) elif isinstance(axis, histbook.axis.intbin): indexes.append( fcns.greatest( fcns.lit(0), fcns.least(fcns.lit(int(axis.max) - int(axis.min) + 1), fcns.round(exprcol - int(axis.min) + 1)))) elif isinstance(axis, histbook.axis.split): def build(x, i): if i < len(axis.edges): if axis.closedlow: return build(x.when(exprcol < float(axis.edges[i]), i), i + 1) else: return build( x.when(exprcol <= float(axis.edges[i]), i), i + 1) else: return x.otherwise(i) indexes.append( build( fcns.when( fcns.isnull(exprcol) | fcns.isnan(exprcol), len(axis.edges) + 1), 0)) elif isinstance(axis, histbook.axis.cut): indexes.append(fcns.when(exprcol, 0).otherwise(1)) else: raise AssertionError(axis) aliasnum = [-1] def alias(x): aliasnum[0] += 1 return x.alias("@" + str(aliasnum[0])) index = alias(fcns.struct(*indexes)) selectcols = [index] if hist._weightoriginal is not None: weightcol = tocolumns(df, histbook.instr.totree(hist._weightparsed)) for axis in hist._profile: exprcol = tocolumns(df, histbook.instr.totree(axis._parsed)) if hist._weightoriginal is None: selectcols.append(alias(exprcol)) selectcols.append(alias(exprcol * exprcol)) else: selectcols.append(alias(exprcol * weightcol)) selectcols.append(alias(exprcol * exprcol * weightcol)) if hist._weightoriginal is None: df2 = df.select(*selectcols) else: selectcols.append(alias(weightcol)) selectcols.append(alias(weightcol * weightcol)) df2 = df.select(*selectcols) aggs = [fcns.sum(df2[n]) for n in df2.columns[1:]] if hist._weightoriginal is None: aggs.append(fcns.count(df2[df2.columns[0]])) def getornew(content, key, nextaxis): if key in content: return content[key] elif isinstance(nextaxis, histbook.axis.GroupAxis): return {} else: return numpy.zeros(hist._shape, dtype=histbook.hist.COUNTTYPE) def recurse(index, columns, axis, content): if len(axis) == 0: content += columns elif isinstance(axis[0], (histbook.axis.groupby, histbook.axis.groupbin)): content[index[0]] = recurse( index[1:], columns, axis[1:], getornew(content, index[0], axis[1] if len(axis) > 1 else None)) if isinstance(axis[0], histbook.axis.groupbin) and None in content: content["NaN"] = content[None] del content[None] elif isinstance( axis[0], (histbook.axis.bin, histbook.axis.intbin, histbook.axis.split)): i = index[0] - (1 if not axis[0].underflow else 0) if int(i) < axis[0].totbins: recurse(index[1:], columns, axis[1:], content[int(i)]) elif isinstance(axis[0], histbook.axis.cut): recurse(index[1:], columns, axis[1:], content[0 if index[0] else 1]) else: raise AssertionError(axis[0]) return content query = df2.groupBy(df2[df2.columns[0]]).agg(*aggs) def wait(): for row in query.collect(): recurse(row[0], row[1:], hist._group + hist._fixed, hist._content) return wait
def run(w=14, l=114, threshold=0): # Creating Data Frame and filtering by threshold pattern, k = random_pattern(l, w), w Chiaromonte = [[91, -114, -31, -123], [-114, 100, -125, -31], [-31, -125, 100, -114], [-123, -31, -114, 91]] spark = SparkSession.builder.appName('Distributed FSWM').getOrCreate() df = spark.read.text("data/example.fasta") # Read the sequences sequences = df.where(~df.value.contains('>')).rdd.map(list).map( lambda x: (x[0].encode('ascii'))).map(list) # Defining schema for data frame schema = StructType([ StructField("id", IntegerType()), StructField("Sequence", ArrayType(StringType())) ]) df = spark.createDataFrame( (tuple([_id, data[0]]) for _id, data in enumerate(map(lambda x: [x], sequences.take(2)))), schema=schema) # Creating ngrams ngram = NGram(n=w, inputCol="Sequence", outputCol="ngrams") df_clean = ngram.transform(df).select(["id", "ngrams"]) # Exploding ngrams into the data frame df_explode = df_clean.withColumn('ngrams', explode('ngrams')) # Defining the reducer # Create your UDF object (which accepts your python function called "my_udf") udf_object = udf(lambda y: reducer_concat(y), IntegerType()) # Here we should have for all the sequences df_w0 = df_explode.where(df_clean.id == 0) df_w0 = df_w0.withColumn("id0", monotonically_increasing_id() + 1).withColumnRenamed('ngrams', 'w0').select('id0', 'w0') df0 = df_w0.withColumn("word0", udf_object(df_w0.w0)).select("id0", "word0") df0.show() df_w1 = df_explode.where(df_clean.id == 1) df_w1 = df_w1.withColumn("id1", monotonically_increasing_id() + 1).withColumnRenamed('ngrams', 'w1').select('id1', 'w1') df1 = df_w1.withColumn("word1", udf_object(df_w1.w1)).select("id1", "word1") df1.show(truncate=False) df_result = df0.crossJoin(df1) \ .withColumn("spaced_word", udf_spaced_words(pattern)(col("word0"), col("word1"))) \ .where(col("spaced_word").isNotNull()) \ .withColumn("score", udf_score(pattern, k, Chiaromonte)(col("word0"), col("word1"))) \ .where(col("score") > threshold) \ .orderBy(["spaced_word", "score"], ascending=False) \ .withColumn("min", least(col("id0"), col("id1"))) \ .withColumn("max", greatest(col("id0"), col("id1"))) \ .drop_duplicates(subset=["spaced_word", "min"]) \ .drop_duplicates(subset=["spaced_word", "max"]) \ .withColumn("JukesCantor", udf_jukes_cantor(pattern, k)(col("word0"), col("word1"))) df_result.show() p = df_result.agg(suma("JukesCantor")).collect()[0][0] * 1.0 / ( (k - bin(pattern).count("1") / 2) * df_result.count()) print(JukesCantor(p))
def amend_device_tracking(observations_df, tracking_df, last_updated_by): # type: (DataFrame, DataFrame, str) -> typing.Tuple[DataFrame, DataFrame, DataFrame] """ Blends new observations into an existing device tracking dataset. :param observations_df: New observations to be used for amending the device tracking data set. :param tracking_df: The device tracking data set. :param last_updated_by: The last updated user/process tracking field :return: A 3-tuple of (modified device tracking records only, updated full device tracking data set, device tracking records for never-before-seen devices only) """ observations_df = observations_df.alias('o') tracking_df = tracking_df.alias('t') pk = ['organization', 'mac'] # Find the tracking records that are changed by the new observations. delta_df = observations_df.select( 'organization', 'mac', 'first_observed_at', 'last_observed_at' ).join( tracking_df, on=pk, how='left' ).where( col('t.mac').isNull() | (col('o.first_observed_at') < col('t.first_observed_at')) | (col('o.last_observed_at') > col('t.last_observed_at')) ).select( 'o.organization', 'o.mac', least('o.first_observed_at', 't.first_observed_at').name('first_observed_at'), greatest('o.last_observed_at', 't.last_observed_at').name('last_observed_at'), ).cache() # Create a new version of the entire device tracking dataset, and checkpoint it to break the cyclic lineage # caused by reading from and writing to the same table. refresh_df = tracking_df.join( delta_df, on=pk, how='left_anti' # Retain only the unmodified records. ).unionByName( delta_df.select( '*', current_timestamp().name('last_updated_at'), lit(last_updated_by).name('last_updated_by') ) ).coalesce( 1 ).checkpoint( eager=True ) # Find any never-before-seen devices. new_devices_df = delta_df.join( tracking_df, on=pk, how='left_anti' ).cache() return delta_df, refresh_df, new_devices_df
sqlcontext.registerDataFrameAsTable(df7, "df") df8 = sqlcontext.sql("""SELECT df.playername , MAX(CASE WHEN df.prediction = 'Zone1' THEN df.hitrate END) AS Zone1hitrate , MAX(CASE WHEN df.prediction = 'Zone2' THEN df.hitrate END) AS Zone2hitrate , MAX(CASE WHEN df.prediction = 'Zone3' THEN df.hitrate END) AS Zone3hitrate , MAX(CASE WHEN df.prediction = 'Zone4' THEN df.hitrate END) AS Zone4hitrate FROM df GROUP BY df.playername""") #adding a column with the highest hitrate to the previous table df9 = df8.select(df8.playername \ , df8.Zone1hitrate \ , df8.Zone2hitrate \ , df8.Zone3hitrate \ , df8.Zone4hitrate \ , greatest("Zone1hitrate","Zone2hitrate","Zone3hitrate","Zone4hitrate").alias("besthitrate")) #using the besthitrate column to determine the best zone for each player df10 = df9.withColumn("bestzone",when(df9.Zone1hitrate == df9.besthitrate, "Zone1") \ .when(df9.Zone2hitrate == df9.besthitrate, "Zone2") \ .when(df9.Zone3hitrate == df9.besthitrate, "Zone3") \ .when(df9.Zone4hitrate == df9.besthitrate, "Zone4")) sqlcontext.registerDataFrameAsTable(df10, "df") print( "The NBA player have been classified into four confortable zones, with the following structure: [shotclock,shotdist,closedefdist]" ) print("The four zones are:") print("Zone1: %s" % (centroid1)) print("Zone2: %s" % (centroid2)) print("Zone3: %s" % (centroid3))
df = sqlContext.read.format('parquet').load( 'hdfs:/scholar_data/tokens_count_by_year.parquet') # keep only tokens starting from 3 characters in length df = df.filter('LENGTH(entities) > 2') # gather column names linked to years col_years = [col_name for col_name in df.columns] col_years.remove('entities') # Find peak usage of token across the years # https://stackoverflow.com/questions/40874657/pyspark-compute-row-maximum-of-the-subset-of-columns-and-add-to-an-exisiting-da minf = F.lit(float("-inf")) df = df.withColumn( "year_max", F.greatest(*[F.coalesce(F.col(year), minf) for year in col_years])) # forget about tokens that have never been really used df = df.filter("year_max > 10").drop('year_max') # find total number of "valid" tokens used on each year df = df.join(df.groupby().sum(*col_years)) # retrieve token frequency (times common coefficient) for each year # coefficient is to make sure we do not limitations of float precision too hard for year in col_years: df = df.withColumn(year, 100000.0 * F.col(year) / F.col(f'sum({year})')).drop(f'sum({year})') # store results df.write.save('hdfs:/scholar_data/tokens_freq_by_year.parquet',
def startCalculation(self): spark = self.sparkSession sc = spark.sparkContext #cache dataframes tw = self.tweet_df tw.cache() u = self.user_df u.cache() # calculate duration of dataset dates = tw.select('created_at').rdd.map( lambda r: convert_twitter_date(r[0])).collect() duration_of_dataset = self.get_duration_of_dataset(dates) # list of names self.list_screen_names = u.select('screen_name').rdd.map( lambda r: r[0]).distinct().collect() #list of categories list_categories = tw.select("category").rdd.map( lambda r: r[0]).distinct().collect() list_categories.sort() self.list_categories = list_categories # format dates and remove hour info format_dates = udf(convert_twitter_date_noHour, DateType()) updated_tweet_df = tw.withColumn("formatted_date", format_dates(tw["created_at"])) #calculate tweets count of all users joined_df = u.join(updated_tweet_df, u.id == updated_tweet_df.userId, how='left') tweets_total = joined_df.groupBy("id").count().orderBy( 'count', ascending=False).withColumnRenamed("count", "tweets_total") #calculate tweets count of all users by topic tweets_by_topic = joined_df.groupBy("id").pivot( "category").count().fillna(0, subset=list_categories) tweets_by_topic_nested = tweets_by_topic.select( "id", struct(list_categories).alias("dict_tweet_by_topic")) #calculate days posted of all users by topic days_posted_by_topic = joined_df.groupBy( "id", "formatted_date").pivot("category").count().fillna( 0, subset=list_categories) for cat in list_categories: days_posted_by_topic = days_posted_by_topic.withColumn( cat, when(days_posted_by_topic[cat] > 0, 1).otherwise(0)) days_posted_by_topic_summed = days_posted_by_topic.groupBy("id").agg( *[sum(c).alias(c) for c in list_categories]) days_posted_by_topic_nested = days_posted_by_topic_summed.select( "id", struct(list_categories).alias("dict_days_posted_by_topic")) #join tweets_total, tweets_by_topic_nested and, days_posted_by_topic_nested temp_u = tweets_total.join(tweets_by_topic_nested, "id").join(days_posted_by_topic_nested, "id") #calculate focus rate for cat in list_categories: temp_u = temp_u.withColumn( cat, col("dict_tweet_by_topic.{}".format(cat)) / greatest(lit(1), col("tweets_total"))) temp_u = temp_u.select( "id", "tweets_total", "dict_tweet_by_topic", "dict_days_posted_by_topic", struct(list_categories).alias("dict_focus_rate")) #calculate activeness1 for cat in list_categories: temp_u = temp_u.withColumn( cat, col("dict_days_posted_by_topic.{}".format(cat)) / duration_of_dataset) temp_u = temp_u.select( "id", "tweets_total", "dict_tweet_by_topic", "dict_days_posted_by_topic", "dict_focus_rate", struct(list_categories).alias("dict_activeness_1")) #calculate activeness2 for cat in list_categories: temp_u = temp_u.withColumn( cat, col("dict_tweet_by_topic.{}".format(cat)) / duration_of_dataset) temp_u = temp_u.select( "id", "tweets_total", "dict_tweet_by_topic", "dict_days_posted_by_topic", "dict_focus_rate", "dict_activeness_1", struct(list_categories).alias("dict_activeness_2")) #calculate activeness3 for cat in list_categories: temp_u = temp_u.withColumn( cat, col("dict_tweet_by_topic.{}".format(cat)) * col("dict_days_posted_by_topic.{}".format(cat)) / duration_of_dataset) temp_u = temp_u.select( "id", "tweets_total", "dict_tweet_by_topic", "dict_days_posted_by_topic", "dict_focus_rate", "dict_activeness_1", "dict_activeness_2", struct(list_categories).alias("dict_activeness_3")) #set results self.results_df = temp_u.select( "id", struct(temp_u.columns[1:]).alias("user_features{}".format(( "_" + self.method_name) if len(self.method_name) > 0 else "")))
def max(df, cols: List[str]): return df.select(F.greatest(cols))
##Jas_Labour_Production_df.select('HOURS_BILLABLE').distinct().show() Jas_Labour_Production_df=Jas_Labour_Production_df.filter(Jas_Labour_Production_df['TRANSACTION_DATE1'] >= lit("2019-01-01"))\ .filter(Jas_Labour_Production_df['TRANSACTION_DATE1'] <= lit("2019-02-21")) ##Jas_Labour_Production_df.printSchema() ##Performing join operation between two given sources and storing it in an intermediate dataframe Jas_Labour_Interm_df=Jas_Labour_Paid_df.join(Jas_Labour_Production_df,Jas_Labour_Paid_df.KEY_COL==Jas_Labour_Production_df.KEY_COL1,'inner')\ .select('TRANSACTION_DATE','EMPLOYEE_NUMBER','KEY_COL','EMPLOYEE_NAME','PROGRAM_DESC','AVAIL_OT_HRS','WEEK_NUMBER','DIRECT_INDIRECT',\ 'HOURS_TOTAL',Jas_Labour_Production_df['PROD_REG_HRS'],Jas_Labour_Production_df['PROD_OT_HRS'],Jas_Labour_Production_df['HOURS_BILLABLE']) ##Final dataframe with the required columns Jas_Labour_Final_df=Jas_Labour_Interm_df\ .withColumn('AVAIL_REG_HRS', when(col('HOURS_BILLABLE') !=0, col('HOURS_BILLABLE')).otherwise(0))\ .withColumn('EXP_REG_HRS',\ when(((col('HOURS_TOTAL') !=0) & (col('PROD_REG_HRS')!=0)),greatest(col('HOURS_TOTAL'),col('PROD_REG_HRS'))).when(((col('HOURS_TOTAL')!=0) & (col('PROD_REG_HRS') == 0)),col('HOURS_TOTAL'))\ .when(((col('HOURS_TOTAL')==0) & (col('PROD_REG_HRS') !=0)), col('PROD_REG_HRS')))\ .withColumn('RATE_TYPE', lit('JAS'))\ .withColumn('RATE_VALUE', lit(80))\ .withColumn('COMPANY_CODE', lit(1))\ .withColumn('SOURCE', lit('Quantum')) ##Jas_Labour_Final_df.printSchema() Jas_Labour_Final_df=Jas_Labour_Final_df.select('TRANSACTION_DATE','EMPLOYEE_NUMBER',col('DIRECT_INDIRECT').alias('JOB_CATEGORY'),\ 'PROGRAM_DESC','COMPANY_CODE',col('WEEK_NUMBER').alias('WEEK_NO'),'SOURCE',col('HOURS_TOTAL').alias('TOTAL_REG_HOURS'),\ col('AVAIL_OT_HRS').alias('TOTAL_OT_HOURS'),col('PROD_REG_HRS').alias('CHARGEABLE_REG_HOURS'),col('PROD_OT_HRS').alias('CHARGEABLE_OT_HOURS'),\ col('EXP_REG_HRS').alias('NON_CHARGEABLE_REG_HOURS'))\ .withColumn('EMPLOYEE_BASE', lit('N/A'))\ .withColumn('BASE_DESCRIPTION', lit('N/A'))\ .withColumn('SHOP_CODE', lit(0))\ .withColumn('SHOP_DESCRIPTION', lit('N/A'))\
def Validate(ngrams \ , sampleSizes \ , ctxSize \ , sqc \ , seqs \ , outFile \ , minval \ , maxval \ , avg \ , nlines): accuracy = [] gramSize = GramSize(ctxSize, lookahead) c1 = (((maxval - minval) * 1.0) / nlines) / avg c2 = ((minval * 1.0) / nlines) / avg print seqs.count() ngrams = ngrams.repartition(1 << nPartLog) ngrams.cache() #we will validate separately for each vector size for vecSize in vecSizes: print '======TESTING FOR VECTOR SIZE', vecSize #start fresh old_ngrams = ngrams ngrams = ngrams.withColumn('correct', lit(0)) #use models from each sample modelId = 0 for sampleSize in sampleSizes: w2v = Word2VecModel.load(w2vFile(outDir, ctxSize, sampleSize, vecSize)) lrmodels = [] for dim in range(0, vecSize): lrmodels.append(LinearRegressionModel.load(lrmFile(outDir, ctxSize, sampleSize, vecSize, dim))) success = 0 fail = 0 unopt = 0 #add columns to store model success and failure modelSucc = 'succ_' + str(modelId) modelFail = 'fail_' + str(modelId) modelUnopt = 'unopt_' + str(modelId) seqs = seqs.withColumn(modelSucc, lit(0)) \ .withColumn(modelFail, lit(0)) \ .withColumn(modelUnopt, lit(0)) modelId = modelId + 1 ngrams = ngrams \ .withColumn('predSeq', lit('')) #create initial feature vector #transform each word into a cluster center words, d, centers = ClusterWords(w2v \ , seqs \ ) #record correctness for this model only old_ngrams = ngrams ngrams = ngrams.withColumn('sample_correct', lit(0)).withColumn('sample_confi', lit(1.0)) for nextPos in range(0,lookahead): #build the feature vector ngrams = BuildSubstringFeature(ngrams, w2v, nextPos, nextPos + ctxSize, ctxSize, lookahead,) #build the prediction vector ngrams = BuildPredictionVector(ngrams, lrmodels, ctxSize, vecSize) #now assign a cluster id to each prediction vector old_ngrams = ngrams ngrams = centers.transform(ngrams).withColumnRenamed('cluster', 'predWord').withColumnRenamed('vector', 'predictionVector') #get the predicted word ngrams = ngrams.join(broadcast(words), words.cluster == ngrams.predWord, 'inner') \ .drop('cluster') #\ #calculate the cosine similarity between prediction vector and center vector epsilon = 0.0001 def CosineSimi (v1, v2): d1 = DenseVector(v1) d2 = DenseVector(v2) n1 = d1.norm(2) n2 = d2.norm(2) return float(d1.dot(d2) / (n1 * n2)) cossim = udf(lambda v1, v2: CosineSimi(v1, v2), DoubleType()) ngrams = ngrams.withColumn('simi', cossim('centerVector', 'predictionVector')) ngrams = ngrams.drop('centerVector').drop('predictionVector') #update predicted sequence ngrams = ngrams.withColumn('predSeq', concat_ws(' ', 'predSeq', 'word')) ngrams = ngrams.withColumn('predSeq', ltrim(ngrams.predSeq)) #get actual sequence ngrams = CreateSubstring(ngrams, 'sentence', 'actualSeq', gramSize, ' ', ctxSize, ctxSize + nextPos + 1) #now get the cluster id for the predicted word in the sentence ngrams = BuildLabelVector(ngrams, w2v, ctxSize, lookahead, nextPos).withColumnRenamed('labelVec', 'vector').drop('ngrams') ngrams = centers.transform(ngrams).drop('vector') #and host latency for actual word ngrams = ngrams.join(broadcast(words), 'cluster', 'inner') \ .drop('word') \ .drop('centerVector') #\ #record correctness ngrams = ngrams.withColumn('round_correct', when((ngrams.predWord != ngrams.cluster) | (ngrams.simi < confidence), 0).otherwise(nextPos + 1)).drop('predWord').drop('cluster') ngrams = ngrams.withColumn('sample_correct', when(ngrams.sample_correct + 1 == ngrams.round_correct, ngrams.round_correct).otherwise(ngrams.sample_correct)) #get overall correctness ngrams = ngrams.withColumn('correct', greatest('sample_correct', 'correct')) #get binary correctness ngrams = ngrams.withColumn('binary_correct', when(ngrams.correct >= nextPos + 1, 1).otherwise(0)) ngrams = ngrams.withColumn('sample_confi', when(ngrams.binary_correct == 1, 1.0).otherwise(least(ngrams.simi, ngrams.sample_confi))) ngrams = ngrams.withColumn('simi', when(ngrams.binary_correct == 1, ngrams.simi).otherwise(ngrams.sample_confi)) ngrams = ngrams.withColumn('predSeq', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), ngrams.actualSeq).otherwise(ngrams.predSeq)) ngrams = ngrams.withColumn('succ_wt', when(ngrams.binary_correct == 1, ngrams.wt).otherwise(0)) ngrams = ngrams.withColumn('fail_wt', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), 0).otherwise(ngrams.wt)) ngrams = ngrams.withColumn('unopt_wt', when((ngrams.binary_correct == 0) & (ngrams.simi < confidence), ngrams.wt).otherwise(0)) ngrams = ngrams.drop('simi') #now summarize success and failure rates by predicted sequence seqWts = ngrams.groupBy('predSeq').agg(sum('succ_wt').alias('succ_wt'), sum('fail_wt').alias('fail_wt'), sum('unopt_wt').alias('unopt_wt')) #update sequences table seqs = seqWts.join(broadcast(seqs), seqWts.predSeq==seqs.word, 'right_outer').drop('predSeq').fillna(-c2/c1, ['succ_wt', 'fail_wt', 'unopt_wt']) scaleback = udf(lambda s: float(s*c1 + c2), DoubleType()) seqs = seqs.withColumn(modelSucc, col(modelSucc) + scaleback(seqs.succ_wt)).drop('succ_wt') seqs = seqs.withColumn(modelFail, col(modelFail) + scaleback(seqs.fail_wt)).drop('fail_wt') seqs = seqs.withColumn(modelUnopt, col(modelUnopt) + scaleback(seqs.unopt_wt)).drop('unopt_wt') seqs.cache() aggregated = seqs.agg(sum(modelSucc), sum(modelFail), sum(modelUnopt)) aggregated.cache() new_success = aggregated.head()['sum(' + modelSucc + ')'] new_fail = aggregated.head()['sum(' + modelFail + ')'] new_unopt = aggregated.head()['sum(' + modelUnopt + ')'] print nextPos, new_success - success, new_fail - fail, new_unopt - unopt success = new_success fail = new_fail unopt = new_unopt #end for testing for each model for a particular vector size #end for each vector size seqs.orderBy('succ_0', ascending=False).write.mode('overwrite').csv(outputFile(outDir, ctxSize, vecSize, sampleSizes)) return accuracy