def interests(filename, n, s, c): ''' Using the same FP-Growth algorithm, write a script that computes the interest of association rules (interest = |confidence - frequency(consequent)|; note the absolute value) obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), and prints the first <n> rules sorted by (1) descending antecedent size in association rule, and (2) descending interest. Return value: a CSV string. Test: tests/test_interests.py ''' spark = init_spark() result = spark.sparkContext.textFile(filename).map(lambda l: l.split(",")).zipWithIndex().map( lambda x: (x[1], x[0][0], x[0][1:])) df = spark.createDataFrame(result, ['id', 'plant', 'items']) fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(df) result = model.associationRules modelResult = model.freqItemsets result=modelResult.join(result,modelResult['items']==result["consequent"]) total = df.count() result = result.withColumn("interest",abs(result["confidence"]-result["freq"]/total)) result = result.select(size("antecedent").alias('tam'), 'antecedent', 'consequent', 'confidence',"items","freq","interest") result = result.sort(desc('tam'), desc('interest')).limit(n) result=result.select('antecedent', 'consequent', 'confidence',"items","freq","interest") return toCSVLine(result)
def frequent_itemsets(filename, n, s, c): ''' Using the FP-Growth algorithm from the ML library (see http://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html), write a function that returns the first <n> frequent itemsets obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), sorted by (1) descending itemset size, and (2) descending frequency. The FP-Growth model should be applied to the DataFrame computed in the previous task. Return value: a CSV string. As before, using toCSVLine may help. Test: tests/test_frequent_items.py ''' spark = init_spark() lines = spark.read.text(filename).rdd parts = lines.map(lambda row: row.value.split(",")) rdd_data = parts.map(lambda p: Row(name=p[0], items=p[1:])) df = spark.createDataFrame(rdd_data) fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(df) model_1 = model.freqItemsets.orderBy([size("items"), "freq"], ascending=[0, 0]) final_op = toCSVLine(model_1.limit(n)) return final_op '''return "not implemented"'''
def frequent_itemsets(filename, n, s, c): ''' Using the FP-Growth algorithm from the ML library (see http://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html), write a function that returns the first <n> frequent itemsets obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), sorted by (1) descending itemset size, and (2) descending frequency. The FP-Growth model should be applied to the DataFrame computed in the previous task. Return value: a CSV string. As before, using toCSVLine may help. Test: tests/test_frequent_items.py ''' spark = init_spark() result = spark.sparkContext.textFile(filename).map(lambda l: l.split(",")).zipWithIndex().map(lambda x: (x[1], x[0][0], x[0][1:])) df = spark.createDataFrame(result, ['id', 'plant','items']) fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(df) result=model.freqItemsets result=result.select("items","freq",size("items").alias("tam")) result=result.sort(desc('tam'),desc('freq')).limit(n) result=result.select('items','freq') return toCSVLine(result)
def interests(filename, n, s, c): ''' Using the same FP-Growth algorithm, write a script that computes the interest of association rules (interest = |confidence - frequency(consequent)|; note the absolute value) obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), and prints the first <n> rules sorted by (1) descending antecedent size in association rule, and (2) descending interest. Return value: a CSV string. Test: tests/test_interests.py ''' spark = init_spark() lines = spark.read.text(filename).rdd parts = lines.map(lambda row: row.value.split(",")) rdd_data = parts.map(lambda p: Row(name=p[0], items=p[1:])) df = spark.createDataFrame(rdd_data) total_count = df.count() fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(df) model_updated = model.associationRules.join( model.freqItemsets, model.associationRules['consequent'] == model.freqItemsets['items']) model_with_interest = model_updated.withColumn( "interest", lit( calculate_interest(model_updated.confidence, model_updated.freq, total_count))) model_1 = model_with_interest.drop("lift") model_2 = model_1.orderBy([size("antecedent"), "interest"], ascending=[0, 0]) final_op = toCSVLine(model_2.limit(n)) return final_op
def process(): data_content = [x.strip().split(',') for x in open(FILE_PATH).readlines()] data_content_tuple = [] for i in range(0, len(data_content)): data_content_tuple.append((i, data_content[i])) df = spark.createDataFrame(data_content_tuple, ["id", "items"]) fpGrowth = FPGrowth(itemsCol="items", minSupport=0.1, minConfidence=0.5) model = fpGrowth.fit(df) # Display frequent itemsets. # model.freqItemsets model.freqItemsets.filter(size('items') > 0).orderBy('freq', ascending=0).show( 50, False) print(type(model.freqItemsets)) # Display generated association rules. model.associationRules.orderBy('confidence', ascending=0).show(200, False) # transform examines the input items against all the association rules and summarize the # consequents as prediction model.transform(df).show(50, False)
def fp_growth(df): training, test = data_process(df) fpGrowth = FPGrowth(itemsCol='itemset', minSupport=0.1, minConfidence=0.2) model = fpGrowth.fit(training) # Display frequent itemsets. # model.freqItemsets.show() # Display generated association rules. rules = model.associationRules rules = get_valid_rules(rules) rules.show() # Display the predicted purchasing. # res = model.transform(test).orderBy('prediction', ascending=False) # Calculate conversion rate. res = test.join(rules, test.itemset == rules.antecedent).select( test["*"], rules["prediction"]) conversion = F.udf(lambda x, y: 0 if len(set(x) & set(y)) == 0 else 1, IntegerType()) res = res.withColumn('conversion', conversion('ground_truth', 'prediction')) res.show() print("The total size of testset is: %d K = %d" % (test.count(), K)) total_c = res.count() print("The number of total recommendation is: %d" % total_c) total_v = res.agg(F.sum('conversion')).collect()[0][0] print("The number of correct recommendation is: %d" % total_v) print(df.count())
def rules(self): dataset = self._dataset() transactions_count = dataset.count() fp = FPGrowth(minSupport=self._min_support_count * 1.0 / transactions_count, minConfidence=self._min_confidence, itemsCol="items", numPartitions=self._partitions) fpm = fp.fit(dataset) association_rules = ( fpm.associationRules # 只保留长度为 1 的结果 .filter((size("antecedent") == 1) & (size("consequent") == 1)).withColumn( 'antecedent', col("antecedent")[0]).withColumn( 'consequent', col('consequent')[0])) window = Window.partitionBy(association_rules.antecedent).orderBy( association_rules.lift.desc()) association_rules = (association_rules.select( '*', rank().over(window).alias('rank')).filter( col('rank') <= self._top_n).select("antecedent", "consequent", "lift")) return association_rules
def main(): spark = SparkSession \ .builder \ .getOrCreate() spark.sparkContext.setCheckpointDir('gs://reddit_data_soen498/checkpoint/') @udf("boolean") def isNotDefault(x): defaultSubs = ["Art", "AskReddit", "DIY", "Documentaries", "EarthPorn", "Futurology", "GetMotivated", "IAmA", "InternetIsBeautiful", "Jokes", "LifeProTips", "Music", "OldSchoolCool", "Showerthoughts", "UpliftingNews", "announcements", "askscience", "aww", "blog", "books", "creepy", "dataisbeautiful", "explainlikeimfive", "food", "funny", "gadgets", "gaming", "gifs", "history", "listentothis", "mildlyinteresting", "movies", "news", "nosleep", "nottheonion", "personalfinance", "philosophy", "photoshopbattles", "pics", "science", "space", "sports", "television", "tifu", "todayilearned", "videos", "worldnews"] return x not in defaultSubs data = spark.read.json("gs://reddit_data_soen498/RC_2018-02.json") keep = [data.author, data.id, data.subreddit] data = data.select(*keep) data = data.filter(data.author != "[deleted]") data = data.filter(isNotDefault(data.subreddit)) data = data.groupBy(data.author).agg(F.collect_set("subreddit").alias("items")) size_ = udf(lambda xs: len(xs), IntegerType()) data = data.filter(size_(data.items) > 1) data = data.select(data.items) support = 200/data.count() fp = FPGrowth(minSupport=support, minConfidence=0.5) fpm = fp.fit(data) fpm.associationRules.show(100) fpm.save("gs://reddit_data_soen498/modelFP_noDefaultSub_20support")
def _run_FPGrowth(self, df): # Apply spark ml libs FP-growth algorithm for frequent itemset mining fpGrowth = FPGrowth(itemsCol="chordItems", minSupport=self.params["minSupport"], minConfidence=self.params["minConfidence"]) model = fpGrowth.fit(df) return model
def build_association_rule_model(item_set, min_support, min_confidence): # Use a low support as we have a large dataset fp_growth = FPGrowth(itemsCol="items", minSupport=min_support, minConfidence=min_confidence) print('Fitting FPGrowth....') model = fp_growth.fit(item_set) print('Fit Complete') return model
def SurvivalIndexTimeout(timeoutpidsmap): global spcon sqlcon = SQLContext(spcon) timeoutdf=sqlcon.createDataFrame(timeoutpidsmap,['index','process_ids']) fpGrowth=FPGrowth(itemsCol="process_ids",minSupport=0.5,minConfidence=0.5) fpModel=fpGrowth.fit(timeoutdf) fpModel.freqItemsets.show() fpModel.associationRules.show()
async def get_model(self, df, min_support=0.1, min_confidence=0.6): fpGrowth = FPGrowth(itemsCol="items", minSupport=min_support, minConfidence=min_confidence) model = fpGrowth.fit(df) return model.freqItemsets.sort( "freq", ascending=False), model.associationRules.sort( "confidence", ascending=True), model
def arRules(self, transaction): spark = SparkSession.builder.getOrCreate() R = Row('ID', 'items') # use enumerate to add the ID column df = spark.createDataFrame([R(i, x) for i, x in enumerate(transaction)]) fpGrowth = FPGrowth(itemsCol='items', minSupport=0.0001, minConfidence=0.0001) model = fpGrowth.fit(df) rules = model.associationRules.collect() # Display generated association rules. return rules
def recommendation(data, conf, outputpath): sparkSession = SparkSession.builder.getOrCreate() # extract algorithm parameters from conf file MyMinConfidence = conf["minConfidence"] MyMinSupport = conf["minSupport"] MyNumPartitions = conf["numPartitions"] MyMinFavorScore = conf["minfavorscore"] MyResultSavePath = os.path.join("hdfs://{0}".format(outputpath), "FPresult.json") MyModelSavePath = os.path.join("hdfs://{0}".format(outputpath), "FPmodel") print("============train FPmodel==============") df = sparkSession.createDataFrame(transformData(data, MyMinFavorScore), ["userId", "productIds"]) fpGrowth = FPGrowth(itemsCol="productIds", minSupport=MyMinSupport, minConfidence=MyMinConfidence) model = fpGrowth.fit(df) print("============save association rules==============") # if the length of result is 0 if model.associationRules.count() == 0: print( "============no association rules! retry to change algorithm parameters ==============" ) else: # determine if the file exists (ret, out, err) = MyUtil.run_cmd(['hdfs', 'dfs', '-test', '-e', MyModelSavePath]) # if file already exists,then delete it if ret == 0: print(MyModelSavePath + " file alreay exits") MyUtil.run_cmd(['hdfs', 'dfs', '-rm', '-r', MyModelSavePath]) print(MyModelSavePath + " file dosen't exit") model.save(MyModelSavePath) print("============save association results==============") # if the length of result is 0 if model.associationRules.count() == 0: print( "============no association rules! retry to change algorithm parameters ==============" ) else: # determine if the file exists (ret, out, err) = MyUtil.run_cmd( ['hdfs', 'dfs', '-test', '-e', MyResultSavePath]) # if file already exists,then delete it if ret == 0: print(MyResultSavePath + " file alreay exits") MyUtil.run_cmd(['hdfs', 'dfs', '-rm', '-r', MyResultSavePath]) else: print(MyResultSavePath + " file dosen't exit") model.transform(df).write.json(MyResultSavePath)
def test_freq_itemsets(self): fp = FPGrowth() fpm = fp.fit(self.data) expected_freq_itemsets = self.spark.createDataFrame( [([1], 4), ([2], 3), ([2, 1], 3), ([3], 2), ([3, 1], 2)], ["items", "freq"] ) actual_freq_itemsets = fpm.freqItemsets self.assertEqual(actual_freq_itemsets.subtract(expected_freq_itemsets).count(), 0) self.assertEqual(expected_freq_itemsets.subtract(actual_freq_itemsets).count(), 0)
def test_association_rules(self): fp = FPGrowth() fpm = fp.fit(self.data) expected_association_rules = self.spark.createDataFrame( [([3], [1], 1.0, 1.0), ([2], [1], 1.0, 1.0)], ["antecedent", "consequent", "confidence", "lift"] ) actual_association_rules = fpm.associationRules self.assertEqual(actual_association_rules.subtract(expected_association_rules).count(), 0) self.assertEqual(expected_association_rules.subtract(actual_association_rules).count(), 0)
def interests(filename, n, s, c): ''' Using the same FP-Growth algorithm, write a script that computes the interest of association rules (interest = |confidence - frequency(consequent)|; note the absolute value) obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), and prints the first <n> rules sorted by (1) descending antecedent size in association rule, and (2) descending interest. Return value: a CSV string. Test: tests/test_interests.py ''' spark = init_spark() frame = construct() frame2 = frame.withColumn("items", explode(frame.items)) frame2 = frame2.groupBy("items").count().sort(desc("count")) frame2 = frame2.withColumnRenamed("items", "consequent2") frame2 = frame2.withColumnRenamed("count", "freq") fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(frame) model = model.associationRules model = model.withColumn("consequent2", explode(model.consequent)) model = model.join(frame2, "consequent2", "inner") model = model.withColumn( "interest", lit(abs(model.confidence - (model.freq / frame.count())))) model = model.select("*", size("antecedent")) model = model.withColumnRenamed("size(antecedent)", "ln") model = model.sort(desc("ln"), desc("interest")) model = model.select("antecedent", "consequent", "confidence", "consequent", "freq", "interest") model = model.limit(n) string = toCSVLine(model) print(string) return string
def arRules(self, transaction): spark = SparkSession.builder.config("spark.executor.memory", MAX_MEMORY).config( "spark.driver.memory", MAX_MEMORY).getOrCreate() R = Row('ID', 'items') # use enumerate to add the ID column df = spark.createDataFrame( [R(i, x) for i, x in enumerate(transaction)]) fpGrowth = FPGrowth(itemsCol='items', minSupport=0.001, minConfidence=0.001) model = fpGrowth.fit(df) return model
def test_freq_itemsets(): data = spark.createDataFrame([([1, 2], ), ([1, 2], ), ([1, 2, 3], ), ([1, 3], )], ["items"]) fp = FPGrowth() fpm = fp.fit(data) expected_freq_itemsets = spark.createDataFrame([([1], 4), ([2], 3), ([2, 1], 3), ([3], 2), ([3, 1], 2)], ["items", "freq"]) actual_freq_itemsets = fpm.freqItemsets assert actual_freq_itemsets.subtract(expected_freq_itemsets).count() == 0 assert expected_freq_itemsets.subtract(actual_freq_itemsets).count() == 0
def test_association_rules(): data = spark.createDataFrame([([1, 2], ), ([1, 2], ), ([1, 2, 3], ), ([1, 3], )], ["items"]) fp = FPGrowth() fpm = fp.fit(data) expected_association_rules = spark.createDataFrame( [([3], [1], 1.0, 1.0), ([2], [1], 1.0, 1.0)], ["antecedent", "consequent", "confidence", "lift"]) actual_association_rules = fpm.associationRules assert actual_association_rules.subtract( expected_association_rules).count() == 0 assert expected_association_rules.subtract( actual_association_rules).count() == 0
def cluster(request): unique_fields = custom_fields(request) # First, read the data data_df = read_df(request, 'clean') data_df.cache() json_df = data_df.toPandas() json_df.to_json() # Create a tuple of id and items from the Data Frame dd = [] for p in data_df: dd.append(p) data = [] for row in json_df.itertuples(): id = row[1] items = [] for column in range(2, (len(dd) + 1)): items.append(row[column]) data.append((id, items)) # Create a Data Frame from the data dictionary final_data = Spark.sqlContext.createDataFrame(data, ["id", "items"]) # Create the FPGrowth instance with its arguments and train the model fpGrowth = FPGrowth(itemsCol='items', minSupport=0.5, minConfidence=0.6) model = fpGrowth.fit(final_data) # Frequent Item sets itemSets = model.freqItemsets # Generated Association Rules assocRules = model.associationRules # Examines input items against all association rules and summarize consequents as prediction prediction = model.transform(data) context = { 'all_data': json_df, 'itemSets': itemSets, 'assocRules': assocRules, 'predicted': prediction } return render(request, 'show_clusters.html', context)
def SAR(self, transaction): MAX_MEMORY = "12g" spark = SparkSession.builder.master("local").config("spark.memory.fraction", 0.8) \ .config("spark.executor.memory", MAX_MEMORY) \ .config("spark.driver.memory", MAX_MEMORY).getOrCreate() R = Row('ID', 'items') # use enumerate to add the ID column df = spark.createDataFrame([R(i, x) for i, x in enumerate(transaction)]) fp_growth = FPGrowth(itemsCol='items', minSupport=(0.001), minConfidence=(0.001), numPartitions=100) df_fit = fp_growth.fit(df) freq = df_fit.freqItemsets.collect() freq_list = list(filter(lambda x: len(x[0]) > 1, freq)) rule = df_fit.associationRules.collect() rule_list = list(filter(lambda x: x[3] > 1, rule)) return rule_list, freq_list
def train(self): trainDataList, testDataList = self.doRandomSplitData(self.dbData) print("random split. input list size:{}, train size:{}, test size:{}". format(len(self.dbData), len(trainDataList), len(testDataList))) trainUsrItemMap = self.getItemsForUsr(trainDataList) testUsrItemMap = self.getItemsForUsr(testDataList) print('trainUsrItemMap len:' + str(len(trainUsrItemMap)) + ", testUsrItemMap:" + str(len(testUsrItemMap))) trainDf = self.spark.createDataFrame(trainUsrItemMap.items(), ["id", "items"]) trainDf.cache() fpGrowth = FPGrowth(itemsCol="items", minSupport=self.minSupport, minConfidence=self.minConfidence) fgModel = fpGrowth.fit(trainDf) associateRules = fgModel.associationRules.collect() antecedentTmpList = [value['antecedent'] for value in associateRules] antecedentList = [] [ antecedentList.append(i) for i in antecedentTmpList if not i in antecedentList ] print('associateRules len:', len(associateRules), ', antecedentList len:', len(antecedentList)) freqItemsets = fgModel.freqItemsets.collect() print('freqItemsets len:', len(freqItemsets)) antecedentPredictionList = self.transformAllAntecdents( antecedentList, fgModel) print('antecedentPredictionList size:', len(antecedentPredictionList)) usrPredictionMap = self.predictForUsers(antecedentPredictionList, trainUsrItemMap) print('usrPredictionMap len:' + str(len(usrPredictionMap))) totalTP, totalFP, totalFN = self.getTestPrecionAndRecall( usrPredictionMap, testUsrItemMap) precision = float(totalTP) / float(totalTP + totalFP) recall = float(totalTP) / float(totalTP + totalFN) print('precision:', precision, ", recall:", recall)
def main(): # Read from the transactions database and transactions collection, this will # generate a Dataframe object print("Reading from transactions db... \n") transactions_data = spark_session.read \ .format("com.mongodb.spark.sql.DefaultSource") \ .option("database", "transactions") \ .option("collection", "transactions") \ .load() print('Our read transactions are of the type: ', type(transactions_data), '\n') print("The generated transactions schema is: \n") transactions_data.printSchema() print("The data fetched from the dbb is: \n") transactions_data.show() product_codes = transactions_data.select("ProductCode") fpGrowth = FPGrowth(itemsCol="ProductCode", minSupport=0.0001, minConfidence=0.05) print('Fitting the model...') model = fpGrowth.fit(product_codes) # Display frequent itemsets. model.freqItemsets.show() # Display generated association rules. model.associationRules.show(100) # transform examines the input items against all the association rules and summarize the # consequents as prediction model.transform(transactions_data).show() # Simple test stuff to write to the db print("Writing to the mongodb") model.associationRules.write.format( "com.mongodb.spark.sql.DefaultSource") \ .option("database", "transactions") \ .option("collection", "recommendations") \ .mode("append") \ .save()
def temp(): from pyspark.sql import SparkSession from pyspark.ml.fpm import FPGrowth spark = SparkSession.builder.getOrCreate() df = spark.createDataFrame([(0, ['a', 'b', 'e']), (1, ['a', 'b', 'c', 'e']), (2, ['a', 'b'])], ["id", "items"]) fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6) model = fpGrowth.fit(df) # Display frequent itemsets. model.freqItemsets.show() # Display generated association rules. model.associationRules.show() # transform examines the input items against all the association rules and summarize the # consequents as prediction model.transform(df).show()
def PAR(self, transaction): MAX_MEMORY = "14g" spark = SparkSession.builder.master("local").config("spark.memory.fraction", 0.8) \ .config("spark.executor.memory", MAX_MEMORY) \ .config("spark.driver.memory", MAX_MEMORY).getOrCreate() R = Row('ID', 'items') # use enumerate to add the ID column df = spark.createDataFrame( [R(i, x) for i, x in enumerate(transaction)]) fp_growth = FPGrowth(itemsCol='items', minSupport=(0.001), minConfidence=(0.001), numPartitions=100) freq = fp_growth.fit(df).freqItemsets.collect() supp_x = sorted(list(filter(lambda x: len(x[0]) == 1, freq))) supp_xy = sorted(list(filter(lambda x: len(x[0]) == 2, freq))) supp_x = {k[0]: v for k, v in supp_x if k[0] != '$MISS'} supp_xy = list( filter(lambda k: k[0][0] != '$MISS' and k[0][1] != '$MISS', supp_xy)) # Rule Power Factor (RPF) par_result = dict() for i, j in supp_x.items(): if (i != '$MISS'): par_result[i] = dict() for m, n in supp_xy: if m[0] == i and m[1] != '$MISS': par_result[i][m[1]] = (((n / len(transaction))**2) / (j / len(transaction)), n) elif m[1] == i and m[0] != '$MISS': par_result[i][m[0]] = (((n / len(transaction))**2) / (j / len(transaction)), n) return supp_x, {k: v for k, v in par_result.items() if len(v) > 0}
def association_rules(filename, n, s, c): ''' Using the same FP-Growth algorithm, write a script that returns the first <n> association rules obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), sorted by (1) descending antecedent size in association rule, and (2) descending confidence. Return value: a CSV string. Test: tests/test_association_rules.py ''' spark = init_spark() lines = spark.read.text(filename).rdd parts = lines.map(lambda row: row.value.split(",")) rdd_data = parts.map(lambda p: Row(name=p[0], items=p[1:])) df = spark.createDataFrame(rdd_data) fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(df) model_1 = model.associationRules.orderBy( [size("antecedent"), "confidence"], ascending=[0, 0]) model_2 = model_1.drop("lift") final_op = toCSVLine(model_2.limit(n)) return final_op
def association_rules(filename, n, s, c): ''' Using the same FP-Growth algorithm, write a script that returns the first <n> association rules obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), sorted by (1) descending antecedent size in association rule, and (2) descending confidence. Return value: a CSV string. Test: tests/test_association_rules.py ''' spark = init_spark() result = spark.sparkContext.textFile(filename).map(lambda l: l.split(",")).zipWithIndex().map(lambda x: (x[1], x[0][0], x[0][1:])) df = spark.createDataFrame(result, ['id', 'plant', 'items']) fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(df) result = model.associationRules result = result.select(size("antecedent").alias('tam'),'antecedent','consequent', 'confidence') result = result.sort(desc('tam'), desc('confidence')).limit(n) result=result.select('antecedent','consequent','confidence') return toCSVLine(result)
class FPGEstimator(Estimator): def __init__(self, spark, user_col, item_col, grade_col, min_support, min_confidence): self.spark = spark self.item_col = item_col self.user_col = user_col self.grade_col = grade_col self.list_item_col = self.item_col + "_list" self.min_support = min_support self.min_confidence = min_confidence self.model = FPGrowth(itemsCol=self.list_item_col, minSupport=self.min_support, minConfidence=self.min_confidence, numPartitions=1000) def _fit(self, transformed_df): train_fp_data = transformed_df.groupBy(self.user_col).agg( collect_set(self.item_col).alias(self.list_item_col)).select( self.user_col, self.list_item_col) # train_fp_data = train_fp_data.cache() fp_model = self.model.fit(train_fp_data) return FPGTransformer(self.spark, self.user_col, self.item_col, self.list_item_col, self.grade_col, fp_model.associationRules)
def association_rules(filename, n, s, c): ''' Using the same FP-Growth algorithm, write a script that returns the first <n> association rules obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), sorted by (1) descending antecedent size in association rule, and (2) descending confidence. Return value: a CSV string. Test: tests/test_association_rules.py ''' spark = init_spark() frame = construct() fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(frame) model = model.associationRules model = model.select("*", size("antecedent")) model = model.withColumnRenamed("size(antecedent)", "ln") model = model.sort(desc("ln"), desc("confidence")) model = model.select("antecedent", "consequent", "confidence") model = model.limit(n) string = toCSVLine(model) #print(string) return string
def frequent_itemsets(filename, n, s, c): ''' Using the FP-Growth algorithm from the ML library (see http://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html), write a function that returns the first <n> frequent itemsets obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), sorted by (1) descending itemset size, and (2) descending frequency. The FP-Growth model should be applied to the DataFrame computed in the previous task. Return value: a CSV string. As before, using toCSVLine may help. Test: tests/test_frequent_items.py ''' spark = init_spark() frame = construct() fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(frame) model = model.freqItemsets model = model.select("*", size("items")) model = model.withColumnRenamed("size(items)", "ln") model = model.sort(desc("ln"), desc("freq")) model = model.select("items", "freq") model = model.limit(n) string = toCSVLine(model) return string
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("FPGrowthExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame([ (0, [1, 2, 5]), (1, [1, 2, 3, 5]), (2, [1, 2]) ], ["id", "items"]) fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6) model = fpGrowth.fit(df) # Display frequent itemsets. model.freqItemsets.show() # Display generated association rules. model.associationRules.show() # transform examines the input items against all the association rules and summarize the # consequents as prediction model.transform(df).show() # $example off$ spark.stop()