#print(DescriptionGrp.rdd.take(2)) minSupport = 0.05 * DescriptionGrp.rdd.count() apr_tem = DescriptionGrp.rdd.map(lambda x: (x[0], list([x[1]]))).reduceByKey( lambda x, y: x + y) schema = StructType([ StructField("id", StringType(), True), StructField("items", ArrayType(StringType()), True) ]) transactions = spark.createDataFrame(apr_tem, schema) print(transactions.show(2)) ##transactions_fp=apr_tem.map(lambda x: (x[1])) #print(transactions_fp.take(2)) #schema = StructType([StructField("test_123",ArrayType(StringType(),True),True)]) #fields = [StructField(field_name, StringType(), True) for field_name in schema.split(',')] #schema = StructType(fields) ##final_transactions_rdd = sc.parallelize(transactions_fp.collect()) ##final_transactions = final_transactions_rdd.map(lambda x : ','.join(x)) ##print(final_transactions.take(2)) #transactions = spark.createDataFrame([final_transactions]) ##transactions = final_transactions.map(lambda line: line.strip().split(',')) ##print(transactions.take(2)) fpgrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6) ##fpgrowth = FPGrowth(minSupport=0.5, minConfidence=0.6) model = fpgrowth.fit(transactions) # Display frequent itemsets. model.freqItemsets.show() # Display generated association rules. model.associationRules.show() # transform examines the input items against all the association rules and summarize the # consequents as prediction model.transform(df).show()
df_1 =df_0.withColumn('Txn_description',udf_1(col('Txn_description'))) print(df_1.printSchema()) print(df_1.show(2)) df_2 = df_1.withColumn("Txn_description", split(col("Txn_description"), ",").cast("array<string>")) print(df_2.printSchema()) print(df_2.show(2)) from pyspark.ml.fpm import FPGrowth print(datetime.now()) fpGrowth = FPGrowth(itemsCol="Txn_description", minSupport=0.0005, minConfidence=0.1) model = fpGrowth.fit(df_2) print(datetime.now()) # Display frequent itemsets. freq_set = model.freqItemsets.collect() print(len(freq_set)) pprint(freq_set) print(datetime.now()) # Display generated association rules. association_set = model.associationRules.collect() pprint(association_set) print(len(association_set)) # transform examines the input items against all the association rules and summarize the # consequents as prediction transformed_input = model.transform(df_2).take(1000)