#print(DescriptionGrp.rdd.take(2))
minSupport = 0.05 * DescriptionGrp.rdd.count()
apr_tem = DescriptionGrp.rdd.map(lambda x: (x[0], list([x[1]]))).reduceByKey(
    lambda x, y: x + y)
schema = StructType([
    StructField("id", StringType(), True),
    StructField("items", ArrayType(StringType()), True)
])
transactions = spark.createDataFrame(apr_tem, schema)
print(transactions.show(2))
##transactions_fp=apr_tem.map(lambda x: (x[1]))
#print(transactions_fp.take(2))
#schema = StructType([StructField("test_123",ArrayType(StringType(),True),True)])
#fields = [StructField(field_name, StringType(), True) for field_name in schema.split(',')]
#schema = StructType(fields)
##final_transactions_rdd = sc.parallelize(transactions_fp.collect())
##final_transactions = final_transactions_rdd.map(lambda x : ','.join(x))
##print(final_transactions.take(2))
#transactions = spark.createDataFrame([final_transactions])
##transactions = final_transactions.map(lambda line: line.strip().split(','))
##print(transactions.take(2))
fpgrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
##fpgrowth = FPGrowth(minSupport=0.5, minConfidence=0.6)
model = fpgrowth.fit(transactions)
# Display frequent itemsets.
model.freqItemsets.show()
# Display generated association rules.
model.associationRules.show()
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()
Пример #2
0
df_1 =df_0.withColumn('Txn_description',udf_1(col('Txn_description')))
print(df_1.printSchema())
print(df_1.show(2))


df_2 = df_1.withColumn("Txn_description", split(col("Txn_description"), ",").cast("array<string>"))
print(df_2.printSchema())
print(df_2.show(2))




from pyspark.ml.fpm import FPGrowth
print(datetime.now())
fpGrowth = FPGrowth(itemsCol="Txn_description", minSupport=0.0005, minConfidence=0.1)
model = fpGrowth.fit(df_2)
print(datetime.now())
# Display frequent itemsets.
freq_set = model.freqItemsets.collect()

print(len(freq_set))
pprint(freq_set)
print(datetime.now())
# Display generated association rules.
association_set = model.associationRules.collect()
pprint(association_set)
print(len(association_set))
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
transformed_input = model.transform(df_2).take(1000)