def rule2(): from mlxtend.frequent_patterns import apriori as ap from mlxtend.frequent_patterns import association_rules pd.options.display.max_columns = 1000 start = time.time() hot_encoded_df = data.groupby([ '订单数量', '产品名称' ])['产品名称'].count().unstack().reset_index().fillna(0).set_index('订单数量') hot_encoded_df = hot_encoded_df.applymap(encode_units) print(hot_encoded_df) frequent_itemsets = ap(hot_encoded_df, min_support=0.01, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.2) print("频繁项集:", frequent_itemsets) print("关联规则:", rules[(rules['lift'] >= 1) & (rules['confidence'] >= 0.3)]) rules.to_excel( 'E:\\python_project\\GIT\\DATA_ANALYSIS\\test\\test\\关联规则2.xlsx') frequent_itemsets.to_excel( 'E:\\python_project\\GIT\\DATA_ANALYSIS\\test\\test\\频繁项集2.xlsx') end = time.time()
def rule2(): from mlxtend.frequent_patterns import apriori as ap from mlxtend.frequent_patterns import association_rules pd.options.display.max_columns = 1000 start = time.time() hot_encoded_df = data.groupby( ['FaultNo.', 'Alarm Msg.'])['Alarm Msg.'].count().unstack().reset_index().fillna( 0).set_index('FaultNo.') hot_encoded_df = hot_encoded_df.applymap(encode_units) frequent_itemsets = ap(hot_encoded_df, min_support=0.01, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.2) print("频繁项集:", frequent_itemsets) print("关联规则:", rules[(rules['lift'] >= 1) & (rules['confidence'] >= 0.2)]) print(rules['confidence']) rules.to_excel('E:\DATA_ENGIN\\apiority\\' + F + '关联规则2.xlsx') frequent_itemsets.to_excel('E:\DATA_ENGIN\\apiority\\' + F + '频繁项集2.xlsx') end = time.time() print("总用时:", end - start)
def data_load(file): try: df = pd.read_csv(file, sep=';', encoding = 'utf-8', index_col=False, error_bad_lines = False) except: print('incorrect file path') return df df = data_load(file) df.head() df.dtypes df = df.loc[:,df.columns != 'id'] freq_items = ap(df, min_support = 0.11, use_colnames=True) freq_items.head() rules = ar(freq_items, metric = 'lift', min_threshold = 1) rules.head() rules['confidence'].sort_values(ascending = False).head(100) rules['lift'].sort_values(ascending = False).head(100) pd.set_option('display.max_columns', None) rules[ (rules['lift'] >= 1) & (rules['confidence'] >= 0.8)]
#5. Przekonwertuj dane do postaci którą przyjmuję algorytm Apriori (DataFrame gdzie indexami są #numery transakcji a wartościami kolumn True lub False w zależności czy dany produkt wystąpił w #transakcji). #6. Wygeneruj listę reguł dla wybranej przez Ciebie wartości min_support. Przejrzyj reguły a następnie #wybierz 5 Twoim zdaniem najlepszych. #7. Spróbuj zwiększyć wartość min_support, co się wtedy dzieję z liczbą reguł? #8. Wypisz wszystkie reguły, których wartość lift jest większa niż 5 i wartość confidence #jest większa niż 0.8 df_EIRE = df[df['Country'] == 'EIRE'] df_EIRE = df_EIRE[['InvoiceNo', 'Description', 'Quantity']] df_EIRE = (df_EIRE.groupby(['InvoiceNo', 'Description'])['Quantity']. sum().unstack().fillna(0)) df_EIRE[df_EIRE == 0] = False df_EIRE[df_EIRE != 0] = True freq_items = ap(df_EIRE, min_support = 0.05, use_colnames=True) freq_items.head(10) rules = ar(freq_items, metric = 'lift', min_threshold = 1) rules.head(10) rules['confidence'].sort_values(ascending = False).head(10) rules['lift'].sort_values(ascending = False).head(10) rules[ (rules['lift'] >= 1) & (rules['confidence'] >= 0.8)]
edgecolor='red') plt.xticks(np.arange(0, 32, step=2.5)) plt.xlabel("The Items number of every customer") plt.ylabel("Frequency") plt.show() print() ## b) te = tec() cusItemList = DataFrame.groupby(['Customer' ])['Item'].apply(list).values.tolist() te_ary = te.fit(cusItemList).transform(cusItemList) ItemIndicator = pds.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = ap(ItemIndicator, min_support=(75 / customerItem.count()), max_len=32, use_colnames=True) print(frequent_itemsets) print() ## c) # Discover the association rules assoc_rules = as_r(frequent_itemsets, metric="confidence", min_threshold=0.01) print('We can find', len(assoc_rules), 'Association rules') print(assoc_rules) print() ## d) def showGraph(): plt.figure(facecolor='white', edgecolor='white')
for i in range(len(cgm)): CGmax.append(maximum(cgm.loc[i])) BOLmax.append(maximum(bol.loc[i])) CG0.append(cgm.loc[i][5]) apriDF.append([ CalculateBins(max(cgm.loc[i])), CalculateBins(cgm.loc[i][5]), max(bol.loc[i]) ]) #Apriori Algorithm #For Most Frequent Itemsets transEnc = TransactionEncoder() transactions = pd.DataFrame(transEnc.fit(apriDF).transform(apriDF), columns=transEnc.columns_) rules = ar(ap(transactions, min_support=0.00000000001, use_colnames=True), min_threshold=0.0) rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x)) for column in ['antecedents', 'consequents']: rules[column] = rules[column].astype(str) rules[column] = rules[column].str.replace(re.escape('frozenset({'), '') rules[column] = rules[column].str.replace(re.escape('})'), '') rules["SET"] = rules["antecedents"] + ',' + rules['consequents'] rules['SET'] = rules['SET'].str.replace("'", "") rules['SET'] = rules.SET.apply(lambda x: x.split(',')) #rules.to_csv("Rules.csv") li = rules['SET'].tolist() y = [[(float(j)) for j in i] for i in li] for i in y: i.sort(reverse=True) b = list()
for com in commons: labels[com] = 1 encoded_vals.append(labels) encoded_vals[0] encode_df = pd.DataFrame(encoded_vals) print(encode_df.head()) ## ## Defining the apriori algorithm. ## freq_items = ap(encode_df, min_support=0.0085, use_colnames=True, verbose=1, low_memory=False) print(freq_items.head()) ## ## Defining the association rules algorithms to match and find similar items together based on confidence. ## assocn_rules_conf = ar(freq_items, metric="confidence", min_threshold=0.25) print(assocn_rules_conf) ## ## Defining the association rules algorithms to match and find similar items together based on support. ## assocn_rules_supp = ar(freq_items, metric="support", min_threshold=0.005)