def event_apriori_data(event_id, support): support = float(support) data = db.events lists = data.find({'eventId': event_id}) lists = list(lists) if (len(lists) != 0): df1 = pd.DataFrame(lists) item = [] for i in range(0, len(df1.triggers[0])): text = df1.triggers[0][i]['data'] text = text.split(',') item.append(text) te = TransactionEncoder() te_arry = te.fit_transform(item) df2 = pd.DataFrame(te_arry, columns=te.columns_) frq_item = apriori(df2, min_support=support, use_colnames=True) rule = association_rules(frq_item, metric='confidence', min_threshold=0.6) return rule.to_json(orient='records') else: return "No Data"
def apply_(self): df_ = self.frequent_patterns_prepare(min_threshold=1000) te = TransactionEncoder() # 对数据集进行TransactionEncoder编码 df_tf = te.fit_transform(df_.values) df = pd.DataFrame(df_tf, columns=te.columns_) start = time() # 寻找频繁项集 frequent_itemsets = fpgrowth(df, min_support=0.05, use_colnames=True) logging.debug('寻找频繁项集算法时耗:%s\n' % (time() - start)) print('寻找频繁项集算法时耗:', time() - start) print() frequent_itemsets.sort_values(by='support', ascending=False, inplace=True) logging.debug(f'freqSet:\n{frequent_itemsets}\n') print(f'freqSet:\n{frequent_itemsets}') print('\n\n', '**' * 30) # 生成关联规则 association_rule = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7) # 指标为置信度 association_rule.sort_values(by='leverage', ascending=False, inplace=True) # 关联规则按leverage排序 logging.debug('关联规则:\n{}'.format(association_rule)) print('关联规则:\n{}'.format(association_rule))
def rule(): data = get_data() te = TransactionEncoder() te_ary = te.fit_transform(data) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True) rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.2) return frequent_itemsets, rules
def rule(): data = pd.read_csv('shopping_data.csv', header=None) df_arr = data.apply(deal,axis=1).tolist() te = TransactionEncoder() # ???? te_ary = te.fit_transform(df_arr) # ????? df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True) association_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2) return frequent_itemsets, association_rule # ?????????????? DataFrame
def transaction_encoder(transaction): """ 读入交易记录,转化为0-1编码交易记录 Parameters: transaction (二维list): 交易记录 Returns: encoded_transaction (DataFrame): 0-1编码的交易记录 """ te = TransactionEncoder() # 定义模型 df_tf = te.fit_transform(transaction) encoded_transaction = pd.DataFrame(df_tf, columns=te.columns_) return encoded_transaction
def fp_growth_retail(TOP_PERCENTAGE, file_name, no_of_trx): data = pd.read_csv('../Datasets/' + str(file_name) + '.csv', header=None) print("\n --- FP Growth on File " + str(file_name) + " : and Top Percentage: " + str(TOP_PERCENTAGE)) # converting into required format of TransactionEncoder() trans = [] for i in range(0, no_of_trx): trans.append([str(data.values[i, j]) for j in range(0, 20)]) Items = dict(collections.Counter([x for sublist in trans for x in sublist])) Items['nan'] = 0 print("Frequencies of Each Item:") print(Items) top_items = top_x_per_products(Items, TOP_PERCENTAGE) print("Top Items:") print(top_items) plot_graph(top_items, 'fp_growth', TOP_PERCENTAGE) Output = [b for b in trans if any(a in b for a in top_items.keys())] # Using TransactionEncoder trans = np.array(trans) Output = np.array(Output) # print(Output.shape) t = TransactionEncoder() data = t.fit_transform(Output) data = pd.DataFrame(data, columns=t.columns_, dtype=int) # print(data.shape) # here we also find nan as one of the columns so lets drop that column data.drop('nan', axis=1, inplace=True) # print(data.shape) # print(data.head()) # running the fpgrowth algorithm res = fpgrowth(data, min_support=0.01, use_colnames=True) print("Number of Frequent Item sets:" + str(len(res))) res = association_rules(res, metric="confidence", min_threshold=0.5) print("\n=============== ASOCIATION RULES ======================") cols = [0, 1, 4, 5] res = res[res.columns[cols]] print(res)
def rule(): df = pd.read_csv("shopping_data.csv", header=None) dataset = df.stack().groupby(level=0).apply(list).tolist() te = TransactionEncoder() # 定义模型 te_ary = te.fit_transform(dataset) # 转换数据集 df = pd.DataFrame(te_ary, columns=te.columns_) # 将数组处理为 DataFrame frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True) association_rules = rules(frequent_itemsets, metric="confidence", min_threshold=0.2) # 置信度阈值为 0.1 return frequent_itemsets, association_rules
def rule3(): from mlxtend.frequent_patterns import fpgrowth from mlxtend.frequent_patterns import association_rules from mlxtend.preprocessing import TransactionEncoder now = time.time() te = TransactionEncoder() te_ary = te.fit_transform(transactions) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = fpgrowth(df, min_support=0.03, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.2) print("频繁项集:", frequent_itemsets) print("关联规则:", rules[(rules['lift'] >= 1) & (rules['confidence'] >= 0.2)]) print("用时:", time.time() - now)
def encode_orders_materials(orders): orders_grouped = orders[['order_id', 'material']].groupby('order_id') orders_materials = [ list(orders_group.material) for (_, orders_group) in orders_grouped ] encoder = TransactionEncoder() orders_materials = encoder.fit_transform(orders_materials, sparse=True) orders_index = list(orders_grouped.groups.keys()) orders_columns = [str(column) for column in encoder.columns_] return pandas.DataFrame.sparse.from_spmatrix(orders_materials, index=orders_index, columns=orders_columns)
def unitfiy_sample_dataset(self): start = time.perf_counter() print("开始进一步规约样本数据集") shopping_df = pd.DataFrame(self.sampleList) df_arr = shopping_df.stack().groupby(level=0).apply(list).tolist() # 方法一 # df_arr = shopping_df.apply(self.deal,axis=1).tolist() # 方法二 te = TransactionEncoder() # 定义模型 df_tf = te.fit_transform(df_arr) # df_01 = df_tf.astype('int') # 将 True、False 转换为 0、1 # 官方给的其它方法 # df_name = te.inverse_transform(df_tf) # 将编码值再次转化为原来的商品名 self.sample_df = pd.DataFrame(df_tf, columns=te.columns_) elapsed = (time.perf_counter() - start) print("Time used:", elapsed) print("样本数据集已进一步规约完毕")
def rule(): df = pd.read_csv('shopping_data.csv') dataset = df.stack().groupby(level=0).apply(list).tolist() te = TransactionEncoder() te_ary = te.fit_transform(dataset) data = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(data, min_support=0.05, use_colnames=True) association_rules = rules( frequent_itemsets, metric="confidence", min_threshold=0.2) return frequent_itemsets, association_rules
def ele(date_start, date_end, asset_id, support): support = float(support) date_start = parser.parse(date_start, dayfirst=True) date_end = parser.parse(date_end, dayfirst=True) data = db.reports lists = data.find({ 'date': { '$gt': date_start, '$lt': date_end }, 'assetId': asset_id }) lists = list(lists) if (len(lists) != 0): df = pd.DataFrame(lists) data_ele = [] for i in range(0, len(df.elements)): data_ele.append(df.elements[i]) for i in range(0, len(data_ele)): for j in range(0, len(data_ele[i])): if data_ele[i][j] is None: data_ele[i][j] = "other" te = TransactionEncoder() te_arry = te.fit_transform(data_ele) df1 = pd.DataFrame(te_arry, columns=te.columns_) frq_item = apriori(df1, min_support=support, use_colnames=True) rule = association_rules(frq_item, metric='confidence', min_threshold=0.5) return rule.to_json(orient='records') else: return "No Data"
def rule(): df_data=pd.read_csv('shopping_data.csv',header=None) dataset=[] for i in range(len(df_data)): list_data=list(df_data.loc[i]) list_no_nan=[] for j in range(len(list_data)): if isinstance(list_data[j],float): break else: list_no_nan.append(list_data[j]) dataset.append(list_no_nan) te = TransactionEncoder() te_ary = te.fit_transform(dataset) df_te = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df_te, min_support=0.05, use_colnames=True) association_rules_df=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2) print(frequent_itemsets) print(association_rules_df) return frequent_itemsets,association_rules_df
def generate_rules(df): df['cat_suicides_num'] = pd.cut(df['num_of_suicides'], bins=6) df['cat_suicides/100k'] = pd.cut(df['suicides/100k'], bins=6) df['cat_population'] = pd.cut(df['population'], bins=6) df['cat_gdp_for_year'] = pd.cut(df['gdp_for_year'], bins=6) df['cat_gdp_per_capita'] = pd.cut(df['gdp_per_capita'], bins=6) df.drop([ 'num_of_suicides', 'population', 'suicides/100k', 'gdp_for_year', 'gdp_per_capita' ], axis=1, inplace=True) # for i in df['continent'].unique(): # df_continent = df.loc[df.continent == i] # df_continent = df_continent.drop(['continent'], axis=1) # print('start: ', datetime.now().time()) trans = [] for y in range(0, df.shape[0]): trans.append([str(df.values[y, j]) for j in range(0, df.shape[1])]) # print('just transformed the dataset into array: ', datetime.now().time()) te = TransactionEncoder() data = te.fit_transform(trans) data = pd.DataFrame(data, columns=te.columns_) print(data) frequent_items = apriori(data, min_support=0.5, use_colnames=True) print(frequent_items) rules = association_rules(frequent_items, metric="confidence", min_threshold=0.5) print(rules) # print('finished mining: ', datetime.now().time()) rules.to_csv('generated_rules.csv')
def data_for_apriori(self): # rating_data rating_df = pd.read_pickle(self.rating_data_path) # 4점 이상의 고평점 영화만 사용 over_4_rating = rating_df[rating_df['rating'] >= 4] user_movie_basket = over_4_rating.groupby('user_id')['movie_id'].apply( set) # basket -> vector transaction = TransactionEncoder() basket_array = transaction.fit_transform(user_movie_basket) basket_df = pd.Dataframe(basket_array, columns=transaction.columns_) # 평점 개수 기준 상위 5000개 top_5000_movie= rating_df.groupby('movie_id')['rating'].count().sort_values(ascending=False).\ iloc[:5000].index top_5000_basket = basket_df[top_5000_movie] top_5000_basket = top_5000_basket[top_5000_basket.sum(axis=1) > 0] return top_5000_basket
def data_transform(): # 导入数据并进行条件过滤 df = pd.read_excel('./销售基础表查询.xlsx', sheet_name='销售基础表查询', header=0) print('原始数据: {}'.format(df.shape)) bool_content = ((df['实销数量'] > 0) & (df['实销金额'] > 0)) df = df[bool_content] print('数据过滤后: {}'.format(df.shape)) # 选择需要的数据字段 df2 = df[['单据号', '商品']] print('提取字段后: {}'.format(df2.shape)) # 数据整合 —— 唯一单据号对应商品 df3 = pd.DataFrame([(i, df2[df2['单据号'] == i]['商品'].tolist()) for i in df2['单据号'].unique()]) print('数据整合后: {}'.format(df3.shape)) # 生成购物篮对应商品列表 df_arr = df3[1].tolist() print('购物篮对应商品列表: {}'.format(len(df_arr))) # 调用模型 te = TransactionEncoder() df_tf = te.fit_transform(df_arr) # 生成数据集 df4 = pd.DataFrame(df_tf, columns=te.columns_) print('转换完成: {}'.format(df4.shape)) # 返回数据 return df4
def data_transform(): # 导入数据并进行条件过滤 df = pd.read_excel('./销售基础表查询.xlsx', sheet_name='销售基础表查询', header=0) # print(df.head()) # print(df.shape) bool_content = ((df['实销数量'] > 0) & (df['实销金额'] > 0)) df = df[bool_content] # print(df.head()) print('df: {}'.format(df.shape)) # 选择需要的数据字段 df2 = df[['单据号', '商品']] print('df2: {}'.format(df2.shape)) # 数据整合 —— 唯一单据号对应商品 df3 = pd.DataFrame([(i, df2[df2['单据号'] == i]['商品'].tolist()) for i in df2['单据号'].unique()]) print('df3: {}'.format(df3.shape)) # 生成购物篮对应商品列表 # shopping_lists = [] # for shopping_list in df3[1]: # shopping_lists.append(shopping_list) # shopping_df = pd.DataFrame(shopping_lists) # print(shopping_df.head(20)) # 剔除数据中的空值( apply ) # df_arr = shopping_df.apply(deal, axis=1).tolist() # print('df_arr: {}'.format(len(df_arr))) # print(df_arr[:21]) df_arr = df3[1].tolist() print('df_arr: {}'.format(len(df_arr))) # 调用模型 te = TransactionEncoder() df_tf = te.fit_transform(df_arr) # 生成数据集 df4 = pd.DataFrame(df_tf, columns=te.columns_) print('df4: {}'.format(df4.shape)) # print(df4.head(20)) # 返回数据 return df4
import matplotlib.pyplot as plt all_data = pd.read_csv('dataset_group.csv', header=None) print(all_data) unique_id = all_data[1].unique() print(unique_id.size) items = all_data[2].unique() print(items.size) dataset = [[elem for elem in all_data[all_data[1] == id][2] if elem in items] for id in unique_id] te = TransactionEncoder() te_ary = te.fit_transform(dataset) df = pd.DataFrame(te_ary, columns=te.columns_) print(df) # 1 results = apriori(df, min_support=0.3, use_colnames=True) results['length'] = results['itemsets'].apply(lambda x: len(x)) print(results) results_orig = apriori(df, min_support=0.3, use_colnames=True, max_len=1) results_orig['length'] = results_orig['itemsets'].apply(lambda x: len(x)) print(results_orig) results = apriori(df, min_support=0.3, use_colnames=True) results['length'] = results['itemsets'].apply(lambda x: len(x)) results_2 = results[results['length'] == 2]
def test_fit_transform(): oht = TransactionEncoder() trans = oht.fit_transform(dataset) np.testing.assert_array_equal(expect, trans)
"D:\\Machine Learning_Algoritms\\Apriori\\GroceryStoreDataSet.csv", encoding='latin1', names=['products'], header=None) num_records = len(Dataframe) print(num_records) transactions = [] for i in range(0, num_records): transactions.append([str(Dataframe.values[i, j]) for j in range(0, 3)]) Dataframe = list(Dataframe["products"].apply(lambda x: x.split(','))) from mlxtend.preprocessing import TransactionEncoder te = TransactionEncoder() te_data = te.fit_transform(Dataframe) Dataframe = pd.DataFrame(te_data, columns=te.columns_) count = Dataframe.loc[:, :].sum() reverse_count = count.sort_values(0, ascending=False).head(11) reverse_count = reverse_count.to_frame() reverse_count = reverse_count.reset_index() #reverse_count = reverse_count.rename(columns = {“index”: “items” ,0: “count”}) plt.style.available plt.rcParams['figure.figsize'] = (10, 6) plt.style.use('dark_background') ax = reverse_count.plot.barh() plt.title("Popular items")
import youtube_process from mlxtend.preprocessing import TransactionEncoder file_US = "USvideos.csv" US_data = pd.read_csv(file_US, keep_default_na=False, low_memory=False) US_data df = US_data[['category_id','views']] df with open("US_category_id.json", 'r') as f: content = json.load(f) category_map = {} for i in content['items']: category_map[int(i['id'])] = i['snippet']['title'] category_map t = df['category_id'].map(category_map) df = pd.concat([df,t],axis=1) df.columns=['category_id','views','category'] grade = [] for i in df['views'].values: views_map = lambda x:{x>=4194399:'A',1823157<=x<4194399:'B',681861<=x<1823157:'C', 242329<=x<681861:'D',549<=x<242329:'E'} grade.append(views_map(i)[True]) df['views_grade'] = grade df = df.drop(['category_id', 'views'], axis = 1) df def deal(data): return data.dropna().tolist() df_arr = df.apply(deal,axis=1).tolist() # 转化成列表 TE = TransactionEncoder() # 定义模型 df_tf = TE.fit_transform(df_arr) df = pd.DataFrame(df_tf,columns=TE.columns_) df
['莴苣','豆奶','尿布','葡萄酒'], ['莴苣','豆奶','尿布','橙汁']] shopping_df = pd.DataFrame(shopping_list) def deal(data): return data.dropna().tolist() df_arr = shopping_df.apply(deal,axis=1).tolist() """由于mlxtend的模型只接受特定的数据格式。(TransactionEncoder类似于独热编码,每个值转换为一个唯一的bool值)""" from mlxtend.preprocessing import TransactionEncoder # 传入模型的数据需要满足特定的格式,可以用这种方法来转换为bool值,也可以用函数转换为0、1 te = TransactionEncoder() # 定义模型 df_tf = te.fit_transform(df_arr) # df_01 = df_tf.astype('int') # 将 True、False 转换为 0、1 # 官方给的其它方法 # df_name = te.inverse_transform(df_tf) # 将编码值再次转化为原来的商品名 df = pd.DataFrame(df_tf,columns=te.columns_) """求频繁项集: 导入apriori方法设置最小支持度min_support=0.05求频繁项集,还能选择出长度大于x的频繁项集。 """ from mlxtend.frequent_patterns import apriori frequent_itemsets = apriori(df,min_support=0.05,use_colnames=True) # use_colnames=True表示使用元素名字,默认的False使用列名代表元素 # frequent_itemsets = apriori(df,min_support=0.05) frequent_itemsets.sort_values(by='support',ascending=False,inplace=True) # 频繁项集可以按支持度排序 print('求频繁项集') print(frequent_itemsets[frequent_itemsets.itemsets.apply(lambda x: len(x)) >= 2]) # 选择长度 >=2 的频繁项集
from datetime import datetime import json global rules data = pd.read_csv('groceryinfo.csv', header=None) #print(data) records = [] for i in range(0, 7501): records.append([str(data.values[i, j]) for j in range(0, 20)]) removed_records = [] for row in records: row = list(filter(lambda a: a != 'nan', row)) row = list(filter(lambda a: a != 'mineral water', row)) removed_records.append(row) te = TransactionEncoder() data = te.fit_transform(removed_records) data = pd.DataFrame(data, columns=te.columns_) from mlxtend.frequent_patterns import apriori, association_rules frq_items = apriori(data, min_support=0.004, use_colnames=True) rules = association_rules(frq_items, metric="confidence", min_threshold=0.2) rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False]) print(len(rules)) global products products = { "olive oil": 'https://images-na.ssl-images-amazon.com/images/I/71JLJ0MQT8L._SY679_.jpg', "frozen vegetables": 'https://images-na.ssl-images-amazon.com/images/I/81Dxf-0CzwL._SL1500_.jpg',
# məhsulları ayrı ayrılıqda list formatına çevirmək transactions = list(items["mehsul_ad"].transform(lambda x: x.split(";"))) # In[12]: transactions[0] # In[13]: # məhsulları pivot formatında səbətə çevirmək: hər bir sətirdə(səbətdə) hansı məhsulların olub olmaması from mlxtend.preprocessing import TransactionEncoder tr_enc = TransactionEncoder() basket = pd.DataFrame(tr_enc.fit_transform(transactions), columns=tr_enc.columns_) # In[14]: basket # In[15]: # səbət analizi üçün lazım olan funksiyalar from mlxtend.frequent_patterns import apriori, association_rules # In[16]: # məhsulların ayrı ayrılıqda və birlikdə səbətlərdə görünmə dərəcəsi
""" import csv dict = dkey w = csv.writer(open("output.csv", "w")) for key, val in dict.items(): w.writerow([key, val]) """ # To create a list of lists from the dictionary values i=10002 while i in range(10002,42580): dkey[i]=list(map(str,dkey[i])) we=list(dkey.values()) #Fitting the association rule learning model from mlxtend.preprocessing import TransactionEncoder te = TransactionEncoder() dat=we[1:50] te_ary = te.fit_transform(dat,sparse=False) df = pd.DataFrame(te_ary, columns=te.columns_) from mlxtend.frequent_patterns import apriori frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True) print (frequent_itemsets) from mlxtend.frequent_patterns import association_rules t=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
from apyori import apriori rules = apriori(symptoms, min_support=0.1, min_confidence=0.7) results = list(rules) for i in results: for j in i.ordered_statistics: X = j.items_base Y = j.items_add x = ', '.join([item for item in X]) y = ', '.join([item for item in Y]) if x != '': print(x + ' → ' + y) from mlxtend.preprocessing import TransactionEncoder TE = TransactionEncoder() data = TE.fit_transform(symptoms) print(data) import pandas as pd df = pd.DataFrame(data, columns=TE.columns_) df.head() from mlxtend.frequent_patterns import apriori items = apriori(df, min_support=0.1, use_colnames=True) print(items) print(items[items['itemsets'].apply(lambda x: len(x)) >= 2]) from mlxtend.frequent_patterns import association_rules rules = association_rules(items, min_threshold=0.7)
def Mlx(itemsets, minimumSup): te = TransactionEncoder() # 定義模型 df_tf = te.fit_transform(itemsets) df = pd.DataFrame(df_tf, columns=te.columns_) return apriori(df, min_support=minimumSup, use_colnames=True)
# %% items = [ 'whole milk', 'yogurt', 'soda', 'tropical fruit', 'shopping bags', 'sausage', 'whipped/sour cream', 'rolls/buns', 'other vegetables', 'root vegetables', 'pork', 'bottled water', 'pastry', 'citrus fruit', 'canned beer', 'bottled beer' ] np_data_new = all_data.to_numpy() np_data_new = [[ elem for elem in row[1:] if isinstance(elem, str) and elem in items ] for row in np_data_new] # %% te_new = TransactionEncoder() te_ary_new = te_new.fit_transform(np_data_new) data_new = pd.DataFrame(te_ary_new, columns=te_new.columns_) data_new # %% fpg_result_new = fpgrowth(data_new, min_support=0.03, use_colnames=True).sort_values('support', ascending=False) fpg_result_new # %% fpm_result_new = fpmax(data_new, min_support=0.03, use_colnames=True).sort_values('support', ascending=False) fpm_result_new
import pandas as pd from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori, association_rules import requests file = open('/root/PycharmProjects/DATA_MINING/groceries.csv', 'w') data = requests.get( 'https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/groceries.csv' ) file.write(data.text) file.close() file = open('/root/PycharmProjects/DATA_MINING/groceries.csv', 'r') lines = file.readlines() data = [[x if x[len(x) - 1] != '\n' else x[:len(x) - 1] for x in y.split(',')] for y in lines] print(data) encoder = TransactionEncoder() data = encoder.fit_transform(data) print(data) data = data.astype('int') data = pd.DataFrame(data, columns=encoder.columns_) print(data) print(" Minimum support 5 % and confidence 9 % ") frq_items = apriori(data, min_support=0.05, use_colnames=True) rules = association_rules(frq_items, metric="confidence", min_threshold=0.09) print(rules) print(" Minimum support 7 % and confidence 10 % ") frq_items = apriori(data, min_support=0.07, use_colnames=True) rules = association_rules(frq_items, metric="confidence", min_threshold=0.10) print(rules)
if freq[i] in txt: res.append(freq[i]) if len(set(res)) == 0: return (set(freq)) elif len(set(res)) < 15: for j in range(len(freq)): if len(set(res)) < 15: res.append(freq[j]) else: return set(res) return set(res) else: return set(res) df = pd.read_csv("dataset/sample.csv")['Tags'] df = df.apply(lambda x: x.split(" ")) data = df.values td = TransactionEncoder() td_data = td.fit_transform(df) df2 = pd.DataFrame(td_data, columns=td.columns_) freq_data = apriori(df2, min_support=0.009, use_colnames=True) #0.0004 freq_data['len'] = freq_data.itemsets.apply(lambda x: len(x)) def test(): df = pd.read_csv("sample.csv").head(1) txt = df["Title"] + " " + df["Body"] return get_ferq_with_txt(txt, ["linux", "c#", "php"])
''' 【白话机器学习】算法理论+实战之关联规则 https://mp.weixin.qq.com/s/KXoKE0cY7hiJIA2hE86mDw ''' # 一、直接可用数据集 data = [('牛奶', '面包', '尿布'), ('可乐', '面包', '尿布', '啤酒'), ('牛奶', '尿布', '啤酒', '鸡蛋'), ('面包', '牛奶', '尿布', '啤酒'), ('面包', '牛奶', '尿布', '可乐')] # 1.1、第三方库: from mlxtend.frequent_patterns import apriori as mlxtend_apriori, association_rules as mlxtend_association_rules from mlxtend.preprocessing import TransactionEncoder # TransactionEncoder是进行数据转换中的,需要先将上面的data数据转成宽表的形式,何谓宽表,下面的这种: """数据转换""" transEn = TransactionEncoder() oht_ary = transEn.fit_transform(data) new_data = pd.DataFrame(oht_ary, columns=transEn.columns_) # In[]: print(new_data.iloc[0][0]) print(type(new_data.iloc[0][0])) # In[]: # 第一步:计算频繁项集,在这里可以定义最小支持度进行筛选频繁项集: """计算频繁项集""" frequent_itemset = mlxtend_apriori(new_data, min_support=0.5, use_colnames=True) frequent_itemset # In[]: # 第二步:挖取关联规则, 这里的 准则 可以使用 置信度(confidence) 或 提升度(lift) rules = mlxtend_association_rules(frequent_itemset, metric='confidence',