def create_rules(): fname = './Sample_Data/sample_variants.txt' dataset = [] with open(fname, 'r') as fhandle: lines = fhandle.readlines() for line in lines: if len(line) > 0: patientid, genes = line.split('\t') genes = genes.split(',') tmp = genes[-1] tmp = tmp[0:len(tmp) - 1] genes[-1] = tmp dataset.append(genes) print(dataset) oht = OnehotTransactions() oht_ary = oht.fit(dataset).transform(dataset) df = pd.DataFrame(oht_ary, columns=oht.columns_) frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True) association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0) return rules
def get_itemsets(inputdata): oht = OnehotTransactions() oht_ary = oht.fit(inputdata).transform(inputdata) df = pd.DataFrame(oht_ary, columns=oht.columns_) frequent_itemsets = apriori(df, min_support=0.1, max_len=4, use_colnames=True) #rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0) frequent_itemsets.to_csv('AANFrequentItemsets.csv')
def test_cloning(): oht = OnehotTransactions() oht.fit(dataset) oht2 = clone(oht) msg = ("'OnehotTransactions' object has no attribute 'columns_'") assert_raises(AttributeError, msg, oht2.transform, dataset) trans = oht2.fit_transform(dataset) np.testing.assert_array_equal(expect, trans)
def get_oht_dataframe(self): self.short_list = self.get_processed_list() if len(self.short_list) < 15000: self.short_list = self.short_list[:] else: self.short_list = self.short_list[:15000] print(len(self.short_list)) oht = OnehotTransactions() oht_ary = oht.fit(self.short_list).transform(self.short_list) data_frame = pd.DataFrame(oht_ary, columns = oht.columns_) return data_frame
def main(): oht = OnehotTransactions() oht_ary = oht.fit(dataset).transform(dataset) df = pd.DataFrame(oht_ary, columns=oht.columns_) frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True) print(frequent_itemsets) rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6) print(rules)
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA): global OH, lr, quota, i, km, part quota = QUOTA list_q = [] obj_q = [] for url in QLINK_URLS: prop = get_prop(url) list_q += prop obj_q.append(prop) params, count = np.unique(list_q, return_counts=True) params = np.array(params, dtype='object') count = np.array(count, dtype='object') feat_q_num = np.hstack((params.reshape(-1,1), count.reshape(-1,1))) feat_q_num = feat_q_num[np.argsort(feat_q_num[:,1])] feat_q_num = feat_q_num[::-1] obj_g = [] for url in UNKNOWN_URLS: prop = get_prop(url) obj_g.append(prop) imp_features = (feat_q_num[:100][:,0]).reshape(-1,1) OH = OnehotTransactions() OH.fit(imp_features) q_matrix = OH.transform(obj_q) g_matrix = OH.transform(obj_g) X = np.vstack((q_matrix, g_matrix)) y = [1]*500 + [0]*500 y = np.array(y) lr = LinearRegression() lr.fit(X,y) i = 0 ## divide by n clusters set part for each one n = 15 km = KMeans(n_clusters=n) a = np.hstack((km.fit_predict(X).reshape(-1,1), y.reshape(-1,1))) part = np.zeros(n) for i in range(n): part[i] = a[a[:,0] == i][:,1].sum() part /= part.sum()
def findAssociationWord(sentenceList): transactionID = list() for sentence in sentenceList: word_list = word_tokenize(sentence, engine='mm') for word in reversed(word_list): if word in stopword or re.match('[\W]+', word): word_list.remove(word) transactionID.append(word_list) oht = OnehotTransactions() oht_ary = oht.fit(transactionID).transform(transactionID) df = pd.DataFrame(oht_ary, columns=oht.columns_) frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True) result = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5) print(result)
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA): global OH, lr, quota, i, km, part quota = QUOTA list_q = [] obj_q = [] for url in QLINK_URLS: prop = get_prop(url) list_q += prop obj_q.append(prop) params, count = np.unique(list_q, return_counts=True) params = np.array(params, dtype='object') count = np.array(count, dtype='object') feat_q_num = np.hstack((params.reshape(-1, 1), count.reshape(-1, 1))) feat_q_num = feat_q_num[np.argsort(feat_q_num[:, 1])] feat_q_num = feat_q_num[::-1] obj_g = [] for url in UNKNOWN_URLS: prop = get_prop(url) obj_g.append(prop) imp_features = (feat_q_num[:100][:, 0]).reshape(-1, 1) OH = OnehotTransactions() OH.fit(imp_features) q_matrix = OH.transform(obj_q) g_matrix = OH.transform(obj_g) X = np.vstack((q_matrix, g_matrix)) y = [1] * 500 + [0] * 500 y = np.array(y) lr = LinearRegression() lr.fit(X, y) i = 0 ## divide by n clusters set part for each one n = 15 km = KMeans(n_clusters=n) a = np.hstack((km.fit_predict(X).reshape(-1, 1), y.reshape(-1, 1))) part = np.zeros(n) for i in range(n): part[i] = a[a[:, 0] == i][:, 1].sum() part /= part.sum()
def experiment3(): print('Experiment 3 Association Rules of Actors and Directors') print('-------------------------------------------------------') # read in data as datafile in pandas df = pd.read_csv('./data/assocRules_withDirector.csv') df = df.dropna() # get just the values, without the header df_values = df.values # transform the data with a onehot transform, to vectorize categorical data oht = OnehotTransactions() df_processed = oht.fit(df_values).transform(df_values) # rebuild the dataframe with transformed data df = pd.DataFrame(df_processed, columns=oht.columns_) # find frequencies with apriori algorithm frequent_combinations = apriori(df, min_support=0.001, use_colnames=True) # create tabular ruleset rules = association_rules(frequent_combinations, metric="lift", min_threshold=1) rules.to_csv('./results/association_rules.csv', sep='\t') print(rules)
def Apriori(id, dataset, principal): pasos = "Dataset cargado" + '\n' dataset = pickdataset(int(id), dataset) oht = OnehotTransactions() oht_ary = oht.fit(dataset).transform(dataset) df = pd.DataFrame(oht_ary, columns=oht.columns_) frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True) association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2) pasos += "Dataset Procesado: " + '\n' pasos += str(df) + '\n' pasos += "Item Set: " + '\n' pasos += str(frequent_itemsets) + '\n' avgReal = str(np.mean(rules.as_matrix(columns=['support'])) * 100) + "% soporte promedio" reglas = rules[['antecedents', 'consequents', 'support']] if principal: context = { 'algoritmoPrincipal': 'Apriori', 'resultado': avgReal, 'pasos': pasos, 'reglas': reglas, 'img': 'No aplica' } else: context = { 'algoritmoComparar': 'Regresión Lineal', 'resultado2': avgReal, 'pasos2': pasos, 'reglas2': reglas, 'img2': 'No aplica' } return context
def test_fit(): oht = OnehotTransactions() oht.fit(dataset) assert (oht.columns_ == [ 'Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice' ])
from mlxtend.preprocessing import OnehotTransactions from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules # reading file as lines of string with open("categories.txt") as file: default=file.read().splitlines() # converting the strings into list type data=[] for s in default: data.append(list(map(str,s.split(';')))) # dataNew = pd.get_dummies(data) # converting the dataset into one hot trnasaction [0 if exist,1 in not] oht = OnehotTransactions() dataNew = oht.fit(data).transform(data) dataNew=pd.DataFrame(dataNew,columns=oht.columns_) # print(dataNew.head()) # apriori algorithm, parameter are give frequent_itemsets = apriori(dataNew, min_support=0.01, use_colnames=True) frequent_itemsets=np.array(frequent_itemsets) # for i in range(len(frequent_itemsets)): # print("{}:{}".format(int(frequent_itemsets[i][0]*len(data)),frequent_itemsets[i][1][0])) # writing in the text file # write_file=open("new_pattern.txt","w") # for i in range(len(frequent_itemsets)):
import pandas as pd from mlxtend.preprocessing import OnehotTransactions from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules data = pd.read_csv('./inout_flow_data.csv') data = data.loc[(data['station_id'] == 519) & ((data['in_flow_count'] != 0) | (data['out_flow_count'] != 0)), ['in_flow_count', 'out_flow_count']] lst = [] for value in data.itertuples(): lst.append(['i' + str(int(value[1] / 5)), 'o' + str(int(value[2] / 5))]) oht = OnehotTransactions() oht_ary = oht.fit(lst).transform(lst) df = pd.DataFrame(oht_ary, columns=oht.columns_) frequent_itemsets = apriori(df, min_support=0.03, use_colnames=True) print(frequent_itemsets) rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4) print(rules)
def test_inverse_transform(): oht = OnehotTransactions() oht.fit(dataset) np.testing.assert_array_equal(np.array(data_sorted), np.array(oht.inverse_transform(expect)))
def test_fit_transform(): oht = OnehotTransactions() trans = oht.fit_transform(dataset) np.testing.assert_array_equal(expect, trans)
# ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'], # ['Milk', 'Apple', 'Kidney Beans', 'Eggs'], # ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'], # ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']] from datetime import timedelta, date from es02 import es02 dataset = [] if __name__ == "__main__": global dataset es = es02() es.set_service("display") es.load_datas2(date(2017,12,1),date(2018,1,8)) dataset2 = es.dset dataset = [ item for item in dataset2 if len(item) > 1 ] for item in dataset: print(item) import pandas as pd from mlxtend.preprocessing import OnehotTransactions oht = OnehotTransactions() oht_ary = oht.fit(dataset).transform(dataset) df = pd.DataFrame(oht_ary, columns=oht.columns_) from mlxtend.frequent_patterns import apriori frequent_itemsets = apriori(df, min_support=0.3,use_colnames=True) print(frequent_itemsets) from mlxtend.frequent_patterns import association_rules arule = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) print(arule)
@author: AliOthman """ import pandas as pd from mlxtend.preprocessing import OnehotTransactions from mlxtend.frequent_patterns import apriori #dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'], # ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'], # ['Milk', 'Apple', 'Kidney Beans', 'Eggs'], # ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'], # ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']] data = pd.read_csv('bakery.csv') process = OnehotTransactions() matrix = process.fit(data).transform(data) df = pd.DataFrame(matrix, columns=process.columns_) #apriori(df, min_support=0.6) #apriori(df, min_support=0.6, use_colnames=True) frequent_itemsets = apriori(df, min_support=0.0123, use_colnames=True) frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply( lambda x: len(x)) #frequent_itemsets[ (frequent_itemsets['length'] == 2) & # (frequent_itemsets['support'] >= 0.8) ] print frequent_itemsets
'register': u_r, 'trannum': u_t, 'basket': basket_SKUs } list_of_baskets.append(d) else: pass basket_df = pd.DataFrame(list_of_baskets) all_baskets = basket_df['basket'].tolist() from mlxtend.preprocessing import OnehotTransactions from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules oht = OnehotTransactions() oht_array = oht.fit(all_baskets).transform(all_baskets) oht_df = pd.DataFrame(oht_array, columns=oht.columns_) print 'One Hot Encoding Successful' print '-----------------------------' frequent_SKUs = apriori(oht_df, min_support=0.0004, use_colnames=True) print 'Frequent SKUs Successful' print '----------------------------------------------------------' assoc_rules = association_rules(frequent_SKUs, metric="confidence", min_threshold=0.5) # pickle the association rules dataframe for easy reference
# In[19]: numpyMatrix = df.as_matrix() # In[21]: numpyMatrix # In[22]: from mlxtend.preprocessing import OnehotTransactions # In[23]: oht = OnehotTransactions() oht_ary = oht.fit(numpyMatrix).transform(numpyMatrix) dataframe = pd.DataFrame(oht_ary, columns=oht.columns_) # In[35]: dataframe.drop(dataframe.columns[0], axis=1) # In[56]: from mlxtend.frequent_patterns import apriori frequent_itemsets = apriori(dataframe, min_support=0.05, use_colnames=True) frequent_itemsets # In[57]:
Data = [['Power Bank', 'Screen Guard', 'Travel Charger'], ['Screen Guard', 'Bluetooth Headset', 'Mobile Cover'], ['Screen Guard', 'Arm Band', 'Mobile Cover'], ['Power Bank', 'Screen Guard', 'Leather Pouch'], ['Bluetooth Headset', 'Power Bank', 'Mobile Cover']] import pandas as pd from mlxtend.preprocessing import OnehotTransactions from mlxtend.frequent_patterns import apriori ###Start code here oht = OnehotTransactions() oht_ary = oht.fit(Data).transform(Data) dataFrame = pd.DataFrame(oht_ary, columns=oht.columns_) frequent_itemsets = apriori(dataFrame, min_support=0.1, use_colnames=True) ###End code(approx 4 lines) from mlxtend.frequent_patterns import association_rules ###Start code here association_rule = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) print(association_rule) ###End code(approx 2 lines) ''' antecedants consequents \ 0 (Arm Band) (Screen Guard) 1 (Travel Charger, Power Bank) (Screen Guard) 2 (Travel Charger, Screen Guard) (Power Bank) 3 (Travel Charger) (Power Bank, Screen Guard) 4 (Bluetooth Headset) (Mobile Cover)
def get(self, request, *args, **kwargs): context = self.get_context_data(**kwargs) threshold = request.GET.get('parameter') percentage = float(request.GET.get('percentage')) support = float(request.GET.get('support')) nodes = int(request.GET.get('nodes')) # Read working file data and prepare transactions for # Apriori algorithm file = WFile.objects.get(pk=kwargs['pk']) store_data = pd.read_csv(file.file.path) columns = request.GET.getlist('selected') dataset = [] for _, row in store_data.iterrows(): aux = [] for c in columns: aux.append(c + '=' + row[c]) dataset.append(aux) # A priori algorithm apply: Extraction of frequent itemsets # and association rules oht = OnehotTransactions() oht_ary = oht.fit(dataset).transform(dataset) df = pd.DataFrame(oht_ary, columns=oht.columns_) frequent_itemsets = apriori(df, min_support=support, use_colnames=True) context['frequent_itemsets'] = frequent_itemsets rules = association_rules(frequent_itemsets, metric=threshold, min_threshold=percentage) # rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.2) context['rules'] = rules # Building of scatter plot of support vs. confidence support = rules.as_matrix(columns=['support']) confidence = rules.as_matrix(columns=['confidence']) for i in range(len(support)): support[i] = support[i] + 0.0025 * (random.randint(1, 10) - 5) confidence[i] = confidence[i] + 0.0025 * (random.randint(1, 10) - 5) plt.gcf().clear() plt.scatter(support, confidence, alpha=0.5, marker="*") plt.xlabel('support') plt.ylabel('confidence') plt.tight_layout() file1 = io.BytesIO() plt.savefig(file1) file1 = ContentFile(file1.getvalue()) # Building of histogram frequency_array = frequent_itemsets[frequent_itemsets['itemsets'].map( len) == 1] total_transactions = len(dataset) histogram_labels = [] histogram_frequency = [] for index, row in frequency_array.iterrows(): histogram_frequency.append(int(row['support'] * total_transactions)) histogram_labels.append(list(row['itemsets'])[0]) histogram_data = [] for i in range(len(histogram_labels)): histogram_data += [ histogram_labels[i] for x in range(histogram_frequency[i]) ] ##### # Create the plot ##### plt.gcf().clear() fig, ax = plt.subplots() # the histogram of the data n, bins, patches = ax.hist(histogram_data) # add a 'best fit' line ax.set_ylabel('Frequency') plt.xticks(histogram_labels, rotation=90, fontsize='x-small') # Tweak spacing to prevent clipping of ylabel fig.tight_layout() file2 = io.BytesIO() plt.savefig(file2) file2 = ContentFile(file2.getvalue()) # Building of heat plot # Convert the input into a 2D dictionary freqMap = {} for line in dataset: for item in line: if not item in freqMap: freqMap[item] = {} for other_item in line: if not other_item in freqMap: freqMap[other_item] = {} freqMap[item][other_item] = freqMap[item].get( other_item, 0) + 1 freqMap[other_item][item] = freqMap[other_item].get( item, 0) + 1 df = DataFrame(freqMap).T.fillna(0) ##### # Create the plot ##### plt.gcf().clear() plt.pcolormesh(df, edgecolors='black') plt.yticks(np.arange(0.5, len(df.index), 1), df.index, fontsize='x-small') plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, rotation=90, fontsize='x-small') plt.tight_layout() file3 = io.BytesIO() plt.savefig(file3) file3 = ContentFile(file3.getvalue()) # Draw graph for association rules file4 = draw_graph(rules, nodes) print(file1, file2, file3, file4) results = AssociationRules.objects.create() results.scatter.save('scatter.png', file1) results.histogram.save('histogram.png', file2) results.heat_map.save('heat_map.png', file3) results.graph.save('graph.png', file4) context['results'] = results return self.render_to_response(context)
import json recipe_data = pd.read_json('./train.json') print(recipe_data) # In[2]: ingredients = recipe_data['ingredients'].tolist() print(ingredients[:5]) # In[3]: from mlxtend.preprocessing import OnehotTransactions from mlxtend.frequent_patterns import apriori oht = OnehotTransactions() oht_ary = oht.fit(ingredients).transform(ingredients) df_train = pd.DataFrame(oht_ary, columns=oht.columns_) df_train # In[4]: frequent_itemsets_train = apriori(df_train, min_support=0.05, use_colnames=True) print(frequent_itemsets_train) # In[5]: from mlxtend.frequent_patterns import association_rules
import pandas as pd from mlxtend.preprocessing import OnehotTransactions from mlxtend.frequent_patterns import apriori, association_rules import os import csv os.chdir('E:\\20.association analysis\\dataset\\75000') file = open('75000-out1.csv') rows = csv.reader(file) rows = list(rows) for row in rows: row.pop(0) oht = OnehotTransactions() oht_ary = oht.fit(rows).transform(rows) df = pd.DataFrame(oht_ary, columns=oht.columns_) df frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) rules[(rules['lift'] >= 6) & (rules['confidence'] >= 0.8)]
def test_fit(): oht = OnehotTransactions() oht.fit(dataset) assert(oht.columns_ == ['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice'])
all_cells_list = [] # Loop that sets cell_list to the value of each sell appended onto each other for cell in sheet.col(spreadsheet_column): # Sets cell_value to the value found in the cell cell_value = cell.value # Removes unneeded characters to reduce clutter cell_value = cell_value.replace('"', "") cell_value = cell_value.replace(r'[', '') cell_value = cell_value.replace(r']', '') # Splits each value apart based on spaces and appends them to the cell list cell_value_parsed = cell_value.split(',') all_cells_list.append(cell_value_parsed) print(all_cells_list) # Creation of the data frame based on the cell_list oht = OnehotTransactions() oht_ary = oht.fit(all_cells_list).transform(all_cells_list) df = pd.DataFrame(oht_ary, columns=oht.columns_) print(df) # ---------------------------------------------------------------------------------------------------------------------- # Configuration for apriori algorithm # ---------------------------------------------------------------------------------------------------------------------- # minimum value of the coefficient min_co = 0.3 # True or False whether or not to include the column names in the output use_colnames_bool = True # max number of associations max_len_value = None frequent_itemsets = apriori(df, min_support=min_co,
disease = "" print("ENTER") with open("rollup_dataset.csv", "rt", encoding='utf-8') as csvfile: reader = csv.reader(csvfile) for row in reader: row_clean = [i for i in row if i] bucket_clean = [i for i in bucket if i] if len(row_clean) == (len(bucket_clean) + 1): if all(values in row_clean for values in bucket_clean): disease = row_clean[0] break return disease buckets = [] with open("buckets_new.csv") as csvfile: reader = csv.reader(csvfile) for row in reader: buckets.append(row) oht = OnehotTransactions() oht_ary = oht.fit(buckets).transform(buckets) df = pd.DataFrame(oht_ary, columns=oht.columns_) frequent_itemsets = apriori(df, min_support=0.4, use_colnames=True) frequent_itemsets = frequent_itemsets[frequent_itemsets['support'] >= 0.7] frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets']] print(list(frequent_itemsets)) #get_disease(frequent_itemsets[0]['itemsets'], buckets)
import csv import pandas as pd from IPython.display import display from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules from mlxtend.preprocessing import OnehotTransactions dataset = [] with open('supermarket.csv', newline='') as f: transactions = csv.reader(f) dataset = list(transactions) oht = OnehotTransactions() oht_ary = oht.fit(dataset).transform(dataset) data_frame = pd.DataFrame(oht_ary, columns=oht.columns_) frequent_itemsets = apriori(data_frame, min_support=0.3, use_colnames=True) # get the association rules rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.84).round(2) display(rules)
f.write("%s " % content2) f.write("%s\n" % content3) total_eval_count = total_eval_count + 1 print "Total Number of evaluations", print total_eval_count # Make a list of all the Side Effects, so that we can convert it to One Hot Notation side_effect_list = [] for patient in patient_data.keys(): for drug in patient_data[patient]["Treatments List"].keys(): side_effect_list.append( patient_data[patient]["Treatments List"][drug]["Side Effects"]) # Converting to One Hot Vector convert = OnehotTransactions() side_effect_list = convert.fit(side_effect_list).transform(side_effect_list) # Making the Librec Dictionary i.e. the vectorised feature dataset side_effect_counter = 0 librec_dict = {} for patient in patient_data.keys(): patient_counter = patient_id[patient] librec_dict[patient_counter] = {} for drug in patient_data[patient]["Treatments List"].keys(): key = int(drug) librec_dict[patient_counter][key] = {} librec_dict[patient_counter][key]["Side Effect Rating"] = patient_data[ patient]["Treatments List"][drug]["Side Effect Rating"]