def get_freqitemsets(fname,minsupport,maxlhs): #minsupport is an integer percentage (e.g. 10 for 10%) #maxlhs is the maximum size of the lhs #first load the data data,Ydata = load_data(fname) #Now find frequent itemsets #Mine separately for each class data_pos = [x for i,x in enumerate(data) if Ydata[i,0]==0] data_neg = [x for i,x in enumerate(data) if Ydata[i,0]==1] assert len(data_pos)+len(data_neg) == len(data) Y = [0,0] Y[0] = sum([1<<i for i,x in enumerate(data) if Ydata[i,0]==1]) Y[1] = sum([1<<i for i,x in enumerate(data) if Ydata[i,1]==1]) itemsets = [r[0] for r in fpgrowth(data_pos,supp=minsupport,zmax=maxlhs)] itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=minsupport,zmax=maxlhs)]) itemsets = list(set(itemsets)) print len(itemsets),'rules mined' #Now form the data-vs.-lhs set #X[j] is the bit vector of data points that contain itemset j (that is, satisfy rule j) X = [ 0 for j in range(len(itemsets)+1)] global trainingSize trainingSize = len(data) X[0] = (1<<trainingSize) - 1 #the default rule satisfies all data, so all bits are 1's for (j,lhs) in enumerate(itemsets): X[j+1] = sum([1<<i for (i,xi) in enumerate(data) if set(lhs).issubset(xi)]) #now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) return X,Y,nruleslen,lhs_len,itemsets_all
def construct_fp_train(filename): setfile = open(filename) reader = csv.reader(setfile, delimiter=";") #General Statistics set_ids = [] measures = [] notes_fp = [] results = [] for row in reader: id_file = row[0] set_ids.append(id_file) measures.append(get_measures(id_file)) measures = itertools.chain(*measures) notes_fp = fpgrowth(list(measures), report='S', zmin=2) print len(notes_fp) setfile = open(filename) reader = csv.reader(setfile, delimiter=";") for row in reader: id_file = row[0] measure = get_measures(id_file) fp_song = fpgrowth(measure, report='S', zmin=2) result = compare_fp(notes_fp, fp_song) results.append(result) return results, notes_fp
def get_freqitemsets_1(dataset, Y, minsupport, maxlhs): #minsupport is an integer percentage (e.g. 10 for 10%) #maxlhs is the maximum size of the lhs #first load the data #Now find frequent itemsets #Mine separately for each class data_pos = [x for i, x in enumerate(dataset) if Y[i] == 0] data_neg = [x for i, x in enumerate(dataset) if Y[i] == 1] print 'ok' #data_pos = dataset[0:72] #data_neg = dataset[72:144] assert len(data_pos) + len(data_neg) == len(dataset) try: itemsets = [r[0] for r in fpgrowth(data_pos, supp=minsupport, zmax=maxlhs)] itemsets.extend([r[0] for r in fpgrowth(data_neg, supp=minsupport, zmax=maxlhs)]) except TypeError: itemsets = [r[0] for r in fpgrowth(data_pos, supp=minsupport, max=maxlhs)] itemsets.extend([r[0] for r in fpgrowth(data_neg, supp=minsupport, max=maxlhs)]) itemsets = list(set(itemsets)) print(len(itemsets), 'rules mined') #Now form the data-vs.-lhs set #X[j] is the set of data points that contain itemset j (that is, satisfy rule j) X = [set() for j in range(len(itemsets) + 1)] X[0] = set(range(len(dataset))) #the default rule satisfies all data for (j, lhs) in enumerate(itemsets): X[j + 1] = set([i for (i, xi) in enumerate(dataset) if set(lhs).issubset(xi)]) #now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = np.array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) return X, Y, nruleslen, lhs_len, itemsets_all
def construct_fp_train(filename): setfile= open(filename) reader = csv.reader(setfile, delimiter=";") #General Statistics set_ids = [] measures = [] notes_fp = [] results = [] for row in reader: id_file = row[0] set_ids.append(id_file) measures.append(get_measures(id_file)) measures = itertools.chain(*measures) notes_fp = fpgrowth(list(measures), report='S', zmin=2) print len(notes_fp) setfile= open(filename) reader = csv.reader(setfile, delimiter=";") for row in reader: id_file = row[0] measure = get_measures(id_file) fp_song = fpgrowth(measure ,report='S', zmin=2) result = compare_fp(notes_fp, fp_song) results.append(result) return results, notes_fp
def fi(data): print("Using apriori for fim : ") freq_list = fim.apriori(tracts=data, supp=5) print("The frequent item list is : ") print(freq_list) rules = fim.apriori(tracts=data, target='r', eval='c', report='c') print("The rules are : ") print(rules) rules = fim.apriori(tracts=data, target='r', eval='l', report='l') print("The rules are (evaluated with lift): ") print(rules) print("lfi using apriori : ") lfi(freq_list) print("Using fp-growth for fim : ") freq_list = fim.fpgrowth(tracts=data, supp=5) print("The frequent item list is : ") print(freq_list) rules = fim.fpgrowth(tracts=data, target='r', eval='c', report='c', conf=60) print("The rules are (evaluated with confidence): ") print(rules) rules = fim.fpgrowth(tracts=data, target='r', eval='l', report='l', conf=60) print("The rules are (evaluated with lift): ") print(rules) print("lfi using fpgrowth is : ") lfi(freq_list)
def gen_rule(df_combine, Y, Supp, Maxlen, N): # generate rules using FP-growth algorithm df_combine = 1 - df_combine itemMatrix = [[item for item in df_combine.columns if row[item] == 1] for i, row in df_combine.iterrows()] pindex = np.where(Y == 1)[0] nindex = np.where(Y != 1)[0] prules = fpgrowth([itemMatrix[i] for i in pindex], supp=Supp, zmin=1, zmax=Maxlen) prules = [np.sort(x[0]).tolist() for x in prules] nrules = fpgrowth([itemMatrix[i] for i in nindex], supp=Supp, zmin=1, zmax=Maxlen) nrules = [np.sort(x[0]).tolist() for x in nrules] prules, pRMatrix, psupp, pprecision, perror = screen_rules( prules, df_combine, Y, N, Supp) nrules, nRMatrix, nsupp, nprecision, nerror = screen_rules( nrules, df_combine, 1 - np.array(Y), N, Supp) premined_rules = prules premined_rules.extend(nrules) return premined_rules
def get_freqitemsets(fname, minsupport, maxlhs, verbose=True): # minsupport is an integer percentage (e.g. 10 for 10%) # maxlhs is the maximum size of the lhs # first load the data data, Y = load_data(fname) # Now find frequent itemsets # Mine separately for each class data_pos = [x for i, x in enumerate(data) if Y[i, 0] == 0] data_neg = [x for i, x in enumerate(data) if Y[i, 0] == 1] assert len(data_pos) + len(data_neg) == len(data) try: itemsets = [r[0] for r in fpgrowth(data_pos, supp=minsupport, zmax=maxlhs)] itemsets.extend([r[0] for r in fpgrowth(data_neg, supp=minsupport, zmax=maxlhs)]) except TypeError: itemsets = [r[0] for r in fpgrowth(data_pos, supp=minsupport, max=maxlhs)] itemsets.extend([r[0] for r in fpgrowth(data_neg, supp=minsupport, max=maxlhs)]) itemsets = list(set(itemsets)) if verbose: print(len(itemsets), 'rules mined') # Now form the data-vs.-lhs set # X[j] is the set of data points that contain itemset j (that is, satisfy rule j) X = [set() for j in range(len(itemsets) + 1)] X[0] = set(range(len(data))) # the default rule satisfies all data for (j, lhs) in enumerate(itemsets): X[j + 1] = set([i for (i, xi) in enumerate(data) if set(lhs).issubset(xi)]) # now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) return X, Y, nruleslen, lhs_len, itemsets_all
def generate_rulespace(self, supp, maxlen, N, need_negcode=False, njobs=5, method='fpgrowth', criteria='IG', add_rules=[]): if method == 'fpgrowth': if need_negcode: df = 1 - self.df df.columns = [name.strip() + 'neg' for name in self.df.columns] df = pd.concat([self.df, df], axis=1) else: df = 1 - self.df pindex = np.where(self.Y == 1)[0] nindex = np.where(self.Y != 1)[0] itemMatrix = [[item for item in df.columns if row[item] == 1] for i, row in df.iterrows()] prules = fpgrowth([itemMatrix[i] for i in pindex], supp=supp, zmin=1, zmax=maxlen) prules = [np.sort(x[0]).tolist() for x in prules] nrules = fpgrowth([itemMatrix[i] for i in nindex], supp=supp, zmin=1, zmax=maxlen) nrules = [np.sort(x[0]).tolist() for x in nrules] else: print('Using random forest to generate rules ...') prules = [] for length in range(2, maxlen + 1, 1): n_estimators = 250 * length # min(5000,int(min(comb(df.shape[1], length, exact=True),10000/maxlen))) clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=length) clf.fit(self.df, self.Y) for n in range(n_estimators): prules.extend( extract_rules(clf.estimators_[n], self.df.columns)) prules = [list(x) for x in set(tuple(np.sort(x)) for x in prules)] nrules = [] for length in range(2, maxlen + 1, 1): n_estimators = 250 * length # min(5000,int(min(comb(df.shape[1], length, exact=True),10000/maxlen))) clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=length) clf.fit(self.df, 1 - self.Y) for n in range(n_estimators): nrules.extend( extract_rules(clf.estimators_[n], self.df.columns)) nrules = [list(x) for x in set(tuple(np.sort(x)) for x in nrules)] df = 1 - self.df df.columns = [name.strip() + 'neg' for name in self.df.columns] df = pd.concat([self.df, df], axis=1) self.prules, self.pRMatrix, self.psupp, self.pprecision, self.perror = self.screen_rules( prules, df, self.Y, N, supp) self.nrules, self.nRMatrix, self.nsupp, self.nprecision, self.nerror = self.screen_rules( nrules, df, 1 - self.Y, N, supp)
def get_freqitemsets(fname,minsupport,maxlhs): #minsupport is an integer percentage (e.g. 10 for 10%) #maxlhs is the maximum size of the lhs #first load the data data,Y = load_data(fname) #Now find frequent itemsets #Mine separately for each class if Y.shape[-1] == 2: # currently only mine itemsets for binary classification data_pos = [x for i,x in enumerate(data) if Y[i,0]==0] data_neg = [x for i,x in enumerate(data) if Y[i,0]==1] assert len(data_pos)+len(data_neg) == len(data) try: itemsets = [r[0] for r in fpgrowth(data_pos,supp=minsupport,zmax=maxlhs)] itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=minsupport,zmax=maxlhs)]) except TypeError: print("TypeError in fpgrowth") itemsets = [r[0] for r in fpgrowth(data_pos,supp=minsupport,max=maxlhs)] itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=minsupport,max=maxlhs)]) else: data_classes = [[] for _ in range(Y.shape[-1])] for row, y in zip(data, Y): i = list(y).index(1) data_classes[i].append(row) assert sum([len(x) for x in data_classes]) == len(data) itemsets = [ [r[0] for r in fpgrowth(data_class, supp=minsupport, zmax=maxlhs)] for data_class in data_classes ] # flatten itemsets = [x for class_itemset in itemsets for x in class_itemset] itemsets = list(set(itemsets)) print(len(itemsets),'rules mined') #Now form the data-vs.-lhs set #X[j] is the set of data points that contain itemset j (that is, satisfy rule j) X = [ set() for j in range(len(itemsets)+1)] X[0] = set(range(len(data))) #the default rule satisfies all data for (j,lhs) in enumerate(itemsets): X[j+1] = set([i for (i,xi) in enumerate(data) if set(lhs).issubset(xi)]) #now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) return X,Y,nruleslen,lhs_len,itemsets_all
def get_freqitemsets(fname, minsupport=10, maxlhs=2): # Load the data data, Y = load_data(fname) # Open output file fout = open(fname + ".out", "w") # Now find frequent itemsets, mining separately for each class data_pos = [x for i, x in enumerate(data) if Y[i, 0] == 0] data_neg = [x for i, x in enumerate(data) if Y[i, 0] == 1] assert len(data_pos) + len(data_neg) == len(data) print "About to calculate positive itemsets" itemsets = [r[0] for r in fpgrowth(data_pos, supp=minsupport, zmax=maxlhs)] print "About to calculate negative itemsets" itemsets.extend( [r[0] for r in fpgrowth(data_neg, supp=minsupport, zmax=maxlhs)]) itemsets = list(set(itemsets)) print "Done" n_rules = len(itemsets) # Now for each rule we want to write out a line of output # containing the rule and a bit for each training sample # indicating if the sample satisfies the rule or not. for lhs in itemsets: print lhs fout.write('&'.join(lhs) + '\t') for (j, attrs) in enumerate(data): if set(lhs).issubset(attrs): fout.write('1 ') else: fout.write('0 ') fout.write('\n') fout.close() fout = open(fname + ".label", "w") for label in xrange(2): if label == 0: fout.write('negative' + '\t') else: fout.write('positive' + '\t') for i, x in enumerate(data): if Y[i, label] == 1: fout.write('1 ') else: fout.write('0 ') fout.write('\n') fout.close()
def get_freqitemsets(fname, minsupport=10, maxlhs = 2): # Load the data data,Y = load_data(fname) # Open output file fout = open (fname+".out", "w") # Now find frequent itemsets, mining separately for each class data_pos = [x for i,x in enumerate(data) if Y[i,0]==0] data_neg = [x for i,x in enumerate(data) if Y[i,0]==1] assert len(data_pos)+len(data_neg) == len(data) print "About to calculate positive itemsets" itemsets = [r[0] for r in fpgrowth(data_pos,supp=minsupport,zmax=maxlhs)] print "About to calculate negative itemsets" itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=minsupport,zmax=maxlhs)]) itemsets = list(set(itemsets)) print "Done" n_rules = len(itemsets) # Now for each rule we want to write out a line of output # containing the rule and a bit for each training sample # indicating if the sample satisfies the rule or not. for lhs in itemsets : print lhs fout.write('&'.join(lhs) + '\t') for (j, attrs) in enumerate(data) : if set(lhs).issubset(attrs) : fout.write('1 ') else : fout.write('0 ') fout.write('\n') fout.close() fout = open (fname+".label", "w") for label in xrange(2): if label==0: fout.write('negative' + '\t') else: fout.write('positive' + '\t') for i,x in enumerate(data): if Y[i,label] == 1: fout.write('1 ') else: fout.write('0 ') fout.write('\n'); fout.close()
def generate_rules(self,supp,maxlen,N, method = 'randomforest'): self.maxlen = maxlen self.supp = supp df = 1-self.df #df has negative associations df.columns = [name.strip() + '_neg' for name in self.df.columns] df = pd.concat([self.df,df],axis = 1) if method =='fpgrowth' and maxlen<=3: itemMatrix = [[item for item in df.columns if row[item] ==1] for i,row in df.iterrows() ] pindex = np.where(self.Y==1)[0] nindex = np.where(self.Y!=1)[0] print 'Generating rules using fpgrowth' start_time = time.time() rules= fpgrowth([itemMatrix[i] for i in pindex],supp = supp,zmin = 1,zmax = maxlen) rules = [tuple(np.sort(rule[0])) for rule in rules] rules = list(set(rules)) start_time = time.time() print '\tTook %0.3fs to generate %d rules' % (time.time() - start_time, len(rules)) else: rules = [] start_time = time.time() for length in xrange(1,maxlen+1,1): n_estimators = min(pow(df.shape[1],length),4000) clf = RandomForestClassifier(n_estimators = n_estimators,max_depth = length) clf.fit(self.df,self.Y) for n in xrange(n_estimators): rules.extend(extract_rules(clf.estimators_[n],df.columns)) rules = [list(x) for x in set(tuple(x) for x in rules)] print '\tTook %0.3fs to generate %d rules' % (time.time() - start_time, len(rules)) self.screen_rules(rules,df,N) # select the top N rules using secondary criteria, information gain self.getPatternSpace()
def find_freq_mining_pred_deps(infile, outfile, mode='fim.fpgrowth', support=10, confidence=80, zmin=2, zmax=2, sample_to_print=5): with open(infile) as f: transactions = pickle.load(f) if mode == 'fim.fpgrowth': import fim patterns = fim.fpgrowth(transactions, zmin=zmin, zmax=zmax, supp=support, conf=confidence) print "## Sample of rules ({} total): ##".format(len(patterns)) print patterns[0:sample_to_print] elif mode == 'fim.carpenter': import fim patterns = fim.carpenter(transactions, zmin=2, zmax=2) print "## Sample of rules ({} total): ##".format(len(patterns)) print patterns[0:sample_to_print] with open(outfile, 'w') as f: pickle.dump(patterns, f, -1)
def get_nonsingle_itemsets(transactions, output_file): # Closed Non-Single Itemsets (ResponseBot 7) print("Building non-single itemsets with FP-Growth...") patterns = fpgrowth(transactions, target='c', supp=-1000, zmin=2) #output for (pattern, support) in sorted(patterns, key=lambda x: -x[1]): p = ','.join(pattern) output_file.write('{} {} \n'.format(p, str(support))) print 'Number of patterns:', len(patterns)
def freq_set_mining(self, community_list_1, community_list_2): tracts = [] for cl in [community_list_1, community_list_2]: for c in cl: tracts.append(c) fip = fim.fpgrowth(tracts, "m", -2) # size of overlap weighted by number of clusters in which this overlap occurs distance = np.sum([len(tup[0]) * tup[1] for tup in fip]) / np.array([community_list_1, community_list_2]).size return distance
def generate_rules(self, supp=2, maxlen=10, N=10000, method='fpgrowth'): ''' fp-growth, apriori, or using a tree based method. Note that, for frequen itemset mining, data needs to be discretized first. ''' self.maxlen = maxlen self.supp = supp df = 1 - self.df #df has negative associations df.columns = [name.strip() + '_neg' for name in self.df.columns] df = pd.concat([self.df, df], axis=1) # if method =='fpgrowth' and maxlen<=3: if method == 'fpgrowth' and self.maxlen <= 3: # if method =='fpgrowth': print('Generating rules using fpgrowth, of support', self.supp, "max len", self.maxlen) # itemMatrix = [[item for item in df.columns if row[item] ==1] for i,row in df.iterrows() ] cols = df.columns.values.astype(str) R, C = np.where(df.values == 1) itemMatrix = np.split(cols[C], np.unique(R, return_index=True)[1])[1:] itemMatrix = [item.tolist() for item in itemMatrix] pindex = np.where(self.Y == self.target_class_idx)[0] nindex = np.where(self.Y != self.target_class_idx)[0] # pindex = np.where(self.Y==1)[0] # nindex = np.where(self.Y!=1)[0] start_time = time.time() rules = fpgrowth([itemMatrix[i] for i in pindex], supp=supp, zmin=1, zmax=self.maxlen) rules = [tuple(np.sort(rule[0])) for rule in rules] rules = list(set(rules)) print('\tTook %0.3fs to generate %d rules' % (time.time() - start_time, len(rules))) else: rules = [] print('Generating rules using tree-based method ') start_time = time.time() for length in range(1, maxlen + 1, 1): n_estimators = min(pow(df.shape[1], length), 300) clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=length) clf.fit(self.df, self.Y) for n in range(n_estimators): rules.extend( extract_rules(clf.estimators_[n], df.columns, target_class_idx=self.target_class_idx)) rules = [list(x) for x in set(tuple(x) for x in rules)] print('\tTook %0.3fs to generate %d rules' % (time.time() - start_time, len(rules))) self.screen_rules( rules, df, N ) # select the top N rules using secondary criteria, information gain self.getPatternSpace()
def mine_antecedents(data, Y, minsupport, max_predicates_per_antecedent): # data is the training data # Y is the training labels: 1 for positive and 0 for negative # minsupport is an integer percentage (e.g. 10 for 10%) # max_predicates_per_antecedent is the maximum number of predicates in a rule # mine the rule set n = len(data) data_pos = [x for i, x in enumerate(data) if Y[i] == 1] data_neg = [x for i, x in enumerate(data) if Y[i] == 0] assert len(data_pos) + len(data_neg) == n antecedent_set = [ r[0] for r in fpgrowth( data_pos, supp=minsupport, zmax=max_predicates_per_antecedent) ] antecedent_set.extend([ r[0] for r in fpgrowth( data_neg, supp=minsupport, zmax=max_predicates_per_antecedent) ]) antecedent_set = list(set(antecedent_set)) print len(antecedent_set), 'rules mined' # form the rule-versus-data set # X_pos[j] is the set of positive data points that satisfy rule j # X_neg[j] is the set of negative data points that satisfy rule j X_pos = [0 for j in range(len(antecedent_set) + 1)] X_neg = [0 for j in range(len(antecedent_set) + 1)] # X_pos[0] (X_neg[0]) is the set of all positive (negative) data points X_pos[0] = sum([1 << i for i, x in enumerate(data) if Y[i] == 1]) X_neg[0] = sum([1 << i for i, x in enumerate(data) if Y[i] == 0]) for (j, antecedent) in enumerate(antecedent_set): X_pos[j+1] = sum([1<<i for (i,xi) in enumerate(data) \ if Y[i] == 1 and set(antecedent).issubset(xi)]) X_neg[j+1] = sum([1<<i for (i,xi) in enumerate(data) \ if Y[i] == 0 and set(antecedent).issubset(xi)]) # form antecedent_len and nantecedents antecedent_len = [0] for antecedent in antecedent_set: antecedent_len.append(len(antecedent)) nantecedents = Counter(antecedent_len) antecedent_len = np.array(antecedent_len) antecedent_set_all = ['null'] antecedent_set_all.extend(antecedent_set) return X_pos, X_neg, nantecedents, antecedent_len, antecedent_set_all
def write_frequent_itemsets(input_path, output_path, support=-10, min_set_size=1, max_set_size=3): # parse transactions from file transactions = parser.parse_csv_to_mat(input_path) # mine frequent itemsets frequent_itemsets = fpgrowth(transactions, supp=support, min=min_set_size, max=max_set_size) # write result to file with open(output_path, 'w+') as fd: pickle.dump(frequent_itemsets, fd)
def compute_item_distribution_in_trans(fin_str): item_distribution_dict = dict() print fin_str for fre_item in fpgrowth(bca.iter_trans_data(fin_str).values(),supp=0,zmax=1,report='[a'): print fre_item[0][0],fre_item[1][0] base = 5 key = int(fre_item[1][0])/base item_distribution_dict.setdefault(key,0) item_distribution_dict[key] +=1 for key in item_distribution_dict: print >> sys.stdout,'[%d,%d) itemnums is %d' %(key*base , (key+1) *base, item_distribution_dict[key])
def __call__(self, x_ns, feat_names=None): import pandas as pd def which_are_1(v): return list(pd.Series(range(len(v)))[map(bool, v)]) feat_names = np.array(feat_names) length = float(x_ns.shape[0]) raw = fim.fpgrowth([which_are_1(x_n) for x_n in x_ns], supp=self.supp, zmax=self.zmax) return [binary_rule(list(r), 1, feat_names[list(r)]) for (r, s) in raw]
def get_freqitemsets(fname, minsupport, maxlhs): #minsupport is an integer percentage (e.g. 10 for 10%) #maxlhs is the maximum size of the lhs #first load the data data, Ydata = load_data(fname) #Now find frequent itemsets #Mine separately for each class data_pos = [x for i, x in enumerate(data) if Ydata[i, 0] == 0] data_neg = [x for i, x in enumerate(data) if Ydata[i, 0] == 1] assert len(data_pos) + len(data_neg) == len(data) Y = [0, 0] Y[0] = sum([1 << i for i, x in enumerate(data) if Ydata[i, 0] == 1]) Y[1] = sum([1 << i for i, x in enumerate(data) if Ydata[i, 1] == 1]) itemsets = [r[0] for r in fpgrowth(data_pos, supp=minsupport, zmax=maxlhs)] itemsets.extend( [r[0] for r in fpgrowth(data_neg, supp=minsupport, zmax=maxlhs)]) itemsets = list(set(itemsets)) print len(itemsets), 'rules mined' #Now form the data-vs.-lhs set #X[j] is the bit vector of data points that contain itemset j (that is, satisfy rule j) X = [0 for j in range(len(itemsets) + 1)] global trainingSize trainingSize = len(data) X[0] = (1 << trainingSize ) - 1 #the default rule satisfies all data, so all bits are 1's for (j, lhs) in enumerate(itemsets): X[j + 1] = sum( [1 << i for (i, xi) in enumerate(data) if set(lhs).issubset(xi)]) #now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) return X, Y, nruleslen, lhs_len, itemsets_all
def construct_fp_test(filename, notes_fp): setfile = open(filename) reader = csv.reader(setfile, delimiter=";") results = [] ids = [] for row in reader: id_file = row[0] measure = get_measures(id_file) fp_song = fpgrowth(measure, report='S', zmin=2) result = compare_fp(notes_fp, fp_song) results.append(result) ids.append(id_file) return results, ids
def construct_fp_test(filename, notes_fp): setfile= open(filename) reader = csv.reader(setfile, delimiter=";") results = [] ids = [] for row in reader: id_file = row[0] measure = get_measures(id_file) fp_song = fpgrowth(measure, report='S', zmin=2) result = compare_fp(notes_fp, fp_song) results.append(result) ids.append(id_file) return results, ids
def mining_rule_and_add_candidate(trans_dict, fout_str, min_supp=-5): candidate_dict = dict() for r in fpgrowth(trans_dict.values(), target="r", supp=-5, zmin=2, report="[ac"): # for r in fpgrowth(trans_dict.values(),supp=-5,zmin = 1,report='[a'): # r:(item_body,(item_head1,item_head2),[values]) print >>sys.stdout, r item_heads_set = set(r[1]) for user in trans_dict: buy_items_set = set(trans_dict[user]) if item_heads_set.issubset(buy_items_set): if r[0] not in buy_items_set: candidate_dict.setdefault(user, set()) candidate_dict[user].add(r[0]) fout = open(fout_str, "w") for user in candidate_dict: print >> fout, "%s,%s" % (user, "#".join(candidate_dict[user])) fout.close()
def item_stats(): """ Plot stats on frequent itemset occurences """ transactions = parser.parse_csv_to_mat('/Users/ahkj/Dropbox/SAAS/data/csv/sample-big/customers.txt') frequent_itemsets = fpgrowth(transactions, supp=0.0005, max=3 ) frequencies_1=[] frequencies_2 =[] frequencies_3 = [] for frequent_itemset in frequent_itemsets: if len(frequent_itemset[0])==1: frequencies_1.append(frequent_itemset[1][0]) elif len(frequent_itemset[0])==2: frequencies_2.append(frequent_itemset[1][0]) elif len(frequent_itemset[0])==3: frequencies_3.append(frequent_itemset[1][0]) frequencies_counts_1 = [0 for x in range(max(frequencies_1)+1)] frequencies_counts_2 = [0 for x in range(max(frequencies_2)+1)] frequencies_counts_3 = [0 for x in range(max(frequencies_3)+1)] for frequencie in frequencies_1: frequencies_counts_1[frequencie]+=1 for frequencie in frequencies_2: frequencies_counts_2[frequencie]+=1 for frequencie in frequencies_3: frequencies_counts_3[frequencie]+=1 cleaned_ys_1 = frequencies_counts_1[0:30] xs_1 =[x for x in range(len(cleaned_ys_1))] plt.scatter(xs_1, cleaned_ys_1) plot_item_stats(xs_1, cleaned_ys_1, '../tmp/plots/item_stats/signletons.png') cleaned_ys_2 = frequencies_counts_2[0:30] xs_2 =[x for x in range(len(cleaned_ys_2))] plot_item_stats(xs_2, cleaned_ys_2, '../tmp/plots/item_stats/pairs.png') cleaned_ys_3 = frequencies_counts_3[0:30] xs_3 =[x for x in range(len(cleaned_ys_3))] plot_item_stats(xs_3, cleaned_ys_3, '../tmp/plots/item_stats/triples.png') # item_stats()
def queryGene(D1, thre): """ Use fpgrowth to generate a finite queries pool :param D1: local database {'uniqueid':['database'. 'laboratory']} :param thre: threshold of queries' frequency :return: a closed frequency itemset of local database """ D1bags = [] for k, v in D1.iteritems(): D1bags.append(v) queries_old = fim.fpgrowth(D1bags, 'c', 100.0 * thre / len(D1bags)) queries = {} for i in queries_old: queries[frozenset(i[0])] = i[1] print >> perr, len(queries), 'queries generated in total.' return queries
def __call__(self, data): def which_are_1(v): return list(pd.Series(range(len(v)))[map(bool,v)]) length = float(len(data)) import pdb raw = fim.fpgrowth([which_are_1(x_n) for x_n in data.x_ns], supp = self.supp, zmax = self.zmax) data_idx = hash(data) # for (r,s) in raw: # try: # print data.x_names[r] # except: # pdb.set_trace() if data.x_names != None: return [rule_f((data_idx,i), r, s[0]/length, list(data.x_names[list(r)])) for (i, (r, s)) in enumerate(raw)] else: return [rule_f((data_idx,i), r, s[0]/length) for (i, (r, s)) in enumerate(raw)]
def collaborationDiscovery(papers): support = 9 frequentCollaborators = [] allAuthorsPerPaper = [] for key, value in papers.items(): if 'affiliations' in papers[key]: authorsPerPaper = set() for affiliation in papers[key]['affiliations']: authorsPerPaper.add(authors[affiliation['aid']]) allAuthorsPerPaper.append(authorsPerPaper) patterns = fpgrowth(allAuthorsPerPaper, supp=-support) for pattern, support in sorted(patterns, key=lambda x: -x[1]): if len(pattern) > 1: frequentCollaborators.append((pattern, support)) return frequentCollaborators
def generate_rules(self,supp,maxlen,N, need_negcode = False,njobs = 5, method = 'fpgrowth',criteria = 'IG',add_rules = []): self.maxlen = maxlen self.supp = supp if method =='fpgrowth': print('Using fpgrowth to generate rules with support {} and max length {}'.format(supp,maxlen)) itemMatrix = [[item for item in self.df.columns if row[item] ==1] for i,row in self.df.iterrows() ] pindex = np.where(self.Y==1)[0] nindex = np.where(self.Y!=1)[0] start_time = time.time() rules= fpgrowth([itemMatrix[i] for i in pindex],supp = supp,zmin = 1,zmax = maxlen) rules = [np.sort(x[0]).tolist() for x in rules] df = self.df else: print('Using random forest to generate rules ...') rules = [] start_time = time.time() for length in range(2,maxlen+1,1): n_estimators = 500*length# min(5000,int(min(comb(df.shape[1], length, exact=True),10000/maxlen))) clf = RandomForestClassifier(n_estimators = n_estimators,max_depth = length) clf.fit(self.df.iloc[:,list(range(int(self.df.shape[1]/2)))],self.Y) for n in range(n_estimators): rules.extend(extract_rules(clf.estimators_[n],self.df.columns[:int(self.df.shape[1]/2)])) rules = [list(x) for x in set(tuple(np.sort(x)) for x in rules)] df = 1-self.df df.columns = [name.strip() + 'neg' for name in self.df.columns] df = pd.concat([self.df,df],axis = 1) self.generate_time = time.time() - start_time print('\tTook %0.3fs to generate %d rules' % (self.generate_time, len(rules))) count = 0 index = [] for rule in add_rules: if np.sort(rule).tolist() not in rules: rules.append(rule) index.append(len(rules)-1) else: index.append(rules.index(rule)) self.rulespace = [len(rules)] self.all_rulelen = np.array([len(rule) for rule in rules]) self.screen_rules(rules,df,N,supp,criteria,njobs,index) # select the top N rules using secondary criteria, information gain
def compute_frequent_transactions(lsynchs, sup, lsensors): """ Applies FP-growth for finding the frequent transactions in the syncronizations :return: """ ltrans = [] for synch in lsynchs: trans = [] for sn, _, cl in synch: trans.append('%s-C%s'%(lsensors[sn],str(cl))) ltrans.append(trans) lfreq = [] cnt_len = np.zeros(len(lsensors)) for itemset, sval in fpgrowth(ltrans, supp=-sup, zmin=2, target='m'): lfreq.append((itemset, sval)) cnt_len[len(itemset)-2] += 1 return lfreq, cnt_len
def generate_rules(self, X_trans, y): itemNames = dict() for i, item in enumerate(X_trans.columns): itemNames[i + 1] = item self.itemNames = itemNames if self.method == 'fpgrowth': from fim import fpgrowth, fim items = np.arange(1, len(X_trans.columns) + 1) itemMatrix = (X_trans * items).to_numpy() itemMatrix_numerical = np.array( [row[row > 0] for row in itemMatrix]) rules = fpgrowth(itemMatrix_numerical[np.where(y == 1)].tolist(), supp=self.support, zmin=1, zmax=self.maxlen) self.rules = [sorted(rule[0]) for rule in rules] else: items = np.arange(1, len(X_trans.columns) + 1) rules = [] for length in range(1, self.maxlen + 1): # if the data is too small, it will complaine # the n_estimators can not larger than the # possible trees n_estimators = self.forest_size * length clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=length) clf.fit(X_trans, y) for n in range(n_estimators): rules.extend(extract_rules(clf.estimators_[n], items)) # To-do: not sure which one is faster, needs to test on a large dataset rules = [list(x) for x in set(tuple(np.sort(y)) for y in rules)] # rules = [list(x) for x in remove_duplicates(tuple(np.sort(y)) for y in rules)] self.rules = rules # this needs to be modified, because # it needs user to add numerical version of the rules for add_rule in self.add_rules: if np.sort(add_rule).tolist() not in self.rules: self.rules.append(add_rule)
def getfrequentitems(data, cl, minsupp, maxlhs): # returns all patterns/frequent itemsets given a certain minsupp and max length (maxlhs) itemsets = [] for c in cl: #start_time = time.time() data_aux = [t.difference(c) for t in data if c <= t] #print("Time for set difference: " +str(time.time()-start_time)) start_time = time.time() itemsets.extend([ r[0] for r in fpgrowth(data_aux, supp=minsupp, zmin=2, zmax=maxlhs) ]) #print("Time for set fpgrowth: " +str(time.time()-start_time)) #remove repeated sets #start_time = time.time() itemsets = list(set(itemsets)) #print("Time for list set transform: " +str(time.time()-start_time)) #start_time = time.time() itemsets.sort(key=len) #print("Time for sorting list: " +str(time.time()-start_time)) return itemsets
def generate_rules(self, supp, maxlen, N, method='randomforest'): self.maxlen = maxlen self.supp = supp df = 1 - self.df #df has negative associations df.columns = [name.strip() + '_neg' for name in self.df.columns] df = pd.concat([self.df, df], axis=1) if method == 'fpgrowth' and maxlen <= 3: itemMatrix = [[item for item in df.columns if row[item] == 1] for i, row in df.iterrows()] pindex = np.where(self.Y == 1)[0] nindex = np.where(self.Y != 1)[0] print 'Generating rules using fpgrowth' start_time = time.time() rules = fpgrowth([itemMatrix[i] for i in pindex], supp=supp, zmin=1, zmax=maxlen) rules = [tuple(np.sort(rule[0])) for rule in rules] rules = list(set(rules)) start_time = time.time() print '\tTook %0.3fs to generate %d rules' % ( time.time() - start_time, len(rules)) else: rules = [] start_time = time.time() for length in xrange(1, maxlen + 1, 1): n_estimators = min(pow(df.shape[1], length), 4000) clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=length) clf.fit(self.df, self.Y) for n in xrange(n_estimators): rules.extend(extract_rules(clf.estimators_[n], df.columns)) rules = [list(x) for x in set(tuple(x) for x in rules)] print '\tTook %0.3fs to generate %d rules' % ( time.time() - start_time, len(rules)) self.screen_rules( rules, df, N ) # select the top N rules using secondary criteria, information gain self.getPatternSpace()
def get_freq_itemsets(data, y, min_support=50, max_lhs=2): """ Xtrain,Ytrain,nruleslen,lhs_len,itemsets = get_freqitemsets(fname+'_train',minsupport,maxlhs) #Do frequent itemset mining from the training data """ if y.shape[-1] == 2: # currently only mine itemsets for binary classification data_pos = [x for i, x in enumerate(data) if y[i, 0] == 0] data_neg = [x for i, x in enumerate(data) if y[i, 0] == 1] print(len(data_pos)) print(len(data_neg)) print(len(data)) assert len(data_pos) + len(data_neg) == len(data) itemsets = [ r[0] for r in fpgrowth(data_pos, supp=min_support, zmax=max_lhs) ] itemsets.extend( [r[0] for r in fpgrowth(data_neg, supp=min_support, zmax=max_lhs)]) else: raise NotImplementedError itemsets = list(set(itemsets)) print("{} rules mined".format(len(itemsets))) # build S (antecedent vs. datapoint matrix) # S[i] is the i-th antecedent # S[0] is for the default rule (which satisfies all data) print("Building S...") """ S = [set() for _ in range(len(itemsets) + 1)] S[0] = set(range(len(data))) for j, lhs in enumerate(itemsets): s_lhs = set(lhs) S[j+1] = set([i for i, xi in enumerate(data) if s_lhs.issubset(xi)]) """ n_antes = len(itemsets) S = np.zeros((n_antes + 1, len(data))) S[0] = 1. for j, lhs in enumerate(itemsets): s_lhs = set(lhs) for i, xi in enumerate(data): S[j + 1, i] = s_lhs.issubset(xi) S = S.transpose() print("S built.") # get the cardinality of each antecendent # default rule has cardinality 0 lhs_len = [0] lhs_len.extend([len(lhs) for lhs in itemsets]) lhs_len = np.array(lhs_len) itemsets = ['null'] + itemsets return S, lhs_len, itemsets
# for row in range(filtered.shape[0]): # to_append = list(filtered.indices[filtered.indptr[row]:filtered.indptr[row + 1]] # [np.argsort(filtered.data[filtered.indptr[row]:filtered.indptr[row + 1]])]) # sequences_spm.append(to_append) # save_obj(name="sequences_cat1_" + str(i), obj=sequences_spm, path=ROOT_DIR + '/data/cat1/') costante_di_popolarita = 15 pred_lil = sps.lil_matrix((10000, 2262292)) for i in tqdm(range(1000,2000)): sequences = load_obj(path=ROOT_DIR+'/data/cat1/', name='sequences_cat1_'+str(i)) popularity = len(sequences) preds_line = np.zeros(2262292) for seq in fpgrowth(sequences,supp= -popularity/costante_di_popolarita, target='m'): for song in seq[0]: preds_line[song]+= seq[1]*(len(seq[0])-1)*(len(seq[0])-1) vals = fast_argpart(preds_line) pred_lil[i,vals] = preds_line[vals] eurm = sps.csr_matrix(pred_lil) eurm = eurm_remove_seed(eurm , dr ) rec_list = eurm_to_recommendation_list(eurm) ev.evaluate(rec_list, "cat2_spm_max",verbose=True, do_plot=True, show_plot=True, save=True ) exit() # # parallel association rule.
def association_rule(i): sequences = load_obj(path=ROOT_DIR + '/data/cat1/', name='sequences_cat1_' + str(i)) popularity_iniziale = len(sequences) preds_line = np.zeros(2262292) if popularity_iniziale > 2000: mean_len = 0 for seq in sequences: mean_len += len(seq) mean_len = mean_len / len(sequences) count = 0 for j in range(len(sequences)): if len(sequences[j]) > (mean_len * 2) or len(sequences[j]) < (mean_len / 2): sequences[j] = [] count += 1 popularity = popularity_iniziale - count print(i, "iniziale",popularity_iniziale, "new_pop", popularity, "rimosse", count, " mean_l", mean_len, "num_seq", len(sequences)) if popularity > 2000: mean_len = 0 for seq in sequences: mean_len += len(seq) mean_len = mean_len / len(sequences) count = 0 for j in range(len(sequences)): if len(sequences[j]) > (mean_len * 2) or len(sequences[j]) < (mean_len / 2): sequences[j] = [] count += 1 popularity -= count print(i, popularity_iniziale, "new_pop", popularity, "rimosse", count, " mean_l", mean_len, "num_seq", len(sequences)) if popularity > 2000: mean_len = 0 for seq in sequences: mean_len += len(seq) mean_len = mean_len / len(sequences) count = 0 for j in range(len(sequences)): if len(sequences[j]) > (mean_len * 2) or len(sequences[j]) < (mean_len / 2): sequences[j] = [] count += 1 popularity -= count print(i, popularity_iniziale, "new_pop", popularity, "rimosse", count, " mean_l", mean_len, "num_seq", len(sequences)) sequences = np.array(sequences) sequences = sequences[len(sequences) > 0] const = costante_di_pop sequences = fpgrowth(sequences, supp=-popularity / const, target=target) for seq in sequences: for song in seq[0]: preds_line[song] += seq[1] * (len(seq[0]) - 1) * (len(seq[0]) - 1) indices = fast_argpart(preds_line) preds_line_lil = sps.lil_matrix((1, 2262292)) vals = fast_argpart(preds_line) preds_line_lil[0, vals] = preds_line[vals] del sequences, indices, preds_line, vals, gc.collect() print("nnz", preds_line_lil.nnz) return preds_line_lil
def cross_validation_compact(transactions, sample_pct=0.50, support=-3, all_frequent_items=None): from fim import fpgrowth """ Cross validation. Using compact representation from Forward. """ # init _id = str(time()).replace('.','') # if all_frequent_items is None: # all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3) cv_start = time() print "\n### Running cross validation {}###".format(_id) print "Total transactions:{}".format(len(transactions)) # print "Total frequest items:{}".format(len(all_frequent_items)) # run results avg_errors = [] var_errors = [] # all_triangles, all_triples = filter_items(all_frequent_items) for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3) all_triangles, all_triples = Forward.forward_compact(all_frequent_items) # Get triples for estimates frequent_items = fpgrowth(chunk, supp=support, min=1, max=3) if len(frequent_items) > 0: print 'frequent items: {}'.format(len(frequent_items)) else: print 'No frequent items in chunk: {}'.format(index) continue triangle_tree, triples = Forward.forward_compact(frequent_items) print 'triangle roots: {}'.format(len(triangle_tree)) estimates = [] observations = [] abs_errors = [] max_est = 0 max_obs = 0 # DFS of the tree holding all triangles for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1) # maxumum estiamte seen (for plotting) max_est = max(max_est, est) # record the estimate estimates.append(est) # from all observed triples get the actual observed number of triples observed = 0 if all_triples.has_key((n1, n2, n3)): observed = all_triples[(n1, n2, n3)] # maximum observation of the triple (for plotting) max_obs = max(max_obs, observed) # record the observed observations.append(observed) # record abs error error = abs(obs-est) / float(obs) * 100 abs_errors.append(error) if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found # evaluation min_error = min(abs_errors) max_error = max(abs_errors) avg_error = sum(abs_errors) / float(len(abs_errors)) avg_errors.append(avg_error) var_error = 0 if len(abs_errors) > 1: var_error = tvar(abs_errors) #tvar is the sample variance var_errors.append(var_error) res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error) print res_string else: print 'No abs errors!' print "Cross validation done!" print "time: ", (time() - cv_start) total_avg_error = sum(avg_errors)/float(len(avg_errors)) total_res_string = "Avg error:{}".format(total_avg_error)
# In[5]: # http://www.borgelt.net/pyfim.html from fim import apriori, fpgrowth patterns = apriori(transactions, supp=-3) # +: percentage -: absolute number # output print '-------- Apriori --------' for (pattern, support) in sorted(patterns, key=lambda x: -x[1]): print pattern, support print 'Number of patterns:', len(patterns) # In[6]: patterns = fpgrowth(transactions, supp=-3) # output print '-------- FP-Growth --------' for (pattern, support) in sorted(patterns, key=lambda x: -x[1]): print pattern, support print 'Number of patterns:', len(patterns) # In[7]: patterns = fpgrowth(transactions, target='c', supp=-2, zmin=2) # output print '-------- Closed Non-single Itemsets --------' for (pattern, support) in sorted(patterns, key=lambda x: -x[1]): print pattern, support print 'Number of patterns:', len(patterns)
def cross_validation(transactions, sample_pct=0.50, support=-3, all_frequent_items=None): from fim import fpgrowth """ Cross validation, 'old' version not using compatct triangle representation from Forward. """ # init _id = str(time()).replace('.','') # if all_frequent_items is None: # all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3) cv_start = time() print "\n### Running cross validation {}###".format(_id) print "Total transactions:{}".format(len(transactions)) # print "Total frequest items:{}".format(len(all_frequent_items)) # run results avg_errors = [] var_errors = [] # all_triangles, all_triples = filter_items(all_frequent_items) for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3) all_triangles, all_triples = Forward.forward(all_frequent_items) # Get triples for estimates frequent_items = fpgrowth(chunk, supp=support, min=1, max=3) if len(frequent_items) > 0: print 'frequent items: {}'.format(len(frequent_items)) else: print 'No frequent items in chunk: {}'.format(index) continue triangles, triples = Forward.forward(frequent_items) print 'triangles: {}'.format(len(triangles)) estimates = [] observations = [] abs_errors = [] max_est = 0 max_obs = 0 for (s1, s2, s3, s12, s23, s13, s123) in triangles: # if s123[1] != 0: # continue # maxent estimate from the sample. # Index [1] of the tuples hold the # occurences in the sample est = ent.maxent_est_rosa(s1[1], s2[1], s3[1], s12[1], s23[1], s13[1], float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1) # maxumum estiamte seen (for plotting) max_est = max(max_est, est) # record the estimate estimates.append(est) # from all observed triples get the actual observed number of triples observed = 0 if all_triples.has_key(s123[0]): observed = all_triples[s123[0]] # maximum observation of the triple (for plotting) max_obs = max(max_obs, observed) # record the observed observations.append(observed) # record abs error error = abs(obs-est) / float(obs) * 100 abs_errors.append(error) if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found # evaluation min_error = min(abs_errors) max_error = max(abs_errors) avg_error = sum(abs_errors) / float(len(abs_errors)) avg_errors.append(avg_error) var_error = 0 if len(abs_errors) > 1: var_error = tvar(abs_errors) #tvar is the sample variance var_errors.append(var_error) # TODO histogram of the average errors. max-ent, extrapolation, heurestic # TODO print average error og the average errors to the log. res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error) print res_string else: print 'No abs errors!' print "Cross validation done!" print "time: ", (time() - cv_start) total_avg_error = sum(avg_errors)/float(len(avg_errors)) total_res_string = "Avg error:{}".format(total_avg_error) return path
] word_counts = Counter() for w in wymagania: word_counts.update(w) word_counts.most_common(100) wymagania = [ w for w in wymagania if w ] itemsets = fim.fpgrowth(wymagania, target = 'r', zmin = 2, zmax = 2, supp = 0.40, conf = 20, eval = 'l', report = '(acl') ''' nodes = [ {"id": id, "group":1} for id, count in word_counts.items() ] links = [ {"source":left, "target":right[0], "value": math.log(numbers[2] + 1) } for left, right, numbers in itemsets ] graph = {
#Lendo o arquivo csv data_frame = pd.read_csv("./data/newTrainSet.csv") #Pegar todos os ids unicos dos pedidos array = data_frame.order_id.unique() #Fazer uma lista com todos os pedidos data_list = [] for p in data_frame.order_id.unique(): data_list.append( (data_frame[data_frame['order_id'] == p].product_id).tolist()) #Executar algoritmo FP Growth result = fpgrowth(data_list, supp=min_sup, conf=min_conf, target='r', report='XC') #Escrever os resultados em um arquivo i = 0 for p in result: filename = "res" filename = filename + str(i) + ".out" i = i + 1 f = open(filename, 'w') f.write(repr(p)) f.close() concat_files(0, i) id_to_name_pyfim()
def get_association_rules(transactions): result = fpgrowth(transactions, target='r', conf=80, eval='c', report='hbalc') result = sorted(result, key=lambda x: (-x[-1], -x[-2]))[:10] return [(x[1], x[0]) for x in result]
print(fpgrowth.__doc__) elif tid < -1: print(eclat.__doc__) elif tid < 0: print(apriori.__doc__) else: tracts = [ [ 1, 2, 3 ], [ 1, 4, 5 ], [ 2, 3, 4 ], [ 1, 2, 3, 4 ], [ 2, 3 ], [ 1, 2, 4 ], [ 4, 5 ], [ 1, 2, 3, 4 ], [ 3, 4, 5 ], [ 1, 2, 3 ] ] print('transactions:') for t in tracts: print(t) if tid < 1: print ('apriori(tracts, supp=-3, zmin=2):') for r in apriori(tracts, supp=-3, zmin=2): print r elif tid < 2: print ('eclat(tracts, supp=-3, zmin=2):') for r in eclat(tracts, supp=-3, zmin=2): print r elif tid < 3: print ('fpgrowth(tracts, supp=-3, zmin=2):') for r in fpgrowth(tracts, supp=-3, zmin=2): print r else: print ('fim(tracts, supp=-3, zmin=2, report=\'#\'):') for r in fim(tracts, supp=-3, zmin=2, report='#'): print r
def fit(self, X, y, feature_labels=[], undiscretized_features=[], verbose=False): """Fit rule lists to data Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data y : array_like, shape = [n_samples] Labels feature_labels : array_like, shape = [n_features], optional (default: []) String labels for each feature. If empty and X is a DataFrame, column labels are used. If empty and X is not a DataFrame, then features are simply enumerated undiscretized_features : array_like, shape = [n_features], optional (default: []) String labels for each feature which is NOT to be discretized. If empty, all numeric features are discretized verbose : bool Currently doesn't do anything Returns ------- self : returns an instance of self. """ self.seed() if len(set(y)) != 2: raise Exception( "Only binary classification is supported at this time!") # deal with pandas data if type(X) in [pd.DataFrame, pd.Series]: X = X.values if type(y) in [pd.DataFrame, pd.Series]: y = y.values X, y = self._setdata(X, y, feature_labels, undiscretized_features) permsdic = defaultdict( default_permsdic) # We will store here the MCMC results data = list(X[:]) # Now find frequent itemsets # Mine separately for each class data_pos = [x for i, x in enumerate(data) if y[i] == 0] data_neg = [x for i, x in enumerate(data) if y[i] == 1] assert len(data_pos) + len(data_neg) == len(data) try: itemsets = [ r[0] for r in fpgrowth(data_pos, supp=self.minsupport, zmin=self._zmin, zmax=self.maxcardinality) ] itemsets.extend([ r[0] for r in fpgrowth(data_neg, supp=self.minsupport, zmin=self._zmin, zmax=self.maxcardinality) ]) except TypeError: itemsets = [ r[0] for r in fpgrowth(data_pos, supp=self.minsupport, min=self._zmin, max=self.maxcardinality) ] itemsets.extend([ r[0] for r in fpgrowth(data_neg, supp=self.minsupport, min=self._zmin, max=self.maxcardinality) ]) itemsets = list(set(itemsets)) if self.verbose: print(len(itemsets), 'rules mined') # Now form the data-vs.-lhs set # X[j] is the set of data points that contain itemset j (that is, satisfy rule j) X = [set() for j in range(len(itemsets) + 1)] X[0] = set(range(len(data))) # the default rule satisfies all data for (j, lhs) in enumerate(itemsets): X[j + 1] = set( [i for (i, xi) in enumerate(data) if set(lhs).issubset(xi)]) # now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = ( X, np.vstack((1 - np.array(y), y)).T.astype(int), nruleslen, lhs_len, itemsets_all) # Do MCMC res, Rhat = run_bdl_multichain_serial(self.max_iter, self.thinning, self.alpha, self.listlengthprior, self.listwidthprior, Xtrain, Ytrain, nruleslen, lhs_len, self.maxcardinality, permsdic, self.burnin, self.n_chains, [None] * self.n_chains, verbose=self.verbose, seed=self.random_state) # Merge the chains permsdic = merge_chains(res) ###The point estimate, BRL-point self.d_star = get_point_estimate( permsdic, lhs_len, Xtrain, Ytrain, self.alpha, nruleslen, self.maxcardinality, self.listlengthprior, self.listwidthprior, verbose=self.verbose) # get the point estimate if self.d_star: # Compute the rule consequent self.theta, self.ci_theta = get_rule_rhs(Xtrain, Ytrain, self.d_star, self.alpha, True) return self
def minging_rule(trans_dict, fout_str, min_supp=-5): fout = open(fout_str, "w") for r in fpgrowth(trans_dict.values(), target="r", supp=-5, conf=50, zmin=2, report="[ac"): print >> fout, "%s,%s,%d#%.2f" % (r[0], "#".join(r[1]), r[2][0], r[2][1]) fout.close()
def fpgrowth(tracts, target='s', min_c=2, min_z=2, max=None, report='a', algo='s'): ''' Find frequent item sets with the fpgrowth algorithm. INPUT: tracts [list of lists] transaction database to mine. The database must be an iterable of transactions; each transaction must be an iterable of items; each item must be a hashable object. If the database is a dictionary, the transactions are the keys, the values their (integer) multiplicities. target [str. Default: 's'] type of frequent item sets to find s/a: sets/all all frequent item sets c : closed closed frequent item sets m : maximal maximal frequent item sets g : gens generators min_c [int. Default: 2] minimum support of an item set (positive: absolute number, negative: percentage) min_z [int. Default: 2] minimum number of items per item set max [int. Default: no limit] maximum number of items per item set report [str. Default: 'a'] values to report with an item set a absolute item set support (number of transactions) s relative item set support as a fraction S relative item set support as a percentage e value of item set evaluation measure E value of item set evaluation measure as a percentage # pattern spectrum instead of full pattern set algo [str. Default: 's'] algorithm variant to use: s simple simple tree nodes with only link and parent c complex complex tree nodes with children and siblings d single top-down processing on a single prefix tree t topdown top-down processing of the prefix trees Variant d does not support closed/maximal item set mining. OUTPUT: * If *report* == 'a'/'s'/'S'/'e'/'E' return a list of pairs, each consisting of a frequent itemset (as a tuple of unit IDs) and a value representing that itemset's support or evaluation measure * If *report* == '#', return a pattern spectrum as a list of triplets (size, supp, cnt), representing pattern size, pattern support, and number of patterns with that size and that support found in *tracts* ''' import fim # By default, set the maximum pattern size to the number of spike trains if max is None: max = numpy.max([len(t) for t in tracts]) + 1 # Run the original fpgrowth fpgrowth_output = fim.fpgrowth(tracts=tracts, target=target, supp=-min_c, min=min_z, max=max, report=report, algo='s') # Return the output if report != '#': return [(cfis, s[0]) for (cfis, s) in fpgrowth_output] else: return fpgrowth_output
tid = int(argv[1]) if tid < -2: print(fpgrowth.__doc__) elif tid < -1: print(eclat.__doc__) elif tid < 0: print(apriori.__doc__) else: tracts = [[1, 2, 3], [1, 4, 5], [2, 3, 4], [1, 2, 3, 4], [2, 3], [1, 2, 4], [4, 5], [1, 2, 3, 4], [3, 4, 5], [1, 2, 3]] print('transactions:') for t in tracts: print(t) if tid < 1: print('apriori(tracts, supp=-3, zmin=2):') for r in apriori(tracts, supp=-3, zmin=2): print r elif tid < 2: print('eclat(tracts, supp=-3, zmin=2):') for r in eclat(tracts, supp=-3, zmin=2): print r elif tid < 3: print('fpgrowth(tracts, supp=-3, zmin=2):') for r in fpgrowth(tracts, supp=-3, zmin=2): print r else: print('fim(tracts, supp=-3, zmin=2, report=\'#\'):') for r in fim(tracts, supp=-3, zmin=2, report='#'): print r