def build_rules(min_supp = 2): print('minimum support = ', min_supp) #get frequent item using fp-growth tr = fp.find_frequent_patterns(tr_data, min_supp) utr = fp.find_frequent_patterns(untr_data, min_supp) #prepare the frequent item tr_rules = [set(x) for x in tr] utr_rules = [set(x) for x in utr] return tr_rules, utr_rules;
def get_seq_patterns(self, simple_log, length): patterns = pyfpgrowth.find_frequent_patterns(simple_log, 0) seq = [] for item in patterns.keys(): if len(item) == length: seq.append(item) return seq
def generate_rules(dataframe, information=[], minsupport=25, minconfidence=0.55, satisfied_value=3): ''' :param information: the transaction we already have :param dataframe: it should includes userID, productID, rating :param minsupport: the itemset at least show up minsupport times :param minconfidence: the min possibility that the consumer will apply the rules :param satisified_value: consider the user like the product only he rating it over the value :return: rules: the association rules ''' raw_dict = {} # transform dataframe to transaction for row in dataframe.itertuples(): raw_dict.setdefault(row.reviewerID, {}) if float(row.rating) >= satisfied_value: raw_dict[row.reviewerID].update({row.productID: row.rating}) transaction = [] for user in raw_dict: transaction.append(list(raw_dict[user].keys())) if information: transaction.extend(information) # pp.pprint(transaction) # pp.pprint(len(transaction)) # generate the rules patterns = pyfpgrowth.find_frequent_patterns(transaction, minsupport) rules = pyfpgrowth.generate_association_rules(patterns, minconfidence) return rules
def upload(): # path_parent = os.path.dirname(os.getcwd()) # os.chdir(path_parent) if request.method == "POST": if request.files: file = request.files['inputFile'] support = request.form['support'] confidence = request.form['confidence'] file.save(os.path.join(app.config["FILE_UPLOADS"], file.filename)) with open('./uploads/' + file.filename, newline='') as f: reader = csv.reader(f) data = list(reader) transactions = data patterns = pyfpgrowth.find_frequent_patterns( transactions, len(transactions) * int(support) / 100) rules = pyfpgrowth.generate_association_rules( patterns, int(confidence) / 100) # return str(rules) res = rules newFile = File(name=file.filename, sup=support, con=confidence, result=res) db.session.add(newFile) db.session.commit() return render_template("data.html", result=result)
def run_rule_manually(self): for record in self: transactions = self.get_sale_data() if (record.rule_type == 'apriori'): results = self.format_rules( list( apriori(transactions, min_support=record.min_supp, min_confidence=record.min_conf))) self.update_rule(results, 'apriori') else: totalRow = len(transactions) results = self.format_rules_fp( pyfpgrowth.generate_association_rules( pyfpgrowth.find_frequent_patterns( transactions, totalRow * record.min_supp), record.min_conf)) self.update_rule(results, 'fpgrowth') self.update_on_web() return { 'type': 'ir.actions.act_window', 'name': 'View Rules', 'view_type': 'form', 'view_mode': 'tree,form', 'res_model': 'data.mining.show', 'target': 'current', }
def fpgrowth1(item_dataset, min_support=0.1, repetitions=10): #Transform data itemsets = [] for row in item_dataset: vals = [] for i, val in enumerate(row): if val == 1: vals.append(i) itemsets.append(vals) times = [] for _ in range(repetitions): t0 = time.time() patterns = pyfpgrowth.find_frequent_patterns( itemsets, min_support * len(item_dataset)) times.append(time.time() - t0) fp_item_sets = [] for items, count in patterns.items(): sup = count / len(item_dataset) conds = [[x, 1] for x in items] fp_item_sets.append([sup, conds]) exc_time = np.mean(times) return fp_item_sets, exc_time
def do_patterns(years): results = {} counts = {} for year in years: results[year] = {} counts[year] = {} delays = pd.read_sql( 'SELECT * from delays_binned where year = {}'.format(year), db_connection) # get rid of fields we don't need df = delays.drop( columns=['level_0', 'index', 'delay_id', 'report_date']) # stringify values for pyfpgrowth df['Route'] = df['Route'].apply(str) df['year'] = df['year'].apply(str) for i in range(0, len(pat_types)): # drop cols df_dropped = df.drop(columns=col_drop_types[i]) # find patterns store results fps = pyfpgrowth.find_frequent_patterns(df_dropped.values, MINSUP) # only keep max length for each category results[year][pat_types[i]] = { x: fps[x] for x in fps.keys() if len(x) > len(pat_types[i].split(',')) } # store counts as we go counts[year][pat_types[i]] = len(results[year][pat_types[i]]) return counts
def categorize_queries(self, **data): """ Executes Apriori algorithm and returns a RelationRecord generator. Arguments: transactions -- A transaction iterable object (eg. [['A', 'B'], ['B', 'C']]). Keyword arguments: min_support -- The minimum support of relations (float). min_lift -- The minimum lift of relations (float). (>1 is likely) min_probability -- Finds patterns that are associated with another with a certain minimum probability: min_confidence -- The minimum confidence of relations (float). """ min_support = data.get('min_support', 10) min_lift = data.get('min_lift', 1) min_probability = data.get('min_probability', 0.5) min_confidence = data.get('min_confidence', 0) print("Converting to transactions.") transactions, match_queries = self.create_transactions( self.df, self.col) if self.alg.lower() == "apriori": print("Running Apriori") min_support = float(min_support / len(transactions)) results = list( apriori(transactions, min_support=min_support, min_confidence=min_confidence, min_lift=min_lift, max_length=None)) print("Making Categories") self.categories = [' '.join(list(l.items)) for l in results] elif self.alg.lower() == "fpgrowth": print("Running FPGrpwth") results = list( pg.generate_association_rules( pg.find_frequent_patterns(transactions, min_support), min_probability)) print("Making Categories") self.categories = [' '.join(l) for l in results] else: raise Exception( "{} is not one of the available algorithms (`apriori`, `fpgrowth`)" .format(self.alg)) print('Total Categories: {}'.format(len(set(self.categories)))) self.df['match_queries'] = match_queries self.df['category'] = self.df.match_queries.map( lambda x: self.match_labels(x, self.categories)) self.counts = pd.DataFrame(self.df.category.value_counts())
def assign_servers_test_output(df_train, df_test, percentile, confidence, apps_server): df_train['hour'] = None df_train['hour'] = pd.DatetimeIndex(df_train['Date']).hour data_l = list(df_train['pairs']) pairs_count = (df_train.groupby('pairs2').agg({ 'Date': 'count', 'norm_latency': 'mean', 'Duration': 'sum', 'Packets': 'sum' }).reset_index()) pairs_count.columns = [ 'pairs', 'frequency', 'avg_norm_latency', 'total_duration', 'total_packets' ] pairs_count['norm_latency'] = ( pairs_count['total_duration'] / pairs_count['total_packets'].sum() ) * 100 #sum of all duration time divided by sum of all packets transfered for that pair per_n = (pairs_count['frequency'].quantile(percentile)) patterns = pyfpgrowth.find_frequent_patterns(data_l, per_n) rules = pyfpgrowth.generate_association_rules(patterns, confidence) #format the rules, bring back in the other info on latency rank formated_rules = format_rules(rules, df_train, apps_server) #now we make the server assignments based on the training rules applied to the test data server_df, server_assignments, total_latency, total_latency_model, avg_latency, avg_latency_model = server_association( formated_rules, df_test, apps_server) #this function loaded fr #return(formated_rules) return (server_df, server_assignments, total_latency, total_latency_model, avg_latency, avg_latency_model)
def get_representative_label(items_df, min_sup): tmp = np.array(items_df) data = tmp.tolist() patterns = pyfpgrowth.find_frequent_patterns( data, min_sup) # dict,格式为{(1, 2): 4, (2,): 7} freq_items = patterns.keys( ) # list,格式为[(1, 2), (2,), (1, 3), (2, 3), (1,)] if freq_items: # 获取到了符合要求的频繁项集 label_list = list() max_len = sorted(patterns.values())[-1] # 频繁项集最大长度 for k, v in patterns.items(): if v == max_len: # 符合长度要求的label集合 label_list += k label_list = list(set(label_list)) # 去重 # 删除空字符串 if '' in label_list: label_list.remove('') label_str = '","'.join(label_list) # 逗号拼接为字符串 label_str = '"' + label_str + '"' # 格式转为"tag1","tag2"..... return label_str # 返回标签字符串 else: return '""' # 返回空标签串
def fpgrouth(id, dataset, principal): pasos = "Dataset Cargado" + '\n' dataset = pickdataset(int(id), dataset) patterns = pyfpgrowth.find_frequent_patterns(dataset, 3) rules = pyfpgrowth.generate_association_rules(patterns, 0.6) pasos += "Encuentros: " + '\n' pasos += str(patterns) + '\n' avgReal = 0 for i in rules.values(): it = i[1:2] x = str(it) x1 = x.split(',') x2 = str(x1[0]) x3 = x2.split('(') avgReal += float(x3[1]) avgReal = str((avgReal / len(rules.values())) * 100) + '% Confianza' reglas = str(rules) img = 'No aplica' if principal: context = { 'algoritmoPrincipal': 'FP-growth', 'resultado': avgReal, 'pasos': pasos, 'reglas': reglas, 'img': img } else: context = { 'algoritmoComparar': 'FP-growth', 'resultado2': avgReal, 'pasos2': pasos, 'reglas2': reglas, 'img2': img } return context
def find_rules(data, support_threshold, confidence_threshold): patterns = find_frequent_patterns(transactions=data, support_threshold=support_threshold) rules = generate_association_rules( patterns=patterns, confidence_threshold=confidence_threshold) return rules
def fp_growth(self, total_elements): if len(total_elements) == 0: print("total_element is Empty!") return min_sup = len(total_elements) / 100 patterns = pyfpgrowth.find_frequent_patterns(total_elements, min_sup) self.fp_status = patterns print("All patterns:", patterns)
def fpGrowth(transactions): patterns = pyfpgrowth.find_frequent_patterns(transactions, 1) rules = pyfpgrowth.generate_association_rules(patterns, 0.2) print(patterns) print(rules) salida = "\nFrecuencias" + "\n" + str( patterns) + "\n" + "Reglas" + "\n" + str(rules) return salida
def evandempsey_fpgrowth(minsup, item_no): start = datetime.now() transactions, y_res = merge_data(item_no) patterns = pyfpgrowth.find_frequent_patterns(transactions, minsup) itemsets = patterns.items() for itemset in itemsets: print(itemset) print(datetime.now() - start)
def fpgrowth(seq, **kwargs): import pyfpgrowth sup = int(kwargs.pop('sup')) conf = float(kwargs.pop('conf')) patterns = pyfpgrowth.find_frequent_patterns(seq, sup) rules = pyfpgrowth.generate_association_rules(patterns, conf) ret = [[key, value[0], value[1]] for key, value in rules.items()] return ret
def compute_fqis_pyfpgrowth_dict(super_ilists, min_sup=0.6): print('computing fpgrowth') super_ilists = [list(np.int16(ds)) for ds in super_ilists ] # convert to int16s to save memory avoid MemoryError patterns = pyfpgrowth.find_frequent_patterns( super_ilists, round(min_sup * len(super_ilists))) return patterns
def demo(): transactions = [[1, 2, 5], [2, 4], [2, 3], [1, 2, 4], [1, 3], [2, 3], [1, 3], [1, 2, 3, 5], [1, 2, 3]] # support = 2 # minconf = 0.7 patterns = pyfpgrowth.find_frequent_patterns(transactions, 2) rules = pyfpgrowth.generate_association_rules(patterns, 0.7) print(rules)
def pattern_mine(trans, support, confidence): """ input: [[term1, term2, ...], [term1, term2, ...] ...] output: """ pattern = pyfpgrowth.find_frequent_patterns(trans, support) rule = pyfpgrowth.generate_association_rules(pattern, confidence) return pattern, rule
def AssociationRule(Voicing,support,confidence): #Voicing_patterns = {} #Voicing_rules = {} for v in Voicing.items(): patterns = pyfpgrowth.find_frequent_patterns(v[1], len(v[1])*support) Voicing_patterns[v[0]] = patterns rules = pyfpgrowth.generate_association_rules(patterns, confidence) Voicing_rules[v[0]]= rules return Voicing_patterns,Voicing_rules
def fit(self, transactions): logging.info("Starting pattern mining!") self.pattern_hash = sorted(dict( pyfpgrowth.find_frequent_patterns(transactions, self.min_support)).items(), key=operator.itemgetter(1), reverse=True) self.features = [k for k, v in self.pattern_hash][0:self.max_features] logging.info("Found rules: {}".format(len(self.features)))
def generate_rules(data: pd.DataFrame, support_threshold: int = 1, confidence_threshold: float = 0.3) -> tuple: patterns = fp.find_frequent_patterns(data['items'], support_threshold=support_threshold) rules = fp.generate_association_rules( patterns, confidence_threshold=confidence_threshold) return patterns, rules
def frequent_pattern_mining(members, Features): # find patterns that occur at least min_support times. min_support = 0.2 * len(members) patterns = pyfpgrowth.find_frequent_patterns(members, min_support) print("# Patterns:", len(patterns)) display_patterns = [] for p in patterns: p_index = int(p[0]) - 1 display_patterns.append(Features[p_index]) print("Patterns:", display_patterns)
def find_team(data, freq=5, file_path='./data/author_list/team'): """ find team according to the freq with fp-growth algorithm """ teams = find_frequent_patterns(data, freq) with codecs.open(file_path, 'w', encoding='utf8') as f: for team in teams: # team size should be greater than 3 if len(team) >= 3: f.write(','.join(team) + '\n')
def get_scores(transactions, resume_words): num_resumes = len(transactions) suggestion_scores = collections.Counter() patterns = fpg.find_frequent_patterns(transactions, num_resumes / 1.5) rules = fpg.generate_association_rules(patterns, 0.5) for antecedent, consequent in rules.items(): if set(antecedent).issubset(resume_words) and antecedent in patterns: suggestion_scores[consequent[0]] += ((patterns[antecedent] * consequent[1])) suggestions = set.union(*[set(x) for x in suggestion_scores if suggestion_scores[x] >= 3]) return {x for x in suggestions if x not in resume_words}
def dev_association(data): a = 0 test_14 = 0 count = [0, 0, 0, 0, 0, 0] itemset = [[] for x in range(data.shape[1] * data.shape[2])] for i in range(data.shape[1]): for j in range(data.shape[2]): for z in range(data.shape[0]): if data[z, i, j] == 1: itemset[a].append(z) count[z] = count[z] + 1 if data[1, i, j] == data[4, i, j] == 1: test_14 += 1 a += 1 # count = [count[z]/data.shape[1]*data.shape[2] for z in range(len(count))] print(test_14) patterns = fpgrowth.find_frequent_patterns(itemset, 50) rules = fpgrowth.generate_association_rules(patterns, .2) association_rules = apriori(itemset, min_support=0.002, min_confidence=0.4, min_lift=2, min_length=2.5) association_results = list(association_rules) associ_rules = [] for item in association_results: # associ_rules = [] pair = [] for i in item[0]: pair.append(i) support = item[1] for rules in item[2]: tmp = [] for i in rules[0]: tmp.append(i) for i in rules[1]: tmp.append(i) associ_rules.append([tmp, rules[2]]) dev_rules_no_0 = [] dev_rules = np.array(associ_rules) print(dev_rules[0]) print(dev_rules[0, 0]) # print(dev_rules[0,0,0]) print(dev_rules[0, 0][0]) for item in dev_rules: if item[0][0] == 0: continue dev_rules_no_0.append([item[0][0:-1], [item[0][-1]], item[1]]) return patterns, rules, count, association_results, dev_rules_no_0
def proses(id): my_data = File.query.get(id) with open('./uploads/' + my_data.name, newline='') as f: reader = csv.reader(f) data = list(reader) transactions = data patterns = pyfpgrowth.find_frequent_patterns( transactions, len(transactions) * my_data.sup / 100) rules = pyfpgrowth.generate_association_rules(patterns, 0.5) return str(rules)
def get_prevalent_interactions( rf, impurity_decrease_threshold, min_support=10, weight_scheme="depth", signed=False, ): ''' Compute the prevalent interactions and their prevalence First, we use FP growth to find a series of candidate interactions. Second, we compute the weighted prevalence of each candidate. Parameters ---------- rf : the random forest model impurity_decrease_threshold : float, if a split results in a decrease smaller than this parameter, then it will not appear in the path. If it is unclear how to select this for a rf, use visualize_impurity _decrease function to look at the histogram of impurity decrease for all the splits. min_support : int, optional with default 10, the minimum number of paths a interaction must appear to be considered weight_scheme : str, ["depth", "samplesize"], how to compute the weight Returns ------- prevalence : dictionary, key correspond to patterns and values correspond to their weights. ''' feature_paths, weight = get_filtered_feature_paths( rf, impurity_decrease_threshold, signed=signed, weight_scheme=weight_scheme, ) feature_paths = [list(path) for path in feature_paths] patterns = pyfpgrowth.find_frequent_patterns(feature_paths, min_support) #print(feature_paths) prevalence = {p: 0 for p in patterns} for key in patterns: p = set(list(key)) for path, w in zip(feature_paths, weight): if p.issubset(path): prevalence[key] += w prevalence = OrderedDict( sorted(prevalence.items(), key=lambda t: -t[1]**(1 / len(t[0]))), ) return prevalence
def fp_growth(items, supported=0.5): """ FP—Growth的频繁模式挖掘 :param items: 格式为 list[ list[],list[], ... ,] :param supported:支持度 , 默认为0.5 :return: """ supported_num = int( len(items) * supported ) patterns = pyfpgrowth.find_frequent_patterns(items, supported_num) return sorted(patterns.items(), key=lambda x: x[1], reverse = True)
def main(): df = pd.read_csv(r"MarketBasket/Market_Basket_Optimisation.csv", header=None) transcation_efficient = transcationGenerator(df, "efficient") transcation_non_efficient = transcationGenerator(df, "non-efficient") print('-' * 20, 'Apriori', '-' * 20) apriori_one(transcation_efficient, support=0.05, confidence=0.3) print('-' * 20, 'Apriori', '-' * 20) apriori_two(transcation_non_efficient, 0.05, "confidence", 0.3) print('-' * 20, 'FP-GROWTH', '-' * 20) patterns = fp.find_frequent_patterns(transcation_efficient, 20) rules = fp.generate_association_rules(patterns, 0.3) print('关联规则:', '\n', rules)
rules01 = association_rules(frequent_itemsets01, metric="confidence", min_threshold=0.1) rules01 # In[14]: import pyfpgrowth # In[15]: patterns = pyfpgrowth.find_frequent_patterns(dataset, 1000) patterns # In[18]: rules5 = pyfpgrowth.generate_association_rules(patterns, 0.5) rules5 # In[19]: patterns500 = pyfpgrowth.find_frequent_patterns(dataset, 500) patterns500
import pyfpgrowth transactions=[] with open("../data/KnowledgeGraph/sample7.txt") as f: for line in f: line=line.strip('\n') ip,ua,target=line.split(',') print "Add (%s %s %s)" % (ip,ua,target) transactions.append([ip,ua,target]) patterns = pyfpgrowth.find_frequent_patterns(transactions, 3) rules = pyfpgrowth.generate_association_rules(patterns, 0.9) print rules