Пример #1
0
def build_rules(min_supp = 2):
    
    print('minimum support = ', min_supp)
    
    #get frequent item using fp-growth   
    tr = fp.find_frequent_patterns(tr_data, min_supp)
    utr = fp.find_frequent_patterns(untr_data, min_supp)
    
    #prepare the frequent item
    tr_rules = [set(x) for x in tr]
    utr_rules = [set(x) for x in utr]
    
    return tr_rules, utr_rules;
Пример #2
0
 def get_seq_patterns(self, simple_log, length):
     patterns = pyfpgrowth.find_frequent_patterns(simple_log, 0)
     seq = []
     for item in patterns.keys():
         if len(item) == length:
             seq.append(item)
     return seq
Пример #3
0
def generate_rules(dataframe, information=[], minsupport=25, minconfidence=0.55, satisfied_value=3):
    '''
    :param information: the transaction we already have
    :param dataframe: it should includes userID, productID, rating
    :param minsupport: the itemset at least show up minsupport times
    :param minconfidence: the min possibility that the consumer will apply the rules
    :param satisified_value:  consider the user like the product only he rating it over the value
    :return: rules: the association rules
    '''

    raw_dict = {}

    # transform dataframe to transaction
    for row in dataframe.itertuples():
        raw_dict.setdefault(row.reviewerID, {})
        if float(row.rating) >= satisfied_value:
            raw_dict[row.reviewerID].update({row.productID: row.rating})

    transaction = []
    for user in raw_dict:
        transaction.append(list(raw_dict[user].keys()))
    if information:
        transaction.extend(information)

    # pp.pprint(transaction)
    # pp.pprint(len(transaction))


    # generate the rules
    patterns = pyfpgrowth.find_frequent_patterns(transaction, minsupport)
    rules = pyfpgrowth.generate_association_rules(patterns, minconfidence)
    return rules
Пример #4
0
def upload():
    #	path_parent = os.path.dirname(os.getcwd())
    #	os.chdir(path_parent)

    if request.method == "POST":

        if request.files:
            file = request.files['inputFile']
            support = request.form['support']
            confidence = request.form['confidence']

            file.save(os.path.join(app.config["FILE_UPLOADS"], file.filename))
            with open('./uploads/' + file.filename, newline='') as f:
                reader = csv.reader(f)
                data = list(reader)
                transactions = data
                patterns = pyfpgrowth.find_frequent_patterns(
                    transactions,
                    len(transactions) * int(support) / 100)
                rules = pyfpgrowth.generate_association_rules(
                    patterns,
                    int(confidence) / 100)
#                return str(rules)
            res = rules
            newFile = File(name=file.filename,
                           sup=support,
                           con=confidence,
                           result=res)

        db.session.add(newFile)
        db.session.commit()
        return render_template("data.html", result=result)
Пример #5
0
 def run_rule_manually(self):
     for record in self:
         transactions = self.get_sale_data()
         if (record.rule_type == 'apriori'):
             results = self.format_rules(
                 list(
                     apriori(transactions,
                             min_support=record.min_supp,
                             min_confidence=record.min_conf)))
             self.update_rule(results, 'apriori')
         else:
             totalRow = len(transactions)
             results = self.format_rules_fp(
                 pyfpgrowth.generate_association_rules(
                     pyfpgrowth.find_frequent_patterns(
                         transactions, totalRow * record.min_supp),
                     record.min_conf))
             self.update_rule(results, 'fpgrowth')
         self.update_on_web()
         return {
             'type': 'ir.actions.act_window',
             'name': 'View Rules',
             'view_type': 'form',
             'view_mode': 'tree,form',
             'res_model': 'data.mining.show',
             'target': 'current',
         }
Пример #6
0
def fpgrowth1(item_dataset, min_support=0.1, repetitions=10):

    #Transform data
    itemsets = []
    for row in item_dataset:
        vals = []
        for i, val in enumerate(row):
            if val == 1:
                vals.append(i)
        itemsets.append(vals)

    times = []
    for _ in range(repetitions):
        t0 = time.time()
        patterns = pyfpgrowth.find_frequent_patterns(
            itemsets, min_support * len(item_dataset))
        times.append(time.time() - t0)

    fp_item_sets = []
    for items, count in patterns.items():
        sup = count / len(item_dataset)
        conds = [[x, 1] for x in items]
        fp_item_sets.append([sup, conds])
    exc_time = np.mean(times)

    return fp_item_sets, exc_time
def do_patterns(years):
    results = {}
    counts = {}
    for year in years:
        results[year] = {}
        counts[year] = {}
        delays = pd.read_sql(
            'SELECT * from delays_binned where year = {}'.format(year),
            db_connection)
        # get rid of fields we don't need
        df = delays.drop(
            columns=['level_0', 'index', 'delay_id', 'report_date'])
        # stringify values for pyfpgrowth
        df['Route'] = df['Route'].apply(str)
        df['year'] = df['year'].apply(str)
        for i in range(0, len(pat_types)):
            # drop cols
            df_dropped = df.drop(columns=col_drop_types[i])
            # find patterns store results
            fps = pyfpgrowth.find_frequent_patterns(df_dropped.values, MINSUP)
            # only keep max length for each category
            results[year][pat_types[i]] = {
                x: fps[x]
                for x in fps.keys() if len(x) > len(pat_types[i].split(','))
            }
            # store counts as we go
            counts[year][pat_types[i]] = len(results[year][pat_types[i]])
    return counts
Пример #8
0
    def categorize_queries(self, **data):
        """
        Executes Apriori algorithm and returns a RelationRecord generator.

        Arguments:
            transactions -- A transaction iterable object
                            (eg. [['A', 'B'], ['B', 'C']]).

        Keyword arguments:
            min_support -- The minimum support of relations (float).
            min_lift -- The minimum lift of relations (float). (>1 is likely)
            min_probability -- Finds patterns that are associated with another with a certain minimum probability:

            min_confidence -- The minimum confidence of relations (float).


        """

        min_support = data.get('min_support', 10)
        min_lift = data.get('min_lift', 1)
        min_probability = data.get('min_probability', 0.5)
        min_confidence = data.get('min_confidence', 0)

        print("Converting to transactions.")
        transactions, match_queries = self.create_transactions(
            self.df, self.col)

        if self.alg.lower() == "apriori":
            print("Running Apriori")
            min_support = float(min_support / len(transactions))
            results = list(
                apriori(transactions,
                        min_support=min_support,
                        min_confidence=min_confidence,
                        min_lift=min_lift,
                        max_length=None))
            print("Making Categories")
            self.categories = [' '.join(list(l.items)) for l in results]

        elif self.alg.lower() == "fpgrowth":
            print("Running FPGrpwth")
            results = list(
                pg.generate_association_rules(
                    pg.find_frequent_patterns(transactions, min_support),
                    min_probability))
            print("Making Categories")
            self.categories = [' '.join(l) for l in results]

        else:
            raise Exception(
                "{} is not one of the available algorithms (`apriori`, `fpgrowth`)"
                .format(self.alg))

        print('Total Categories: {}'.format(len(set(self.categories))))

        self.df['match_queries'] = match_queries
        self.df['category'] = self.df.match_queries.map(
            lambda x: self.match_labels(x, self.categories))

        self.counts = pd.DataFrame(self.df.category.value_counts())
Пример #9
0
def assign_servers_test_output(df_train, df_test, percentile, confidence,
                               apps_server):
    df_train['hour'] = None
    df_train['hour'] = pd.DatetimeIndex(df_train['Date']).hour

    data_l = list(df_train['pairs'])
    pairs_count = (df_train.groupby('pairs2').agg({
        'Date': 'count',
        'norm_latency': 'mean',
        'Duration': 'sum',
        'Packets': 'sum'
    }).reset_index())
    pairs_count.columns = [
        'pairs', 'frequency', 'avg_norm_latency', 'total_duration',
        'total_packets'
    ]
    pairs_count['norm_latency'] = (
        pairs_count['total_duration'] / pairs_count['total_packets'].sum()
    ) * 100  #sum of all duration time divided by sum of all packets transfered for that pair

    per_n = (pairs_count['frequency'].quantile(percentile))
    patterns = pyfpgrowth.find_frequent_patterns(data_l, per_n)
    rules = pyfpgrowth.generate_association_rules(patterns, confidence)

    #format the rules, bring back in the other info on latency rank

    formated_rules = format_rules(rules, df_train, apps_server)

    #now we make the server assignments based on the training rules applied to the test data
    server_df, server_assignments, total_latency, total_latency_model, avg_latency, avg_latency_model = server_association(
        formated_rules, df_test, apps_server)  #this function loaded fr

    #return(formated_rules)
    return (server_df, server_assignments, total_latency, total_latency_model,
            avg_latency, avg_latency_model)
Пример #10
0
def get_representative_label(items_df, min_sup):
    tmp = np.array(items_df)
    data = tmp.tolist()
    patterns = pyfpgrowth.find_frequent_patterns(
        data, min_sup)  # dict,格式为{(1, 2): 4, (2,): 7}
    freq_items = patterns.keys(
    )  # list,格式为[(1, 2), (2,), (1, 3), (2, 3), (1,)]

    if freq_items:  # 获取到了符合要求的频繁项集
        label_list = list()
        max_len = sorted(patterns.values())[-1]  # 频繁项集最大长度
        for k, v in patterns.items():
            if v == max_len:  # 符合长度要求的label集合
                label_list += k

        label_list = list(set(label_list))  # 去重

        # 删除空字符串
        if '' in label_list:
            label_list.remove('')

        label_str = '","'.join(label_list)  # 逗号拼接为字符串
        label_str = '"' + label_str + '"'  # 格式转为"tag1","tag2".....
        return label_str  # 返回标签字符串
    else:
        return '""'  # 返回空标签串
def fpgrouth(id, dataset, principal):
    pasos = "Dataset Cargado" + '\n'
    dataset = pickdataset(int(id), dataset)
    patterns = pyfpgrowth.find_frequent_patterns(dataset, 3)
    rules = pyfpgrowth.generate_association_rules(patterns, 0.6)
    pasos += "Encuentros: " + '\n'
    pasos += str(patterns) + '\n'
    avgReal = 0
    for i in rules.values():
        it = i[1:2]
        x = str(it)
        x1 = x.split(',')
        x2 = str(x1[0])
        x3 = x2.split('(')
        avgReal += float(x3[1])
    avgReal = str((avgReal / len(rules.values())) * 100) + '% Confianza'
    reglas = str(rules)
    img = 'No aplica'
    if principal:
        context = {
            'algoritmoPrincipal': 'FP-growth',
            'resultado': avgReal,
            'pasos': pasos,
            'reglas': reglas,
            'img': img
        }
    else:
        context = {
            'algoritmoComparar': 'FP-growth',
            'resultado2': avgReal,
            'pasos2': pasos,
            'reglas2': reglas,
            'img2': img
        }
    return context
Пример #12
0
def find_rules(data, support_threshold, confidence_threshold):
    patterns = find_frequent_patterns(transactions=data,
                                      support_threshold=support_threshold)
    rules = generate_association_rules(
        patterns=patterns, confidence_threshold=confidence_threshold)

    return rules
Пример #13
0
 def fp_growth(self, total_elements):
     if len(total_elements) == 0:
         print("total_element is Empty!")
         return
     min_sup = len(total_elements) / 100
     patterns = pyfpgrowth.find_frequent_patterns(total_elements, min_sup)
     self.fp_status = patterns
     print("All patterns:", patterns)
Пример #14
0
def fpGrowth(transactions):
    patterns = pyfpgrowth.find_frequent_patterns(transactions, 1)
    rules = pyfpgrowth.generate_association_rules(patterns, 0.2)
    print(patterns)
    print(rules)
    salida = "\nFrecuencias" + "\n" + str(
        patterns) + "\n" + "Reglas" + "\n" + str(rules)
    return salida
Пример #15
0
def evandempsey_fpgrowth(minsup, item_no):
    start = datetime.now()
    transactions, y_res = merge_data(item_no)
    patterns = pyfpgrowth.find_frequent_patterns(transactions, minsup)
    itemsets = patterns.items()
    for itemset in itemsets:
        print(itemset)
    print(datetime.now() - start)
Пример #16
0
def fpgrowth(seq, **kwargs):
    import pyfpgrowth
    sup = int(kwargs.pop('sup'))
    conf = float(kwargs.pop('conf'))
    patterns = pyfpgrowth.find_frequent_patterns(seq, sup)
    rules = pyfpgrowth.generate_association_rules(patterns, conf)
    ret = [[key, value[0], value[1]] for key, value in rules.items()]
    return ret
Пример #17
0
def compute_fqis_pyfpgrowth_dict(super_ilists, min_sup=0.6):
    print('computing fpgrowth')
    super_ilists = [list(np.int16(ds)) for ds in super_ilists
                    ]  # convert to int16s to save memory avoid MemoryError

    patterns = pyfpgrowth.find_frequent_patterns(
        super_ilists, round(min_sup * len(super_ilists)))
    return patterns
def demo():
    transactions = [[1, 2, 5], [2, 4], [2, 3], [1, 2, 4], [1, 3], [2, 3],
                    [1, 3], [1, 2, 3, 5], [1, 2, 3]]
    # support = 2
    # minconf = 0.7
    patterns = pyfpgrowth.find_frequent_patterns(transactions, 2)
    rules = pyfpgrowth.generate_association_rules(patterns, 0.7)
    print(rules)
Пример #19
0
def pattern_mine(trans, support, confidence):
    """
    input: [[term1, term2, ...], [term1, term2, ...] ...]
    output: 
    """
    pattern = pyfpgrowth.find_frequent_patterns(trans, support)
    rule = pyfpgrowth.generate_association_rules(pattern, confidence)
    return pattern, rule
Пример #20
0
def AssociationRule(Voicing,support,confidence):
    #Voicing_patterns = {}
    #Voicing_rules = {}
    for v in Voicing.items():
        patterns = pyfpgrowth.find_frequent_patterns(v[1], len(v[1])*support)
        Voicing_patterns[v[0]] = patterns
        rules = pyfpgrowth.generate_association_rules(patterns, confidence)
        Voicing_rules[v[0]]= rules
    return Voicing_patterns,Voicing_rules
Пример #21
0
 def fit(self, transactions):
     logging.info("Starting pattern mining!")
     self.pattern_hash = sorted(dict(
         pyfpgrowth.find_frequent_patterns(transactions,
                                           self.min_support)).items(),
                                key=operator.itemgetter(1),
                                reverse=True)
     self.features = [k for k, v in self.pattern_hash][0:self.max_features]
     logging.info("Found rules: {}".format(len(self.features)))
Пример #22
0
def generate_rules(data: pd.DataFrame,
                   support_threshold: int = 1,
                   confidence_threshold: float = 0.3) -> tuple:
    patterns = fp.find_frequent_patterns(data['items'],
                                         support_threshold=support_threshold)
    rules = fp.generate_association_rules(
        patterns, confidence_threshold=confidence_threshold)

    return patterns, rules
def frequent_pattern_mining(members, Features):
    # find patterns that occur at least min_support times.
    min_support = 0.2 * len(members)
    patterns = pyfpgrowth.find_frequent_patterns(members, min_support)
    print("# Patterns:", len(patterns))
    display_patterns = []
    for p in patterns:
        p_index = int(p[0]) - 1
        display_patterns.append(Features[p_index])
    print("Patterns:", display_patterns)
Пример #24
0
def find_team(data, freq=5, file_path='./data/author_list/team'):
    """
    find team according to the freq with fp-growth algorithm
    """
    teams = find_frequent_patterns(data, freq)
    with codecs.open(file_path, 'w', encoding='utf8') as f:
        for team in teams:
            # team size should be greater than 3
            if len(team) >= 3:
                f.write(','.join(team) + '\n')
Пример #25
0
def get_scores(transactions, resume_words):
    num_resumes = len(transactions)
    suggestion_scores = collections.Counter()
    patterns = fpg.find_frequent_patterns(transactions, num_resumes / 1.5)
    rules = fpg.generate_association_rules(patterns, 0.5)
    for antecedent, consequent in rules.items():
        if set(antecedent).issubset(resume_words) and antecedent in patterns:
            suggestion_scores[consequent[0]] += ((patterns[antecedent] * consequent[1]))

    suggestions = set.union(*[set(x) for x in suggestion_scores if suggestion_scores[x] >= 3])
    return {x for x in suggestions if x not in resume_words}
Пример #26
0
def dev_association(data):
    a = 0
    test_14 = 0
    count = [0, 0, 0, 0, 0, 0]
    itemset = [[] for x in range(data.shape[1] * data.shape[2])]
    for i in range(data.shape[1]):
        for j in range(data.shape[2]):
            for z in range(data.shape[0]):
                if data[z, i, j] == 1:
                    itemset[a].append(z)
                    count[z] = count[z] + 1
            if data[1, i, j] == data[4, i, j] == 1:
                test_14 += 1
            a += 1

    # count = [count[z]/data.shape[1]*data.shape[2] for z in range(len(count))]
    print(test_14)
    patterns = fpgrowth.find_frequent_patterns(itemset, 50)
    rules = fpgrowth.generate_association_rules(patterns, .2)

    association_rules = apriori(itemset,
                                min_support=0.002,
                                min_confidence=0.4,
                                min_lift=2,
                                min_length=2.5)
    association_results = list(association_rules)

    associ_rules = []
    for item in association_results:
        # associ_rules = []
        pair = []
        for i in item[0]:
            pair.append(i)
        support = item[1]
        for rules in item[2]:
            tmp = []
            for i in rules[0]:
                tmp.append(i)
            for i in rules[1]:
                tmp.append(i)
            associ_rules.append([tmp, rules[2]])

    dev_rules_no_0 = []
    dev_rules = np.array(associ_rules)
    print(dev_rules[0])
    print(dev_rules[0, 0])
    # print(dev_rules[0,0,0])
    print(dev_rules[0, 0][0])
    for item in dev_rules:
        if item[0][0] == 0:
            continue
        dev_rules_no_0.append([item[0][0:-1], [item[0][-1]], item[1]])

    return patterns, rules, count, association_results, dev_rules_no_0
Пример #27
0
def proses(id):
    my_data = File.query.get(id)
    with open('./uploads/' + my_data.name, newline='') as f:
        reader = csv.reader(f)
        data = list(reader)
        transactions = data
        patterns = pyfpgrowth.find_frequent_patterns(
            transactions,
            len(transactions) * my_data.sup / 100)
        rules = pyfpgrowth.generate_association_rules(patterns, 0.5)
        return str(rules)
Пример #28
0
def get_prevalent_interactions(
    rf,
    impurity_decrease_threshold,
    min_support=10,
    weight_scheme="depth",
    signed=False,
):
    '''
    Compute the prevalent interactions and their prevalence
        First, we use FP growth to find a series of candidate interactions.
        Second, we compute the weighted prevalence of each candidate.
    
    Parameters
    ----------

    rf : the random forest model

    impurity_decrease_threshold : float, if a split results in a decrease
        smaller than this parameter, then it will not appear in the path.
        If it is unclear how to select this for a rf, use visualize_impurity
        _decrease function to look at the histogram of impurity decrease for
        all the splits.

    min_support : int, optional with default 10,
        the minimum number of paths a interaction must appear to be considered

    weight_scheme : str, ["depth", "samplesize"],
        how to compute the weight

    Returns
    -------

    prevalence : dictionary, key correspond to patterns and values correspond
        to their weights.
    '''
    feature_paths, weight = get_filtered_feature_paths(
        rf,
        impurity_decrease_threshold,
        signed=signed,
        weight_scheme=weight_scheme,
    )
    feature_paths = [list(path) for path in feature_paths]
    patterns = pyfpgrowth.find_frequent_patterns(feature_paths, min_support)
    #print(feature_paths)
    prevalence = {p: 0 for p in patterns}
    for key in patterns:
        p = set(list(key))
        for path, w in zip(feature_paths, weight):
            if p.issubset(path):
                prevalence[key] += w
    prevalence = OrderedDict(
        sorted(prevalence.items(), key=lambda t: -t[1]**(1 / len(t[0]))), )
    return prevalence
Пример #29
0
def fp_growth(items, supported=0.5):
    """
        FP—Growth的频繁模式挖掘
    :param items: 格式为 list[ list[],list[], ... ,]
    :param supported:支持度 , 默认为0.5
    :return: 
    """

    supported_num = int( len(items) * supported )
    patterns = pyfpgrowth.find_frequent_patterns(items, supported_num)

    return sorted(patterns.items(), key=lambda x: x[1], reverse = True)
Пример #30
0
def main():
    df = pd.read_csv(r"MarketBasket/Market_Basket_Optimisation.csv",
                     header=None)
    transcation_efficient = transcationGenerator(df, "efficient")
    transcation_non_efficient = transcationGenerator(df, "non-efficient")
    print('-' * 20, 'Apriori', '-' * 20)
    apriori_one(transcation_efficient, support=0.05, confidence=0.3)
    print('-' * 20, 'Apriori', '-' * 20)
    apriori_two(transcation_non_efficient, 0.05, "confidence", 0.3)
    print('-' * 20, 'FP-GROWTH', '-' * 20)
    patterns = fp.find_frequent_patterns(transcation_efficient, 20)
    rules = fp.generate_association_rules(patterns, 0.3)
    print('关联规则:', '\n', rules)
Пример #31
0

rules01 = association_rules(frequent_itemsets01, metric="confidence", min_threshold=0.1)
rules01


# In[14]:


import pyfpgrowth


# In[15]:


patterns = pyfpgrowth.find_frequent_patterns(dataset, 1000)
patterns


# In[18]:


rules5 = pyfpgrowth.generate_association_rules(patterns, 0.5)
rules5


# In[19]:


patterns500 = pyfpgrowth.find_frequent_patterns(dataset, 500)
patterns500
Пример #32
0
import pyfpgrowth


transactions=[]

with open("../data/KnowledgeGraph/sample7.txt") as f:
    for line in f:
        line=line.strip('\n')
        ip,ua,target=line.split(',')
        print "Add (%s %s %s)" % (ip,ua,target)
        transactions.append([ip,ua,target])



patterns = pyfpgrowth.find_frequent_patterns(transactions, 3)
rules = pyfpgrowth.generate_association_rules(patterns, 0.9)

print rules