def test_generate_rules():
    data_set = Apriori.load_data_set()
    l, support_data = Apriori.apriori(data_set, 0.5)
    print '='*100
    print l
    rules = Apriori.generate_rules(l, support_data, 0.5)
    print rules
def test_create():
    data_set = Apriori.load_data_set()
    c1 = Apriori.create_c1(data_set)
    all_set = map(set, data_set)
    list_set, support_data = Apriori.scan_d(all_set, c1, 0.5)
    set2 = Apriori.generate_ck(list_set, 2)
    print set2
示例#3
0
def pcy(baskets: list, minSupport: float, minConfidence: float, maxK: int,
        bucketSize: int, hashFunc1, hashFunc2) -> (list, int, defaultdict):
    C1 = ap.genC1(baskets)
    L1, sup1 = ap.genFreqSet(baskets, C1, minSupport)

    # 生成2项候选集时,进行PCY优化
    bitmap1 = hashPairs(baskets, L1, bucketSize, minSupport, hashFunc1)
    Ck1 = genCkByBitMap(L1, bitmap1, bucketSize, 2, hashFunc1)

    bitmap2 = hashPairs(baskets, L1, bucketSize, minSupport, hashFunc2)
    Ck2 = genCkByBitMap(L1, bitmap2, bucketSize, 2, hashFunc2)
    Ck = Ck1 & Ck2

    L = [set(), L1]
    sup = [defaultdict(float), sup1]

    k = 2
    while True:
        Lk, supk = ap.genFreqSet(baskets, Ck, minSupport)
        L.append(Lk)
        sup.append(supk)
        if k == maxK:
            break
        k += 1
        Ck = ap.genCk(L[k - 1], k)
    return sup, bitmap1, ap.genRules(L, sup, minConfidence)
示例#4
0
def test_create():
    data_set = Apriori.load_data_set()
    c1 = Apriori.create_c1(data_set)
    all_set = map(set, data_set)
    list_set, support_data = Apriori.scan_d(all_set, c1, 0.5)
    set2 = Apriori.generate_ck(list_set, 2)
    print set2
示例#5
0
def test_generate_rules():
    data_set = Apriori.load_data_set()
    l, support_data = Apriori.apriori(data_set, 0.5)
    print '=' * 100
    print l
    rules = Apriori.generate_rules(l, support_data, 0.5)
    print rules
示例#6
0
def apriori_classID(lastestTime, appointMap, minSupport=0.1, minConf=0.6):

    dataSet = ReadData.get_all_recent_class(lastestTime, appointMap)

    L, suppData = Apriori.apriori(dataSet, minSupport)

    rules = Apriori.generateRules(L, suppData, minConf)
    return rules
def main():
    # Connect to database
    ppsd_data = connect_db('train')

    fuzzified_data = Fuzzification.fuzzify(ppsd_data)
    # print(fuzzified_data)

    # Insert Fuzzified Data
    insert_db(fuzzified_data, ppsd_data)

    # Apriori Algorithm
    fuzzy_csv = pd.read_csv('fuzzified.csv')

    # FP Grwoth
    start_time = time.time()

    rules, confi = FPGrowth.mine('fuzzified.csv')
    print("FP: --- %s seconds ---" % (time.time() - start_time))

    insert_fprules(rules, confi)
    # insert_fprulesCSV(rules,confi)
    start_time = time.time()
    ant, con, conf, lift = Apriori.mine('fuzzified.csv')
    print("Apriori: --- %s seconds ---" % (time.time() - start_time))
    insert_arules(ant, con, conf, lift)
示例#8
0
def aiSub(ingredients, unwantedIng):
    #First get frequent item sets
    L = ap.readFromFile()

    if not(unwantedIng in ingredients):
        return "error"

    ingredients.remove(unwantedIng)
    wantedIngredients = set(ingredients)
    
    a = 0
    bestFit = None

    for thing in L:
        if thing <= wantedIngredients:
            continue
        counter = 0
        for ing in thing:
            if ing in wantedIngredients:
                counter += 1
        if counter > a and not('butter' in thing) and not('bread' in thing):
            a = counter
            bestFit = thing
        elif counter == a:
            pass
            #do something clever here
    return bestFit - set(ingredients)
    
示例#9
0
def main():
    print 'begin:'
    t = time.time()
    print '获取数据中:'
    data = getdata()

#    printdata(data)

    print '获取数据完毕'
    print '商店总数:', len(data)
    aprioriResult = {}
    for shop in data:
        print 'shop:', shop, '  顾客数:', len(data[shop])
        assortment = []
        for curstomer in data[shop]:
            assortment.append(list(data[shop][curstomer]))

      #  print 'get assortment done',assortment

        adata = Apriori.apriori(assortment, 0.5)

#        if adata:
#            print adata

        aprioriResult[shop] = adata
    print '保存apriori结果到数据库'
    saveaprioriresult(aprioriResult)
    print str(time.time() - t)
    print 'done'
示例#10
0
def arpriori_trade(full_table_name):
    query = \
        """SELECT T.*
        FROM(
          SELECT shop_id, GROUP_CONCAT(DISTINCT(item_id)) AS list
          FROM topdata.top_item_trade_his_20121219_1
          GROUP BY shop_id, nick)T
        WHERE LOCATE(',', T.list)<>0
        """
    db_conn = ibbdlib.get_db_conn(**db_server)
    metadata = dict()
    try:
        for row in db_conn.iter(cmd_str):
            if not metadata.get(row.shop_id):
                metadata[row.shop_id] = [row.list.split(',')]
            else:
                metadata[row.shop_id].append(row.list.split(','))

        for (shop_id, t) in metadata.items():
            print shop_id
            t = Apriori.apriori(t, 0.1)
            print json.dumps(t, indent=4)
            query = 'INSERT IGNORE INTO ibbd2.ststc_shop_apriori values(%%s, CURDATE(), %s, NOW())' % ','.join(['%s'] * 4)
            db_conn.executemany(query, [[shop_id, pair.split(',')[0], pair.split(',')[1], data['sup'], data['num']]
                                for (pair, data) in t.items() if len(pair.split(',')) == 2])
    except Exception, e:
        print e
示例#11
0
def arpriori_trade():
    while 1:
        full_table_name = tasks_queue.get()
        log.info(full_table_name)
        query = \
            """SELECT T.*
            FROM(
              SELECT shop_id, GROUP_CONCAT(DISTINCT(item_id)) AS list
              FROM %s
              GROUP BY shop_id, nick)T
            WHERE LOCATE(',', T.list)<>0""" \
            % full_table_name
        db_conn = ibbdlib.get_db_conn(**db_server)
        metadata = dict()
        try:
            log.info('%s query data...', full_table_name)
            for row in db_conn.iter(query):
                if not metadata.get(row.shop_id):
                    metadata[row.shop_id] = [row.list.split(',')]
                else:
                    metadata[row.shop_id].append(row.list.split(','))
            log.info('%s run apriori...', full_table_name)
            for (shop_id, t) in metadata.items():
                t = Apriori.apriori(t, 0.1)
                query = 'INSERT IGNORE INTO ibbd2.ststc_shop_apriori values(%%s, CURDATE(), %s, NOW())' % ','.join(['%s'
                        ] * 4)
                db_conn.executemany(query, [[shop_id, pair.split(',')[0], pair.split(',')[1], data['sup'], data['num']]
                                    for (pair, data) in t.items() if len(pair.split(',')) == 2])
            log.info('%s finished', full_table_name)
        except:
            log.error(traceback.format_exc())
        finally:
            db_conn.close()
            tasks_queue.task_done()
def getSupport(minsupport, processbiclusters, winedata):
    print('...', end='')
    tidata = processbiclusters
    apriori = Apriori.Apriori()
    itemlist = apriori.getSupportOnlyAssociationRules(minsupport, tidata)
    #formattedlist = apriori.getDataFormattedItemList(winedata, itemlist) #there's a problem with the formatted list(problem is that winedata does not carry the same format as the testdata format)
    formattedlist = []
    return apriori, itemlist, formattedlist
示例#13
0
def apriori(train, test, return_pred, num_cluster):
    train_orders_i = train.set_index('order_id')['product_id'].rename('item_id')
    test_orders_i = test.set_index('order_id')['product_id'].rename('item_id')

    #item_name = train['product_id', 'product_name', 'aisle_id', 'department_id'].rename(columns={'product_id': 'item_id', 'product_name': 'item_name'})
    rules_i = Apriori.association_rules(train_orders_i, 0.01)
    #rules_final_i = Apriori.merge_item_name(rules_i, item_name).sort_values('lift', ascending=False)
    #display(rules_final_i)

    # Train set pairs
    train_pairs_gen_i = Apriori.get_item_pairs(train_orders_i)
    train_pairs_i = Apriori.freq(train_pairs_gen_i).to_frame("freqAB")
    train_pairs_i = train_pairs_i.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
    train_pairs_i['pair'] = train_pairs_i.item_A.astype(str).str.cat(train_pairs_i.item_B.astype(str), sep='-')

    # Test set pairs
    test_pairs_gen_i = Apriori.get_item_pairs(test_orders_i)
    test_pairs_i = Apriori.freq(test_pairs_gen_i).to_frame("freqAB")
    test_pairs_i = test_pairs_i.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
    test_pairs_i['pair'] = test_pairs_i.item_A.astype(str).str.cat(test_pairs_i.item_B.astype(str), sep='-')

    # Rules set pairs
    rules_i['pair'] = rules_i.item_A.astype(str).str.cat(rules_i.item_B.astype(str), sep='-')

    test_pair_set_i = set(np.unique(test_pairs_i.pair))
    train_pair_set_i = set(np.unique(train_pairs_i.pair))
    rules_pair_set_i = set(np.unique(rules_i.pair))

    # TP= Pairs that exist in a priori pred and test
    tp = len(list(test_pair_set_i & rules_pair_set_i))

    # TN= pairs that exists train set but not in test
    tn = len(list(test_pair_set_i - train_pair_set_i))

    # FN= Pairs that exists in test but not in a priori
    fn = len(list(rules_pair_set_i - test_pair_set_i))

    # FP= Pairs that exists in a priori but not in test
    fp = len(list(test_pair_set_i - rules_pair_set_i))

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (recall * precision) / (recall + precision)
    print('APRIORI')
    return recall, precision, f1
def aprioriWorker(data_fname, sup, out_fname, conn):
    try:
        conn.send(Apriori.Apriori_main(data_fname, sup, out_fname)[1])
        conn.close()
        return
    except MemoryError:
        conn.send(-1)
        conn.close()
        return
    except Exception as e:
        conn.send((-1,e.message,))
        conn.close()
        return
def aprioriWorker(data_fname, sup, conn, flag=True):
    try:
        if flag:
            Apriori.Apriori_main(data_fname, sup)
            conn.send(0)
            conn.close()
            return
        else:
            conn.send(Apriori.Apriori_main(data_fname, sup)[1])
            conn.close()
            return
    except MemoryError:
        conn.send(-1)
        conn.close()
        return
    except Exception as e:
        conn.send((
            -1,
            e.message,
        ))
        conn.close()
        return
示例#16
0
def main():
	
	#data preprocessing
	filename = "stars_data.csv"
	data = a.read_data(filename)
	data.pop(0)
	random.shuffle(data)
	words = a.frequency_word(data)	
	features = a.create_binary_feature(data,words,6)
	words.append("isPositive")
	words.append("isNegative")
	minsupport = 0.03
	minconf = 0.25
	
	L,support_count = ampriori.frequentItemsetGeneration(features,words,minsupport)
	print len(L[0]) + len(L[1]) + len(L[2])
	
	rules,r = ampriori.ruleGeneration(L,support_count,minconf)
	print len(rules)
	
	rules = sorted(rules.items(),key=operator.itemgetter(1),reverse= True)
	rules = [rules[i] for i in range(30)]
	for index, rule in enumerate(rules):
		print rule
def metricWorker(fname, sanitized, sens, sup, conn):
    Apriori_results_init = readLargeData(fname)
    S = minSet(readSensitiveSet(sens))
    SS = supersets(S, Apriori_results_init.keys())
    
    r_fd = list(set(Apriori_results_init) - SS)
    Apriori_results = Apriori.Apriori_main(sanitized, sup)[0]

    side_effects = len(r_fd)-len(Apriori_results)
    
    if side_effects<0:
        conn.send((side_effects,0,))
        conn.close()
        return
    else:
##        a1 = 0.
##        a2 = 0.
##        for itemset in convert2frozen_m(apriori(r_fd, target='m', supp = float(0.0))):
##            a1 += 1.0
##            for itemset2 in convert2frozen_m(apriori(Apriori_results.keys(), target='m', supp = float(0.0))):
##                if itemset == itemset2:
##                    a2 += 1.0
##                    
##        Bd_rate = abs(round(float((a1-a2)/a1),2))
        
        SumAll = 0
        AbsDif = 0.0
        for itemset in r_fd:
            SumAll +=  Apriori_results_init[itemset]
            if itemset in Apriori_results:
                AbsDif +=  float(abs(Apriori_results_init[itemset] - Apriori_results[itemset]))
            else:
                AbsDif +=  float(Apriori_results_init[itemset])
                
        if SumAll == 0:
            inls =  round(float(AbsDif), 3)
        else:
            inls =  round(float(AbsDif/SumAll), 3)

        conn.send((side_effects, inls,))
        conn.close()
        return
示例#18
0
def retrievePatterns(ingredient):
    # For generating local patterns instead of global patterns

    # db = connect(host="localhost", db="sandwiches", user="******", passwd="root")
    # cursor = db.cursor()
    # cursor1 = db.cursor()
    # cursor2 = db.cursor()
    # cursor.execute("""SELECT recipes.recipeId, recipeName FROM ingredients, recipes, ingredientsForRecipe WHERE ingredients.ingredientId = ingredientsForRecipe.ingredientId and recipes.recipeId = ingredientsForRecipe.recipeId and ingredientName = %s """, ingredient)
    # l = []
    # for row in cursor:
    #     cursor1.execute("""SELECT ingredientId FROM ingredientsForRecipe WHERE recipeId = %s""", row[0])
    #     temp = []
    #     for r in cursor1:
    #         # temp.append(int(r[0]))
    #         cursor2.execute("""SELECT ingredientName FROM ingredients WHERE ingredientId = %s""", r[0])
    #         t = cursor2.fetchone()
    #         temp.append(t)
    #     l.append(temp)
    # result, supp = ap.apriori(l, minsupport=0.02)
    return ap.readFromFile()
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    #生成候选规则集合,从最初的项集中生成更多的关联规则
    #freqSet:频繁项集,例:[2,3,5]
    #H:可以出现在规则右部的元素列表,例:[set([2]),set([3]),set([5])]
    #m:H中频繁项集大小,例:1
    m = len(H[0])
    print('m:')
    print m
    print('freqSet:')
    print freqSet
    #查看freqSet是否可以移除大小为m的子集
    if (len(freqSet) > (m + 1)): 
        #生成H中无重复的m+1元组合,例:[set([2,3]),set([2,5]),set([3,5])]
        Hmp1 = Apriori.aprioriGen(H, m+1)
        print('Hmp1:')
        print Hmp1
        #检测Hmp1中的组合是否能成为规则中的右部(满足最小可信度要求)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        #如果不止一条规则满足要求,那么使用Hmp1迭代
        if (len(Hmp1) > 1):    
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
def metricWorker(fname, sanitized, sens, sup, conn):
    Apriori_results_init = readLargeData(fname)
    S = minSet(readSensitiveSet(sens))
    SS = supersets(S, Apriori_results_init.keys())

    r_fd = list(set(Apriori_results_init) - SS)
    Apriori_results = Apriori.Apriori_main(sanitized, sup)[0]
    side_effects = len(r_fd) - len(Apriori_results)

    if side_effects < 0:
        conn.send((
            side_effects,
            0,
        ))
        conn.close()
        return
    else:

        SumAll = 0
        AbsDif = 0.0
        for itemset in r_fd:
            SumAll += Apriori_results_init[itemset]
            if itemset in Apriori_results:
                AbsDif += float(
                    abs(Apriori_results_init[itemset] -
                        Apriori_results[itemset]))
            else:
                AbsDif += float(Apriori_results_init[itemset])

        if SumAll == 0:
            inls = round(float(AbsDif), 3)
        else:
            inls = round(float(AbsDif / SumAll), 3)

        conn.send((
            side_effects,
            inls,
        ))
        conn.close()
        return
示例#21
0
import Apriori

dataSet = Apriori.loadDataSet()
# C1 = Apriori.createC1(dataSet)
# D = map(set, dataSet)
# L1, suppData = Apriori.scanD(D, C1, 0.5)
L, suppData = Apriori.apriori(dataSet, minSupport=0.5)
rules = Apriori.generateRules(L, suppData, minConf=0.7)

'''
Apriori.mushTest("E:/TestDatas/MachineLearningInAction/Ch11/mushroom.dat")
'''
示例#22
0
    #buckets = LSH.MyLSHashing(signature_ls,bandNum,s,r)
    # buckets = LSH.MylocalitySensitiveHashing(signature_ls,bandNum,s,r)

    buckets = LSH.localitySensitiveHashing(signature_ls, bandNum, s, r)
    I_set_list = LSH.buckets_sort(buckets)

    for ii in I_set_list:
        print(ii)

    buckets_dict = LSH.hash_buckets(I_set_list)

    for i in buckets_dict:
        print(i, buckets_dict[i])

    #Apriori Method
    dataset = Apriori.loadDataSet(buckets_dict)
    C1 = Apriori.createC1(dataset)
    print("C1:")
    print(C1)
    D = list(map(set, dataset))
    print("D:")
    print(D)
    #L1, supportData0 = Apriori.scanD(D,C1, minSupport = 0.5)
    L, supportData = Apriori.apriori(dataset, minSupport=0.5)
    rules = Apriori.generateRules(L, supportData)
    print("*" * 100)
    print(L)
    print("*" * 100)
    print(supportData)
    print("*" * 100)
    print(rules)
示例#23
0
import Apriori

transSet={}
transSet['T100']=(1,2,5)
transSet['T200']=(2,4)
transSet['T300']=(2,3)
transSet['T400']=(1,2,4)
transSet['T500']=(1,3)
transSet['T600']=(2,3)
transSet['T700']=(1,3)
transSet['T800']=(1,2,3,5)
transSet['T900']=(1,2,3)

frequentSets=Apriori.apriori(transSet, 2)

rules=Apriori.generateAssociationRules(frequentSets,0.4,transSet)
for rule in rules:
    prioriSet,inferredSet,confidence=rule
    print prioriSet,'=>',inferredSet,' confidence=',confidence
import Apriori
import AuxiliaryFunctions
dataSet = AuxiliaryFunctions.loadDataSet()
print dataSet
C1 = AuxiliaryFunctions.createC1(dataSet)
print('C1:')
print C1
D = map(set, dataSet)
print('D:')
print D
L1, supportData1 = AuxiliaryFunctions.scanD(D, C1, 0.5)
print('L1:')
print L1
print('supportData1:')
print supportData1
L, supportData = Apriori.apriori(dataSet, 0.5)
print('频繁集:')
print('k=1:')
print L[0]
print('k=2:')
print L[1]
print('k=3:')
print L[2]
print('支持度:')
print(supportData)
print('候选集:')
print('C1:')
print C1
print('C2:')
print Apriori.aprioriGen(L[0], 2)
print('C3:')
示例#25
0
                        default=False)

    args = parser.parse_args()

    print("Parameter values:")
    print("Number of k-tuples: ", args.k_tuples)
    print("Support: ", args.support)
    print("Confidence threshold: ", args.confidence)
    print("Plotting: ", args.plot)

    k_tuples = args.k_tuples
    support = args.support
    confidence = args.confidence

    filename = "../data/T10I4D100K.dat"
    apriori = Apriori(filename, support)
    apriori.load_dataset()
    baskets = apriori.baskets
    times = []

    start = time.time()
    print("\nStart Apriori Pipeline\n")

    for i in range(k_tuples):
        if (i == 0):
            start_stage = time.time()

            apriori.generate_candidates_ck(baskets, None, 1)
            apriori.filter_candidates(apriori.candidates[i], support)

            end_stage = time.time()
示例#26
0
#-*-coding:utf8-*-

import pandas as pd
from apriori import *
import sys
import Apriori

reload(sys)
sys.setdefaultencoding('utf-8')

inputfile ='G://PyCharm//data//menu_orders.xls'
outputfile = 'apriori_rules.csv' #结果文件
data = pd.read_excel(inputfile)

print(u'\n转换原始数据至0-1矩阵。。。')
ct = lambda x :pd.Series(1 , index=x[pd.notnull(x)])
b = map(ct , data.as_matrix())
data = pd.DataFrame(list(b)).fillna(0)
print(u'\n转换完毕。')
del b

support = 0.2 #最小支持度
confidence = 0.5 #最小置信度
ms = '---'

Apriori.find_rule(data, support, confidence, ms).to_csv(outputfile) #保存结果
示例#27
0
support = 50
threshold = 70

createCombiner = (lambda line: [line])
mergeValue = (lambda exist, new: exist + [new])
mergeCombiner = (lambda exist1, exist2: exist1 + exist2)

userRDD = userRDD.filter(lambda line: line != header) \
    .map(lambda line: (line.split(',')[0], line.split(',')[1])) \
    .combineByKey(createCombiner, mergeValue, mergeCombiner) \
    .filter(lambda line: len(line[1]) >= threshold)

# userRDD.foreach(print)

businessRDD = userRDD.flatMap(lambda line: A.convert(line)) \
    .groupByKey() \
    .mapValues(set)

business = businessRDD.collect()
businessdict = {item[0]: item[1] for item in business}

numOfPar = userRDD.getNumPartitions()

candidates = userRDD.mapPartitions(lambda partition: A.Apriori(partition, support / numOfPar)) \
    .reduceByKey(lambda a, b: a | b) \
    .collect()

candidates = sorted([(key, sorted([list(sets) for sets in value]))
                     for key, value in candidates])
示例#28
0
binary_os = operating_sys_cleaning_binary(cleaned_os) # for Kmeans and KNN 


#make arrays of the selected features with one person per array for apriori
apriori_features = []

for i in xrange(len(snow_letter)):
	temp = []
	temp.append(string_cleaned_prog_skills[i])
	temp.append(snow_letter[i])
	temp.append(cleaned_os[i])
	apriori_features.append(temp)


#Calling Apriori
prunedlen3 = Apriori.apriori(apriori_features)
print "*" *45
print "Apriori"
print "*" * 45
print "Longest frequent pattern:", prunedlen3

allpossible = Apriori.possible_comb(prunedlen3)

permutated = Apriori.permutations(allpossible)

Rules = Apriori.confidence(permutated, apriori_features)

print "Rules from longest frequent pattern and their confidence:", Rules

#make arrays of the selected features with one person per array for Kmeans and KNN
num_features = []
# coding:utf8
'''
Created on 2018年2月22日
@author: XuXianda

'''
import CreateRules
import Apriori
import AuxiliaryFunctions
dataSet = AuxiliaryFunctions.loadDataSet()
L, supportData = Apriori.apriori(dataSet, minSupport=0.5)
print('L:')
print L
print('supportData:')
print supportData
rules = CreateRules.generateRules(L, supportData, minConf=0.5)
print('rule1:')
print rules
#rules2=CreateRules.generateRules(L,supportData,minConf=0.5)
#print('rule2:')
#print rules2
示例#30
0
import json, sys


if __name__ == '__main__':

	if len(sys.argv) < 4:
		print "incorrect input format"
		print "Correct Format: <CSV FILE> <min_sup> <min_conf>"

	else:

		csvFile = sys.argv[1]
		min_sup = sys.argv[2]
		min_conf = sys.argv[3]

		print "Finding Associated Pair..."

		# data_set is a list of lists, each item is a row in csv file
		# attributes : DBA, BORO, CUSINE DESCRIPTION, INSPECTION DATE, ACTION, VIOLATION CODE,
		# VIOLATION DESCRIPTION, CRITICAL FLAG, SCORE, GRADE, GRADE DATE, RECORD DATE, INSPECTION TYPE
		data_set = []
		data_set = Parser.parse(csvFile)

		frequentSets = Apriori.getFrequentSets(data_set, min_sup)
		# frequentSets = []

		associatedRules = Apriori.getAssociatedRulesWith(data_set, frequentSets, min_sup, min_conf)

		number_of_rows = len(data_set)
		Apriori.exportWith(frequentSets, associatedRules, min_sup, min_conf, number_of_rows)
示例#31
0
def rule_generate(request):
    operationtype=str(request.POST.get("OperationType"))
    global datagridpagesize
    global TotalFileList
    global mylist
    global patterns
    global lengthofmylist,pages,nextpage,previouspage,pagelist,ruleslist,TopList,interests
    global protocl
    global label_list

    lengthofmylist=len(TotalFileList)

    pages=int(lengthofmylist / datagridpagesize) + (lengthofmylist % datagridpagesize > 0)

    nextpage=pages
    previouspage=0
    pagelist=[]
    ruleslist=[]
    TopList=[]
    interests=[]
    templist=[]

    try:
        for tab in range(datagridpagesize):
            TopList.append(TotalFileList[tab+(gloffset-1)*datagridpagesize])
    except:
        pass
    for tab in range(pages):
        pagelist.append(str(tab+1))




    protocol=str(request.POST.get("strProtocol")).strip()
    label_list=str(request.POST.get("strAtributes")).strip().split(',')
    #print("@@@@@@@@@@@@@@@@@@@@@@@@@")
    #print(label_list)
    rulesfolder="inputrules"

    #starttime = time.time()
    #timestart = time.clock()

    if not os.path.isdir(ProjectPath+'/'+rulesfolder):
        os.makedirs(ProjectPath+'/'+rulesfolder)
    if not os.path.isdir(str(ProjectPath+'/results')):
        os.makedirs(str(ProjectPath+'/results'))

    for each in interests:
        #InputForRules.GenerateInputForRules(eachgroup)
        with open(ProjectPath+'/'+rulesfolder+"/input_"+each,"w")as fout:
            pass
        with open(ProjectPath+"/results/RulesFor_"+each,"w")as fout:
            pass
    for each in TotalFileList:
        templist.append(each.filename)
    if operationtype=="Association Rule":
        para=Parameter(float(request.POST.get("paraMinSupp")),float(request.POST.get("paraMinCond"))\
                   ,float(request.POST.get("paraMinLift")),float(request.POST.get("paraMinKulc")),float(request.POST.get("paraThreshIR")))
        patterns=str(request.POST.get("strPattern"))
        interests.append(patterns)


        for each in interests:
            #print(each+" is processing......")
            InPutForRulesVersion2.MainFunc(templist,os.path.join(ProjectPath,FilesStoreFolder),protocol,each.strip(),rulesfolder,label_list)
            a = Apriori(para.min_supp,ProjectPath+'/'+rulesfolder+"/input_"+each.strip())
            ls = a.do()
            rules = a.ralationRules(ls.get(ls.size()).items,para.min_cond,para.min_lift,para.min_kulc,para.thresh_ir)
            rule_count=0
            for rule in rules:
                rule_count += 1
                ruleslist.append(str(rule_count)+'th'+str(rule))
            with open(ProjectPath+"/results/RulesFor_"+each,"a")as fout:
                fout.write("min_support is "+str(para.min_supp)+".  min_confidence is "+str(para.min_cond)+"\n")
                for rule in rules:
                    fout.write(str(rule)+'\n')
                fout.write("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n")
        return rule_show(request)
    elif operationtype=="Bayes":
        O=[]
        Result,TotalIdf,E,D1,D2=BayesEntropy2.Main(templist,os.path.join(ProjectPath,FilesStoreFolder),protocol,label_list)
        for eachk,eachv in D1.items():
            O.append("***********************************\nThe commander is *************************:    "+eachk+'\n')
            O.append("The Entropy is    :    "+str(E[eachk])+'\n')
            O.append("The Total appear times is       :   "+str(sum(D2[eachk]))+'.\n')
            for e2,k2 in eachv.items():
                O.append("The \""+ e2 + "\" appears "+ str(k2)+" times and appears in "+str(TotalIdf[e2])+" files. \n")
                O.append("The result for \""+e2+"\"    is         :    "+str(Result[e2])+'!\n')

        ruleslist=O

        return rule_show(request)
# -*- coding: utf-8 -*-
#-------------------------------------------------------------------------------
#Name    :
#Author : Xueshijun
#MailTo : [email protected]    / [email protected]
#QQ     : 324858038
#Blog   : http://blog.csdn.net/xueshijun666
#Created on Tue Feb 23 11:08:20 2016
#Version: 1.0
#-------------------------------------------------------------------------------
'''mushroom.dat'''
import Apriori  as ap
mushDataSet = [line.split() for line in open('mushroom.dat').readlines()]
L,supportData = ap.apriori(mushDataSet,minSupport=0.3)

for item in L[1]:
    if item.intersection('2'):print item

for item in L[3]:
    if item.intersection('2'):print item
if __name__ == '__main__':
    args = main(sys.argv[1:])

    min_sup = args[1][0]
    min_conf = args[1][1]
    input_filename = args[1][2]
    output_filename = args[1][3]

    L = [[]]
    C = [[], []]
    C_support = [[]]

    data_set = load_data(input_filename)
    data_count = float(len(data_set))
    C_support.append(support_count_C1(data_set))
    L.append(Apriori_L1(C_support[1], data_count, min_sup))

    # print(L[1])
    k = 1

    while L[k]:
        k += 1
        C.append(Apriori_gen(L[k - 1], k))
        C_support.append(support_count(data_set, C[k]))
        L.append(Apriori(C_support[k], data_count, min_sup))

    rule = []
    rule = generate_Rule(L, C_support, min_sup=min_sup, min_conf=min_conf)

    association_rule_output(rule, output_filename)
示例#34
0
    def GET(self, cnpj):
        cnpjs = Banco().searchCNPJS(cnpj)
        regras = Apriori().extractRules(cnpjs)
        cnpjs = Banco().extractCNPJs(regras)

        return Banco().formatCNPJS(regras, cnpj)
示例#35
0
# 原数据为倒排表数据

# In[ ]:

inverted = pd.read_csv(
    r'D:\Python_Training\script_Python\15Association\bank.csv', encoding='gbk')
inverted.head()

# ## 数据转换

# 倒排表数据转换为相应的二维列表数据

# In[ ]:

idataset = apri.dataconvert(inverted,
                            tidvar='CSR_ID',
                            itemvar='PROD',
                            data_type='inverted')
idataset[:5]

# ## 关联规则

# 参数说明:
#
# + minSupport:最小支持度阈值
# + minConf:最小置信度阈值
# + minlen:规则最小长度
# + maxlen:规则最大长度

# 这里,minSupport或minConf设定越低,产生的规则越多,计算量也就越大
#
# 设定参数为:minSupport=0.05,minConf=0.5,minlen=1,maxlen=10
示例#36
0
    if v.find("BROWN") != -1:
        return "brown"
    return v


df.FavSQLServ = df.FavSQLServ.map(sql_standardize)
df.FavColor = df.FavColor.map(color_standardize)
df.OS = df.OS.map(os_standardize)

# Fill missing. (Not for Apriori)
#
#df =  df.fillna(df.mean())

#Pre Apriori:
#Combine attributes with their values, so we mine on key/values that are equal, not just same values.
r = []
for i in range(len(df)):
    f = set([])
    for j in range(len(df.iloc[i])):
        a = str(df.iloc[i].index[j])
        b = str(df.iloc[i][j])
        c = a + ": " + b
        f.add(c)
    r += [f]

print "The output reads: 20 of 66 students has these values alike:"
a = Apriori.Apriori()
for i in range(20):
    print str(20 - i)
    print a.find(r, 20 - i)
示例#37
0
        Lk, supk = ap.genFreqSet(baskets, Ck, minSupport)
        L.append(Lk)
        sup.append(supk)
        if k == maxK:
            break
        k += 1
        Ck = ap.genCk(L[k - 1], k)
    return sup, bitmap1, ap.genRules(L, sup, minConfidence)


if __name__ == '__main__':
    minConf = 0.5
    minSup = 0.005
    maxk = 4
    bucketSize = 4999
    idMap, dataSet = ap.loadData("./src/Groceries.csv")
    itemBaskets = list(map(frozenset, dataSet))

    def hashFuncForPair1(pair: set) -> int:
        # sha1 = hashlib.sha1()
        hashVal = 1
        for item in pair:
            # sha1.update(idMap[item].encode("utf-8"))
            hashVal *= hash(idMap[item])
            # hashVal += hash(idMap[item])
        # return int(sha1.hexdigest(), 16)
        return hashVal

    def hashFuncForPair2(pair: set) -> int:
        sha1 = hashlib.sha1()
        # hashVal = 0
示例#38
0
'''
# 挖掘频繁项集和频繁规则
itemsets, rules = apriori(data, min_support=0.5, min_confidence=0.1)
print(itemsets)
print()
print(rules)
'''
这个的优点是使用起来简单,并且efficient-apriori 工具包把每一条数据集里的项式都放到了一个集合中进行运算,
并没有考虑它们之间的先后顺序。因为实际情况下,同一个购物篮中的物品也不需要考虑购买的先后顺序。
而其他的 Apriori 算法可能会因为考虑了先后顺序,出现计算频繁项集结果不对的情况。
所以这里采用的是 efficient-apriori 这个工具包。
'''

# In[]:
# 2、自定义库:
ress = apri.arules(data, minSupport=0.5, minConf=0.1, minlen=1,
                   maxlen=4)  # DataFrame
print(type(ress))

# In[]:
# *****************************************************************************

# In[]:
# 二、实例: 数据库格式数据集:
inverted = pd.read_csv("Transactions.csv")
inverted.head()


# In[]:
def encode_unit(x):
    if x <= 0:
        return False
示例#39
0
#!usr/bin/python
# coding:utf-8
import Apriori

dataSet = Apriori.loadDataSet()
print dataSet
# c1 = Apriori.createC1(dataSet)
# print c1
# D = map(set, dataSet)
# L1, supportData0 = Apriori.scanD(D, c1, 0.5)
# print L1
# print supportData0
L, supportData = Apriori.apriori(dataSet)
print L
print supportData
print '\n'
rules = Apriori.generateRules(L, supportData, 0.7)
print rules

print dataSet[0][:1]
def test_apriori():
    data_set = Apriori.load_data_set()
    l, support_data = Apriori.apriori(data_set, 0.5)
    print l
    print support_data
示例#41
0
def test_apriori():
    data_set = Apriori.load_data_set()
    l, support_data = Apriori.apriori(data_set, 0.5)
    print l
    print support_data
示例#42
0
__author__ = 'bigship'

import Apriori

def split(str,cha):
    retList = []
    for x in str:
        if x != cha[0] and x!= cha[1]:
            retList.append(x)
    return retList
mushDataSet = []
data = open('mushroom.txt')
cha = [',','\n']
for line in data.readlines():
    mushDataSet.append(split(line,cha))
data.close()
smallDataSet = mushDataSet[:10]
print smallDataSet
L , supportData = Apriori.apriori(smallDataSet,0.7)
for item in L[4]:
    if item.intersection('e'):
        print item
result = Apriori.generateRules(L,supportData,0.85)
# 原数据为倒排表数据

# In[ ]:
#Transactions---自行车及周边物品的销售数据

inverted = pd.read_csv(r'D:\Python_book\15Association\Transactions.csv')
inverted.head()

# ## 数据转换

# 倒排表数据转换为相应的二维列表数据

# In[ ]:

idataset = apri.dataconvert(inverted,
                            tidvar='OrderNumber',
                            itemvar='Model',
                            data_type='inverted')
idataset[:5]

# ## 关联规则

# 参数说明:
#
# + minSupport:最小支持度阈值
# + minConf:最小置信度阈值
# + minlen:规则最小长度
# + maxlen:规则最大长度

# 这里,minSupport或minConf设定越低,产生的规则越多,计算量也就越大
#
# 设定参数为:minSupport=0.05,minConf=0.5,minlen=1,maxlen=10
        # print date[0]
        # print start_date

        #print bigRise[i]



        if len(bigRise) == 23:
            if bigRise[day] == '1':
                calcu_list.append(code[day])
        #else:
        #    print code[0] + ' does not have enough days'
        #print calcu_list

    sum_list.append(calcu_list)
    #print sum_list


print sum_list
print len(sum_list)

minSupport = 0.1
minConf = 0.1

L,SupportData = Apriori.apriori(sum_list,minSupport)
rules = Apriori.generateRules(L,SupportData,minConf)
print rules

time_end=time.time()
print u"总运行时间为:"
print time_end-time_start