def test_apriori(): data = ("a,b,c,d,e,f\n" "g,h,i,j,k,l\n" "z,x\n" "z,x\n" "z,x,y\n" "z,x,y,i\n") expectedItemSets = { ItemSet("i"): 2 / 6, ItemSet("z"): 4 / 6, ItemSet("x"): 4 / 6, ItemSet("y"): 2 / 6, ItemSet("xz"): 4 / 6, ItemSet("yz"): 2 / 6, ItemSet("xy"): 2 / 6, ItemSet("xyz"): 2 / 6 } index = InvertedIndex() index.load(data) itemsets = apriori(index, 2 / 6) assert (set(expectedItemSets.keys()) == set(itemsets)) for itemset in itemsets: assert (expectedItemSets[itemset] == index.support(itemset)) print("Itemsets={}".format([i for i in itemsets if len(i) > 1])) # (antecedent, consequent, confidence, lift, support) expectedRules = { (frozenset({Item("x"), Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3), (frozenset({Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("x")}), frozenset({Item("z"), Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("x")}), frozenset({Item("z")}), 1, 1.5, 2 / 3), (frozenset({Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3), (frozenset({Item("y")}), frozenset({Item("z"), Item("x")}), 1, 1.5, 1 / 3), (frozenset({Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3), (frozenset({Item("z"), Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("z"), Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3), (frozenset({Item("z")}), frozenset({Item("x"), Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("z")}), frozenset({Item("x")}), 1, 1.5, 2 / 3), (frozenset({Item("z")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3), } rules = set(generate_rules(itemsets, 0, 0, index)) for (antecedent, consequent, confidence, lift, support) in rules: print("{}, {} conf={:.4f}, {:.4f}, {:.4f}".format( antecedent, consequent, confidence, lift, support)) assert (rules == expectedRules)
def main(): parser = ArgumentParser( description="Association rule data mining in Python") parser.add_argument("--input", dest="input", required=True) parser.add_argument("--output", dest="output", required=True) parser.add_argument("--min-confidence", dest="min_confidence", type=float_between_0_and_1, required=True) parser.add_argument("--min-support", dest="min_support", type=float_between_0_and_1, required=True) parser.add_argument("--min-lift", dest="min_lift", type=float_gteq_1, required=True) args = parser.parse_args() program_start = time.time() start = program_start print("ARMPY - Association Rule Mining using Python.") print("Input file: {}".format(args.input)) print("Output file: {}".format(args.output)) print("Minimum support: {}".format(args.min_confidence)) print("Minimum confidence: {}".format(args.min_support)) print("Minimum lift: {}".format(args.min_lift)) print("Generating frequent itemsets using FPGrowth...", flush=True) reader = DatasetReader(args.input) (itemsets, itemset_counts, num_transactions) = mine_fp_tree(reader, args.min_support) duration = time.time() - start print("FPGrowth mined {} items in {:.2f} seconds".format( len(itemsets), duration), flush=True) start = time.time() rules = generate_rules(itemsets, itemset_counts, num_transactions, args.min_confidence, args.min_lift) duration = time.time() - start print("Generated {} rules in {:.2f} seconds".format(len(rules), duration), flush=True) start = time.time() with open(args.output, "w") as f: f.write("Antecedent->Consequent,Confidence,Lift,Support\n") for (antecedent, consequent, confidence, lift, support) in rules: f.write("{} -> {},{:.4f},{:.4f},{:.4f}\n".format( set_to_string(antecedent), set_to_string(consequent), confidence, lift, support)) print("Wrote rules to disk in {:.2f} seconds".format(duration), flush=True) duration = time.time() - program_start print("Total runtime {:.2f} seconds".format(duration)) return 0
def main(): args = parse_args() program_start = time.time() if args.trace_malloc: tracemalloc.start() print( "Virtual Change/Drift Detection - Association Rule Mining using Python." ) print("Drift Algorithm: {}".format(args.drift_algorithm)) print("Input file: {}".format(args.input)) print("Output file prefix: {}".format(args.output)) print("Training window size: {}".format(args.training_window_size)) print("Minimum confidence: {}".format(args.min_confidence)) print("Minimum support: {}".format(args.min_support)) print("Minimum lift: {}".format(args.min_lift)) if args.fixed_drift_confidence is not None: print("Fixed drift confidence of: {}".format( args.fixed_drift_confidence)) print("Tracing memory allocations: {}".format(args.trace_malloc)) print("Save rules: {}".format(args.save_rules)) print("Generating maximal itemsets: {}".format(args.maximal_itemsets)) reader = iter(DatasetReader(args.input)) transaction_num = 0 end_of_last_window = 0 cohort_num = 1 volatility_detector = make_volatility_detector(args) while True: window = take(args.training_window_size, reader) if len(window) == 0: break print("") print("Mining window [{},{}]".format(transaction_num, transaction_num + len(window))) end_of_last_window = transaction_num + len(window) transaction_num += len(window) print("Running FP-Growth...", flush=True) start = time.time() (itemsets, itemset_counts, num_transactions) = mine_fp_tree(window, args.min_support, args.maximal_itemsets) assert (num_transactions == len(window)) duration = time.time() - start print("FPGrowth mined {} items in {:.2f} seconds".format( len(itemsets), duration)) print("Generating rules...", flush=True) start = time.time() rules = list( generate_rules(itemsets, itemset_counts, num_transactions, args.min_confidence, args.min_lift)) duration = time.time() - start print("Generated {} rules in {:.2f} seconds".format( len(rules), duration), flush=True) if len(rules) == 0: print("No rules; just noise. Skipping change detection.") print( "Consider increasing training window size or lowering minsup/conf." ) continue if args.save_rules: start = time.time() output_filename = args.output + "." + str(cohort_num) cohort_num += 1 write_rules_to_file(rules, output_filename) print("Wrote rules for cohort {} to file {} in {:.2f} seconds". format(cohort_num, output_filename, duration), flush=True) drift_detector = make_drift_detector(args, volatility_detector) drift_detector.train(window, rules) # Read transactions until a drift is detected. for transaction in reader: transaction_num += 1 drift = drift_detector.check_for_drift(transaction, transaction_num) if drift is not None: print( "Detected drift of type {} at transaction {}, {} after end of training window" .format(drift.drift_type, transaction_num, transaction_num - end_of_last_window)) if drift.hellinger_value is not None: print( "Hellinger value: {}, confidence interval: {} ± {} ([{},{}])" .format(drift.hellinger_value, drift.mean, drift.confidence, drift.mean - drift.confidence, drift.mean + drift.confidence)) # Record the drift in the volatility detector. This is used inside # the drift detector to help determine how large a confidence interval # is required when detecting drifts. if volatility_detector is not None: volatility_detector.add(transaction_num) # Break out of the inner loop, we'll jump back up to the top and mine # a new training window. break if len(window) < args.training_window_size: break print("\nEnd of stream\n") duration = time.time() - program_start print("Total runtime {:.2f} seconds".format(duration)) if args.trace_malloc: (_, peak_memory) = tracemalloc.get_traced_memory() tracemalloc_memory = tracemalloc.get_tracemalloc_memory() print("Peak memory usage: {:.3f} MB".format(peak_memory / 10**6)) print("tracemalloc overhead: {:.3f} MB".format( (tracemalloc_memory / 10**6))) print("Peak memory usage minus tracemalloc overhead: {:.3f} MB".format( (peak_memory - tracemalloc_memory) / 10**6)) snapshot = tracemalloc.take_snapshot() bytes_allocated = sum(x.size for x in snapshot.traces) print("Total traced memory allocated: {:.3f} MB".format( bytes_allocated / 10**6)) tracemalloc.stop() return 0
def test_apriori(): data = ("a,b,c,d,e,f\n" "g,h,i,j,k,l\n" "z,x\n" "z,x\n" "z,x,y\n" "z,x,y,i\n") expectedItemSets = {ItemSet("i"): 2 / 6, ItemSet("z"): 4 / 6, ItemSet("x"): 4 / 6, ItemSet("y"): 2 / 6, ItemSet("xz"): 4 / 6, ItemSet("yz"): 2 / 6, ItemSet("xy"): 2 / 6, ItemSet("xyz"): 2 / 6} index = InvertedIndex() index.load(data) itemsets = apriori(index, 2 / 6) assert(len(itemsets) == len(expectedItemSets)) for itemset in itemsets: assert(frozenset(itemset) in expectedItemSets) for itemset in itemsets: assert(expectedItemSets[frozenset(itemset)] == index.support(itemset)) print("Itemsets={}".format([i for i in itemsets if len(i) > 1])) def itemize(a): return list(map(item_id, a)) # (antecedent, consequent, confidence, lift, support) rx = [ (['y'], ['x'], 1.0, 1.5, 0.3333333333333333), (['x'], ['y'], 0.5, 1.5, 0.3333333333333333), (['y'], ['z'], 1.0, 1.5, 0.3333333333333333), (['z'], ['y'], 0.5, 1.5, 0.3333333333333333), (['x'], ['z'], 1.0, 1.5, 0.6666666666666666), (['z'], ['x'], 1.0, 1.5, 0.6666666666666666), (['x', 'y'], ['z'], 1.0, 1.5, 0.3333333333333333), (['z', 'y'], ['x'], 1.0, 1.5, 0.3333333333333333), (['z', 'x'], ['y'], 0.5, 1.5, 0.3333333333333333), (['y'], ['z', 'x'], 1.0, 1.5, 0.3333333333333333), (['x'], ['z', 'y'], 0.5, 1.5, 0.3333333333333333), (['z'], ['x', 'y'], 0.5, 1.5, 0.3333333333333333) ] expectedRules = list(map(lambda a: (itemize(a[0]), itemize(a[1]), a[2], a[3], a[4]), rx)) itemset_counts = dict(map(lambda i: (tuple(i), index.count(i)), itemsets)) rules = generate_rules( itemsets, itemset_counts, index.num_transactions, 0, 0) def deitemize(a): return list(map(item_str, a)) p = list(map(lambda a: (deitemize(a[0]), deitemize(a[1]), a[2], a[3], a[4]), rules)) print("rules") print(p) for (antecedent, consequent, confidence, lift, support) in rules: print("{}, {} conf={:.4f}, {:.4f}, {:.4f}". format(antecedent, consequent, confidence, lift, support)) assert(len(rules) == len(expectedRules)) for i in range(len(rules)): assert(expectedRules[i] in rules)