예제 #1
0
def test_apriori():
    data = ("a,b,c,d,e,f\n"
            "g,h,i,j,k,l\n"
            "z,x\n"
            "z,x\n"
            "z,x,y\n"
            "z,x,y,i\n")

    expectedItemSets = {
        ItemSet("i"): 2 / 6,
        ItemSet("z"): 4 / 6,
        ItemSet("x"): 4 / 6,
        ItemSet("y"): 2 / 6,
        ItemSet("xz"): 4 / 6,
        ItemSet("yz"): 2 / 6,
        ItemSet("xy"): 2 / 6,
        ItemSet("xyz"): 2 / 6
    }

    index = InvertedIndex()
    index.load(data)
    itemsets = apriori(index, 2 / 6)
    assert (set(expectedItemSets.keys()) == set(itemsets))
    for itemset in itemsets:
        assert (expectedItemSets[itemset] == index.support(itemset))

    print("Itemsets={}".format([i for i in itemsets if len(i) > 1]))

    # (antecedent, consequent, confidence, lift, support)
    expectedRules = {
        (frozenset({Item("x"),
                    Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3),
        (frozenset({Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3),
        (frozenset({Item("x")}), frozenset({Item("z"),
                                            Item("y")}), 0.5, 1.5, 1 / 3),
        (frozenset({Item("x")}), frozenset({Item("z")}), 1, 1.5, 2 / 3),
        (frozenset({Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3),
        (frozenset({Item("y")}), frozenset({Item("z"),
                                            Item("x")}), 1, 1.5, 1 / 3),
        (frozenset({Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3),
        (frozenset({Item("z"),
                    Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3),
        (frozenset({Item("z"),
                    Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3),
        (frozenset({Item("z")}), frozenset({Item("x"),
                                            Item("y")}), 0.5, 1.5, 1 / 3),
        (frozenset({Item("z")}), frozenset({Item("x")}), 1, 1.5, 2 / 3),
        (frozenset({Item("z")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3),
    }

    rules = set(generate_rules(itemsets, 0, 0, index))

    for (antecedent, consequent, confidence, lift, support) in rules:
        print("{}, {} conf={:.4f}, {:.4f}, {:.4f}".format(
            antecedent, consequent, confidence, lift, support))

    assert (rules == expectedRules)
예제 #2
0
def main():
    parser = ArgumentParser(
        description="Association rule data mining in Python")
    parser.add_argument("--input", dest="input", required=True)
    parser.add_argument("--output", dest="output", required=True)
    parser.add_argument("--min-confidence",
                        dest="min_confidence",
                        type=float_between_0_and_1,
                        required=True)
    parser.add_argument("--min-support",
                        dest="min_support",
                        type=float_between_0_and_1,
                        required=True)
    parser.add_argument("--min-lift",
                        dest="min_lift",
                        type=float_gteq_1,
                        required=True)
    args = parser.parse_args()

    program_start = time.time()
    start = program_start
    print("ARMPY - Association Rule Mining using Python.")
    print("Input file: {}".format(args.input))
    print("Output file: {}".format(args.output))
    print("Minimum support: {}".format(args.min_confidence))
    print("Minimum confidence: {}".format(args.min_support))
    print("Minimum lift: {}".format(args.min_lift))

    print("Generating frequent itemsets using FPGrowth...", flush=True)
    reader = DatasetReader(args.input)
    (itemsets, itemset_counts,
     num_transactions) = mine_fp_tree(reader, args.min_support)
    duration = time.time() - start
    print("FPGrowth mined {} items in {:.2f} seconds".format(
        len(itemsets), duration),
          flush=True)

    start = time.time()
    rules = generate_rules(itemsets, itemset_counts, num_transactions,
                           args.min_confidence, args.min_lift)
    duration = time.time() - start
    print("Generated {} rules in {:.2f} seconds".format(len(rules), duration),
          flush=True)

    start = time.time()
    with open(args.output, "w") as f:
        f.write("Antecedent->Consequent,Confidence,Lift,Support\n")
        for (antecedent, consequent, confidence, lift, support) in rules:
            f.write("{} -> {},{:.4f},{:.4f},{:.4f}\n".format(
                set_to_string(antecedent), set_to_string(consequent),
                confidence, lift, support))
    print("Wrote rules to disk in {:.2f} seconds".format(duration), flush=True)

    duration = time.time() - program_start
    print("Total runtime {:.2f} seconds".format(duration))

    return 0
예제 #3
0
def main():
    args = parse_args()
    program_start = time.time()

    if args.trace_malloc:
        tracemalloc.start()

    print(
        "Virtual Change/Drift Detection - Association Rule Mining using Python."
    )
    print("Drift Algorithm: {}".format(args.drift_algorithm))
    print("Input file: {}".format(args.input))
    print("Output file prefix: {}".format(args.output))
    print("Training window size: {}".format(args.training_window_size))
    print("Minimum confidence: {}".format(args.min_confidence))
    print("Minimum support: {}".format(args.min_support))
    print("Minimum lift: {}".format(args.min_lift))
    if args.fixed_drift_confidence is not None:
        print("Fixed drift confidence of: {}".format(
            args.fixed_drift_confidence))
    print("Tracing memory allocations: {}".format(args.trace_malloc))
    print("Save rules: {}".format(args.save_rules))
    print("Generating maximal itemsets: {}".format(args.maximal_itemsets))

    reader = iter(DatasetReader(args.input))
    transaction_num = 0
    end_of_last_window = 0
    cohort_num = 1
    volatility_detector = make_volatility_detector(args)
    while True:
        window = take(args.training_window_size, reader)
        if len(window) == 0:
            break
        print("")
        print("Mining window [{},{}]".format(transaction_num,
                                             transaction_num + len(window)))
        end_of_last_window = transaction_num + len(window)
        transaction_num += len(window)
        print("Running FP-Growth...", flush=True)
        start = time.time()

        (itemsets, itemset_counts,
         num_transactions) = mine_fp_tree(window, args.min_support,
                                          args.maximal_itemsets)
        assert (num_transactions == len(window))

        duration = time.time() - start
        print("FPGrowth mined {} items in {:.2f} seconds".format(
            len(itemsets), duration))

        print("Generating rules...", flush=True)
        start = time.time()
        rules = list(
            generate_rules(itemsets, itemset_counts, num_transactions,
                           args.min_confidence, args.min_lift))
        duration = time.time() - start
        print("Generated {} rules in {:.2f} seconds".format(
            len(rules), duration),
              flush=True)

        if len(rules) == 0:
            print("No rules; just noise. Skipping change detection.")
            print(
                "Consider increasing training window size or lowering minsup/conf."
            )
            continue

        if args.save_rules:
            start = time.time()
            output_filename = args.output + "." + str(cohort_num)
            cohort_num += 1
            write_rules_to_file(rules, output_filename)
            print("Wrote rules for cohort {} to file {} in {:.2f} seconds".
                  format(cohort_num, output_filename, duration),
                  flush=True)

        drift_detector = make_drift_detector(args, volatility_detector)
        drift_detector.train(window, rules)

        # Read transactions until a drift is detected.
        for transaction in reader:
            transaction_num += 1
            drift = drift_detector.check_for_drift(transaction,
                                                   transaction_num)
            if drift is not None:
                print(
                    "Detected drift of type {} at transaction {}, {} after end of training window"
                    .format(drift.drift_type, transaction_num,
                            transaction_num - end_of_last_window))
                if drift.hellinger_value is not None:
                    print(
                        "Hellinger value: {}, confidence interval: {} ± {} ([{},{}])"
                        .format(drift.hellinger_value, drift.mean,
                                drift.confidence,
                                drift.mean - drift.confidence,
                                drift.mean + drift.confidence))
                # Record the drift in the volatility detector. This is used inside
                # the drift detector to help determine how large a confidence interval
                # is required when detecting drifts.
                if volatility_detector is not None:
                    volatility_detector.add(transaction_num)
                # Break out of the inner loop, we'll jump back up to the top and mine
                # a new training window.
                break

        if len(window) < args.training_window_size:
            break

    print("\nEnd of stream\n")

    duration = time.time() - program_start
    print("Total runtime {:.2f} seconds".format(duration))

    if args.trace_malloc:
        (_, peak_memory) = tracemalloc.get_traced_memory()
        tracemalloc_memory = tracemalloc.get_tracemalloc_memory()
        print("Peak memory usage: {:.3f} MB".format(peak_memory / 10**6))
        print("tracemalloc overhead: {:.3f} MB".format(
            (tracemalloc_memory / 10**6)))
        print("Peak memory usage minus tracemalloc overhead: {:.3f} MB".format(
            (peak_memory - tracemalloc_memory) / 10**6))
        snapshot = tracemalloc.take_snapshot()
        bytes_allocated = sum(x.size for x in snapshot.traces)
        print("Total traced memory allocated: {:.3f} MB".format(
            bytes_allocated / 10**6))

        tracemalloc.stop()

    return 0
예제 #4
0
def test_apriori():
    data = ("a,b,c,d,e,f\n"
            "g,h,i,j,k,l\n"
            "z,x\n"
            "z,x\n"
            "z,x,y\n"
            "z,x,y,i\n")

    expectedItemSets = {ItemSet("i"): 2 / 6,
                        ItemSet("z"): 4 / 6,
                        ItemSet("x"): 4 / 6,
                        ItemSet("y"): 2 / 6,
                        ItemSet("xz"): 4 / 6,
                        ItemSet("yz"): 2 / 6,
                        ItemSet("xy"): 2 / 6,
                        ItemSet("xyz"): 2 / 6}

    index = InvertedIndex()
    index.load(data)
    itemsets = apriori(index, 2 / 6)
    assert(len(itemsets) == len(expectedItemSets))
    for itemset in itemsets:
        assert(frozenset(itemset) in expectedItemSets)
    for itemset in itemsets:
        assert(expectedItemSets[frozenset(itemset)] == index.support(itemset))

    print("Itemsets={}".format([i for i in itemsets if len(i) > 1]))

    def itemize(a):
        return list(map(item_id, a))

    # (antecedent, consequent, confidence, lift, support)
    rx = [
        (['y'], ['x'], 1.0, 1.5, 0.3333333333333333),
        (['x'], ['y'], 0.5, 1.5, 0.3333333333333333),
        (['y'], ['z'], 1.0, 1.5, 0.3333333333333333),
        (['z'], ['y'], 0.5, 1.5, 0.3333333333333333),
        (['x'], ['z'], 1.0, 1.5, 0.6666666666666666),
        (['z'], ['x'], 1.0, 1.5, 0.6666666666666666),
        (['x', 'y'], ['z'], 1.0, 1.5, 0.3333333333333333),
        (['z', 'y'], ['x'], 1.0, 1.5, 0.3333333333333333),
        (['z', 'x'], ['y'], 0.5, 1.5, 0.3333333333333333),
        (['y'], ['z', 'x'], 1.0, 1.5, 0.3333333333333333),
        (['x'], ['z', 'y'], 0.5, 1.5, 0.3333333333333333),
        (['z'], ['x', 'y'], 0.5, 1.5, 0.3333333333333333)
    ]

    expectedRules = list(map(lambda a: (itemize(a[0]), itemize(a[1]), a[2], a[3], a[4]), rx))

    itemset_counts = dict(map(lambda i: (tuple(i), index.count(i)), itemsets))
    rules = generate_rules(
        itemsets,
        itemset_counts,
        index.num_transactions,
        0,
        0)

    def deitemize(a):
        return list(map(item_str, a))

    p = list(map(lambda a: (deitemize(a[0]), deitemize(a[1]), a[2], a[3], a[4]), rules))
    print("rules")
    print(p)

    for (antecedent,
         consequent,
         confidence,
         lift,
         support) in rules:
        print("{}, {} conf={:.4f}, {:.4f}, {:.4f}".
              format(antecedent, consequent, confidence, lift, support))

    assert(len(rules) == len(expectedRules))
    for i in range(len(rules)):
        assert(expectedRules[i] in rules)