예제 #1
0
def misere(data, target_class, time_budget=conf.TIME_BUDGET, top_k=conf.TOP_K, iterations_limit=conf.ITERATIONS_NUMBER,
           theta=conf.THETA, quality_measure=conf.QUALITY_MEASURE):
    begin = datetime.datetime.utcnow()
    time_budget = datetime.timedelta(seconds=time_budget)

    sorted_patterns = PrioritySet(theta=theta)

    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1
    first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size)
    last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size)
    class_data_count = count_target_class_data(data, target_class)
    itemsets_bitsets = {}

    iterations_count = 0

    while datetime.datetime.utcnow() - begin < time_budget and iterations_count < iterations_limit:
        sequence = copy.deepcopy(random.choice(data))
        sequence = sequence[1:]

        ads = count_subsequences_number(sequence)

        for i in range(int(math.log(ads))):
            if iterations_count >= iterations_limit:
                break

            subsequence = copy.deepcopy(sequence)

            # we remove z items randomly
            seq_items_nb = len([i for j_set in subsequence for i in j_set])
            z = random.randint(1, seq_items_nb - 1)

            for _ in range(z):
                chosen_itemset_i = random.randint(0, len(subsequence) - 1)
                chosen_itemset = subsequence[chosen_itemset_i]

                chosen_itemset.remove(random.sample(chosen_itemset, 1)[0])

                if len(chosen_itemset) == 0:
                    subsequence.pop(chosen_itemset_i)

            quality, _ = compute_quality_vertical(data, subsequence, target_class,
                                                bitset_slot_size,
                                                itemsets_bitsets, class_data_count,
                                                first_zero_mask, last_ones_mask, quality_measure=quality_measure)

            iterations_count += 1
            sorted_patterns.add(sequence_mutable_to_immutable(subsequence),
                                quality)

    return sorted_patterns.get_top_k_non_redundant(data, top_k)
예제 #2
0
def exhaustive(data, target_class, top_k=5, enable_i=True):
    begin = datetime.datetime.utcnow()

    # by storing this large element, we avoid the problem of adding problems elements
    sorted_patterns = PrioritySet(500)

    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1
    first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size)
    last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size)
    class_data_count = count_target_class_data(data, target_class)
    itemsets_bitsets = {}

    items = extract_items(data)

    fifo = [[]]

    # to know if elements have already been added
    fifo_elements = set()

    stage = 0
    compute_count = 0

    while len(fifo) != 0:
        seed = fifo.pop(0)
        children = compute_children(seed, items, enable_i)

        if k_length(seed) > stage:
            stage = k_length(seed)
            display_info(stage, compute_count, sorted_patterns, begin, data, top_k)

        for child in children:
            quality, bitset = compute_quality_vertical(data, child, target_class,
                                                       bitset_slot_size,
                                                       itemsets_bitsets,
                                                       class_data_count,
                                                       first_zero_mask,
                                                       last_ones_mask)

            sorted_patterns.add_preserve_memory(child, quality, data)

            # we do not explore elements with a null support
            if child not in fifo_elements and bitset != 0:
                fifo.append(child)
                fifo_elements.add(child)

            compute_count += len(children)

    print("The algorithm took:{}".format(datetime.datetime.utcnow() - begin))

    return sorted_patterns.get_top_k_non_redundant(data, top_k)
예제 #3
0
def optimizer(data, target, alg_name, time_budget, iteration_limit, enable_i):
    if alg_name == 'misere':
        results = misere(data,
                         target,
                         time_budget=time_budget,
                         iterations_limit=iteration_limit)
    elif alg_name == 'BeamSearch':
        results = beam_search(data,
                              target,
                              time_budget=time_budget,
                              iterations_limit=iteration_limit,
                              enable_i=enable_i)
    elif alg_name == 'SeqScout':
        results = seq_scout(data,
                            target,
                            time_budget=time_budget,
                            iterations_limit=iteration_limit,
                            enable_i=enable_i)
    elif alg_name == 'MCTSExtent':
        results = launch_mcts(data,
                              target,
                              time_budget=time_budget,
                              iterations_limit=iteration_limit)
    else:
        print('Error algo name')
        return

    sorted_patterns = PrioritySet(theta=THETA)
    for result in results:
        sorted_patterns.add(result[1], result[0])

    results_post_opti = optimize_pattern(results, extract_items(data), data,
                                         [], target, TOP_K, sorted_patterns,
                                         enable_i)

    return results, results_post_opti
예제 #4
0
def launch_mcts(data, target_class, time_budget=conf.TIME_BUDGET, top_k=conf.TOP_K, theta=conf.THETA,
                iterations_limit=conf.ITERATIONS_NUMBER, quality_measure=conf.QUALITY_MEASURE):
    begin = datetime.datetime.utcnow()
    time_budget = datetime.timedelta(seconds=time_budget)

    data_positive = filter_positive(data, target_class)
    data = filter_empty_sequences(data)

    node_hashmap = {}
    root_node = Node(None, None, data, data_positive, target_class, node_hashmap)
    node_hashmap[('.')] = root_node

    sorted_patterns = PrioritySet(k=top_k, theta=theta)
    iteration_count = 0

    while datetime.datetime.utcnow() - begin <= time_budget and iteration_count < iterations_limit:
        node_sel = select(root_node)

        if node_sel == 'finished':
            print('Finished')
            break

        node_expand = node_sel.expand(data, data_positive, target_class, quality_measure=quality_measure)
        sorted_patterns.add(sequence_mutable_to_immutable(node_expand.intent), node_expand.quality)

        sequence_reward, reward = roll_out(node_expand, data, target_class, quality_measure=quality_measure)

        sorted_patterns.add(sequence_mutable_to_immutable(sequence_reward), reward)

        update(node_expand, reward)
        iteration_count += 1

        # if iteration_count % int(iterations_limit * 0.1) == 0:
        #    print('{}%'.format(iteration_count / iterations_limit * 100))

    print('Number iteration mcts: {}'.format(iteration_count))
    return sorted_patterns.get_top_k_non_redundant(data, top_k)
예제 #5
0
def beam_search(data, target_class, time_budget=conf.TIME_BUDGET, enable_i=True, top_k=conf.TOP_K,
                beam_width=conf.BEAM_WIDTH,
                iterations_limit=conf.ITERATIONS_NUMBER,
                theta=conf.THETA,
                quality_measure=conf.QUALITY_MEASURE,
                diverse=True):
    items = extract_items(data)

    begin = datetime.datetime.utcnow()
    time_budget = datetime.timedelta(seconds=time_budget)

    # candidate_queue = items_to_sequences(items)
    candidate_queue = [[]]

    sorted_patterns = PrioritySet(top_k, theta=theta)

    nb_iteration = 0
    while datetime.datetime.utcnow() - begin < time_budget and nb_iteration < iterations_limit:
        beam = PrioritySet()

        while (len(candidate_queue) != 0) and nb_iteration < iterations_limit:
            seed = candidate_queue.pop(0)
            children = compute_children(seed, items, enable_i)

            for child in children:
                if nb_iteration >= iterations_limit:
                    break

                quality = compute_quality(data, child, target_class, quality_measure=quality_measure)

                # sorted_patterns.add_preserve_memory(child, quality, data)
                sorted_patterns.add(child, quality)
                beam.add(child, quality)
                nb_iteration += 1
        if diverse:
            candidate_queue = [j for i, j in beam.get_top_k_non_redundant(data, beam_width)]
        else:
            candidate_queue = [j for i, j in beam.get_top_k(beam_width)]

    # print("Number iterations beam search: {}".format(nb_iteration))
    return sorted_patterns.get_top_k_non_redundant(data, top_k)
예제 #6
0
def test_over_top_k():
    priority = PrioritySet()
    priority.add(frozenset([1, 2]), 0.1)

    assert len(priority.get_top_k(2)) == 1
예제 #7
0
def seq_scout(data,
              target_class,
              time_budget=conf.TIME_BUDGET,
              top_k=conf.TOP_K,
              enable_i=True,
              vertical=True,
              iterations_limit=conf.ITERATIONS_NUMBER,
              theta=conf.THETA,
              quality_measure=conf.QUALITY_MEASURE):
    items = extract_items(data)
    begin = datetime.datetime.utcnow()
    time_budget = datetime.timedelta(seconds=time_budget)

    data_target_class = filter_target_class(data, target_class)
    sorted_patterns = PrioritySet(k=top_k, theta=theta)
    UCB_scores = PrioritySetUCB()
    itemsets_memory = get_itemset_memory(data)

    # removing class
    bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1

    global VERTICAL_RPZ
    VERTICAL_RPZ = vertical

    global VERTICAL_TOOLS
    VERTICAL_TOOLS = {
        "bitset_slot_size": bitset_slot_size,
        "first_zero_mask": compute_first_zero_mask(len(data),
                                                   bitset_slot_size),
        "last_ones_mask": compute_last_ones_mask(len(data), bitset_slot_size),
        "class_data_count": count_target_class_data(data, target_class),
        "itemsets_bitsets": {}
    }

    N = 1

    # init: we add objects with the best ucb so that they are all played one time in the main procedure.
    # By putting a null N, we ensure the mean of the quality will be correct
    for sequence in data_target_class:
        sequence_i = sequence_mutable_to_immutable(sequence[1:])
        UCB_score = UCB(float("inf"), 1, N)
        UCB_scores.add(sequence_i, (UCB_score, 0, 0))

    # play with time budget
    while datetime.datetime.utcnow(
    ) - begin < time_budget and N < iterations_limit:
        # we take the best UCB
        _, Ni, mean_quality, sequence = UCB_scores.pop()

        pattern, quality = play_arm(sequence,
                                    data,
                                    target_class,
                                    quality_measure=quality_measure)
        pattern = sequence_mutable_to_immutable(pattern)
        sorted_patterns.add(pattern, quality)

        # we update scores
        updated_quality = (Ni * mean_quality + quality) / (Ni + 1)
        UCB_score = UCB(updated_quality, Ni + 1, N)
        UCB_scores.add(sequence, (UCB_score, Ni + 1, updated_quality))

        N += 1

    # print("SeqScout iterations: {}".format(N))

    return sorted_patterns.get_top_k_non_redundant(data, top_k)
    '''