def misere(data, target_class, time_budget=conf.TIME_BUDGET, top_k=conf.TOP_K, iterations_limit=conf.ITERATIONS_NUMBER, theta=conf.THETA, quality_measure=conf.QUALITY_MEASURE): begin = datetime.datetime.utcnow() time_budget = datetime.timedelta(seconds=time_budget) sorted_patterns = PrioritySet(theta=theta) bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1 first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size) last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size) class_data_count = count_target_class_data(data, target_class) itemsets_bitsets = {} iterations_count = 0 while datetime.datetime.utcnow() - begin < time_budget and iterations_count < iterations_limit: sequence = copy.deepcopy(random.choice(data)) sequence = sequence[1:] ads = count_subsequences_number(sequence) for i in range(int(math.log(ads))): if iterations_count >= iterations_limit: break subsequence = copy.deepcopy(sequence) # we remove z items randomly seq_items_nb = len([i for j_set in subsequence for i in j_set]) z = random.randint(1, seq_items_nb - 1) for _ in range(z): chosen_itemset_i = random.randint(0, len(subsequence) - 1) chosen_itemset = subsequence[chosen_itemset_i] chosen_itemset.remove(random.sample(chosen_itemset, 1)[0]) if len(chosen_itemset) == 0: subsequence.pop(chosen_itemset_i) quality, _ = compute_quality_vertical(data, subsequence, target_class, bitset_slot_size, itemsets_bitsets, class_data_count, first_zero_mask, last_ones_mask, quality_measure=quality_measure) iterations_count += 1 sorted_patterns.add(sequence_mutable_to_immutable(subsequence), quality) return sorted_patterns.get_top_k_non_redundant(data, top_k)
def exhaustive(data, target_class, top_k=5, enable_i=True): begin = datetime.datetime.utcnow() # by storing this large element, we avoid the problem of adding problems elements sorted_patterns = PrioritySet(500) bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1 first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size) last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size) class_data_count = count_target_class_data(data, target_class) itemsets_bitsets = {} items = extract_items(data) fifo = [[]] # to know if elements have already been added fifo_elements = set() stage = 0 compute_count = 0 while len(fifo) != 0: seed = fifo.pop(0) children = compute_children(seed, items, enable_i) if k_length(seed) > stage: stage = k_length(seed) display_info(stage, compute_count, sorted_patterns, begin, data, top_k) for child in children: quality, bitset = compute_quality_vertical(data, child, target_class, bitset_slot_size, itemsets_bitsets, class_data_count, first_zero_mask, last_ones_mask) sorted_patterns.add_preserve_memory(child, quality, data) # we do not explore elements with a null support if child not in fifo_elements and bitset != 0: fifo.append(child) fifo_elements.add(child) compute_count += len(children) print("The algorithm took:{}".format(datetime.datetime.utcnow() - begin)) return sorted_patterns.get_top_k_non_redundant(data, top_k)
def optimizer(data, target, alg_name, time_budget, iteration_limit, enable_i): if alg_name == 'misere': results = misere(data, target, time_budget=time_budget, iterations_limit=iteration_limit) elif alg_name == 'BeamSearch': results = beam_search(data, target, time_budget=time_budget, iterations_limit=iteration_limit, enable_i=enable_i) elif alg_name == 'SeqScout': results = seq_scout(data, target, time_budget=time_budget, iterations_limit=iteration_limit, enable_i=enable_i) elif alg_name == 'MCTSExtent': results = launch_mcts(data, target, time_budget=time_budget, iterations_limit=iteration_limit) else: print('Error algo name') return sorted_patterns = PrioritySet(theta=THETA) for result in results: sorted_patterns.add(result[1], result[0]) results_post_opti = optimize_pattern(results, extract_items(data), data, [], target, TOP_K, sorted_patterns, enable_i) return results, results_post_opti
def launch_mcts(data, target_class, time_budget=conf.TIME_BUDGET, top_k=conf.TOP_K, theta=conf.THETA, iterations_limit=conf.ITERATIONS_NUMBER, quality_measure=conf.QUALITY_MEASURE): begin = datetime.datetime.utcnow() time_budget = datetime.timedelta(seconds=time_budget) data_positive = filter_positive(data, target_class) data = filter_empty_sequences(data) node_hashmap = {} root_node = Node(None, None, data, data_positive, target_class, node_hashmap) node_hashmap[('.')] = root_node sorted_patterns = PrioritySet(k=top_k, theta=theta) iteration_count = 0 while datetime.datetime.utcnow() - begin <= time_budget and iteration_count < iterations_limit: node_sel = select(root_node) if node_sel == 'finished': print('Finished') break node_expand = node_sel.expand(data, data_positive, target_class, quality_measure=quality_measure) sorted_patterns.add(sequence_mutable_to_immutable(node_expand.intent), node_expand.quality) sequence_reward, reward = roll_out(node_expand, data, target_class, quality_measure=quality_measure) sorted_patterns.add(sequence_mutable_to_immutable(sequence_reward), reward) update(node_expand, reward) iteration_count += 1 # if iteration_count % int(iterations_limit * 0.1) == 0: # print('{}%'.format(iteration_count / iterations_limit * 100)) print('Number iteration mcts: {}'.format(iteration_count)) return sorted_patterns.get_top_k_non_redundant(data, top_k)
def beam_search(data, target_class, time_budget=conf.TIME_BUDGET, enable_i=True, top_k=conf.TOP_K, beam_width=conf.BEAM_WIDTH, iterations_limit=conf.ITERATIONS_NUMBER, theta=conf.THETA, quality_measure=conf.QUALITY_MEASURE, diverse=True): items = extract_items(data) begin = datetime.datetime.utcnow() time_budget = datetime.timedelta(seconds=time_budget) # candidate_queue = items_to_sequences(items) candidate_queue = [[]] sorted_patterns = PrioritySet(top_k, theta=theta) nb_iteration = 0 while datetime.datetime.utcnow() - begin < time_budget and nb_iteration < iterations_limit: beam = PrioritySet() while (len(candidate_queue) != 0) and nb_iteration < iterations_limit: seed = candidate_queue.pop(0) children = compute_children(seed, items, enable_i) for child in children: if nb_iteration >= iterations_limit: break quality = compute_quality(data, child, target_class, quality_measure=quality_measure) # sorted_patterns.add_preserve_memory(child, quality, data) sorted_patterns.add(child, quality) beam.add(child, quality) nb_iteration += 1 if diverse: candidate_queue = [j for i, j in beam.get_top_k_non_redundant(data, beam_width)] else: candidate_queue = [j for i, j in beam.get_top_k(beam_width)] # print("Number iterations beam search: {}".format(nb_iteration)) return sorted_patterns.get_top_k_non_redundant(data, top_k)
def test_over_top_k(): priority = PrioritySet() priority.add(frozenset([1, 2]), 0.1) assert len(priority.get_top_k(2)) == 1
def seq_scout(data, target_class, time_budget=conf.TIME_BUDGET, top_k=conf.TOP_K, enable_i=True, vertical=True, iterations_limit=conf.ITERATIONS_NUMBER, theta=conf.THETA, quality_measure=conf.QUALITY_MEASURE): items = extract_items(data) begin = datetime.datetime.utcnow() time_budget = datetime.timedelta(seconds=time_budget) data_target_class = filter_target_class(data, target_class) sorted_patterns = PrioritySet(k=top_k, theta=theta) UCB_scores = PrioritySetUCB() itemsets_memory = get_itemset_memory(data) # removing class bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1 global VERTICAL_RPZ VERTICAL_RPZ = vertical global VERTICAL_TOOLS VERTICAL_TOOLS = { "bitset_slot_size": bitset_slot_size, "first_zero_mask": compute_first_zero_mask(len(data), bitset_slot_size), "last_ones_mask": compute_last_ones_mask(len(data), bitset_slot_size), "class_data_count": count_target_class_data(data, target_class), "itemsets_bitsets": {} } N = 1 # init: we add objects with the best ucb so that they are all played one time in the main procedure. # By putting a null N, we ensure the mean of the quality will be correct for sequence in data_target_class: sequence_i = sequence_mutable_to_immutable(sequence[1:]) UCB_score = UCB(float("inf"), 1, N) UCB_scores.add(sequence_i, (UCB_score, 0, 0)) # play with time budget while datetime.datetime.utcnow( ) - begin < time_budget and N < iterations_limit: # we take the best UCB _, Ni, mean_quality, sequence = UCB_scores.pop() pattern, quality = play_arm(sequence, data, target_class, quality_measure=quality_measure) pattern = sequence_mutable_to_immutable(pattern) sorted_patterns.add(pattern, quality) # we update scores updated_quality = (Ni * mean_quality + quality) / (Ni + 1) UCB_score = UCB(updated_quality, Ni + 1, N) UCB_scores.add(sequence, (UCB_score, Ni + 1, updated_quality)) N += 1 # print("SeqScout iterations: {}".format(N)) return sorted_patterns.get_top_k_non_redundant(data, top_k) '''