def expand(self, data, data_positive, target_class, quality_measure=conf.QUALITY_MEASURE): random_object = random.sample(self.candidate_sequences_expand, 1)[0] self.candidate_sequences_expand.remove(random_object) if self.intent == None: # for the root node, it is directly database sequences sequence_children = sequence_mutable_to_immutable(random_object) else: sequence_children = sequence_mutable_to_immutable( find_LCS(random_object, self.intent)) if sequence_children in self.node_hashmap: child = self.node_hashmap[sequence_children] child.parents.append(self) self.children.append(child) else: child = Node(sequence_children, self, data, data_positive, target_class, self.node_hashmap, quality_measure=quality_measure) self.node_hashmap[sequence_children] = child return child
def test_is_subsequence(): a = ({1, 2}, {2, 3}) b = ({1, 2, 3}, {2, 4, 3}) c = ({1}, {2}, {2}) assert not is_subsequence(c, a) assert is_subsequence(a, b) a = [{1, 2}, {2, 3}] b = [{1, 2, 3}, {1}, {2, 4, 3}] assert is_subsequence(a, b) a = [{1, 5, 2}, {2, 3}] b = [{1, 2, 3}, {1}, {2, 4, 3}] assert not is_subsequence(a, b) a = [{1, 5, 2}, {2, 3}, {5}] b = [{1, 5, 2}, {2, 4, 3}] assert not is_subsequence(a, b) a = [{1}, {2}] b = [{1, 2, 3}, {1}, {2, 4, 3}] assert is_subsequence(a, b) assert not is_subsequence(b, a) a = sequence_mutable_to_immutable(a) b = sequence_mutable_to_immutable(b) assert is_subsequence(a, b) a = [{'1'}, {'2'}] b = [{'1', '2', '3'}, {'1'}, {'2', '4', '3'}] assert is_subsequence(a, b)
def misere(data, target_class, time_budget=conf.TIME_BUDGET, top_k=conf.TOP_K, iterations_limit=conf.ITERATIONS_NUMBER, theta=conf.THETA, quality_measure=conf.QUALITY_MEASURE): begin = datetime.datetime.utcnow() time_budget = datetime.timedelta(seconds=time_budget) sorted_patterns = PrioritySet(theta=theta) bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1 first_zero_mask = compute_first_zero_mask(len(data), bitset_slot_size) last_ones_mask = compute_last_ones_mask(len(data), bitset_slot_size) class_data_count = count_target_class_data(data, target_class) itemsets_bitsets = {} iterations_count = 0 while datetime.datetime.utcnow() - begin < time_budget and iterations_count < iterations_limit: sequence = copy.deepcopy(random.choice(data)) sequence = sequence[1:] ads = count_subsequences_number(sequence) for i in range(int(math.log(ads))): if iterations_count >= iterations_limit: break subsequence = copy.deepcopy(sequence) # we remove z items randomly seq_items_nb = len([i for j_set in subsequence for i in j_set]) z = random.randint(1, seq_items_nb - 1) for _ in range(z): chosen_itemset_i = random.randint(0, len(subsequence) - 1) chosen_itemset = subsequence[chosen_itemset_i] chosen_itemset.remove(random.sample(chosen_itemset, 1)[0]) if len(chosen_itemset) == 0: subsequence.pop(chosen_itemset_i) quality, _ = compute_quality_vertical(data, subsequence, target_class, bitset_slot_size, itemsets_bitsets, class_data_count, first_zero_mask, last_ones_mask, quality_measure=quality_measure) iterations_count += 1 sorted_patterns.add(sequence_mutable_to_immutable(subsequence), quality) return sorted_patterns.get_top_k_non_redundant(data, top_k)
def launch_mcts(data, target_class, time_budget=conf.TIME_BUDGET, top_k=conf.TOP_K, theta=conf.THETA, iterations_limit=conf.ITERATIONS_NUMBER, quality_measure=conf.QUALITY_MEASURE): begin = datetime.datetime.utcnow() time_budget = datetime.timedelta(seconds=time_budget) data_positive = filter_positive(data, target_class) data = filter_empty_sequences(data) node_hashmap = {} root_node = Node(None, None, data, data_positive, target_class, node_hashmap) node_hashmap[('.')] = root_node sorted_patterns = PrioritySet(k=top_k, theta=theta) iteration_count = 0 while datetime.datetime.utcnow() - begin <= time_budget and iteration_count < iterations_limit: node_sel = select(root_node) if node_sel == 'finished': print('Finished') break node_expand = node_sel.expand(data, data_positive, target_class, quality_measure=quality_measure) sorted_patterns.add(sequence_mutable_to_immutable(node_expand.intent), node_expand.quality) sequence_reward, reward = roll_out(node_expand, data, target_class, quality_measure=quality_measure) sorted_patterns.add(sequence_mutable_to_immutable(sequence_reward), reward) update(node_expand, reward) iteration_count += 1 # if iteration_count % int(iterations_limit * 0.1) == 0: # print('{}%'.format(iteration_count / iterations_limit * 100)) print('Number iteration mcts: {}'.format(iteration_count)) return sorted_patterns.get_top_k_non_redundant(data, top_k)
def test_all_lcs(): seq1 = [{'a', 'b'}, {'e'}, {'c'}] seq2 = [{'a'}, {'d'}, {'a', 'b'}, {'f'}, {'e'}] lcs = find_LCS(seq1, seq2, all=True) immu_lcs = sequence_mutable_to_immutable([{'a', 'b'}, {'e'}]) assert immu_lcs in lcs seq1 = [{'a'}, {'a', 'b'}, {'e'}, {'c'}, {'b', 'd'}] seq2 = [{'a'}, {'b', 'c', 'd'}, {'a', 'd'}] lcs = find_LCS(seq1, seq2, all=True) assert len(lcs) == 3
def jaccard_measure_misere(sequence1, sequence2, data): intersection = 0 union = 0 for sequence in data: sequence = sequence[1:] sequence = sequence_mutable_to_immutable(sequence) seq1 = False seq2 = False if is_subsequence(sequence1, sequence): seq1 = True if is_subsequence(sequence2, sequence): seq2 = True if seq1 or seq2: union += 1 if seq1 and seq2: intersection += 1 try: return intersection / union except ZeroDivisionError: return 0
def optimize_pattern(patterns, items, data, itemsets_memory, target_class, top_k, sorted_patterns, enable_i=True, quality_measure=conf.QUALITY_MEASURE): for pattern in patterns: pattern_mutable = sequence_immutable_to_mutable(pattern[1]) optimized_pattern, optimized_quality = exploit_arm( pattern_mutable, pattern[0], items, data, itemsets_memory, target_class, enable_i=enable_i, quality_measure=quality_measure) optimized_pattern = sequence_mutable_to_immutable(optimized_pattern) sorted_patterns.add(optimized_pattern, optimized_quality) return sorted_patterns.get_top_k_non_redundant(data, top_k)
def seq_scout(data, target_class, time_budget=conf.TIME_BUDGET, top_k=conf.TOP_K, enable_i=True, vertical=True, iterations_limit=conf.ITERATIONS_NUMBER, theta=conf.THETA, quality_measure=conf.QUALITY_MEASURE): items = extract_items(data) begin = datetime.datetime.utcnow() time_budget = datetime.timedelta(seconds=time_budget) data_target_class = filter_target_class(data, target_class) sorted_patterns = PrioritySet(k=top_k, theta=theta) UCB_scores = PrioritySetUCB() itemsets_memory = get_itemset_memory(data) # removing class bitset_slot_size = len(max(data, key=lambda x: len(x))) - 1 global VERTICAL_RPZ VERTICAL_RPZ = vertical global VERTICAL_TOOLS VERTICAL_TOOLS = { "bitset_slot_size": bitset_slot_size, "first_zero_mask": compute_first_zero_mask(len(data), bitset_slot_size), "last_ones_mask": compute_last_ones_mask(len(data), bitset_slot_size), "class_data_count": count_target_class_data(data, target_class), "itemsets_bitsets": {} } N = 1 # init: we add objects with the best ucb so that they are all played one time in the main procedure. # By putting a null N, we ensure the mean of the quality will be correct for sequence in data_target_class: sequence_i = sequence_mutable_to_immutable(sequence[1:]) UCB_score = UCB(float("inf"), 1, N) UCB_scores.add(sequence_i, (UCB_score, 0, 0)) # play with time budget while datetime.datetime.utcnow( ) - begin < time_budget and N < iterations_limit: # we take the best UCB _, Ni, mean_quality, sequence = UCB_scores.pop() pattern, quality = play_arm(sequence, data, target_class, quality_measure=quality_measure) pattern = sequence_mutable_to_immutable(pattern) sorted_patterns.add(pattern, quality) # we update scores updated_quality = (Ni * mean_quality + quality) / (Ni + 1) UCB_score = UCB(updated_quality, Ni + 1, N) UCB_scores.add(sequence, (UCB_score, Ni + 1, updated_quality)) N += 1 # print("SeqScout iterations: {}".format(N)) return sorted_patterns.get_top_k_non_redundant(data, top_k) '''