Пример #1
0
def rank_materializable_join_graphs(materializable_join_paths, table_path):
    def score_for_key(keys_score, target):
        for c, nunique, score in keys_score:
            if target == c:
                return score

    def aggr_avg(scores):
        scores = np.asarray(scores)
        return np.average(scores)

    def aggr_mul(scores):
        return reduce(operator.mul, scores)

    rank_jps = []
    keys_cache = dict()
    for mjp in materializable_join_paths:
        jump_scores = []
        for filter, l, r in mjp:
            table = l.source_name
            if table not in keys_cache:
                path = table_path[table]
                table_df = dpu.get_dataframe(path + "/" + table)
                likely_keys_sorted = mva.most_likely_key(table_df)
                keys_cache[table] = likely_keys_sorted
            likely_keys_sorted = keys_cache[table]
            jump_score = score_for_key(likely_keys_sorted, l.field_name)
            jump_scores.append(jump_score)
        jp_score_avg = aggr_avg(jump_scores)
        jp_score_mul = aggr_mul(jump_scores)
        rank_jps.append((mjp, jp_score_avg, jp_score_mul))
    rank_jps = sorted(rank_jps, key=lambda x: x[1], reverse=True)
    return rank_jps
Пример #2
0
def rank_materializable_join_paths_piece(materializable_join_paths,
                                         candidate_group, table_path, dod):
    # compute rank list of likely keys for each table
    table_keys = dict()
    table_field_rank = dict()
    for table in candidate_group:
        if table in table_path:
            path = table_path[table]
        else:
            nid = (dod.aurum_api.make_drs(table)).data[0].nid
            path = dod.aurum_api.helper.get_path_nid(nid)
            table_path[table] = path
        table_df = dpu.get_dataframe(path + "/" + table)
        likely_keys_sorted = mva.most_likely_key(table_df)
        table_keys[table] = likely_keys_sorted
        field_rank = {
            payload[0]: i
            for i, payload in enumerate(likely_keys_sorted)
        }
        table_field_rank[table] = field_rank

    # 1) Split join paths into its pairs, then 2) sort each pair individually, then 3) assemble again

    num_jumps = sorted([len(x) for x in materializable_join_paths])[-1]
    jump_joins = {i: [] for i in range(num_jumps)}

    # 1) split
    for annotated_jp in materializable_join_paths:
        for i, jp in enumerate(annotated_jp):
            jump_joins[i].append(jp)

    def field_to_rank(table, field):
        return table_field_rank[table][field]

    # 2) sort
    for jump, joins in jump_joins.items():
        joins = sorted(
            joins,
            key=lambda x: field_to_rank(x[1].source_name, x[1].field_name))
        jump_joins[jump] = joins

    # 3) assemble
    ranked_materialized_join_paths = [
        [] for _ in range(len(materializable_join_paths))
    ]
    for jump, joins in jump_joins.items():
        for i, join in enumerate(joins):
            ranked_materialized_join_paths[i].append(join)

    return ranked_materialized_join_paths
Пример #3
0
    def virtual_schema_iterative_search(self,
                                        list_attributes: [str],
                                        list_samples: [str],
                                        max_hops=2,
                                        debug_enumerate_all_jps=False):
        # Align schema definition and samples
        assert len(list_attributes) == len(list_samples)
        sch_def = {
            attr: value
            for attr, value in zip(list_attributes, list_samples)
        }

        sch_def = OrderedDict(
            sorted(sch_def.items(), key=lambda x: x[0], reverse=True))

        filter_drs = self.joint_filters(sch_def)

        # We group now into groups that convey multiple filters.
        # Obtain list of tables ordered from more to fewer filters.
        table_fulfilled_filters = defaultdict(list)
        table_nid = dict(
        )  # collect nids -- used later to obtain an access path to the tables
        for filter, drs in filter_drs.items():
            drs.set_table_mode()
            # All these tables fulfill the filter above
            for table in drs:
                # table_fulfilled_filters[table].append(filter)
                if filter[1] == FilterType.ATTR:
                    columns = [c for c in drs.data]  # copy
                    for c in columns:
                        if c.source_name == table:
                            table_nid[table] = c.nid
                    # if filter not in table_fulfilled_filters[table]:
                    if filter[2] not in [
                            id for _, _, id in table_fulfilled_filters[table]
                    ]:
                        table_fulfilled_filters[table].append(
                            ((filter[0], None), FilterType.ATTR, filter[2]))
                elif filter[1] == FilterType.CELL:
                    columns = [c for c in drs.data]  # copy
                    for c in columns:
                        if c.source_name == table:  # filter in this column
                            table_nid[table] = c.nid
                            # if filter not in table_fulfilled_filters[table]:
                            if filter[2] not in [
                                    id for _, _, id in
                                    table_fulfilled_filters[table]
                            ]:
                                table_fulfilled_filters[table].append(
                                    ((filter[0], c.field_name),
                                     FilterType.CELL, filter[2]))

        table_path = obtain_table_paths(table_nid, self)

        # sort by value len -> # fulfilling filters
        table_fulfilled_filters = OrderedDict(
            sorted(table_fulfilled_filters.items(),
                   key=lambda el:
                   (len({filter_id
                         for _, _, filter_id in el[1]}), el[0]),
                   reverse=True))  # len of unique filters, then lexico

        # Ordering filters for more determinism
        for k, v in table_fulfilled_filters.items():
            v = sorted(v, key=lambda el: (el[2], el[0][0]),
                       reverse=True)  # sort by id, then filter_name
            table_fulfilled_filters[k] = v

        def eager_candidate_exploration():
            def covers_filters(candidate_filters, all_filters):
                all_filters_set = set([id for _, _, id in filter_drs.keys()])
                candidate_filters_set = set(
                    [id for _, _, id in candidate_filters])
                if len(candidate_filters_set) == len(all_filters_set):
                    return True
                return False

            def compute_size_filter_ix(filters,
                                       candidate_group_filters_covered):
                new_fs_set = set([id for _, _, id in filters])
                candidate_fs_set = set(
                    [id for _, _, id in candidate_group_filters_covered])
                ix_size = len(
                    new_fs_set.union(candidate_fs_set)) - len(candidate_fs_set)
                return ix_size

            def clear_state():
                candidate_group.clear()
                candidate_group_filters_covered.clear()

            # Eagerly obtain groups of tables that cover as many filters as possible
            backup = []
            go_on = True
            while go_on:
                candidate_group = []
                candidate_group_filters_covered = set()
                for i in range(len(list(table_fulfilled_filters.items()))):
                    table_pivot, filters_pivot = list(
                        table_fulfilled_filters.items())[i]
                    # Eagerly add pivot
                    candidate_group.append(table_pivot)
                    candidate_group_filters_covered.update(filters_pivot)
                    # Did it cover all filters?
                    # if len(candidate_group_filters_covered) == len(filter_drs.items()):
                    if covers_filters(candidate_group_filters_covered,
                                      filter_drs.items()):
                        candidate_group = sorted(candidate_group)
                        # print("1: " + str(table_pivot))
                        yield (candidate_group, candidate_group_filters_covered
                               )  # early stop
                        # Cleaning
                        clear_state()
                        continue
                    for j in range(len(list(table_fulfilled_filters.items()))):
                        idx = i + j + 1
                        if idx == len(table_fulfilled_filters.items()):
                            break
                        table, filters = list(
                            table_fulfilled_filters.items())[idx]
                        # new_filters = len(set(filters).union(candidate_group_filters_covered)) - len(candidate_group_filters_covered)
                        new_filters = compute_size_filter_ix(
                            filters, candidate_group_filters_covered)
                        if new_filters > 0:  # add table only if it adds new filters
                            candidate_group.append(table)
                            candidate_group_filters_covered.update(filters)
                            if covers_filters(candidate_group_filters_covered,
                                              filter_drs.items()):
                                # if len(candidate_group_filters_covered) == len(filter_drs.items()):
                                candidate_group = sorted(candidate_group)
                                # print("2: " + str(table_pivot))
                                yield (candidate_group,
                                       candidate_group_filters_covered)
                                clear_state()
                                # Re-add the current pivot, only necessary in this case
                                candidate_group.append(table_pivot)
                                candidate_group_filters_covered.update(
                                    filters_pivot)
                    candidate_group = sorted(candidate_group)
                    # print("3: " + str(table_pivot))
                    if covers_filters(candidate_group_filters_covered,
                                      filter_drs.items()):
                        yield (candidate_group,
                               candidate_group_filters_covered)
                    else:
                        backup.append(([
                            el for el in candidate_group
                        ], set([el
                                for el in candidate_group_filters_covered])))
                    # Cleaning
                    clear_state()
                # before exiting, return backup in case that may be useful
                for candidate_group, candidate_group_filters_covered in backup:
                    yield (candidate_group, candidate_group_filters_covered)
                go_on = False  # finished exploring all groups

        # Find ways of joining together each group
        cache_unjoinable_pairs = defaultdict(int)
        for candidate_group, candidate_group_filters_covered in eager_candidate_exploration(
        ):
            print("")
            print("Candidate group: " + str(candidate_group))
            num_unique_filters = len(
                {f_id
                 for _, _, f_id in candidate_group_filters_covered})
            print("Covers #Filters: " + str(num_unique_filters))

            if len(candidate_group) == 1:
                table = candidate_group[0]
                path = table_path[table]
                materialized_virtual_schema = dpu.get_dataframe(path + "/" +
                                                                table)
                attrs_to_project = dpu.obtain_attributes_to_project(
                    candidate_group_filters_covered)
                # Create metadata to document this view
                view_metadata = dict()
                view_metadata["#join_graphs"] = 1
                view_metadata["join_graph"] = {
                    "nodes": [{
                        "id": -101010,
                        "label": table
                    }],
                    "edges": []
                }
                yield materialized_virtual_schema, attrs_to_project, view_metadata
                continue  # to go to the next group

            # Pre-check
            # TODO: with a connected components index we can pre-filter many of those groups without checking
            #group_with_all_relations, join_path_groups = self.joinable(candidate_group, cache_unjoinable_pairs)
            max_hops = max_hops
            # We find the different join graphs that would join the candidate_group
            join_graphs = self.joinable(candidate_group,
                                        cache_unjoinable_pairs,
                                        max_hops=max_hops)
            if debug_enumerate_all_jps:
                for i, group in enumerate(join_graphs):
                    print("Group: " + str(i))
                    for el in group:
                        print(el)
                continue  # We are just interested in all JPs for all candidate groups

            # if not graphs skip next
            if len(join_graphs) == 0:
                print("Group: " + str(candidate_group) +
                      " is Non-Joinable with max_hops=" + str(max_hops))
                continue

            # Now we need to check every join graph individually and see if it's materializable. Only once we've
            # exhausted these join graphs we move on to the next candidate group. We know already that each of the
            # join graphs covers all tables in candidate_group, so if they're materializable we're good.
            # materializable_join_graphs = []
            for jpg in join_graphs:
                # Obtain filters that apply to this join graph
                filters = set()
                for l, r in jpg:
                    if l.source_name in table_fulfilled_filters:
                        filters.update(table_fulfilled_filters[l.source_name])
                    if r.source_name in table_fulfilled_filters:
                        filters.update(table_fulfilled_filters[r.source_name])

                # TODO: obtain join_graph score for diff metrics. useful for ranking later
                # rank_materializable_join_graphs(materializable_join_paths, table_path, dod)
                is_join_graph_valid = self.is_join_graph_materializable(
                    jpg, table_fulfilled_filters)

                if is_join_graph_valid:
                    attrs_to_project = dpu.obtain_attributes_to_project(
                        filters)
                    materialized_virtual_schema = dpu.materialize_join_graph(
                        jpg, self)
                    # Create metadata to document this view
                    view_metadata = dict()
                    view_metadata["#join_graphs"] = len(join_graphs)
                    # view_metadata["join_graph"] = self.format_join_paths_pairhops(jpg)
                    view_metadata[
                        "join_graph"] = self.format_join_graph_into_nodes_edges(
                            jpg)
                    yield materialized_virtual_schema, attrs_to_project, view_metadata

        print("Finished enumerating groups")
        cache_unjoinable_pairs = OrderedDict(
            sorted(cache_unjoinable_pairs.items(),
                   key=lambda x: x[1],
                   reverse=True))
        for k, v in cache_unjoinable_pairs.items():
            print(str(k) + " => " + str(v))
Пример #4
0
    def virtual_schema_iterative_search(self,
                                        list_attributes: [str],
                                        list_samples: [str],
                                        debug_enumerate_all_jps=False):
        # Align schema definition and samples
        assert len(list_attributes) == len(list_samples)
        sch_def = {
            attr: value
            for attr, value in zip(list_attributes, list_samples)
        }

        sch_def = OrderedDict(
            sorted(sch_def.items(), key=lambda x: x[0], reverse=True))

        filter_drs = self.joint_filters(sch_def)

        # We group now into groups that convey multiple filters.
        # Obtain list of tables ordered from more to fewer filters.
        table_fulfilled_filters = defaultdict(list)
        table_nid = dict(
        )  # collect nids -- used later to obtain an access path to the tables
        for filter, drs in filter_drs.items():
            drs.set_table_mode()
            # All these tables fulfill the filter above
            for table in drs:
                # table_fulfilled_filters[table].append(filter)
                if filter[1] == FilterType.ATTR:
                    columns = [c for c in drs.data]  # copy
                    for c in columns:
                        if c.source_name == table:
                            table_nid[table] = c.nid
                    # if filter not in table_fulfilled_filters[table]:
                    if filter[2] not in [
                            id for _, _, id in table_fulfilled_filters[table]
                    ]:
                        table_fulfilled_filters[table].append(
                            ((filter[0], None), FilterType.ATTR, filter[2]))
                elif filter[1] == FilterType.CELL:
                    columns = [c for c in drs.data]  # copy
                    for c in columns:
                        if c.source_name == table:  # filter in this column
                            table_nid[table] = c.nid
                            # if filter not in table_fulfilled_filters[table]:
                            if filter[2] not in [
                                    id for _, _, id in
                                    table_fulfilled_filters[table]
                            ]:
                                table_fulfilled_filters[table].append(
                                    ((filter[0], c.field_name),
                                     FilterType.CELL, filter[2]))

        table_path = obtain_table_paths(table_nid, self)

        # sort by value len -> # fulfilling filters
        table_fulfilled_filters = OrderedDict(
            sorted(table_fulfilled_filters.items(),
                   key=lambda el:
                   (len({filter_id
                         for _, _, filter_id in el[1]}), el[0]),
                   reverse=True))  # len of unique filters, then lexico

        # Ordering filters for more determinism
        for k, v in table_fulfilled_filters.items():
            v = sorted(v, key=lambda el: (el[2], el[0][0]),
                       reverse=True)  # sort by id, then filter_name
            table_fulfilled_filters[k] = v

        def eager_candidate_exploration():
            def covers_filters(candidate_filters, all_filters):
                all_filters_set = set([id for _, _, id in filter_drs.keys()])
                candidate_filters_set = set(
                    [id for _, _, id in candidate_filters])
                if len(candidate_filters_set) == len(all_filters_set):
                    return True
                return False

            def compute_size_filter_ix(filters,
                                       candidate_group_filters_covered):
                new_fs_set = set([id for _, _, id in filters])
                candidate_fs_set = set(
                    [id for _, _, id in candidate_group_filters_covered])
                ix_size = len(
                    new_fs_set.union(candidate_fs_set)) - len(candidate_fs_set)
                return ix_size

            def clear_state():
                candidate_group.clear()
                candidate_group_filters_covered.clear()

            # Eagerly obtain groups of tables that cover as many filters as possible
            go_on = True
            while go_on:
                candidate_group = []
                candidate_group_filters_covered = set()
                for i in range(len(list(table_fulfilled_filters.items()))):
                    table_pivot, filters_pivot = list(
                        table_fulfilled_filters.items())[i]
                    # Eagerly add pivot
                    candidate_group.append(table_pivot)
                    candidate_group_filters_covered.update(filters_pivot)
                    # Did it cover all filters?
                    # if len(candidate_group_filters_covered) == len(filter_drs.items()):
                    if covers_filters(candidate_group_filters_covered,
                                      filter_drs.items()):
                        candidate_group = sorted(candidate_group)
                        # print("1: " + str(table_pivot))
                        yield (candidate_group, candidate_group_filters_covered
                               )  # early stop
                        # Cleaning
                        clear_state()
                        continue
                    for j in range(len(list(table_fulfilled_filters.items()))):
                        idx = i + j + 1
                        if idx == len(table_fulfilled_filters.items()):
                            break
                        table, filters = list(
                            table_fulfilled_filters.items())[idx]
                        # new_filters = len(set(filters).union(candidate_group_filters_covered)) - len(candidate_group_filters_covered)
                        new_filters = compute_size_filter_ix(
                            filters, candidate_group_filters_covered)
                        if new_filters > 0:  # add table only if it adds new filters
                            candidate_group.append(table)
                            candidate_group_filters_covered.update(filters)
                            if covers_filters(candidate_group_filters_covered,
                                              filter_drs.items()):
                                # if len(candidate_group_filters_covered) == len(filter_drs.items()):
                                candidate_group = sorted(candidate_group)
                                # print("2: " + str(table_pivot))
                                yield (candidate_group,
                                       candidate_group_filters_covered)
                                clear_state()
                                # Re-add the current pivot, only necessary in this case
                                candidate_group.append(table_pivot)
                                candidate_group_filters_covered.update(
                                    filters_pivot)
                    candidate_group = sorted(candidate_group)
                    # print("3: " + str(table_pivot))
                    yield (candidate_group, candidate_group_filters_covered)
                    # Cleaning
                    clear_state()
                go_on = False  # finished exploring all groups

        # Find ways of joining together each group
        cache_unjoinable_pairs = defaultdict(int)
        for candidate_group, candidate_group_filters_covered in eager_candidate_exploration(
        ):
            print("")
            print("Candidate group: " + str(candidate_group))
            num_unique_filters = len(
                {f_id
                 for _, _, f_id in candidate_group_filters_covered})
            print("Covers #Filters: " + str(num_unique_filters))

            if len(candidate_group) == 1:
                table = candidate_group[0]
                path = table_path[table]
                materialized_virtual_schema = dpu.get_dataframe(path + "/" +
                                                                table)
                attrs_to_project = dpu.obtain_attributes_to_project(
                    (candidate_group_filters_covered, None))
                yield materialized_virtual_schema, attrs_to_project
                continue  # to go to the next group

            # Pre-check
            # TODO: with a connected components index we can pre-filter many of those groups without checking
            group_with_all_relations, join_path_groups = self.joinable(
                candidate_group, cache_unjoinable_pairs)
            if debug_enumerate_all_jps:
                print("Join paths which cover candidate group:")
                for jp in group_with_all_relations:
                    print(jp)
                print("Join graphs which cover candidate group: ")
                for i, group in enumerate(join_path_groups):
                    print("Group: " + str(i))
                    for el in group:
                        print(el)
                continue  # We are just interested in all JPs for all candidate groups

            # if not paths or graphs skip next
            if len(join_path_groups) == 0 and len(
                    group_with_all_relations) == 0:
                print("Group: " + str(candidate_group) + " is Non-Joinable")
                continue

            # We first check if the group_with_all_relations is materializable
            materializable_join_paths = []
            if len(group_with_all_relations) > 0:
                join_paths = self.tx_join_paths_to_pair_hops(
                    group_with_all_relations)
                annotated_join_paths = self.annotate_join_paths_with_filter(
                    join_paths, table_fulfilled_filters, candidate_group)
                # Check JP materialization
                print("Found " + str(len(annotated_join_paths)) +
                      " candidate join paths")
                valid_join_paths = self.verify_candidate_join_paths(
                    annotated_join_paths)
                print("Found " + str(len(valid_join_paths)) +
                      " materializable join paths")
                materializable_join_paths.extend(valid_join_paths)

            # We need that at least one JP from each group is materializable
            if len(materializable_join_paths) == 0 and len(
                    join_path_groups) == 0:
                print("No join graphs for this candidate group")
                continue
            print("Processing join graphs...")
            materializable_join_graphs = dict()
            for k, v in join_path_groups.items():
                print("Pair: " + str(k))
                join_paths = self.tx_join_paths_to_pair_hops(v)
                annotated_join_paths = self.annotate_join_paths_with_filter(
                    join_paths, table_fulfilled_filters, candidate_group)

                # Check JP materialization
                print("Found " + str(len(annotated_join_paths)) +
                      " candidate join paths for join graph")

                # For each candidate join_path, check whether it can be materialized or not,
                # then show to user (or the other way around)
                valid_join_paths = self.verify_candidate_join_paths(
                    annotated_join_paths)

                print("Found " + str(len(valid_join_paths)) +
                      " materializable join paths for join graph")

                if len(valid_join_paths) > 0:
                    materializable_join_graphs[k] = valid_join_paths
                else:
                    # This pair is non-materializable, but there may be other groups of pairs that cover
                    # the same tables, therefore we can only continue, we cannot determine at this point that
                    # the group is non-materializable, not yet.
                    continue
            # Verify whether the join_graphs cover the group or not
            covered_tables = set(candidate_group)
            for k, _ in materializable_join_graphs.items():
                (t1, t2) = k
                if t1 in covered_tables:
                    covered_tables.remove(t1)
                if t2 in covered_tables:
                    covered_tables.remove(t2)
            if len(covered_tables) > 0:
                # now we know there are not join graphs in this group, so we explicitly mark it as such
                materializable_join_graphs.clear()
                materializable_join_graphs = list(
                )  # next block of processing expects a list
            else:
                # 1) find key-groups
                keygroups = defaultdict(list)
                current_id = 0
                for keygroup in itertools.combinations(
                        list(materializable_join_graphs.keys()),
                        len(candidate_group) - 1):
                    for key in keygroup:
                        keygroups[current_id].append(
                            materializable_join_graphs[key])
                    current_id += 1

                # 2) for each key-group, enumerate all paths
                unit_jp = []
                for _, keygroup in keygroups.items():
                    # def unpack(packed_list):
                    #     for el in packed_list:
                    #         yield [v[0] for v in el]
                    args = keygroup
                    for comb in itertools.product(*args):
                        unit_jp.append(comb)

                # pack units into more compact format
                materializable_join_graphs = [
                ]  # TODO: note we are rewriting the type of a var in scope
                for unit in unit_jp:
                    packed_unit = []
                    for el in unit:
                        packed_unit.append(el[0])
                    materializable_join_graphs.append(packed_unit)
            print("Processing join graphs...OK")

            # Merge join paths and join graphs, at this point the difference is meaningless
            # TODO: are paths necessarily contained in graphs? if so, simplify code above

            all_jgs = materializable_join_graphs + materializable_join_paths

            print("Processing materializable join paths...")

            # Sort materializable_join_paths by likely joining on key
            all_jgs_scores = rank_materializable_join_graphs(
                all_jgs, table_path, self)

            clean_jp = []
            for annotated_jp, aggr_score, mul_score in all_jgs_scores:
                jp = []
                filters = set()
                for filter, l, r in annotated_jp:
                    # To drag filters along, there's a leaf special tuple where r may be None
                    # since we don't need it at this point anymore, we check for its existence and do not include it
                    if r is not None:
                        jp.append((l, r))
                    if filter is not None:
                        filters.update(filter)
                clean_jp.append((filters, jp))

            import pickle
            with open("check_debug.pkl", 'wb') as f:
                pickle.dump(clean_jp, f)

            for mjp in clean_jp:
                attrs_to_project = dpu.obtain_attributes_to_project(mjp)
                # materialized_virtual_schema = dpu.materialize_join_path(mjp, self)
                materialized_virtual_schema = dpu.materialize_join_graph(
                    mjp, self)
                yield materialized_virtual_schema, attrs_to_project

        print("Finished enumerating groups")
        cache_unjoinable_pairs = OrderedDict(
            sorted(cache_unjoinable_pairs.items(),
                   key=lambda x: x[1],
                   reverse=True))
        for k, v in cache_unjoinable_pairs.items():
            print(str(k) + " => " + str(v))