Exemplos de API em Python, exemplos de algebra.API em Python

Exemplo n.º 1

0

Exibir arquivo

def main(path_to_serialized_model):
    print('Loading: ' + str(path_to_serialized_model))
    network = fieldnetwork.deserialize_network(path_to_serialized_model)
    store_client = StoreHandler()
    api = API(network, store_client)
    ip_shell = InteractiveShellEmbed(banner1=init_banner, exit_msg=exit_banner)
    ip_shell()

Exemplo n.º 2

0

Exibir arquivo

def init_system(path_to_serialized_model, create_reporting=False):
    print_md('Loading: *' + str(path_to_serialized_model) + "*")
    sl = time.time()
    network = fieldnetwork.deserialize_network(path_to_serialized_model)
    store_client = StoreHandler()
    api = API(network=network, store_client=store_client)
    if create_reporting:
        reporting = Report(network)
    api.helper.help()
    el = time.time()
    print("Took " + str(el - sl) + " to load model")
    return api, reporting

Exemplo n.º 3

0

Exibir arquivo

Arquivo: ss_api.py Projeto: RogerTangos/aurum_debug

    def __init__(self, network, store_client, schema_sim_index, content_sim_index):
        self.network = network
        self.store_client = store_client
        self.schema_sim_index = schema_sim_index
        self.content_sim_index = content_sim_index
        self.srql = API(self.network, self.store_client)
        self.krs = []
        self.kr_handlers = dict()

        # SS indexes
        self.num_vectors_tables = 0  # Total number of semantic vectors for tables
        self.sskey_to_ss = dict()  # ss_key to (table_name, semantic_vectors)
        self.sskey_to_vkeys = dict()  # ss_key to [vector_keys]
        self.vkey_to_sskey = dict()  # vector_key to ss_key (the ss_key that contains vector_key)
        vector_feature_dim = glove_api.get_lang_model_feature_size()
        self.ss_lsh_idx = LSHRandomProjectionsIndex(vector_feature_dim)

Exemplo n.º 4

0

Exibir arquivo

# Ignore in-table results of neighbor searches
# Exclude certain tables
# keyword_search and neighbor_search, but on mutiple contexts

import networkx as nx
from api.apiutils import Relation

from modelstore.elasticstore import StoreHandler, KWType
from knowledgerepr import fieldnetwork
from algebra import API

path_to_serialized_model = "/Users/arcarter/code/datadiscovery/test/testmodel/"
network = fieldnetwork.deserialize_network(path_to_serialized_model)
store_client = StoreHandler()

api = API(network, store_client)

# short variables for Scope
# These are used in keyword searches
# To specify what parts of a file will be searched
source = KWType.KW_TABLE  # table/file/source name
field = KWType.KW_SCHEMA  # colum names/fields
content = KWType.KW_TEXT  # content of the columns

# Short variables for Relation
# These represent edge types in the graph
# and are used for neighbor searches
# schema = Relation.SCHEMA  # similar schemas
schema_sim = Relation.SCHEMA_SIM  # Similar Schema Names
# similar content values. i.e. matching substrings and numbers
content_sim = Relation.CONTENT_SIM

Exemplo n.º 5

0

Exibir arquivo

Arquivo: dod.py Projeto: nato16/aurum-datadiscovery

 def __init__(self, network, store_client, csv_separator=","):
     self.aurum_api = API(network=network, store_client=store_client)
     self.paths_cache = dict()
     dpu.configure_csv_separator(csv_separator)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: dod.py Projeto: nato16/aurum-datadiscovery

class DoD:
    def __init__(self, network, store_client, csv_separator=","):
        self.aurum_api = API(network=network, store_client=store_client)
        self.paths_cache = dict()
        dpu.configure_csv_separator(csv_separator)

    def place_paths_in_cache(self, t1, t2, paths):
        self.paths_cache[(t1, t2)] = paths
        self.paths_cache[(t2, t1)] = paths

    def are_paths_in_cache(self, t1, t2):
        if (t1, t2) in self.paths_cache:
            print("HIT!")
            return self.paths_cache[(t1, t2)]
        elif (t2, t1) in self.paths_cache:
            print("HIT!")
            return self.paths_cache[(t2, t1)]
        else:
            return None

    def individual_filters(self, sch_def):
        # Obtain sets that fulfill individual filters
        filter_drs = dict()
        filter_id = 0
        for attr in sch_def.keys():
            drs = self.aurum_api.search_exact_attribute(attr, max_results=200)
            filter_drs[(attr, FilterType.ATTR, filter_id)] = drs
            filter_id += 1

        for cell in sch_def.values():
            drs = self.aurum_api.search_content(cell, max_results=200)
            filter_drs[(cell, FilterType.CELL, filter_id)] = drs
            filter_id += 1
        return filter_drs

    def joint_filters(self, sch_def):
        # Obtain sets that fulfill individual filters
        filter_drs = dict()
        filter_id = 0

        for attr, cell in sch_def.items():
            if cell == "":
                drs = self.aurum_api.search_exact_attribute(attr,
                                                            max_results=50)
                filter_drs[(attr, FilterType.ATTR, filter_id)] = drs
            else:
                drs_attr = self.aurum_api.search_exact_attribute(
                    attr, max_results=50)
                drs_cell = self.aurum_api.search_content(cell, max_results=500)
                drs = self.aurum_api.intersection(drs_attr, drs_cell)
                filter_drs[(cell, FilterType.CELL, filter_id)] = drs
            filter_id += 1
        return filter_drs

    def virtual_schema_iterative_search(self,
                                        list_attributes: [str],
                                        list_samples: [str],
                                        max_hops=2,
                                        debug_enumerate_all_jps=False):
        # Align schema definition and samples
        assert len(list_attributes) == len(list_samples)
        sch_def = {
            attr: value
            for attr, value in zip(list_attributes, list_samples)
        }

        sch_def = OrderedDict(
            sorted(sch_def.items(), key=lambda x: x[0], reverse=True))

        filter_drs = self.joint_filters(sch_def)

        # We group now into groups that convey multiple filters.
        # Obtain list of tables ordered from more to fewer filters.
        table_fulfilled_filters = defaultdict(list)
        table_nid = dict(
        )  # collect nids -- used later to obtain an access path to the tables
        for filter, drs in filter_drs.items():
            drs.set_table_mode()
            # All these tables fulfill the filter above
            for table in drs:
                # table_fulfilled_filters[table].append(filter)
                if filter[1] == FilterType.ATTR:
                    columns = [c for c in drs.data]  # copy
                    for c in columns:
                        if c.source_name == table:
                            table_nid[table] = c.nid
                    # if filter not in table_fulfilled_filters[table]:
                    if filter[2] not in [
                            id for _, _, id in table_fulfilled_filters[table]
                    ]:
                        table_fulfilled_filters[table].append(
                            ((filter[0], None), FilterType.ATTR, filter[2]))
                elif filter[1] == FilterType.CELL:
                    columns = [c for c in drs.data]  # copy
                    for c in columns:
                        if c.source_name == table:  # filter in this column
                            table_nid[table] = c.nid
                            # if filter not in table_fulfilled_filters[table]:
                            if filter[2] not in [
                                    id for _, _, id in
                                    table_fulfilled_filters[table]
                            ]:
                                table_fulfilled_filters[table].append(
                                    ((filter[0], c.field_name),
                                     FilterType.CELL, filter[2]))

        table_path = obtain_table_paths(table_nid, self)

        # sort by value len -> # fulfilling filters
        table_fulfilled_filters = OrderedDict(
            sorted(table_fulfilled_filters.items(),
                   key=lambda el:
                   (len({filter_id
                         for _, _, filter_id in el[1]}), el[0]),
                   reverse=True))  # len of unique filters, then lexico

        # Ordering filters for more determinism
        for k, v in table_fulfilled_filters.items():
            v = sorted(v, key=lambda el: (el[2], el[0][0]),
                       reverse=True)  # sort by id, then filter_name
            table_fulfilled_filters[k] = v

        def eager_candidate_exploration():
            def covers_filters(candidate_filters, all_filters):
                all_filters_set = set([id for _, _, id in filter_drs.keys()])
                candidate_filters_set = set(
                    [id for _, _, id in candidate_filters])
                if len(candidate_filters_set) == len(all_filters_set):
                    return True
                return False

            def compute_size_filter_ix(filters,
                                       candidate_group_filters_covered):
                new_fs_set = set([id for _, _, id in filters])
                candidate_fs_set = set(
                    [id for _, _, id in candidate_group_filters_covered])
                ix_size = len(
                    new_fs_set.union(candidate_fs_set)) - len(candidate_fs_set)
                return ix_size

            def clear_state():
                candidate_group.clear()
                candidate_group_filters_covered.clear()

            # Eagerly obtain groups of tables that cover as many filters as possible
            backup = []
            go_on = True
            while go_on:
                candidate_group = []
                candidate_group_filters_covered = set()
                for i in range(len(list(table_fulfilled_filters.items()))):
                    table_pivot, filters_pivot = list(
                        table_fulfilled_filters.items())[i]
                    # Eagerly add pivot
                    candidate_group.append(table_pivot)
                    candidate_group_filters_covered.update(filters_pivot)
                    # Did it cover all filters?
                    # if len(candidate_group_filters_covered) == len(filter_drs.items()):
                    if covers_filters(candidate_group_filters_covered,
                                      filter_drs.items()):
                        candidate_group = sorted(candidate_group)
                        # print("1: " + str(table_pivot))
                        yield (candidate_group, candidate_group_filters_covered
                               )  # early stop
                        # Cleaning
                        clear_state()
                        continue
                    for j in range(len(list(table_fulfilled_filters.items()))):
                        idx = i + j + 1
                        if idx == len(table_fulfilled_filters.items()):
                            break
                        table, filters = list(
                            table_fulfilled_filters.items())[idx]
                        # new_filters = len(set(filters).union(candidate_group_filters_covered)) - len(candidate_group_filters_covered)
                        new_filters = compute_size_filter_ix(
                            filters, candidate_group_filters_covered)
                        if new_filters > 0:  # add table only if it adds new filters
                            candidate_group.append(table)
                            candidate_group_filters_covered.update(filters)
                            if covers_filters(candidate_group_filters_covered,
                                              filter_drs.items()):
                                # if len(candidate_group_filters_covered) == len(filter_drs.items()):
                                candidate_group = sorted(candidate_group)
                                # print("2: " + str(table_pivot))
                                yield (candidate_group,
                                       candidate_group_filters_covered)
                                clear_state()
                                # Re-add the current pivot, only necessary in this case
                                candidate_group.append(table_pivot)
                                candidate_group_filters_covered.update(
                                    filters_pivot)
                    candidate_group = sorted(candidate_group)
                    # print("3: " + str(table_pivot))
                    if covers_filters(candidate_group_filters_covered,
                                      filter_drs.items()):
                        yield (candidate_group,
                               candidate_group_filters_covered)
                    else:
                        backup.append(([
                            el for el in candidate_group
                        ], set([el
                                for el in candidate_group_filters_covered])))
                    # Cleaning
                    clear_state()
                # before exiting, return backup in case that may be useful
                for candidate_group, candidate_group_filters_covered in backup:
                    yield (candidate_group, candidate_group_filters_covered)
                go_on = False  # finished exploring all groups

        # Find ways of joining together each group
        cache_unjoinable_pairs = defaultdict(int)
        for candidate_group, candidate_group_filters_covered in eager_candidate_exploration(
        ):
            print("")
            print("Candidate group: " + str(candidate_group))
            num_unique_filters = len(
                {f_id
                 for _, _, f_id in candidate_group_filters_covered})
            print("Covers #Filters: " + str(num_unique_filters))

            if len(candidate_group) == 1:
                table = candidate_group[0]
                path = table_path[table]
                materialized_virtual_schema = dpu.get_dataframe(path + "/" +
                                                                table)
                attrs_to_project = dpu.obtain_attributes_to_project(
                    candidate_group_filters_covered)
                # Create metadata to document this view
                view_metadata = dict()
                view_metadata["#join_graphs"] = 1
                view_metadata["join_graph"] = {
                    "nodes": [{
                        "id": -101010,
                        "label": table
                    }],
                    "edges": []
                }
                yield materialized_virtual_schema, attrs_to_project, view_metadata
                continue  # to go to the next group

            # Pre-check
            # TODO: with a connected components index we can pre-filter many of those groups without checking
            #group_with_all_relations, join_path_groups = self.joinable(candidate_group, cache_unjoinable_pairs)
            max_hops = max_hops
            # We find the different join graphs that would join the candidate_group
            join_graphs = self.joinable(candidate_group,
                                        cache_unjoinable_pairs,
                                        max_hops=max_hops)
            if debug_enumerate_all_jps:
                for i, group in enumerate(join_graphs):
                    print("Group: " + str(i))
                    for el in group:
                        print(el)
                continue  # We are just interested in all JPs for all candidate groups

            # if not graphs skip next
            if len(join_graphs) == 0:
                print("Group: " + str(candidate_group) +
                      " is Non-Joinable with max_hops=" + str(max_hops))
                continue

            # Now we need to check every join graph individually and see if it's materializable. Only once we've
            # exhausted these join graphs we move on to the next candidate group. We know already that each of the
            # join graphs covers all tables in candidate_group, so if they're materializable we're good.
            # materializable_join_graphs = []
            for jpg in join_graphs:
                # Obtain filters that apply to this join graph
                filters = set()
                for l, r in jpg:
                    if l.source_name in table_fulfilled_filters:
                        filters.update(table_fulfilled_filters[l.source_name])
                    if r.source_name in table_fulfilled_filters:
                        filters.update(table_fulfilled_filters[r.source_name])

                # TODO: obtain join_graph score for diff metrics. useful for ranking later
                # rank_materializable_join_graphs(materializable_join_paths, table_path, dod)
                is_join_graph_valid = self.is_join_graph_materializable(
                    jpg, table_fulfilled_filters)

                if is_join_graph_valid:
                    attrs_to_project = dpu.obtain_attributes_to_project(
                        filters)
                    materialized_virtual_schema = dpu.materialize_join_graph(
                        jpg, self)
                    # Create metadata to document this view
                    view_metadata = dict()
                    view_metadata["#join_graphs"] = len(join_graphs)
                    # view_metadata["join_graph"] = self.format_join_paths_pairhops(jpg)
                    view_metadata[
                        "join_graph"] = self.format_join_graph_into_nodes_edges(
                            jpg)
                    yield materialized_virtual_schema, attrs_to_project, view_metadata

        print("Finished enumerating groups")
        cache_unjoinable_pairs = OrderedDict(
            sorted(cache_unjoinable_pairs.items(),
                   key=lambda x: x[1],
                   reverse=True))
        for k, v in cache_unjoinable_pairs.items():
            print(str(k) + " => " + str(v))

    def compute_join_graph_id(self, join_graph):
        all_nids = []
        for hop_l, hop_r in join_graph:
            all_nids.append(hop_r.nid)
            all_nids.append(hop_l.nid)
        path_id = frozenset(all_nids)
        return path_id

    def joinable(self,
                 group_tables: [str],
                 cache_unjoinable_pairs: defaultdict(int),
                 max_hops=2):
        """
        Find all join graphs that connect the tables in group_tables. This boils down to check
        whether there is (at least) a set of join paths which connect all tables, but it is required to find all
        possible join graphs and not only one.
        :param group_tables:
        :param cache_unjoinable_pairs: this set contains pairs of tables that do not join with each other
        :return:
        """
        assert len(group_tables) > 1

        # if not the size of group_tables, there won't be unique jps with all tables. that may not be good though
        max_hops = max_hops

        # for each pair of tables in group keep list of (path, tables_covered)
        paths_per_pair = defaultdict(list)

        for table1, table2 in itertools.combinations(group_tables, 2):
            # Check if tables are already known to be unjoinable
            if (table1, table2) in cache_unjoinable_pairs.keys() or (
                    table2, table1) in cache_unjoinable_pairs.keys():
                continue
            t1 = self.aurum_api.make_drs(table1)
            t2 = self.aurum_api.make_drs(table2)
            t1.set_table_mode()
            t2.set_table_mode()
            # Check cache first, if not in cache then do the search
            drs = self.are_paths_in_cache(table1, table2)
            if drs is None:
                print("Finding paths between " + str(table1) + " and " +
                      str(table2))
                print("max hops: " + str(max_hops))
                s = time.time()
                drs = self.aurum_api.paths(t1,
                                           t2,
                                           Relation.PKFK,
                                           max_hops=max_hops)
                e = time.time()
                print("Total time: " + str((e - s)))
                self.place_paths_in_cache(table1, table2, drs)
            paths = drs.paths()  # list of lists
            # If we didn't find paths, update unjoinable_pairs cache with this pair
            if len(paths
                   ) == 0:  # then store this info, these tables do not join
                cache_unjoinable_pairs[(table1, table2)] += 1
                cache_unjoinable_pairs[(table2, table1)] += 1
            for p in paths:
                tables_covered = set()
                tables_in_group = set(group_tables)
                for hop in p:
                    if hop.source_name in tables_in_group:
                        # this is a table covered by this path
                        tables_covered.add(hop.source_name)
                paths_per_pair[(table1, table2)].append((p, tables_covered))

        if len(paths_per_pair) == 0:
            return []

        # enumerate all possible join graphs
        all_combinations = [
            el for el in itertools.product(*list(paths_per_pair.values()))
        ]
        deduplicated_paths = dict()
        # Add combinations
        for path_combination in all_combinations:
            # TODO: is this general if max_hops > 2?
            for p1, p2 in itertools.combinations(path_combination, 2):
                path1, tables_covered1 = p1
                path2, tables_covered2 = p2
                # does combining these two paths help to cover more tables?
                if len(tables_covered1) > len(tables_covered2):
                    current_cover_len = len(tables_covered1)
                else:
                    current_cover_len = len(tables_covered2)
                potential_cover = tables_covered1.union(tables_covered2)
                joinable_paths = tables_covered1.intersection(tables_covered2)
                potential_cover_len = len(potential_cover)
                # if we cover more tables, and the paths are joinable (at least one table in common)
                if potential_cover_len > current_cover_len and len(
                        joinable_paths) > 0:
                    # Transform paths into pair-hops so we can join them together
                    tx_path1 = self.transform_join_path_to_pair_hop(path1)
                    tx_path2 = self.transform_join_path_to_pair_hop(path2)
                    # combine the paths
                    combined_path = tx_path1 + tx_path2
                    path_id = self.compute_join_graph_id(combined_path)
                    # If I haven't generated this path elsewhere, then I add it along with the tables it covers
                    if path_id not in deduplicated_paths:
                        deduplicated_paths[path_id] = (combined_path,
                                                       potential_cover)

        # Add initial paths that may cover all tables and remove those that do not
        for p, tables_covered in list(paths_per_pair.values())[0]:
            if len(tables_covered) == len(group_tables):
                tx_p = self.transform_join_path_to_pair_hop(p)
                path_id = self.compute_join_graph_id(tx_p)
                deduplicated_paths[path_id] = (tx_p, tables_covered)

        # Now we filter out all paths that do not cover all the tables in the group
        covering_join_graphs = [
            jg[0] for _, jg in deduplicated_paths.items()
            if len(jg[1]) == len(group_tables)
        ]

        # Finally sort by number of required joins
        join_graphs = sorted(covering_join_graphs, key=lambda x: len(x))
        return join_graphs

    def format_join_graph_into_nodes_edges(self, join_graph):
        nodes = dict()
        edges = []
        for jp in join_graph:
            # Add nodes
            for hop in jp:
                label = hop.db_name + ":" + hop.source_name
                node_descr = {
                    "id": hash(label),
                    "label": label
                }  # cannot use nid cause that's for cols not rels
                node_id = hash(label)
                if node_id not in nodes:
                    nodes[node_id] = node_descr
            l, r = jp
            l_label = l.db_name + ":" + l.source_name
            r_label = r.db_name + ":" + r.source_name
            edge_descr = {"from": hash(l_label), "to": hash(r_label)}
            edges.append(edge_descr)
        return {"nodes": list(nodes.values()), "edges": list(edges)}

    def transform_join_path_to_pair_hop(self, join_path):
        """
        1. get join path, 2. annotate with values to check for, then 3. format into [(l,r)]
        :param join_paths:
        :param table_fulfilled_filters:
        :return:
        """
        jp_hops = []
        pair = []
        for hop in join_path:
            pair.append(hop)
            if len(pair) == 2:
                jp_hops.append(tuple(pair))
                pair.clear()
                pair.append(hop)
        # Now remove pairs with pointers within same relation, as we don't need to join these
        jp_hops = [(l, r) for l, r in jp_hops
                   if l.source_name != r.source_name]
        return jp_hops

    def is_join_graph_materializable(self, join_graph,
                                     table_fulfilled_filters):
        # FIXME: add a way of collecting the join cardinalities and propagating them outside as well
        local_intermediates = dict()

        for l, r in join_graph:
            # apply filters to l
            if l.source_name not in local_intermediates:
                l_path = self.aurum_api.helper.get_path_nid(l.nid)
                # If there are filters apply them
                if l.source_name in table_fulfilled_filters:
                    filters_l = table_fulfilled_filters[l.source_name]
                    filtered_l = None
                    for info, filter_type, filter_id in filters_l:
                        if filter_type == FilterType.ATTR:
                            filtered_l = dpu.read_relation(l_path +
                                                           l.source_name)
                            continue  # no need to filter anything if the filter is only attribute type
                        attribute = info[1]
                        cell_value = info[0]
                        filtered_l = dpu.apply_filter(l_path + l.source_name,
                                                      attribute, cell_value)

                    if len(filtered_l) == 0:
                        return False  # filter does not leave any data => non-joinable
                # If there are not filters, then do not apply them
                else:
                    filtered_l = dpu.read_relation(l_path + l.source_name)
            else:
                filtered_l = local_intermediates[l.source_name]
            local_intermediates[l.source_name] = filtered_l

            # apply filters to r
            if r.source_name not in local_intermediates:
                r_path = self.aurum_api.helper.get_path_nid(r.nid)
                # If there are filters apply them
                if r.source_name in table_fulfilled_filters:
                    filters_r = table_fulfilled_filters[r.source_name]
                    filtered_r = None
                    for info, filter_type, filter_id in filters_r:
                        if filter_type == FilterType.ATTR:
                            filtered_r = dpu.read_relation(r_path +
                                                           r.source_name)
                            continue  # no need to filter anything if the filter is only attribute type
                        attribute = info[1]
                        cell_value = info[0]
                        filtered_r = dpu.apply_filter(r_path + r.source_name,
                                                      attribute, cell_value)

                    if len(filtered_r) == 0:
                        return False  # filter does not leave any data => non-joinable
                # If there are not filters, then do not apply them
                else:
                    filtered_r = dpu.read_relation(r_path + r.source_name)
            else:
                filtered_r = local_intermediates[r.source_name]
            local_intermediates[r.source_name] = filtered_r

            # check if the materialized version join's cardinality > 0
            joined = dpu.join_ab_on_key(filtered_l,
                                        filtered_r,
                                        l.field_name,
                                        r.field_name,
                                        suffix_str="_x")

            if len(joined) == 0:
                return False  # non-joinable hop enough to discard join graph
        # if we make it through all hopes, then join graph is materializable (i.e., verified)
        return True

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_algebra.py Projeto: zyxzyxzyx/aurum-datadiscovery

class testAlgebra(unittest.TestCase):

    def setUp(self):
        self.m_network = MagicMock()
        self.m_store_client = MagicMock()
        self.api = API(self.m_network, self.m_store_client)

    def test_keyword_search_db(self):
        # not implemented
        pass

    @patch('algebra.DRS', MagicMock(return_value='return_drs'))
    def test_keyword_search_source(self, *args):
        kw = 'foo'
        kw_type = 0
        max_results = 11
        search_keywords = self.m_store_client.search_keywords

        result = self.api.keyword_search(
            kw=kw, kw_type=kw_type, max_results=max_results)

        self.m_network.assert_not_called()
        search_keywords.assert_called_with(
            keywords=kw, elasticfieldname=0,
            max_hits=max_results)
        self.assertEqual(result, 'return_drs')

    @patch('algebra.DRS', MagicMock(return_value='return_drs'))
    def test_keyword_search_field(self):
        kw = 'foo'
        kw_type = 0
        max_results = 11
        search_keywords = self.m_store_client.search_keywords

        result = self.api.keyword_search(
            kw=kw, kw_type=kw_type, max_results=max_results)

        self.m_network.assert_not_called()
        search_keywords.assert_called_with(
            keywords=kw, elasticfieldname=0,
            max_hits=max_results)
        self.assertEqual(result, 'return_drs')

    @patch('algebra.DRS', MagicMock(return_value='return_drs'))
    def test_keyword_search_content(self):
        kw = 'foo'
        kw_type = 0
        max_results = 11
        search_keywords = self.m_store_client.search_keywords

        result = self.api.keyword_search(
            kw=kw, kw_type=kw_type, max_results=max_results)

        self.m_network.assert_not_called()
        search_keywords.assert_called_with(
            keywords=kw, elasticfieldname=0,
            max_hits=max_results)
        self.assertEqual(result, 'return_drs')

    def test_union(self):
        a = MagicMock()
        b = MagicMock()
        self.api._general_to_drs = MagicMock(return_value=a)

        res = self.api.union(a, b)

        self.api._general_to_drs.assert_called()
        self.assertEqual(a.union(b), res)

    def test_intersection(self):
        a = MagicMock()
        b = MagicMock()
        self.api._general_to_drs = MagicMock(return_value=a)
        self.api._assert_same_mode = MagicMock()

        res = self.api.intersection(a, b)

        self.api._general_to_drs.assert_called()
        self.assertEqual(a.intersection(b), res)
        pass

    def test_difference(self):
        a = MagicMock()
        b = MagicMock()
        self.api._general_to_drs = MagicMock(return_value=a)
        self.api._assert_same_mode = MagicMock()

        res = self.api.difference(a, b)

        self.api._general_to_drs.assert_called()
        self.assertEqual(a.set_difference(b), res)
        pass

    def test_paths(self):
        self.api._general_to_drs = MagicMock()
        pass

    @patch('algebra.DRS', MagicMock())
    def test_traverse(self):
        self.api._general_to_drs = MagicMock()
        res = self.api.traverse(a='drs', primitive='primitive')

        self.api._general_to_drs.assert_called()
        res.absorb_provenance.assert_called()

        pass

    """
    Neighbor Search
    """

    @patch('algebra.DRS', MagicMock(return_value=MagicMock()))
    def test_neighbor_search_pkfk_node(self):
        db = 'db'
        source = 'source_table'
        field = 'column'
        node = (db, source, field)

        relation = Relation.PKFK
        max_hops = 11
        # algebra.DRS.mode = 'foo'

        # mock out i_drs in neighbor_search. Make it iterable
        self.api._general_to_drs = MagicMock()
        # self.api._general_to_drs.return_value.mode = True
        # self.api._general_to_drs.return_value = iter(['foo', 'bar'])

        self.api.neighbor_search(
            general_input=node, relation=relation, max_hops=max_hops)

        self.api._general_to_drs.assert_called_with(node)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_algebra.py Projeto: zyxzyxzyx/aurum-datadiscovery

 def setUp(self):
     self.m_network = MagicMock()
     self.m_store_client = MagicMock()
     self.api = API(self.m_network, self.m_store_client)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_algebra.py Projeto: zyxzyxzyx/aurum-datadiscovery

class TestAlgebraHelpers(unittest.TestCase):

    def setUp(self):
        self.m_network = MagicMock()
        self.m_store_client = MagicMock()
        self.api = API(self.m_network, self.m_store_client)

    @patch('algebra.Hit', MagicMock(return_value='result_hit'))
    def test_nid_to_node(self):
        self.api._network.get_info_for = MagicMock(
            return_value=[('t', 'o', 'op', 'le')])
        nid = 123
        result = self.api._nid_to_hit(nid=nid)
        self.assertEqual(result, 'result_hit')

    @patch('algebra.Hit', MagicMock(return_value='result_hit'))
    @patch('algebra.id_from', MagicMock())
    def test_node_to_hit(self):
        node = ('foo', 'bar', 'fizz')
        result = self.api._node_to_hit(node=node)
        self.assertEqual(result, 'result_hit')

    @patch('algebra.DRS', MagicMock(return_value='result_drs'))
    def test_hit_to_drs_no_table_mode(self):
        self.api._network.get_hits_from_table = MagicMock()
        hit = 'hit'
        result = self.api._hit_to_drs(hit=hit)
        self.assertEqual(result, 'result_drs')
        self.api._network.get_hits_from_table.assert_not_called()

    @patch('algebra.DRS', MagicMock())
    def test_hit_to_drs_with_table_mode(self):
        self.api._network.get_hits_from_table = MagicMock()
        hit = namedtuple(
            'Hit', 'nid, db_name, source_name, field_name, score',
            verbose=False)
        hit.source_name = 'table'

        self.api._hit_to_drs(hit=hit, table_mode=True)
        self.assertEqual(self.api._network.get_hits_from_table.called, True)

    @patch('algebra.id_from', MagicMock())
    @patch('algebra.isinstance', MagicMock(
        side_effect=[False, True, True, True, True, True, True]))
    @patch('algebra.DRS', MagicMock())
    def test_general_to_drs(self):
        nid = 1
        self.api._nid_to_hit = MagicMock()
        self.api._node_to_hit = MagicMock(return_value='t_hit')
        self.api._hit_to_drs = MagicMock(return_value='drs')

        result = self.api._general_to_drs(nid)

        self.api._nid_to_hit.assert_called_with(nid)
        # self.api._node_to_hit.assert_called_with('n_hit')
        # self.api._hit_to_drs.assert_called_with('n_hit')

        self.assertEqual(result, 'drs')

    @patch('algebra.isinstance', MagicMock(return_value=False))
    def test_general_to_drs_fail_case(self):
        self.assertRaises(ValueError, self.api._general_to_drs, 'bad input')

    def test_table_drs_to_field_drs(self):
        m_drs = MagicMock()
        m_drs.set_field_mode = MagicMock()

        self.api._general_to_drs = MagicMock(return_value=m_drs)
        self.api._hit_to_drs = MagicMock(return_value=True)

        self.api._general_to_field_drs('foo')

        # assert that this was called
        self.assertTrue(m_drs.set_fields_mode.called)

        # Having trouble testing anything inside of the iteration
        # there should be a test to make sure that
        # o_drs.absorb_provenance(i_drs) is called.

        # We should be able to check this, but have trouble because the i_drs
        # mock isn't flushed out as well as it should be.
        # self.m_network.assert_called_with('return_hit', Relation.PKFK)
        # self.assertEqual(result, 'return_drs')
        pass

    @patch('algebra.isinstance', MagicMock(return_value=True))
    def test_make_drs(self):
        general_input = ['foo', 'bar']
        self.api._general_to_drs = MagicMock(return_value=True)
        self.api.union = MagicMock(return_value=True)
        res = self.api.make_drs(general_input)

        self.assertTrue(res)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test_algebra.py Projeto: zyxzyxzyx/aurum-datadiscovery

    def test_initialization(self, *args):
        api = API(self.m_network, self.m_store_client)

        self.assertEqual(api._network, self.m_network)
        self.assertEqual(api._store_client, self.m_store_client)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: dod.py Projeto: damienrrb/aurum-datadiscovery

class DoD:
    def __init__(self, network, store_client, csv_separator=","):
        self.aurum_api = API(network=network, store_client=store_client)
        dpu.configure_csv_separator(csv_separator)

    def individual_filters(self, sch_def):
        # Obtain sets that fulfill individual filters
        filter_drs = dict()
        filter_id = 0
        for attr in sch_def.keys():
            drs = self.aurum_api.search_exact_attribute(attr, max_results=200)
            filter_drs[(attr, FilterType.ATTR, filter_id)] = drs
            filter_id += 1

        for cell in sch_def.values():
            drs = self.aurum_api.search_content(cell, max_results=200)
            filter_drs[(cell, FilterType.CELL, filter_id)] = drs
            filter_id += 1
        return filter_drs

    def joint_filters(self, sch_def):
        # Obtain sets that fulfill individual filters
        filter_drs = dict()
        filter_id = 0

        for attr, cell in sch_def.items():
            if cell == "":
                drs = self.aurum_api.search_exact_attribute(attr,
                                                            max_results=50)
                filter_drs[(attr, FilterType.ATTR, filter_id)] = drs
            else:
                drs_attr = self.aurum_api.search_exact_attribute(
                    attr, max_results=50)
                drs_cell = self.aurum_api.search_content(cell, max_results=500)
                drs = self.aurum_api.intersection(drs_attr, drs_cell)
                filter_drs[(cell, FilterType.CELL, filter_id)] = drs
            filter_id += 1
        return filter_drs

    def virtual_schema_iterative_search(self,
                                        list_attributes: [str],
                                        list_samples: [str],
                                        debug_enumerate_all_jps=False):
        # Align schema definition and samples
        assert len(list_attributes) == len(list_samples)
        sch_def = {
            attr: value
            for attr, value in zip(list_attributes, list_samples)
        }

        sch_def = OrderedDict(
            sorted(sch_def.items(), key=lambda x: x[0], reverse=True))

        filter_drs = self.joint_filters(sch_def)

        # We group now into groups that convey multiple filters.
        # Obtain list of tables ordered from more to fewer filters.
        table_fulfilled_filters = defaultdict(list)
        table_nid = dict(
        )  # collect nids -- used later to obtain an access path to the tables
        for filter, drs in filter_drs.items():
            drs.set_table_mode()
            # All these tables fulfill the filter above
            for table in drs:
                # table_fulfilled_filters[table].append(filter)
                if filter[1] == FilterType.ATTR:
                    columns = [c for c in drs.data]  # copy
                    for c in columns:
                        if c.source_name == table:
                            table_nid[table] = c.nid
                    # if filter not in table_fulfilled_filters[table]:
                    if filter[2] not in [
                            id for _, _, id in table_fulfilled_filters[table]
                    ]:
                        table_fulfilled_filters[table].append(
                            ((filter[0], None), FilterType.ATTR, filter[2]))
                elif filter[1] == FilterType.CELL:
                    columns = [c for c in drs.data]  # copy
                    for c in columns:
                        if c.source_name == table:  # filter in this column
                            table_nid[table] = c.nid
                            # if filter not in table_fulfilled_filters[table]:
                            if filter[2] not in [
                                    id for _, _, id in
                                    table_fulfilled_filters[table]
                            ]:
                                table_fulfilled_filters[table].append(
                                    ((filter[0], c.field_name),
                                     FilterType.CELL, filter[2]))

        table_path = obtain_table_paths(table_nid, self)

        # sort by value len -> # fulfilling filters
        table_fulfilled_filters = OrderedDict(
            sorted(table_fulfilled_filters.items(),
                   key=lambda el:
                   (len({filter_id
                         for _, _, filter_id in el[1]}), el[0]),
                   reverse=True))  # len of unique filters, then lexico

        # Ordering filters for more determinism
        for k, v in table_fulfilled_filters.items():
            v = sorted(v, key=lambda el: (el[2], el[0][0]),
                       reverse=True)  # sort by id, then filter_name
            table_fulfilled_filters[k] = v

        def eager_candidate_exploration():
            def covers_filters(candidate_filters, all_filters):
                all_filters_set = set([id for _, _, id in filter_drs.keys()])
                candidate_filters_set = set(
                    [id for _, _, id in candidate_filters])
                if len(candidate_filters_set) == len(all_filters_set):
                    return True
                return False

            def compute_size_filter_ix(filters,
                                       candidate_group_filters_covered):
                new_fs_set = set([id for _, _, id in filters])
                candidate_fs_set = set(
                    [id for _, _, id in candidate_group_filters_covered])
                ix_size = len(
                    new_fs_set.union(candidate_fs_set)) - len(candidate_fs_set)
                return ix_size

            def clear_state():
                candidate_group.clear()
                candidate_group_filters_covered.clear()

            # Eagerly obtain groups of tables that cover as many filters as possible
            go_on = True
            while go_on:
                candidate_group = []
                candidate_group_filters_covered = set()
                for i in range(len(list(table_fulfilled_filters.items()))):
                    table_pivot, filters_pivot = list(
                        table_fulfilled_filters.items())[i]
                    # Eagerly add pivot
                    candidate_group.append(table_pivot)
                    candidate_group_filters_covered.update(filters_pivot)
                    # Did it cover all filters?
                    # if len(candidate_group_filters_covered) == len(filter_drs.items()):
                    if covers_filters(candidate_group_filters_covered,
                                      filter_drs.items()):
                        candidate_group = sorted(candidate_group)
                        # print("1: " + str(table_pivot))
                        yield (candidate_group, candidate_group_filters_covered
                               )  # early stop
                        # Cleaning
                        clear_state()
                        continue
                    for j in range(len(list(table_fulfilled_filters.items()))):
                        idx = i + j + 1
                        if idx == len(table_fulfilled_filters.items()):
                            break
                        table, filters = list(
                            table_fulfilled_filters.items())[idx]
                        # new_filters = len(set(filters).union(candidate_group_filters_covered)) - len(candidate_group_filters_covered)
                        new_filters = compute_size_filter_ix(
                            filters, candidate_group_filters_covered)
                        if new_filters > 0:  # add table only if it adds new filters
                            candidate_group.append(table)
                            candidate_group_filters_covered.update(filters)
                            if covers_filters(candidate_group_filters_covered,
                                              filter_drs.items()):
                                # if len(candidate_group_filters_covered) == len(filter_drs.items()):
                                candidate_group = sorted(candidate_group)
                                # print("2: " + str(table_pivot))
                                yield (candidate_group,
                                       candidate_group_filters_covered)
                                clear_state()
                                # Re-add the current pivot, only necessary in this case
                                candidate_group.append(table_pivot)
                                candidate_group_filters_covered.update(
                                    filters_pivot)
                    candidate_group = sorted(candidate_group)
                    # print("3: " + str(table_pivot))
                    yield (candidate_group, candidate_group_filters_covered)
                    # Cleaning
                    clear_state()
                go_on = False  # finished exploring all groups

        # Find ways of joining together each group
        cache_unjoinable_pairs = defaultdict(int)
        for candidate_group, candidate_group_filters_covered in eager_candidate_exploration(
        ):
            print("")
            print("Candidate group: " + str(candidate_group))
            num_unique_filters = len(
                {f_id
                 for _, _, f_id in candidate_group_filters_covered})
            print("Covers #Filters: " + str(num_unique_filters))

            if len(candidate_group) == 1:
                table = candidate_group[0]
                path = table_path[table]
                materialized_virtual_schema = dpu.get_dataframe(path + "/" +
                                                                table)
                attrs_to_project = dpu.obtain_attributes_to_project(
                    (candidate_group_filters_covered, None))
                yield materialized_virtual_schema, attrs_to_project
                continue  # to go to the next group

            # Pre-check
            # TODO: with a connected components index we can pre-filter many of those groups without checking
            group_with_all_relations, join_path_groups = self.joinable(
                candidate_group, cache_unjoinable_pairs)
            if debug_enumerate_all_jps:
                print("Join paths which cover candidate group:")
                for jp in group_with_all_relations:
                    print(jp)
                print("Join graphs which cover candidate group: ")
                for i, group in enumerate(join_path_groups):
                    print("Group: " + str(i))
                    for el in group:
                        print(el)
                continue  # We are just interested in all JPs for all candidate groups

            # if not paths or graphs skip next
            if len(join_path_groups) == 0 and len(
                    group_with_all_relations) == 0:
                print("Group: " + str(candidate_group) + " is Non-Joinable")
                continue

            # We first check if the group_with_all_relations is materializable
            materializable_join_paths = []
            if len(group_with_all_relations) > 0:
                join_paths = self.tx_join_paths_to_pair_hops(
                    group_with_all_relations)
                annotated_join_paths = self.annotate_join_paths_with_filter(
                    join_paths, table_fulfilled_filters, candidate_group)
                # Check JP materialization
                print("Found " + str(len(annotated_join_paths)) +
                      " candidate join paths")
                valid_join_paths = self.verify_candidate_join_paths(
                    annotated_join_paths)
                print("Found " + str(len(valid_join_paths)) +
                      " materializable join paths")
                materializable_join_paths.extend(valid_join_paths)

            # We need that at least one JP from each group is materializable
            if len(materializable_join_paths) == 0 and len(
                    join_path_groups) == 0:
                print("No join graphs for this candidate group")
                continue
            print("Processing join graphs...")
            materializable_join_graphs = dict()
            for k, v in join_path_groups.items():
                print("Pair: " + str(k))
                join_paths = self.tx_join_paths_to_pair_hops(v)
                annotated_join_paths = self.annotate_join_paths_with_filter(
                    join_paths, table_fulfilled_filters, candidate_group)

                # Check JP materialization
                print("Found " + str(len(annotated_join_paths)) +
                      " candidate join paths for join graph")

                # For each candidate join_path, check whether it can be materialized or not,
                # then show to user (or the other way around)
                valid_join_paths = self.verify_candidate_join_paths(
                    annotated_join_paths)

                print("Found " + str(len(valid_join_paths)) +
                      " materializable join paths for join graph")

                if len(valid_join_paths) > 0:
                    materializable_join_graphs[k] = valid_join_paths
                else:
                    # This pair is non-materializable, but there may be other groups of pairs that cover
                    # the same tables, therefore we can only continue, we cannot determine at this point that
                    # the group is non-materializable, not yet.
                    continue
            # Verify whether the join_graphs cover the group or not
            covered_tables = set(candidate_group)
            for k, _ in materializable_join_graphs.items():
                (t1, t2) = k
                if t1 in covered_tables:
                    covered_tables.remove(t1)
                if t2 in covered_tables:
                    covered_tables.remove(t2)
            if len(covered_tables) > 0:
                # now we know there are not join graphs in this group, so we explicitly mark it as such
                materializable_join_graphs.clear()
                materializable_join_graphs = list(
                )  # next block of processing expects a list
            else:
                # 1) find key-groups
                keygroups = defaultdict(list)
                current_id = 0
                for keygroup in itertools.combinations(
                        list(materializable_join_graphs.keys()),
                        len(candidate_group) - 1):
                    for key in keygroup:
                        keygroups[current_id].append(
                            materializable_join_graphs[key])
                    current_id += 1

                # 2) for each key-group, enumerate all paths
                unit_jp = []
                for _, keygroup in keygroups.items():
                    # def unpack(packed_list):
                    #     for el in packed_list:
                    #         yield [v[0] for v in el]
                    args = keygroup
                    for comb in itertools.product(*args):
                        unit_jp.append(comb)

                # pack units into more compact format
                materializable_join_graphs = [
                ]  # TODO: note we are rewriting the type of a var in scope
                for unit in unit_jp:
                    packed_unit = []
                    for el in unit:
                        packed_unit.append(el[0])
                    materializable_join_graphs.append(packed_unit)
            print("Processing join graphs...OK")

            # Merge join paths and join graphs, at this point the difference is meaningless
            # TODO: are paths necessarily contained in graphs? if so, simplify code above

            all_jgs = materializable_join_graphs + materializable_join_paths

            print("Processing materializable join paths...")

            # Sort materializable_join_paths by likely joining on key
            all_jgs_scores = rank_materializable_join_graphs(
                all_jgs, table_path, self)

            clean_jp = []
            for annotated_jp, aggr_score, mul_score in all_jgs_scores:
                jp = []
                filters = set()
                for filter, l, r in annotated_jp:
                    # To drag filters along, there's a leaf special tuple where r may be None
                    # since we don't need it at this point anymore, we check for its existence and do not include it
                    if r is not None:
                        jp.append((l, r))
                    if filter is not None:
                        filters.update(filter)
                clean_jp.append((filters, jp))

            import pickle
            with open("check_debug.pkl", 'wb') as f:
                pickle.dump(clean_jp, f)

            for mjp in clean_jp:
                attrs_to_project = dpu.obtain_attributes_to_project(mjp)
                # materialized_virtual_schema = dpu.materialize_join_path(mjp, self)
                materialized_virtual_schema = dpu.materialize_join_graph(
                    mjp, self)
                yield materialized_virtual_schema, attrs_to_project

        print("Finished enumerating groups")
        cache_unjoinable_pairs = OrderedDict(
            sorted(cache_unjoinable_pairs.items(),
                   key=lambda x: x[1],
                   reverse=True))
        for k, v in cache_unjoinable_pairs.items():
            print(str(k) + " => " + str(v))

    def joinable(self, group_tables: [str],
                 cache_unjoinable_pairs: defaultdict(int)):
        """
        Check whether there is join graph that connects the tables in the group. This boils down to check
        whether there is a set of join paths which connect all tables.
        :param group_tables:
        :param cache_unjoinable_pairs: this set contains pairs of tables that do not join with each other
        :return:
        """
        assert len(group_tables) > 1

        # Check first with the cache whether these are unjoinable
        for table1, table2 in itertools.combinations(group_tables, 2):
            if (table1, table2) in cache_unjoinable_pairs.keys() or (
                    table2, table1) in cache_unjoinable_pairs.keys():
                # We count the attempt
                cache_unjoinable_pairs[(table1, table2)] += 1
                cache_unjoinable_pairs[(table2, table1)] += 1
                print(table1 + " unjoinable to: " + table2 + " skipping...")
                return [], []

        # if not the size of group_tables, there won't be unique jps with all tables. that may not be good though
        max_hops = 2

        group_with_all_tables = []

        join_path_groups_dict = dict(
        )  # store groups, as many as pairs of tables in the group
        for table1, table2 in itertools.combinations(group_tables, 2):
            # if table1 == "Drupal_employee_directory.csv" and\
            #                 table2 == "Employee_directory.csv" or\
            #                 table2 == "Drupal_employee_directory.csv" and\
            #                 table1 == "Employee_directory.csv":
            #     a = 1
            t1 = self.aurum_api.make_drs(table1)
            t2 = self.aurum_api.make_drs(table2)
            t1.set_table_mode()
            t2.set_table_mode()
            drs = self.aurum_api.paths(t1,
                                       t2,
                                       Relation.PKFK,
                                       max_hops=max_hops)
            paths = drs.paths()  # list of lists
            group = []
            if len(paths
                   ) == 0:  # then store this info, these tables do not join
                cache_unjoinable_pairs[(table1, table2)] += 1
                cache_unjoinable_pairs[(table2, table1)] += 1
            for p in paths:
                tables_covered = set(group_tables)
                for hop in p:
                    if hop.source_name in tables_covered:
                        tables_covered.remove(hop.source_name)
                if len(tables_covered) == 0:
                    group_with_all_tables.append(
                        p)  # this path covers all tables in group
                else:
                    group.append(p)
            join_path_groups_dict[(table1, table2)] = group

        # Remove invalid combinations from join_path_groups_dict
        for i in range(len(group_tables)):
            invalid = False
            table1 = group_tables[i]
            for table2 in group_tables[i + 1:]:
                if len(join_path_groups_dict[(table1, table2)]) == 0:
                    invalid = True
            if invalid:
                # table 1 does not join to all the others, so no join graph here
                to_remove = set()
                for k in join_path_groups_dict.keys():
                    if k[0] == table1:
                        to_remove.add(k)
                for el in to_remove:
                    del join_path_groups_dict[el]

        # Now verify that it's possible to obtain a join graph from these jps
        if len(group_with_all_tables) == 0:
            if len(join_path_groups_dict.items()) == 0:
                return [], [
                ]  # no jps covering all tables and empty groups => no join graph
        return group_with_all_tables, join_path_groups_dict

    def format_join_paths(self, join_paths):
        """
        Transform this into something readable
        :param join_paths: [(hit, hit)]
        :return:
        """
        formatted_jps = []
        for jp in join_paths:
            formatted_jp = ""
            for hop in jp:
                hop_str = hop.db_name + "." + hop.source_name + "." + hop.field_name
                if formatted_jp == "":
                    formatted_jp += hop_str
                else:
                    formatted_jp += " -> " + hop_str
            formatted_jps.append(formatted_jp)
        return formatted_jps

    def tx_join_paths_to_pair_hops(self, join_paths):
        """
        1. get join path, 2. annotate with values to check for, then 3. format into [(l,r)]
        :param join_paths:
        :param table_fulfilled_filters:
        :return:
        """
        join_paths_hops = []
        for jp in join_paths:
            jp_hops = []
            pair = []
            for hop in jp:
                pair.append(hop)
                if len(pair) == 2:
                    jp_hops.append(tuple(pair))
                    pair.clear()
                    pair.append(hop)
            # Now remove pairs with pointers within same relation
            jp_hops = [(l, r) for l, r in jp_hops
                       if l.source_name != r.source_name]
            join_paths_hops.append(jp_hops)
        return join_paths_hops

    def annotate_join_paths_with_filter(self, join_paths,
                                        table_fulfilled_filters,
                                        candidate_group):
        # FIXME: does this keep *all* relevant filters?
        annotated_jps = []
        l = None  # memory for last hop
        r = None
        for jp in join_paths:
            # For each hop
            annotated_jp = []
            for l, r in jp:
                # each filter is a (attr, filter-type)
                # Check if l side is a table in the group or just an intermediary
                if l.source_name in candidate_group:  # it's a table in group, so retrieve filters
                    filters = table_fulfilled_filters[l.source_name]
                else:
                    filters = None  # indicating no need to check filters for intermediary node
                annotated_hop = (filters, l, r)
                annotated_jp.append(annotated_hop)
            annotated_jps.append(annotated_jp)
            # Finally we must check if the very last table was also part of the jp, so we can add the filters for it
            if r.source_name in candidate_group:
                filters = table_fulfilled_filters[r.source_name]
                annotated_hop = (
                    filters, r, None
                )  # r becomes left and we insert a None to indicate the end
                last_hop = annotated_jps[-1]
                last_hop.append(annotated_hop)
        return annotated_jps

    def verify_candidate_join_paths(self, annotated_join_paths):
        materializable_join_paths = []
        total_jps = len(annotated_join_paths)
        i = 0
        for annotated_join_path in annotated_join_paths:
            print("Verifying " + str(i) + "/" + str(total_jps),
                  end="",
                  flush=True)
            valid, filters = self.verify_candidate_join_path(
                annotated_join_path)
            i += 1
            if valid:
                materializable_join_paths.append(annotated_join_path)
        return materializable_join_paths

    def verify_candidate_join_path(self, annotated_join_path):
        tree_valid_filters = dict()
        x = 0
        for filters, l, r in annotated_join_path:  # for each hop
            l_path = self.aurum_api.helper.get_path_nid(l.nid)
            tree_for_level = dict()

            # Before checking for filters, translate carrying values into hook attribute in l
            if len(tree_valid_filters) != 0:  # i.e., not first hop
                x_to_remove = set()
                for x, payload in tree_valid_filters.items():
                    carrying_filters, carrying_values = payload
                    if carrying_values[0] is None and carrying_values[
                            1] is None:
                        # tree_valid_filters[x] = (carrying_filters, (None, None))
                        continue  # in this case nothing to hook
                    attr = carrying_values[1]
                    if attr == l.field_name:
                        continue  # no need to translate values to hook in this case
                    hook_values = set()
                    for carrying_value in carrying_values[0]:
                        values = dpu.find_key_for(l_path + "/" + l.source_name,
                                                  l.field_name, attr,
                                                  carrying_value)
                        hook_values.update(values)
                    if len(hook_values) > 0:
                        tree_valid_filters[x] = (carrying_filters,
                                                 (hook_values,
                                                  l.field_name))  # update tree
                    else:  # does this even make sense?
                        x_to_remove.add(x)
                for x in x_to_remove:
                    del tree_valid_filters[x]
                if len(tree_valid_filters.items()) == 0:
                    return False, set()

            if filters is None:
                # This means we are in an intermediate hop with no filters, as it's only connecting
                continue
            if filters is not None:
                # sort filters so cell type come first
                filters = sorted(filters, key=lambda x: x[1].value)
                # pre-filter carrying values
                for info, filter_type, filter_id in filters:
                    if filter_type == FilterType.CELL:
                        attribute = info[1]
                        cell_value_specified_by_user = info[
                            0]  # this will always be one (?) FIXME: no! only when using the pre-interface
                        path = l_path + "/" + l.source_name
                        keys_l = dpu.find_key_for(
                            path, l.field_name, attribute,
                            cell_value_specified_by_user)
                        # Check for the first addition
                        if len(tree_valid_filters.items()) == 0:
                            x += 1
                            tree_for_level[x] = ({
                                (info, filter_type, filter_id)
                            }, (set(keys_l), l.field_name))
                        # Now update carrying_values with the first filter
                        for x, payload in tree_valid_filters.items():
                            carrying_filters, carrying_values = payload
                            ix = carrying_values[0].intersection(set(keys_l))
                            if len(ix) > 0:  # if keeps it valid, create branch
                                carrying_filters.add(
                                    (info, filter_type, filter_id))
                                x += 1
                                tree_for_level[x] = (carrying_filters,
                                                     (ix, l.field_name))
                    elif filter_type == FilterType.ATTR:
                        # attr filters work with everyone, so just append
                        # Check for the first addition, TODO: tree_for_level ?
                        if len(tree_for_level.items()) == 0:
                            x += 1
                            tree_for_level[x] = ({(info, filter_type,
                                                   filter_id)}, (None, None))
                        for x, payload in tree_for_level.items():
                            carrying_filters, carrying_values = payload
                            carrying_filters.add(
                                (info, filter_type, filter_id))
                            tree_for_level[x] = (carrying_filters,
                                                 carrying_values)
            # Now filter with r
            if r is not None:  # if none, we processed the last step already, so time to check the tree
                r_path = self.aurum_api.helper.get_path_nid(r.nid)
                x_to_remove = set()
                for x, payload in tree_for_level.items():
                    carrying_filters, carrying_values = payload
                    if carrying_values[0] is None and carrying_values[
                            1] is None:
                        # propagate the None None because all values work
                        tree_for_level[x] = (carrying_filters, (None, None))
                        continue
                    values_to_carry = set()
                    for carrying_value in carrying_values[0]:
                        path = r_path + "/" + r.source_name
                        exists = dpu.is_value_in_column(
                            carrying_value, path, r.field_name)
                        if exists:
                            values_to_carry.add(
                                carrying_value)  # this one checks
                    if len(values_to_carry) > 0:
                        # here we update the tree at the current level
                        tree_for_level[x] = (carrying_filters,
                                             (values_to_carry, r.field_name))
                    else:
                        x_to_remove.add(x)
                # remove if any
                for x in x_to_remove:
                    del tree_for_level[
                        x]  # no more results here, need to prune
                tree_valid_filters = tree_for_level
                if len(tree_valid_filters.items()) == 0:
                    return False, set()  # early stop
        # Check if the join path was valid, also retrieve the number of filters covered by this JP
        if len(tree_valid_filters.items()) > 0:
            for k, v in tree_for_level.items():
                tree_valid_filters[k] = v  # merge trees
            unique_filters = set()
            for k, v in tree_valid_filters.items():
                unique_filters.update(v[0])
            return True, len(unique_filters)
        else:
            return False, set()

Exemplo n.º 12

0

Exibir arquivo

 def __init__(self, network, store_client):
     self.api = API(network=network, store_client=store_client)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: dod.py Projeto: singletrips/aurum-datadiscovery

class DoD:
    def __init__(self, network, store_client, csv_separator=","):
        self.aurum_api = API(network=network, store_client=store_client)
        dpu.configure_csv_separator(csv_separator)

    def individual_filters(self, sch_def):
        # Obtain sets that fulfill individual filters
        filter_drs = dict()
        filter_id = 0
        for attr in sch_def.keys():
            drs = self.aurum_api.search_exact_attribute(attr, max_results=200)
            filter_drs[(attr, FilterType.ATTR, filter_id)] = drs
            filter_id += 1

        for cell in sch_def.values():
            drs = self.aurum_api.search_content(cell, max_results=200)
            filter_drs[(cell, FilterType.CELL, filter_id)] = drs
            filter_id += 1
        return filter_drs

    def joint_filters(self, sch_def):
        # Obtain sets that fulfill individual filters
        filter_drs = dict()
        filter_id = 0

        for attr, cell in sch_def.items():
            if cell == "":
                drs = self.aurum_api.search_exact_attribute(attr,
                                                            max_results=50)
                filter_drs[(attr, FilterType.ATTR, filter_id)] = drs
            else:
                drs_attr = self.aurum_api.search_exact_attribute(
                    attr, max_results=50)
                drs_cell = self.aurum_api.search_content(cell, max_results=500)
                drs = self.aurum_api.intersection(drs_attr, drs_cell)
                filter_drs[(cell, FilterType.CELL, filter_id)] = drs
            filter_id += 1
        return filter_drs

    def virtual_schema_iterative_search(self,
                                        list_attributes: [str],
                                        list_samples: [str],
                                        max_hops=2,
                                        debug_enumerate_all_jps=False):
        # Align schema definition and samples
        assert len(list_attributes) == len(list_samples)
        sch_def = {
            attr: value
            for attr, value in zip(list_attributes, list_samples)
        }

        sch_def = OrderedDict(
            sorted(sch_def.items(), key=lambda x: x[0], reverse=True))

        filter_drs = self.joint_filters(sch_def)

        # We group now into groups that convey multiple filters.
        # Obtain list of tables ordered from more to fewer filters.
        table_fulfilled_filters = defaultdict(list)
        table_nid = dict(
        )  # collect nids -- used later to obtain an access path to the tables
        for filter, drs in filter_drs.items():
            drs.set_table_mode()
            # All these tables fulfill the filter above
            for table in drs:
                # table_fulfilled_filters[table].append(filter)
                if filter[1] == FilterType.ATTR:
                    columns = [c for c in drs.data]  # copy
                    for c in columns:
                        if c.source_name == table:
                            table_nid[table] = c.nid
                    # if filter not in table_fulfilled_filters[table]:
                    if filter[2] not in [
                            id for _, _, id in table_fulfilled_filters[table]
                    ]:
                        table_fulfilled_filters[table].append(
                            ((filter[0], None), FilterType.ATTR, filter[2]))
                elif filter[1] == FilterType.CELL:
                    columns = [c for c in drs.data]  # copy
                    for c in columns:
                        if c.source_name == table:  # filter in this column
                            table_nid[table] = c.nid
                            # if filter not in table_fulfilled_filters[table]:
                            if filter[2] not in [
                                    id for _, _, id in
                                    table_fulfilled_filters[table]
                            ]:
                                table_fulfilled_filters[table].append(
                                    ((filter[0], c.field_name),
                                     FilterType.CELL, filter[2]))

        table_path = obtain_table_paths(table_nid, self)

        # sort by value len -> # fulfilling filters
        table_fulfilled_filters = OrderedDict(
            sorted(table_fulfilled_filters.items(),
                   key=lambda el:
                   (len({filter_id
                         for _, _, filter_id in el[1]}), el[0]),
                   reverse=True))  # len of unique filters, then lexico

        # Ordering filters for more determinism
        for k, v in table_fulfilled_filters.items():
            v = sorted(v, key=lambda el: (el[2], el[0][0]),
                       reverse=True)  # sort by id, then filter_name
            table_fulfilled_filters[k] = v

        def eager_candidate_exploration():
            def covers_filters(candidate_filters, all_filters):
                all_filters_set = set([id for _, _, id in filter_drs.keys()])
                candidate_filters_set = set(
                    [id for _, _, id in candidate_filters])
                if len(candidate_filters_set) == len(all_filters_set):
                    return True
                return False

            def compute_size_filter_ix(filters,
                                       candidate_group_filters_covered):
                new_fs_set = set([id for _, _, id in filters])
                candidate_fs_set = set(
                    [id for _, _, id in candidate_group_filters_covered])
                ix_size = len(
                    new_fs_set.union(candidate_fs_set)) - len(candidate_fs_set)
                return ix_size

            def clear_state():
                candidate_group.clear()
                candidate_group_filters_covered.clear()

            # Eagerly obtain groups of tables that cover as many filters as possible
            go_on = True
            while go_on:
                candidate_group = []
                candidate_group_filters_covered = set()
                for i in range(len(list(table_fulfilled_filters.items()))):
                    table_pivot, filters_pivot = list(
                        table_fulfilled_filters.items())[i]
                    # Eagerly add pivot
                    candidate_group.append(table_pivot)
                    candidate_group_filters_covered.update(filters_pivot)
                    # Did it cover all filters?
                    # if len(candidate_group_filters_covered) == len(filter_drs.items()):
                    if covers_filters(candidate_group_filters_covered,
                                      filter_drs.items()):
                        candidate_group = sorted(candidate_group)
                        # print("1: " + str(table_pivot))
                        yield (candidate_group, candidate_group_filters_covered
                               )  # early stop
                        # Cleaning
                        clear_state()
                        continue
                    for j in range(len(list(table_fulfilled_filters.items()))):
                        idx = i + j + 1
                        if idx == len(table_fulfilled_filters.items()):
                            break
                        table, filters = list(
                            table_fulfilled_filters.items())[idx]
                        # new_filters = len(set(filters).union(candidate_group_filters_covered)) - len(candidate_group_filters_covered)
                        new_filters = compute_size_filter_ix(
                            filters, candidate_group_filters_covered)
                        if new_filters > 0:  # add table only if it adds new filters
                            candidate_group.append(table)
                            candidate_group_filters_covered.update(filters)
                            if covers_filters(candidate_group_filters_covered,
                                              filter_drs.items()):
                                # if len(candidate_group_filters_covered) == len(filter_drs.items()):
                                candidate_group = sorted(candidate_group)
                                # print("2: " + str(table_pivot))
                                yield (candidate_group,
                                       candidate_group_filters_covered)
                                clear_state()
                                # Re-add the current pivot, only necessary in this case
                                candidate_group.append(table_pivot)
                                candidate_group_filters_covered.update(
                                    filters_pivot)
                    candidate_group = sorted(candidate_group)
                    # print("3: " + str(table_pivot))
                    yield (candidate_group, candidate_group_filters_covered)
                    # Cleaning
                    clear_state()
                go_on = False  # finished exploring all groups

        # Find ways of joining together each group
        cache_unjoinable_pairs = defaultdict(int)
        for candidate_group, candidate_group_filters_covered in eager_candidate_exploration(
        ):
            print("")
            print("Candidate group: " + str(candidate_group))
            num_unique_filters = len(
                {f_id
                 for _, _, f_id in candidate_group_filters_covered})
            print("Covers #Filters: " + str(num_unique_filters))

            if len(candidate_group) == 1:
                table = candidate_group[0]
                path = table_path[table]
                materialized_virtual_schema = dpu.get_dataframe(path + "/" +
                                                                table)
                attrs_to_project = dpu.obtain_attributes_to_project(
                    candidate_group_filters_covered)
                # Create metadata to document this view
                view_metadata = dict()
                view_metadata["#join_graphs"] = 1
                view_metadata["join_graph"] = {
                    "nodes": [{
                        "id": -101010,
                        "label": table
                    }],
                    "edges": []
                }
                yield materialized_virtual_schema, attrs_to_project, view_metadata
                continue  # to go to the next group

            # Pre-check
            # TODO: with a connected components index we can pre-filter many of those groups without checking
            #group_with_all_relations, join_path_groups = self.joinable(candidate_group, cache_unjoinable_pairs)
            max_hops = max_hops
            # We find the different join graphs that would join the candidate_group
            join_graphs = self.joinable(candidate_group,
                                        cache_unjoinable_pairs,
                                        max_hops=max_hops)
            if debug_enumerate_all_jps:
                for i, group in enumerate(join_graphs):
                    print("Group: " + str(i))
                    for el in group:
                        print(el)
                continue  # We are just interested in all JPs for all candidate groups

            # if not graphs skip next
            if len(join_graphs) == 0:
                print("Group: " + str(candidate_group) +
                      " is Non-Joinable with max_hops=" + str(max_hops))
                continue

            # Now we need to check every join graph individually and see if it's materializable. Only once we've
            # exhausted these join graphs we move on to the next candidate group. We know already that each of the
            # join graphs covers all tables in candidate_group, so if they're materializable we're good.
            # materializable_join_graphs = []
            for jpg in join_graphs:
                # Obtain filters that apply to this join graph
                filters = set()
                for l, r in jpg:
                    if l.source_name in table_fulfilled_filters:
                        filters.update(table_fulfilled_filters[l.source_name])
                    if r.source_name in table_fulfilled_filters:
                        filters.update(table_fulfilled_filters[r.source_name])

                is_join_graph_valid = self.is_join_graph_materializable(
                    jpg, table_fulfilled_filters)

                if is_join_graph_valid:
                    attrs_to_project = dpu.obtain_attributes_to_project(
                        filters)
                    materialized_virtual_schema = dpu.materialize_join_graph(
                        jpg, self)
                    # Create metadata to document this view
                    view_metadata = dict()
                    view_metadata["#join_graphs"] = len(join_graphs)
                    # view_metadata["join_graph"] = self.format_join_paths_pairhops(jpg)
                    view_metadata[
                        "join_graph"] = self.format_join_graph_into_nodes_edges(
                            jpg)
                    yield materialized_virtual_schema, attrs_to_project, view_metadata

            # # FIXME: fixing stitching downstream from here
            #
            # # We have now all the materializable join graphs for this candidate group
            # # We can sort them by how likely they use 'keys'
            # all_jgs_scores = rank_materializable_join_graphs(materializable_join_graphs, table_path, self)
            #
            # # Do some clean up
            # clean_jp = []
            # for annotated_jp, aggr_score, mul_score in all_jgs_scores:
            #     jp = []
            #     filters = set()
            #     for filter, l, r in annotated_jp:
            #         # To drag filters along, there's a leaf special tuple where r may be None
            #         # since we don't need it at this point anymore, we check for its existence and do not include it
            #         if r is not None:
            #             jp.append((l, r))
            #         if filter is not None:
            #             filters.update(filter)
            #     clean_jp.append((filters, jp))
            #
            # for mjp in clean_jp:
            #     attrs_to_project = dpu.obtain_attributes_to_project(mjp)
            #     # materialized_virtual_schema = dpu.materialize_join_path(mjp, self)
            #     materialized_virtual_schema = dpu.materialize_join_graph(mjp, self)
            #     yield materialized_virtual_schema, attrs_to_project

        print("Finished enumerating groups")
        cache_unjoinable_pairs = OrderedDict(
            sorted(cache_unjoinable_pairs.items(),
                   key=lambda x: x[1],
                   reverse=True))
        for k, v in cache_unjoinable_pairs.items():
            print(str(k) + " => " + str(v))

    def compute_join_graph_id(self, join_graph):
        all_nids = []
        for hop_l, hop_r in join_graph:
            all_nids.append(hop_r.nid)
            all_nids.append(hop_l.nid)
        path_id = frozenset(all_nids)
        return path_id

    def joinable(self,
                 group_tables: [str],
                 cache_unjoinable_pairs: defaultdict(int),
                 max_hops=2):
        """
        Find all join graphs that connect the tables in group_tables. This boils down to check
        whether there is (at least) a set of join paths which connect all tables, but it is required to find all
        possible join graphs and not only one.
        :param group_tables:
        :param cache_unjoinable_pairs: this set contains pairs of tables that do not join with each other
        :return:
        """
        assert len(group_tables) > 1

        # if not the size of group_tables, there won't be unique jps with all tables. that may not be good though
        max_hops = max_hops

        # for each pair of tables in group keep list of (path, tables_covered)
        paths_per_pair = defaultdict(list)

        for table1, table2 in itertools.combinations(group_tables, 2):
            # Check if tables are already known to be unjoinable
            if (table1, table2) in cache_unjoinable_pairs.keys() or (
                    table2, table1) in cache_unjoinable_pairs.keys():
                continue
            t1 = self.aurum_api.make_drs(table1)
            t2 = self.aurum_api.make_drs(table2)
            t1.set_table_mode()
            t2.set_table_mode()
            drs = self.aurum_api.paths(t1,
                                       t2,
                                       Relation.PKFK,
                                       max_hops=max_hops)
            paths = drs.paths()  # list of lists
            # If we didn't find paths, update unjoinable_pairs cache with this pair
            if len(paths
                   ) == 0:  # then store this info, these tables do not join
                cache_unjoinable_pairs[(table1, table2)] += 1
                cache_unjoinable_pairs[(table2, table1)] += 1
            for p in paths:
                tables_covered = set()
                tables_in_group = set(group_tables)
                for hop in p:
                    if hop.source_name in tables_in_group:
                        # this is a table covered by this path
                        tables_covered.add(hop.source_name)
                paths_per_pair[(table1, table2)].append((p, tables_covered))

        if len(paths_per_pair) == 0:
            return []

        # enumerate all possible join graphs
        all_combinations = [
            el for el in itertools.product(*list(paths_per_pair.values()))
        ]
        deduplicated_paths = dict()
        # Add combinations
        for path_combination in all_combinations:
            # TODO: is this general if max_hops > 2?
            for p1, p2 in itertools.combinations(path_combination, 2):
                path1, tables_covered1 = p1
                path2, tables_covered2 = p2
                # does combining these two paths help to cover more tables?
                if len(tables_covered1) > len(tables_covered2):
                    current_cover_len = len(tables_covered1)
                else:
                    current_cover_len = len(tables_covered2)
                potential_cover = tables_covered1.union(tables_covered2)
                joinable_paths = tables_covered1.intersection(tables_covered2)
                potential_cover_len = len(potential_cover)
                # if we cover more tables, and the paths are joinable (at least one table in common)
                if potential_cover_len > current_cover_len and len(
                        joinable_paths) > 0:
                    # Transform paths into pair-hops so we can join them together
                    tx_path1 = self.transform_join_path_to_pair_hop(path1)
                    tx_path2 = self.transform_join_path_to_pair_hop(path2)
                    # combine the paths
                    combined_path = tx_path1 + tx_path2
                    path_id = self.compute_join_graph_id(combined_path)
                    # If I haven't generated this path elsewhere, then I add it along with the tables it covers
                    if path_id not in deduplicated_paths:
                        deduplicated_paths[path_id] = (combined_path,
                                                       potential_cover)

        # Add initial paths that may cover all tables and remove those that do not
        for p, tables_covered in list(paths_per_pair.values())[0]:
            if len(tables_covered) == len(group_tables):
                tx_p = self.transform_join_path_to_pair_hop(p)
                path_id = self.compute_join_graph_id(tx_p)
                deduplicated_paths[path_id] = (tx_p, tables_covered)

        # Now we filter out all paths that do not cover all the tables in the group
        covering_join_graphs = [
            jg[0] for _, jg in deduplicated_paths.items()
            if len(jg[1]) == len(group_tables)
        ]

        # Finally sort by number of required joins
        join_graphs = sorted(covering_join_graphs, key=lambda x: len(x))
        return join_graphs

    def format_join_paths_pairhops(self, join_paths):
        """
        Transform this into something readable
        :param join_paths: [(hit, hit)]
        :return:
        """
        formatted_jps = []
        for jp in join_paths:
            formatted_jp = ""
            for hop in jp:
                hop_str = hop.db_name + ":" + hop.source_name + ":" + hop.field_name
                if formatted_jp == "":
                    formatted_jp += hop_str
                else:
                    formatted_jp += " -> " + hop_str
            formatted_jps.append(formatted_jp)
        return formatted_jps

    def format_join_graph_into_nodes_edges(self, join_graph):
        nodes = dict()
        edges = []
        for jp in join_graph:
            # Add nodes
            for hop in jp:
                label = hop.db_name + ":" + hop.source_name
                node_descr = {
                    "id": hash(label),
                    "label": label
                }  # cannot use nid cause that's for cols not rels
                node_id = hash(label)
                if node_id not in nodes:
                    nodes[node_id] = node_descr
            l, r = jp
            l_label = l.db_name + ":" + l.source_name
            r_label = r.db_name + ":" + r.source_name
            edge_descr = {"from": hash(l_label), "to": hash(r_label)}
            edges.append(edge_descr)
        return {"nodes": list(nodes.values()), "edges": list(edges)}

    def transform_join_path_to_pair_hop(self, join_path):
        jp_hops = []
        pair = []
        for hop in join_path:
            pair.append(hop)
            if len(pair) == 2:
                jp_hops.append(tuple(pair))
                pair.clear()
                pair.append(hop)
        # Now remove pairs with pointers within same relation, as we don't need to join these
        jp_hops = [(l, r) for l, r in jp_hops
                   if l.source_name != r.source_name]
        return jp_hops

    def transform_join_paths_to_pair_hops(self, join_paths):
        """
        1. get join path, 2. annotate with values to check for, then 3. format into [(l,r)]
        :param join_paths:
        :param table_fulfilled_filters:
        :return:
        """
        join_paths_hops = []
        for jp in join_paths:
            jp_hops = self.transform_join_path_to_pair_hop(jp)
            join_paths_hops.append(jp_hops)
        return join_paths_hops

    def annotate_join_graph_with_filters(self, join_graph,
                                         table_fulfilled_filters,
                                         candidate_group):
        filters = set()
        for l, r in join_graph:
            filters.update(table_fulfilled_filters[l.source_name])
            filters.update(table_fulfilled_filters[r.source_name])
        return list(filters)

        # # FIXME: BROKEN: not annotating all filters if a table flals in the 'r' is not annotating it
        # annotated_join_graph = []
        # r = None  # memory for last hop
        # for l, r in join_graph:
        #     # each filter is a (attr, filter-type)
        #     # Check if l side is a table in the group or just an intermediary
        #     if l.source_name in candidate_group:  # it's a table in group, so retrieve filters
        #         filters = table_fulfilled_filters[l.source_name]
        #     else:
        #         filters = None  # indicating no need to check filters for intermediary node
        #     annotated_hop = (filters, l, r)
        #     annotated_join_graph.append(annotated_hop)
        # # Finally we must check if the very last table was also part of the jp, so we can add the filters for it
        # if r.source_name in candidate_group:
        #     filters = table_fulfilled_filters[r.source_name]
        #     annotated_hop = (filters, r, None)  # r becomes left and we insert a None to indicate the end
        #     annotated_join_graph.append(annotated_hop)
        # return annotated_join_graph

    def _annotate_join_graph_with_filters(self, join_graph,
                                          table_fulfilled_filters,
                                          candidate_group):
        # FIXME: does this keep *all* relevant filters?
        # FIXME: This would be much simpler to do if we just attach filters to the tables, this would only work
        # FIXME: assuming that we are receiving all paths in order
        annotated_jps = []
        l = None  # memory for last hop
        r = None
        for jp in join_graph:
            # For each hop
            annotated_jp = []
            for l, r in jp:
                # each filter is a (attr, filter-type)
                # Check if l side is a table in the group or just an intermediary
                if l.source_name in candidate_group:  # it's a table in group, so retrieve filters
                    filters = table_fulfilled_filters[l.source_name]
                else:
                    filters = None  # indicating no need to check filters for intermediary node
                annotated_hop = (filters, l, r)
                annotated_jp.append(annotated_hop)
            annotated_jps.append(annotated_jp)
            # Finally we must check if the very last table was also part of the jp, so we can add the filters for it
            if r.source_name in candidate_group:
                filters = table_fulfilled_filters[r.source_name]
                annotated_hop = (
                    filters, r, None
                )  # r becomes left and we insert a None to indicate the end
                last_hop = annotated_jps[-1]
                last_hop.append(annotated_hop)
        return annotated_jps

    def verify_candidate_join_paths(self, annotated_join_paths):
        materializable_join_paths = []
        total_jps = len(annotated_join_paths)
        i = 0
        for annotated_join_path in annotated_join_paths:
            print("Verifying " + str(i) + "/" + str(total_jps),
                  end="",
                  flush=True)
            valid, filters = self.verify_candidate_join_path(
                annotated_join_path)
            i += 1
            if valid:
                materializable_join_paths.append(annotated_join_path)
        return materializable_join_paths

    def is_join_graph_materializable(self, join_graph,
                                     table_fulfilled_filters):

        local_intermediates = dict()

        for l, r in join_graph:
            # apply filters to l
            if l.source_name not in local_intermediates:
                l_path = self.aurum_api.helper.get_path_nid(l.nid)
                # If there are filters apply them
                if l.source_name in table_fulfilled_filters:
                    filters_l = table_fulfilled_filters[l.source_name]
                    filtered_l = None
                    for info, filter_type, filter_id in filters_l:
                        if filter_type == FilterType.ATTR:
                            filtered_l = dpu.read_relation(l_path +
                                                           l.source_name)
                            continue  # no need to filter anything if the filter is only attribute type
                        attribute = info[1]
                        cell_value = info[0]
                        filtered_l = dpu.apply_filter(l_path + l.source_name,
                                                      attribute, cell_value)

                    if len(filtered_l) == 0:
                        return False  # filter does not leave any data => non-joinable
                # If there are not filters, then do not apply them
                else:
                    filtered_l = dpu.read_relation(l_path + l.source_name)
            else:
                filtered_l = local_intermediates[l.source_name]
            local_intermediates[l.source_name] = filtered_l

            # apply filters to r
            if r.source_name not in local_intermediates:
                r_path = self.aurum_api.helper.get_path_nid(r.nid)
                # If there are filters apply them
                if r.source_name in table_fulfilled_filters:
                    filters_r = table_fulfilled_filters[r.source_name]
                    filtered_r = None
                    for info, filter_type, filter_id in filters_r:
                        if filter_type == FilterType.ATTR:
                            filtered_r = dpu.read_relation(r_path +
                                                           r.source_name)
                            continue  # no need to filter anything if the filter is only attribute type
                        attribute = info[1]
                        cell_value = info[0]
                        filtered_r = dpu.apply_filter(r_path + r.source_name,
                                                      attribute, cell_value)

                    if len(filtered_r) == 0:
                        return False  # filter does not leave any data => non-joinable
                # If there are not filters, then do not apply them
                else:
                    filtered_r = dpu.read_relation(r_path + r.source_name)
            else:
                filtered_r = local_intermediates[r.source_name]
            local_intermediates[r.source_name] = filtered_r

            # check if the materialized version join's cardinality > 0
            joined = dpu.join_ab_on_key(filtered_l,
                                        filtered_r,
                                        l.field_name,
                                        r.field_name,
                                        suffix_str="_x")

            if len(joined) == 0:
                return False  # non-joinable hop enough to discard join graph
        # if we make it through all hopes, then join graph is materializable (i.e., verified)
        return True

    def _is_join_graph_materializable(self, annotated_join_paths):
        for idx, annotated_join_path in enumerate(annotated_join_paths):
            valid, filters = self.verify_candidate_join_path(
                annotated_join_path)
            if not valid:
                return False
        return True

    def verify_candidate_join_path(self, annotated_join_path):
        """
        TODO: what about materializing views of the tables that are filtered by the selection predicates?
        """
        tree_valid_filters = dict()
        x = 0
        for filters, l, r in annotated_join_path:  # for each hop
            l_path = self.aurum_api.helper.get_path_nid(l.nid)
            tree_for_level = dict()

            # Before checking for filters, translate carrying values into hook attribute in l
            if len(tree_valid_filters) != 0:  # i.e., not first hop
                x_to_remove = set()
                for x, payload in tree_valid_filters.items():
                    carrying_filters, carrying_values = payload
                    if carrying_values[0] is None and carrying_values[
                            1] is None:
                        # tree_valid_filters[x] = (carrying_filters, (None, None))
                        continue  # in this case nothing to hook
                    attr = carrying_values[1]
                    if attr == l.field_name:
                        continue  # no need to translate values to hook in this case
                    hook_values = set()
                    for carrying_value in carrying_values[0]:
                        values = dpu.find_key_for(l_path + "/" + l.source_name,
                                                  l.field_name, attr,
                                                  carrying_value)
                        hook_values.update(values)
                    if len(hook_values) > 0:
                        # FIXME: EXPERIMENTAL HERE!!
                        tree_valid_filters[x] = (carrying_filters,
                                                 (hook_values,
                                                  l.field_name))  # update tree
                    else:  # does this even make sense?
                        x_to_remove.add(x)
                for x in x_to_remove:
                    del tree_valid_filters[x]
                if len(tree_valid_filters.items()) == 0:
                    return False, set()

            if filters is None:
                # This means we are in an intermediate hop with no filters, as it's only connecting
                # we still need to hook connect the carrying values
                r_path = self.aurum_api.helper.get_path_nid(r.nid)
                x_to_remove = set()
                for x, payload in tree_valid_filters.items():
                    carrying_filters, carrying_values = payload
                    if carrying_values[0] is None and carrying_values[
                            1] is None:
                        # propagate the None None because all values work
                        tree_for_level[x] = (carrying_filters, (None, None))
                        continue
                    values_to_carry = set()
                    # attr = carrying_values[1]
                    for carrying_value in carrying_values[0]:
                        path = r_path + "/" + r.source_name
                        exists = dpu.is_value_in_column(
                            carrying_value, path, r.field_name)
                        if exists:
                            values_to_carry.add(
                                carrying_value)  # this one checks
                    if len(values_to_carry) > 0:
                        # here we update the tree at the current level
                        tree_for_level[x] = (carrying_filters,
                                             (values_to_carry, r.field_name))
                    else:
                        return False, set()  # non joinable, stop trying
                        # x_to_remove.add(x)
                # remove if any
                for x in x_to_remove:
                    del tree_for_level[
                        x]  # no more results here, need to prune
                tree_valid_filters = tree_for_level
                if len(tree_valid_filters.items()) == 0:
                    return False, set()  # early stop
                continue
            if filters is not None:
                # sort filters so cell type come first
                filters = sorted(filters, key=lambda x: x[1].value)
                # pre-filter carrying values
                for info, filter_type, filter_id in filters:
                    if filter_type == FilterType.CELL:
                        attribute = info[1]
                        cell_value_specified_by_user = info[
                            0]  # this will always be one (?) FIXME: no! only when using the pre-interface
                        path = l_path + "/" + l.source_name
                        keys_l = dpu.find_key_for(
                            path, l.field_name, attribute,
                            cell_value_specified_by_user)
                        # Check for the first addition
                        if len(tree_valid_filters.items()) == 0:
                            x += 1
                            tree_for_level[x] = ({
                                (info, filter_type, filter_id)
                            }, (set(keys_l), l.field_name))
                        # Now update carrying_values with the first filter
                        for x, payload in tree_valid_filters.items():
                            carrying_filters, carrying_values = payload
                            ix = carrying_values[0].intersection(set(keys_l))
                            if len(ix) > 0:  # if keeps it valid, create branch
                                carrying_filters.add(
                                    (info, filter_type, filter_id))
                                x += 1
                                tree_for_level[x] = (carrying_filters,
                                                     (ix, l.field_name))
                    elif filter_type == FilterType.ATTR:
                        # attr filters work with everyone, so just append
                        # Check for the first addition, TODO: tree_for_level ?
                        if len(tree_for_level.items()) == 0:
                            x += 1
                            tree_for_level[x] = ({(info, filter_type,
                                                   filter_id)}, (None, None))
                        for x, payload in tree_for_level.items():
                            carrying_filters, carrying_values = payload
                            carrying_filters.add(
                                (info, filter_type, filter_id))
                            tree_for_level[x] = (carrying_filters,
                                                 carrying_values)
            # Now filter with r
            if r is not None:  # if none, we processed the last step already, so time to check the tree
                r_path = self.aurum_api.helper.get_path_nid(r.nid)
                x_to_remove = set()
                for x, payload in tree_for_level.items():
                    carrying_filters, carrying_values = payload
                    if carrying_values[0] is None and carrying_values[
                            1] is None:
                        # propagate the None None because all values work
                        tree_for_level[x] = (carrying_filters, (None, None))
                        continue
                    values_to_carry = set()
                    for carrying_value in carrying_values[0]:
                        path = r_path + "/" + r.source_name
                        exists = dpu.is_value_in_column(
                            carrying_value, path, r.field_name)
                        if exists:
                            values_to_carry.add(
                                carrying_value)  # this one checks
                    if len(values_to_carry) > 0:
                        # here we update the tree at the current level
                        tree_for_level[x] = (carrying_filters,
                                             (values_to_carry, r.field_name))
                    else:
                        x_to_remove.add(x)
                # remove if any
                for x in x_to_remove:
                    del tree_for_level[
                        x]  # no more results here, need to prune
                tree_valid_filters = tree_for_level
                if len(tree_valid_filters.items()) == 0:
                    return False, set()  # early stop
        # Check if the join path was valid, also retrieve the number of filters covered by this JP
        if len(tree_valid_filters.items()) > 0:
            for k, v in tree_for_level.items():
                tree_valid_filters[k] = v  # merge trees
            unique_filters = set()
            for k, v in tree_valid_filters.items():
                unique_filters.update(v[0])
            return True, len(unique_filters)
        else:
            return False, set()