def test_dpu(dod): with open("check_debug.pkl", 'rb') as f: clean_jp = pickle.load(f) for mjp in clean_jp: attrs_to_project = dpu.obtain_attributes_to_project(mjp) # materialized_virtual_schema = dpu.materialize_join_path(mjp, self) materialized_virtual_schema = dpu.materialize_join_graph(mjp, dod) yield materialized_virtual_schema, attrs_to_project
def materialize_join_graphs(self, materializable_join_graphs): to_return = [] for mjg, filters in materializable_join_graphs: # if is_join_graph_valid: attrs_to_project = dpu.obtain_attributes_to_project(filters) # continue # test materialized_virtual_schema = dpu.materialize_join_graph_sample( mjg, self, sample_size=1000) # materialized_virtual_schema = dpu.materialize_join_graph(mjg, self) if materialized_virtual_schema is False: continue # happens when the join was an outlier # Create metadata to document this view view_metadata = dict() view_metadata["#join_graphs"] = len(materializable_join_graphs) # view_metadata["join_graph"] = self.format_join_paths_pairhops(jpg) view_metadata[ "join_graph"] = self.format_join_graph_into_nodes_edges(mjg) to_return.append( (materialized_virtual_schema, attrs_to_project, view_metadata)) # yield materialized_virtual_schema, attrs_to_project, view_metadata return to_return
def virtual_schema_iterative_search(self, list_attributes: [str], list_samples: [str], max_hops=2, debug_enumerate_all_jps=False): # Align schema definition and samples assert len(list_attributes) == len(list_samples) sch_def = { attr: value for attr, value in zip(list_attributes, list_samples) } sch_def = OrderedDict( sorted(sch_def.items(), key=lambda x: x[0], reverse=True)) filter_drs = self.joint_filters(sch_def) # We group now into groups that convey multiple filters. # Obtain list of tables ordered from more to fewer filters. table_fulfilled_filters = defaultdict(list) table_nid = dict( ) # collect nids -- used later to obtain an access path to the tables for filter, drs in filter_drs.items(): drs.set_table_mode() # All these tables fulfill the filter above for table in drs: # table_fulfilled_filters[table].append(filter) if filter[1] == FilterType.ATTR: columns = [c for c in drs.data] # copy for c in columns: if c.source_name == table: table_nid[table] = c.nid # if filter not in table_fulfilled_filters[table]: if filter[2] not in [ id for _, _, id in table_fulfilled_filters[table] ]: table_fulfilled_filters[table].append( ((filter[0], None), FilterType.ATTR, filter[2])) elif filter[1] == FilterType.CELL: columns = [c for c in drs.data] # copy for c in columns: if c.source_name == table: # filter in this column table_nid[table] = c.nid # if filter not in table_fulfilled_filters[table]: if filter[2] not in [ id for _, _, id in table_fulfilled_filters[table] ]: table_fulfilled_filters[table].append( ((filter[0], c.field_name), FilterType.CELL, filter[2])) table_path = obtain_table_paths(table_nid, self) # sort by value len -> # fulfilling filters table_fulfilled_filters = OrderedDict( sorted(table_fulfilled_filters.items(), key=lambda el: (len({filter_id for _, _, filter_id in el[1]}), el[0]), reverse=True)) # len of unique filters, then lexico # Ordering filters for more determinism for k, v in table_fulfilled_filters.items(): v = sorted(v, key=lambda el: (el[2], el[0][0]), reverse=True) # sort by id, then filter_name table_fulfilled_filters[k] = v def eager_candidate_exploration(): def covers_filters(candidate_filters, all_filters): all_filters_set = set([id for _, _, id in filter_drs.keys()]) candidate_filters_set = set( [id for _, _, id in candidate_filters]) if len(candidate_filters_set) == len(all_filters_set): return True return False def compute_size_filter_ix(filters, candidate_group_filters_covered): new_fs_set = set([id for _, _, id in filters]) candidate_fs_set = set( [id for _, _, id in candidate_group_filters_covered]) ix_size = len( new_fs_set.union(candidate_fs_set)) - len(candidate_fs_set) return ix_size def clear_state(): candidate_group.clear() candidate_group_filters_covered.clear() # Eagerly obtain groups of tables that cover as many filters as possible backup = [] go_on = True while go_on: candidate_group = [] candidate_group_filters_covered = set() for i in range(len(list(table_fulfilled_filters.items()))): table_pivot, filters_pivot = list( table_fulfilled_filters.items())[i] # Eagerly add pivot candidate_group.append(table_pivot) candidate_group_filters_covered.update(filters_pivot) # Did it cover all filters? # if len(candidate_group_filters_covered) == len(filter_drs.items()): if covers_filters(candidate_group_filters_covered, filter_drs.items()): candidate_group = sorted(candidate_group) # print("1: " + str(table_pivot)) yield (candidate_group, candidate_group_filters_covered ) # early stop # Cleaning clear_state() continue for j in range(len(list(table_fulfilled_filters.items()))): idx = i + j + 1 if idx == len(table_fulfilled_filters.items()): break table, filters = list( table_fulfilled_filters.items())[idx] # new_filters = len(set(filters).union(candidate_group_filters_covered)) - len(candidate_group_filters_covered) new_filters = compute_size_filter_ix( filters, candidate_group_filters_covered) if new_filters > 0: # add table only if it adds new filters candidate_group.append(table) candidate_group_filters_covered.update(filters) if covers_filters(candidate_group_filters_covered, filter_drs.items()): # if len(candidate_group_filters_covered) == len(filter_drs.items()): candidate_group = sorted(candidate_group) # print("2: " + str(table_pivot)) yield (candidate_group, candidate_group_filters_covered) clear_state() # Re-add the current pivot, only necessary in this case candidate_group.append(table_pivot) candidate_group_filters_covered.update( filters_pivot) candidate_group = sorted(candidate_group) # print("3: " + str(table_pivot)) if covers_filters(candidate_group_filters_covered, filter_drs.items()): yield (candidate_group, candidate_group_filters_covered) else: backup.append(([ el for el in candidate_group ], set([el for el in candidate_group_filters_covered]))) # Cleaning clear_state() # before exiting, return backup in case that may be useful for candidate_group, candidate_group_filters_covered in backup: yield (candidate_group, candidate_group_filters_covered) go_on = False # finished exploring all groups # Find ways of joining together each group cache_unjoinable_pairs = defaultdict(int) for candidate_group, candidate_group_filters_covered in eager_candidate_exploration( ): print("") print("Candidate group: " + str(candidate_group)) num_unique_filters = len( {f_id for _, _, f_id in candidate_group_filters_covered}) print("Covers #Filters: " + str(num_unique_filters)) if len(candidate_group) == 1: table = candidate_group[0] path = table_path[table] materialized_virtual_schema = dpu.get_dataframe(path + "/" + table) attrs_to_project = dpu.obtain_attributes_to_project( candidate_group_filters_covered) # Create metadata to document this view view_metadata = dict() view_metadata["#join_graphs"] = 1 view_metadata["join_graph"] = { "nodes": [{ "id": -101010, "label": table }], "edges": [] } yield materialized_virtual_schema, attrs_to_project, view_metadata continue # to go to the next group # Pre-check # TODO: with a connected components index we can pre-filter many of those groups without checking #group_with_all_relations, join_path_groups = self.joinable(candidate_group, cache_unjoinable_pairs) max_hops = max_hops # We find the different join graphs that would join the candidate_group join_graphs = self.joinable(candidate_group, cache_unjoinable_pairs, max_hops=max_hops) if debug_enumerate_all_jps: for i, group in enumerate(join_graphs): print("Group: " + str(i)) for el in group: print(el) continue # We are just interested in all JPs for all candidate groups # if not graphs skip next if len(join_graphs) == 0: print("Group: " + str(candidate_group) + " is Non-Joinable with max_hops=" + str(max_hops)) continue # Now we need to check every join graph individually and see if it's materializable. Only once we've # exhausted these join graphs we move on to the next candidate group. We know already that each of the # join graphs covers all tables in candidate_group, so if they're materializable we're good. # materializable_join_graphs = [] for jpg in join_graphs: # Obtain filters that apply to this join graph filters = set() for l, r in jpg: if l.source_name in table_fulfilled_filters: filters.update(table_fulfilled_filters[l.source_name]) if r.source_name in table_fulfilled_filters: filters.update(table_fulfilled_filters[r.source_name]) # TODO: obtain join_graph score for diff metrics. useful for ranking later # rank_materializable_join_graphs(materializable_join_paths, table_path, dod) is_join_graph_valid = self.is_join_graph_materializable( jpg, table_fulfilled_filters) if is_join_graph_valid: attrs_to_project = dpu.obtain_attributes_to_project( filters) materialized_virtual_schema = dpu.materialize_join_graph( jpg, self) # Create metadata to document this view view_metadata = dict() view_metadata["#join_graphs"] = len(join_graphs) # view_metadata["join_graph"] = self.format_join_paths_pairhops(jpg) view_metadata[ "join_graph"] = self.format_join_graph_into_nodes_edges( jpg) yield materialized_virtual_schema, attrs_to_project, view_metadata print("Finished enumerating groups") cache_unjoinable_pairs = OrderedDict( sorted(cache_unjoinable_pairs.items(), key=lambda x: x[1], reverse=True)) for k, v in cache_unjoinable_pairs.items(): print(str(k) + " => " + str(v))
def virtual_schema_iterative_search(self, list_attributes: [str], list_samples: [str], debug_enumerate_all_jps=False): # Align schema definition and samples assert len(list_attributes) == len(list_samples) sch_def = { attr: value for attr, value in zip(list_attributes, list_samples) } sch_def = OrderedDict( sorted(sch_def.items(), key=lambda x: x[0], reverse=True)) filter_drs = self.joint_filters(sch_def) # We group now into groups that convey multiple filters. # Obtain list of tables ordered from more to fewer filters. table_fulfilled_filters = defaultdict(list) table_nid = dict( ) # collect nids -- used later to obtain an access path to the tables for filter, drs in filter_drs.items(): drs.set_table_mode() # All these tables fulfill the filter above for table in drs: # table_fulfilled_filters[table].append(filter) if filter[1] == FilterType.ATTR: columns = [c for c in drs.data] # copy for c in columns: if c.source_name == table: table_nid[table] = c.nid # if filter not in table_fulfilled_filters[table]: if filter[2] not in [ id for _, _, id in table_fulfilled_filters[table] ]: table_fulfilled_filters[table].append( ((filter[0], None), FilterType.ATTR, filter[2])) elif filter[1] == FilterType.CELL: columns = [c for c in drs.data] # copy for c in columns: if c.source_name == table: # filter in this column table_nid[table] = c.nid # if filter not in table_fulfilled_filters[table]: if filter[2] not in [ id for _, _, id in table_fulfilled_filters[table] ]: table_fulfilled_filters[table].append( ((filter[0], c.field_name), FilterType.CELL, filter[2])) table_path = obtain_table_paths(table_nid, self) # sort by value len -> # fulfilling filters table_fulfilled_filters = OrderedDict( sorted(table_fulfilled_filters.items(), key=lambda el: (len({filter_id for _, _, filter_id in el[1]}), el[0]), reverse=True)) # len of unique filters, then lexico # Ordering filters for more determinism for k, v in table_fulfilled_filters.items(): v = sorted(v, key=lambda el: (el[2], el[0][0]), reverse=True) # sort by id, then filter_name table_fulfilled_filters[k] = v def eager_candidate_exploration(): def covers_filters(candidate_filters, all_filters): all_filters_set = set([id for _, _, id in filter_drs.keys()]) candidate_filters_set = set( [id for _, _, id in candidate_filters]) if len(candidate_filters_set) == len(all_filters_set): return True return False def compute_size_filter_ix(filters, candidate_group_filters_covered): new_fs_set = set([id for _, _, id in filters]) candidate_fs_set = set( [id for _, _, id in candidate_group_filters_covered]) ix_size = len( new_fs_set.union(candidate_fs_set)) - len(candidate_fs_set) return ix_size def clear_state(): candidate_group.clear() candidate_group_filters_covered.clear() # Eagerly obtain groups of tables that cover as many filters as possible go_on = True while go_on: candidate_group = [] candidate_group_filters_covered = set() for i in range(len(list(table_fulfilled_filters.items()))): table_pivot, filters_pivot = list( table_fulfilled_filters.items())[i] # Eagerly add pivot candidate_group.append(table_pivot) candidate_group_filters_covered.update(filters_pivot) # Did it cover all filters? # if len(candidate_group_filters_covered) == len(filter_drs.items()): if covers_filters(candidate_group_filters_covered, filter_drs.items()): candidate_group = sorted(candidate_group) # print("1: " + str(table_pivot)) yield (candidate_group, candidate_group_filters_covered ) # early stop # Cleaning clear_state() continue for j in range(len(list(table_fulfilled_filters.items()))): idx = i + j + 1 if idx == len(table_fulfilled_filters.items()): break table, filters = list( table_fulfilled_filters.items())[idx] # new_filters = len(set(filters).union(candidate_group_filters_covered)) - len(candidate_group_filters_covered) new_filters = compute_size_filter_ix( filters, candidate_group_filters_covered) if new_filters > 0: # add table only if it adds new filters candidate_group.append(table) candidate_group_filters_covered.update(filters) if covers_filters(candidate_group_filters_covered, filter_drs.items()): # if len(candidate_group_filters_covered) == len(filter_drs.items()): candidate_group = sorted(candidate_group) # print("2: " + str(table_pivot)) yield (candidate_group, candidate_group_filters_covered) clear_state() # Re-add the current pivot, only necessary in this case candidate_group.append(table_pivot) candidate_group_filters_covered.update( filters_pivot) candidate_group = sorted(candidate_group) # print("3: " + str(table_pivot)) yield (candidate_group, candidate_group_filters_covered) # Cleaning clear_state() go_on = False # finished exploring all groups # Find ways of joining together each group cache_unjoinable_pairs = defaultdict(int) for candidate_group, candidate_group_filters_covered in eager_candidate_exploration( ): print("") print("Candidate group: " + str(candidate_group)) num_unique_filters = len( {f_id for _, _, f_id in candidate_group_filters_covered}) print("Covers #Filters: " + str(num_unique_filters)) if len(candidate_group) == 1: table = candidate_group[0] path = table_path[table] materialized_virtual_schema = dpu.get_dataframe(path + "/" + table) attrs_to_project = dpu.obtain_attributes_to_project( (candidate_group_filters_covered, None)) yield materialized_virtual_schema, attrs_to_project continue # to go to the next group # Pre-check # TODO: with a connected components index we can pre-filter many of those groups without checking group_with_all_relations, join_path_groups = self.joinable( candidate_group, cache_unjoinable_pairs) if debug_enumerate_all_jps: print("Join paths which cover candidate group:") for jp in group_with_all_relations: print(jp) print("Join graphs which cover candidate group: ") for i, group in enumerate(join_path_groups): print("Group: " + str(i)) for el in group: print(el) continue # We are just interested in all JPs for all candidate groups # if not paths or graphs skip next if len(join_path_groups) == 0 and len( group_with_all_relations) == 0: print("Group: " + str(candidate_group) + " is Non-Joinable") continue # We first check if the group_with_all_relations is materializable materializable_join_paths = [] if len(group_with_all_relations) > 0: join_paths = self.tx_join_paths_to_pair_hops( group_with_all_relations) annotated_join_paths = self.annotate_join_paths_with_filter( join_paths, table_fulfilled_filters, candidate_group) # Check JP materialization print("Found " + str(len(annotated_join_paths)) + " candidate join paths") valid_join_paths = self.verify_candidate_join_paths( annotated_join_paths) print("Found " + str(len(valid_join_paths)) + " materializable join paths") materializable_join_paths.extend(valid_join_paths) # We need that at least one JP from each group is materializable if len(materializable_join_paths) == 0 and len( join_path_groups) == 0: print("No join graphs for this candidate group") continue print("Processing join graphs...") materializable_join_graphs = dict() for k, v in join_path_groups.items(): print("Pair: " + str(k)) join_paths = self.tx_join_paths_to_pair_hops(v) annotated_join_paths = self.annotate_join_paths_with_filter( join_paths, table_fulfilled_filters, candidate_group) # Check JP materialization print("Found " + str(len(annotated_join_paths)) + " candidate join paths for join graph") # For each candidate join_path, check whether it can be materialized or not, # then show to user (or the other way around) valid_join_paths = self.verify_candidate_join_paths( annotated_join_paths) print("Found " + str(len(valid_join_paths)) + " materializable join paths for join graph") if len(valid_join_paths) > 0: materializable_join_graphs[k] = valid_join_paths else: # This pair is non-materializable, but there may be other groups of pairs that cover # the same tables, therefore we can only continue, we cannot determine at this point that # the group is non-materializable, not yet. continue # Verify whether the join_graphs cover the group or not covered_tables = set(candidate_group) for k, _ in materializable_join_graphs.items(): (t1, t2) = k if t1 in covered_tables: covered_tables.remove(t1) if t2 in covered_tables: covered_tables.remove(t2) if len(covered_tables) > 0: # now we know there are not join graphs in this group, so we explicitly mark it as such materializable_join_graphs.clear() materializable_join_graphs = list( ) # next block of processing expects a list else: # 1) find key-groups keygroups = defaultdict(list) current_id = 0 for keygroup in itertools.combinations( list(materializable_join_graphs.keys()), len(candidate_group) - 1): for key in keygroup: keygroups[current_id].append( materializable_join_graphs[key]) current_id += 1 # 2) for each key-group, enumerate all paths unit_jp = [] for _, keygroup in keygroups.items(): # def unpack(packed_list): # for el in packed_list: # yield [v[0] for v in el] args = keygroup for comb in itertools.product(*args): unit_jp.append(comb) # pack units into more compact format materializable_join_graphs = [ ] # TODO: note we are rewriting the type of a var in scope for unit in unit_jp: packed_unit = [] for el in unit: packed_unit.append(el[0]) materializable_join_graphs.append(packed_unit) print("Processing join graphs...OK") # Merge join paths and join graphs, at this point the difference is meaningless # TODO: are paths necessarily contained in graphs? if so, simplify code above all_jgs = materializable_join_graphs + materializable_join_paths print("Processing materializable join paths...") # Sort materializable_join_paths by likely joining on key all_jgs_scores = rank_materializable_join_graphs( all_jgs, table_path, self) clean_jp = [] for annotated_jp, aggr_score, mul_score in all_jgs_scores: jp = [] filters = set() for filter, l, r in annotated_jp: # To drag filters along, there's a leaf special tuple where r may be None # since we don't need it at this point anymore, we check for its existence and do not include it if r is not None: jp.append((l, r)) if filter is not None: filters.update(filter) clean_jp.append((filters, jp)) import pickle with open("check_debug.pkl", 'wb') as f: pickle.dump(clean_jp, f) for mjp in clean_jp: attrs_to_project = dpu.obtain_attributes_to_project(mjp) # materialized_virtual_schema = dpu.materialize_join_path(mjp, self) materialized_virtual_schema = dpu.materialize_join_graph( mjp, self) yield materialized_virtual_schema, attrs_to_project print("Finished enumerating groups") cache_unjoinable_pairs = OrderedDict( sorted(cache_unjoinable_pairs.items(), key=lambda x: x[1], reverse=True)) for k, v in cache_unjoinable_pairs.items(): print(str(k) + " => " + str(v))
def virtual_schema_iterative_search(self, list_attributes: [str], list_samples: [str], perf_stats, max_hops=2, debug_enumerate_all_jps=False): # Align schema definition and samples st_stage1 = time.time() assert len(list_attributes) == len(list_samples) sch_def = { attr: value for attr, value in zip(list_attributes, list_samples) } sch_def = OrderedDict( sorted(sch_def.items(), key=lambda x: x[0], reverse=True)) filter_drs = self.joint_filters(sch_def) et_stage1 = time.time() perf_stats['t_stage1'] = (et_stage1 - st_stage1) st_stage2 = time.time() # We group now into groups that convey multiple filters. # Obtain list of tables ordered from more to fewer filters. table_fulfilled_filters = defaultdict(list) table_nid = dict( ) # collect nids -- used later to obtain an access path to the tables for filter, drs in filter_drs.items(): drs.set_table_mode() # All these tables fulfill the filter above for table in drs: # table_fulfilled_filters[table].append(filter) if filter[1] == FilterType.ATTR: columns = [c for c in drs.data] # copy for c in columns: if c.source_name == table: table_nid[table] = c.nid # if filter not in table_fulfilled_filters[table]: if filter[2] not in [ id for _, _, id in table_fulfilled_filters[table] ]: table_fulfilled_filters[table].append( ((filter[0], None), FilterType.ATTR, filter[2])) elif filter[1] == FilterType.CELL: columns = [c for c in drs.data] # copy for c in columns: if c.source_name == table: # filter in this column table_nid[table] = c.nid # if filter not in table_fulfilled_filters[table]: if filter[2] not in [ id for _, _, id in table_fulfilled_filters[table] ]: table_fulfilled_filters[table].append( ((filter[0], c.field_name), FilterType.CELL, filter[2])) table_path = obtain_table_paths(table_nid, self) # sort by value len -> # fulfilling filters table_fulfilled_filters = OrderedDict( sorted(table_fulfilled_filters.items(), key=lambda el: (len({filter_id for _, _, filter_id in el[1]}), el[0]), reverse=True)) # len of unique filters, then lexico # Ordering filters for more determinism for k, v in table_fulfilled_filters.items(): v = sorted(v, key=lambda el: (el[2], el[0][0]), reverse=True) # sort by id, then filter_name table_fulfilled_filters[k] = v def eager_candidate_exploration(): def covers_filters(candidate_filters, all_filters): all_filters_set = set([id for _, _, id in filter_drs.keys()]) candidate_filters_set = set( [id for _, _, id in candidate_filters]) if len(candidate_filters_set) == len(all_filters_set): return True return False def compute_size_filter_ix(filters, candidate_group_filters_covered): new_fs_set = set([id for _, _, id in filters]) candidate_fs_set = set( [id for _, _, id in candidate_group_filters_covered]) ix_size = len( new_fs_set.union(candidate_fs_set)) - len(candidate_fs_set) return ix_size def clear_state(): candidate_group.clear() candidate_group_filters_covered.clear() # Eagerly obtain groups of tables that cover as many filters as possible backup = [] go_on = True while go_on: candidate_group = [] candidate_group_filters_covered = set() for i in range(len(list(table_fulfilled_filters.items()))): table_pivot, filters_pivot = list( table_fulfilled_filters.items())[i] # Eagerly add pivot candidate_group.append(table_pivot) candidate_group_filters_covered.update(filters_pivot) # Did it cover all filters? # if len(candidate_group_filters_covered) == len(filter_drs.items()): if covers_filters(candidate_group_filters_covered, filter_drs.items()): candidate_group = sorted(candidate_group) # print("1: " + str(table_pivot)) yield (candidate_group, candidate_group_filters_covered ) # early stop # Cleaning clear_state() continue for j in range(len(list(table_fulfilled_filters.items()))): idx = i + j + 1 if idx == len(table_fulfilled_filters.items()): break table, filters = list( table_fulfilled_filters.items())[idx] # new_filters = len(set(filters).union(candidate_group_filters_covered)) - len(candidate_group_filters_covered) new_filters = compute_size_filter_ix( filters, candidate_group_filters_covered) if new_filters > 0: # add table only if it adds new filters candidate_group.append(table) candidate_group_filters_covered.update(filters) if covers_filters(candidate_group_filters_covered, filter_drs.items()): # if len(candidate_group_filters_covered) == len(filter_drs.items()): candidate_group = sorted(candidate_group) # print("2: " + str(table_pivot)) yield (candidate_group, candidate_group_filters_covered) clear_state() # Re-add the current pivot, only necessary in this case candidate_group.append(table_pivot) candidate_group_filters_covered.update( filters_pivot) candidate_group = sorted(candidate_group) # print("3: " + str(table_pivot)) if covers_filters(candidate_group_filters_covered, filter_drs.items()): yield (candidate_group, candidate_group_filters_covered) else: backup.append(([ el for el in candidate_group ], set([el for el in candidate_group_filters_covered]))) # Cleaning clear_state() # before exiting, return backup in case that may be useful for candidate_group, candidate_group_filters_covered in backup: yield (candidate_group, candidate_group_filters_covered) go_on = False # finished exploring all groups # """ # # FIXME: obtaining pairs of tables to join? # """ # all_pairs = 0 # candidate_groups = 0 # for cg, _ in eager_candidate_exploration(): # candidate_groups += 1 # all_pairs += len([el for el in list(itertools.combinations(cg, 2))]) # # all_pairs_to_join = [len([el for el in list(itertools.combinations(group_tables, 2))]) # # for group_tables in all_candidate_groups] # # all_pairs += all_pairs_to_join[0] # # print([el for el in all_candidate_groups]) # # print("all pairs to join: " + str(all_pairs)) # print("CG: " + str(candidate_groups)) # print("TOTAL: " + str(all_pairs)) # exit() # """ # # FIXME # """ et_stage2 = time.time() perf_stats['t_stage2'] = (et_stage2 - st_stage2) # Find ways of joining together each group cache_unjoinable_pairs = defaultdict(int) perf_stats['time_joinable'] = 0 perf_stats['time_is_materializable'] = 0 perf_stats['time_materialize'] = 0 num_candidate_groups = 0 for candidate_group, candidate_group_filters_covered in eager_candidate_exploration( ): num_candidate_groups += 1 print("") print("Candidate group: " + str(candidate_group)) num_unique_filters = len( {f_id for _, _, f_id in candidate_group_filters_covered}) print("Covers #Filters: " + str(num_unique_filters)) if len(candidate_group) == 1: table = candidate_group[0] path = table_path[table] # materialized_virtual_schema = dpu.get_dataframe(path + "/" + table) materialized_virtual_schema = dpu.read_relation(path + "/" + table) attrs_to_project = dpu.obtain_attributes_to_project( candidate_group_filters_covered) # Create metadata to document this view view_metadata = dict() view_metadata["#join_graphs"] = 1 view_metadata["join_graph"] = { "nodes": [{ "id": -101010, "label": table }], "edges": [] } if 'single_relation_group' not in perf_stats: perf_stats['single_relation_group'] = 0 perf_stats['single_relation_group'] += 1 yield materialized_virtual_schema, attrs_to_project, view_metadata continue # to go to the next group # Pre-check # TODO: with a connected components index we can pre-filter many of those groups without checking #group_with_all_relations, join_path_groups = self.joinable(candidate_group, cache_unjoinable_pairs) max_hops = max_hops # We find the different join graphs that would join the candidate_group st_joinable = time.time() join_graphs = self.joinable(candidate_group, cache_unjoinable_pairs, max_hops=max_hops) et_joinable = time.time() perf_stats['time_joinable'] += (et_joinable - st_joinable) if debug_enumerate_all_jps: for i, group in enumerate(join_graphs): print("Group: " + str(i)) for el in group: print(el) continue # We are just interested in all JPs for all candidate groups # if not graphs skip next if len(join_graphs) == 0: if 'unjoinable_candidate_group' not in perf_stats: perf_stats['unjoinable_candidate_group'] = 0 perf_stats['unjoinable_candidate_group'] += 1 print("Group: " + str(candidate_group) + " is Non-Joinable with max_hops=" + str(max_hops)) continue if 'joinable_candidate_group' not in perf_stats: perf_stats['joinable_candidate_group'] = 0 perf_stats['joinable_candidate_group'] += 1 if 'num_join_graphs_per_candidate_group' not in perf_stats: perf_stats['num_join_graphs_per_candidate_group'] = [] perf_stats['num_join_graphs_per_candidate_group'].append( len(join_graphs)) # Now we need to check every join graph individually and see if it's materializable. Only once we've # exhausted these join graphs we move on to the next candidate group. We know already that each of the # join graphs covers all tables in candidate_group, so if they're materializable we're good. total_materializable_join_graphs = 0 materializable_join_graphs = [] for jpg in join_graphs: # Obtain filters that apply to this join graph filters = set() for l, r in jpg: if l.source_name in table_fulfilled_filters: filters.update(table_fulfilled_filters[l.source_name]) if r.source_name in table_fulfilled_filters: filters.update(table_fulfilled_filters[r.source_name]) # TODO: obtain join_graph score for diff metrics. useful for ranking later # rank_materializable_join_graphs(materializable_join_paths, table_path, dod) st_is_materializable = time.time() # if query view is all attributes, then it's always materializable or we could # join on a small sample and see -- we can have 2 different impls. if sum([0] + [1 for el in list_samples if el != '']) > 0: is_join_graph_valid = self.is_join_graph_materializable( jpg, table_fulfilled_filters) else: is_join_graph_valid = True et_is_materializable = time.time() perf_stats['time_is_materializable'] += (et_is_materializable - st_is_materializable) # Obtain all materializable graphs, then materialize if is_join_graph_valid: total_materializable_join_graphs += 1 materializable_join_graphs.append((jpg, filters)) # At this point we can empty is-join-graph-materializable cache and create a new one # dpu.empty_relation_cache() # TODO: If df.copy() works, then this is a nice reuse st_materialize = time.time() to_return = self.materialize_join_graphs( materializable_join_graphs) et_materialize = time.time() perf_stats['time_materialize'] += (et_materialize - st_materialize) # yield to_return for el in to_return: if 'actually_materialized' not in perf_stats: perf_stats['actually_materialized'] = 0 perf_stats['actually_materialized'] += 1 yield el if 'materializable_join_graphs' not in perf_stats: perf_stats['materializable_join_graphs'] = [] perf_stats['materializable_join_graphs'].append( total_materializable_join_graphs) perf_stats["num_candidate_groups"] = num_candidate_groups print("Finished enumerating groups") cache_unjoinable_pairs = OrderedDict( sorted(cache_unjoinable_pairs.items(), key=lambda x: x[1], reverse=True)) for k, v in cache_unjoinable_pairs.items(): print(str(k) + " => " + str(v))