def test_content_sim_num(): ''' SETUP ''' start_all = time.time() network = FieldNetwork() store = StoreHandler() # Get all fields from store fields_gen = store.get_all_fields() # Network skeleton and hierarchical relations (table - field), etc start_schema = time.time() network.init_meta_schema(fields_gen) end_schema = time.time() print("Total skeleton: {0}".format(str(end_schema - start_schema))) ''' ACTUAL TEST ''' # Content_sim num relation start_num_sig_sim = time.time() id_sig = store.get_all_fields_num_signatures() # networkbuilder.build_content_sim_relation_num(network, id_sig) networkbuilder.build_content_sim_relation_num_overlap_distr( network, id_sig) end_num_sig_sim = time.time() print("Total num-sig-sim: {0}".format( str(end_num_sig_sim - start_num_sig_sim)))
def test(path_to_serialized_model): # Deserialize model network = fieldnetwork.deserialize_network(path_to_serialized_model) # Create client store_client = StoreHandler() # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") # Retrieve indexes schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl') content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl') # Create ontomatch api om = SSAPI(network, store_client, schema_sim_index, content_sim_index) # Load parsed ontology om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True) om.add_krs([("clo", "cache_onto/clo.pkl")], parsed=True) om.add_krs([("bao", "cache_onto/bao.pkl")], parsed=True) #om.add_krs([("go", "cache_onto/go.pkl")], parsed=True) # parse again print("Finding matchings...") st = time.time() matchings = om.find_matchings() et = time.time() print("Finding matchings...OK") print("Took: " + str(et-st)) for k, v in matchings: print(v) return om
def main(path_to_serialized_model): print('Loading: ' + str(path_to_serialized_model)) network = fieldnetwork.deserialize_network(path_to_serialized_model) store_client = StoreHandler() api = API(network, store_client) ip_shell = InteractiveShellEmbed(banner1=init_banner, exit_msg=exit_banner) ip_shell()
def test_find_links(path_to_serialized_model, matchings): # Deserialize model network = fieldnetwork.deserialize_network(path_to_serialized_model) # Create client store_client = StoreHandler() # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") # Retrieve indexes schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl') content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl') om = SSAPI(network, store_client, schema_sim_index, content_sim_index) om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True) om.add_krs([("clo", "cache_onto/clo.pkl")], parsed=True) om.add_krs([("bao", "cache_onto/bao.pkl")], parsed=True) links = om.find_links(matchings) for link in links: print(link)
def generate_matchings(input_model_path, input_ontology_name_path, output_file): # Deserialize model network = fieldnetwork.deserialize_network(input_model_path) # Create client store_client = StoreHandler() # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") # Retrieve indexes schema_sim_index = io.deserialize_object(input_model_path + 'schema_sim_index.pkl') content_sim_index = io.deserialize_object(input_model_path + 'content_sim_index.pkl') # Create ontomatch api om = SSAPI(network, store_client, schema_sim_index, content_sim_index) for onto_name, onto_parsed_path in input_ontology_name_path: # Load parsed ontology om.add_krs([(onto_name, onto_parsed_path)], parsed=True) matchings = om.find_matchings() with open(output_file, 'w') as f: for m in matchings: f.write(str(m) + '\n') print("Done!")
def main(path_to_serialized_model): # Deserialize model network = fieldnetwork.deserialize_network(path_to_serialized_model) # Create client store_client = StoreHandler() # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") # Retrieve indexes schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl') content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl') om = SSAPI(network, store_client, schema_sim_index, content_sim_index) om.add_krs([("dbpedia", "cache_onto/dbpedia.pkl")], parsed=True) matchings = om.find_matchings() print("Found: " + str(len(matchings))) for m in matchings: print(m) return om
def plot_num(): network = FieldNetwork() store = StoreHandler() fields, num_signatures = store.get_all_fields_num_signatures() xaxis = [] yaxis = [] numpoints = 0 for x, y in num_signatures: numpoints = numpoints + 1 xaxis.append(x) yaxis.append(y) print("Num points: " + str(numpoints)) import matplotlib.pyplot as plt plt.plot(xaxis, yaxis, 'ro') plt.axis([0, 600000, 0, 600000]) #plt.axis([0, 10000, 0, 10000]) #plt.axis([0, 500, 0, 500]) plt.show()
def init_system(path_to_serialized_model, create_reporting=False): print_md('Loading: *' + str(path_to_serialized_model) + "*") sl = time.time() network = fieldnetwork.deserialize_network(path_to_serialized_model) store_client = StoreHandler() api = API(network=network, store_client=store_client) if create_reporting: reporting = Report(network) api.helper.help() el = time.time() print("Took " + str(el - sl) + " to load model") return api, reporting
def __init__(self): self.store_client = StoreHandler() self.network = None self.schema_sim_index = None self.content_sim_index = None self.ontomatch_api = None self.matchings = None self.l4_matchings = None self.l5_matchings = None self.l52_matchings = None self.l42_matchings = None self.l1_matchings = None self.l7_matchings = None self.l42_summarized = None self.l52_summarized = None
def test_fuzzy(path_to_serialized_model): # Deserialize model network = fieldnetwork.deserialize_network(path_to_serialized_model) # Create client store_client = StoreHandler() # Retrieve indexes schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl') content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl') # Create ontomatch api om = SSAPI(network, store_client, schema_sim_index, content_sim_index) # Load parsed ontology om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True) matchings = matcherlib.find_hierarchy_content_fuzzy(om.kr_handlers, store_client) for m in matchings: print(m)
class TestReporting(unittest.TestCase): # create store handler store_client = StoreHandler() # read graph path = '../test/test4/' network = deserialize_network(path) api = API(network) api.init_store() def test_compute_statistics(self): r = Report(self.network) ncols = r.num_columns ntables = r.num_tables ncontent = r.num_content_sim_relations nschema = r.num_schema_sim_relations npkfk = r.num_pkfk_relations print("Num cols: " + str(ncols)) print("Num tables: " + str(ntables)) print("Num content sim relations: " + str(ncontent)) print("Num schema sim relations: " + str(nschema)) print("Num PKFK relations: " + str(npkfk))
def main(args): model_path = args.model_path separator = args.separator store_client = StoreHandler() network = fieldnetwork.deserialize_network(model_path) dod = DoD(network=network, store_client=store_client, csv_separator=separator) attrs = args.list_attributes.split(";") values = args.list_values.split(";") print(attrs) print(values) assert len(attrs) == len(values) i = 0 for mjp, attrs_project, metadata in dod.virtual_schema_iterative_search( attrs, values, debug_enumerate_all_jps=False): print("JP: " + str(i)) proj_view = dpu.project(mjp, attrs_project) print(str(proj_view.head(10))) print("Metadata") print(metadata) if args.output_path: if args.full_view: mjp.to_csv(args.output_path + "/raw_view_" + str(i), encoding='latin1', index=False) proj_view.to_csv(args.output_path + "/view_" + str(i), encoding='latin1', index=False) # always store this i += 1 if args.interactive == "True": print("") input("Press any key to continue...")
def test_4_n_42(path_to_serialized_model): # Deserialize model network = fieldnetwork.deserialize_network(path_to_serialized_model) # Create client store_client = StoreHandler() # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") # Retrieve indexes schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl') content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl') # Create ontomatch api om = SSAPI(network, store_client, schema_sim_index, content_sim_index) # Load parsed ontology #om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True) #om.add_krs([("clo", "cache_onto/clo.pkl")], parsed=True) #om.add_krs([("bao", "cache_onto/bao.pkl")], parsed=True) om.add_krs([("dbpedia", "cache_onto/dbpedia.pkl")], parsed=True) # parse again # L6: [Relations] -> [Class names] (semantic groups) print("Finding L6 matchings...") st = time.time() l6_matchings, sem_coh_groups = matcherlib.find_sem_coh_matchings(om.network, om.kr_handlers) print("Finding L6 matchings...OK, " + str(len(l6_matchings)) + " found") et = time.time() print("Took: " + str(et - st)) for m in l6_matchings: print(m) for k, v in sem_coh_groups.items(): print(str(k) + " -> " + str(v)) exit() print("Finding matchings...") st = time.time() # L4: [Relation names] -> [Class names] (syntax) print("Finding L4 matchings...") st = time.time() l4_matchings = matcherlib.find_relation_class_name_matchings(om.network, om.kr_handlers) print("Finding L4 matchings...OK, " + str(len(l4_matchings)) + " found") et = time.time() print("Took: " + str(et - st)) print("computing fanout") fanout = defaultdict(int) for m in l4_matchings: sch, cla = m fanout[sch] += 1 ordered = sorted(fanout.items(), key=operator.itemgetter(1), reverse=True) for o in ordered: print(o) # for match in l4_matchings: # print(match) # L4.2: [Relation names] -> [Class names] (semantic) print("Finding L42 matchings...") st = time.time() l42_matchings = matcherlib.find_relation_class_name_sem_matchings(om.network, om.kr_handlers) print("Finding L42 matchings...OK, " + str(len(l42_matchings)) + " found") et = time.time() print("Took: " + str(et - st)) et = time.time() print("Finding matchings...OK") print("Took: " + str(et - st)) print("are l4 subsumed by l42?") not_in_l42 = 0 not_subsumed = [] for m in l4_matchings: if m not in l42_matchings: not_in_l42 += 1 not_subsumed.append(m) print("NOT-subsumed: " + str(not_in_l42)) """ # L5: [Attribute names] -> [Class names] (syntax) print("Finding L5 matchings...") st = time.time() l5_matchings = matcherlib.find_relation_class_attr_name_matching(om.network, om.kr_handlers) print("Finding L5 matchings...OK, " + str(len(l5_matchings)) + " found") et = time.time() print("Took: " + str(et - st)) # for match in l5_matchings: # print(match) # l52_matchings = [] # L52: [Attribute names] -> [Class names] (semantic) print("Finding L52 matchings...") st = time.time() l52_matchings = matcherlib.find_relation_class_attr_name_sem_matchings(om.network, om.kr_handlers) print("Finding L52 matchings...OK, " + str(len(l52_matchings)) + " found") et = time.time() print("Took: " + str(et - st)) """ with open('OUTPUT_442_only', 'w') as f: f.write("L4" + '\n') for m in l4_matchings: f.write(str(m) + '\n') f.write("L42" + '\n') for m in l42_matchings: f.write(str(m) + '\n') f.write("L5" + '\n')
def main(output_path=None): start_all = time.time() network = FieldNetwork() store = StoreHandler() # Get all fields from store fields_gen = store.get_all_fields() # Network skeleton and hierarchical relations (table - field), etc start_schema = time.time() network.init_meta_schema(fields_gen) end_schema = time.time() print("Total skeleton: {0}".format(str(end_schema - start_schema))) print("!!1 " + str(end_schema - start_schema)) # Schema_sim relation start_schema_sim = time.time() schema_sim_index = networkbuilder.build_schema_sim_relation(network) end_schema_sim = time.time() print("Total schema-sim: {0}".format(str(end_schema_sim - start_schema_sim))) print("!!2 " + str(end_schema_sim - start_schema_sim)) # Entity_sim relation start_entity_sim = time.time() #fields, entities = store.get_all_fields_entities() #networkbuilder.build_entity_sim_relation(network, fields, entities) end_entity_sim = time.time() print("Total entity-sim: {0}".format(str(end_entity_sim - start_entity_sim))) """ # Content_sim text relation (random-projection based) start_text_sig_sim = time.time() st = time.time() text_signatures = store.get_all_fields_text_signatures(network) et = time.time() print("Time to extract signatures from store: {0}".format(str(et - st))) print("!!3 " + str(et - st)) networkbuilder.build_content_sim_relation_text_lsa(network, text_signatures) end_text_sig_sim = time.time() print("Total text-sig-sim: {0}".format(str(end_text_sig_sim - start_text_sig_sim))) print("!!4 " + str(end_text_sig_sim - start_text_sig_sim)) """ # Content_sim text relation (minhash-based) start_text_sig_sim = time.time() st = time.time() mh_signatures = store.get_all_mh_text_signatures() et = time.time() print("Time to extract minhash signatures from store: {0}".format( str(et - st))) print("!!3 " + str(et - st)) content_sim_index = networkbuilder.build_content_sim_mh_text( network, mh_signatures) end_text_sig_sim = time.time() print("Total text-sig-sim (minhash): {0}".format( str(end_text_sig_sim - start_text_sig_sim))) print("!!4 " + str(end_text_sig_sim - start_text_sig_sim)) # Content_sim num relation start_num_sig_sim = time.time() id_sig = store.get_all_fields_num_signatures() #networkbuilder.build_content_sim_relation_num(network, id_sig) networkbuilder.build_content_sim_relation_num_overlap_distr( network, id_sig) #networkbuilder.build_content_sim_relation_num_overlap_distr_indexed(network, id_sig) end_num_sig_sim = time.time() print("Total num-sig-sim: {0}".format( str(end_num_sig_sim - start_num_sig_sim))) print("!!5 " + str(end_num_sig_sim - start_num_sig_sim)) # Primary Key / Foreign key relation start_pkfk = time.time() networkbuilder.build_pkfk_relation(network) end_pkfk = time.time() print("Total PKFK: {0}".format(str(end_pkfk - start_pkfk))) print("!!6 " + str(end_pkfk - start_pkfk)) end_all = time.time() print("Total time: {0}".format(str(end_all - start_all))) print("!!7 " + str(end_all - start_all)) path = "test/datagov/" if output_path is not None: path = output_path fieldnetwork.serialize_network(network, path) # Serialize indexes path_schsim = path + "/schema_sim_index.pkl" io.serialize_object(schema_sim_index, path_schsim) path_cntsim = path + "/content_sim_index.pkl" io.serialize_object(content_sim_index, path_cntsim) print("DONE!")
def init_store(self): # create store handler global store_client store_client = StoreHandler()
class TestDDApiPathQueries(unittest.TestCase): # create store handler store_client = StoreHandler() # read graph path = 'models/chemical/' network = deserialize_network(path) api = API(network) api.init_store() """ TC primitive API """ def test_paths_between_field_mode(self): print(self._testMethodName) field1 = ('chembl_21', 'drug_indication', 'record_id') field2 = ('chembl_21', 'compound_records', 'record_id') drs1 = self.api.drs_from_raw_field(field1) drs2 = self.api.drs_from_raw_field(field2) res = self.api.paths_between(drs1, drs2, Relation.PKFK) data = [x for x in res] print("Total results: " + str(len(data))) for el in data: print(str(el)) def test_paths_between_table_mode(self): print(self._testMethodName) field1 = ('chembl_21', 'drug_indication', 'record_id') field2 = ('chembl_21', 'compound_records', 'record_id') drs1 = self.api.drs_from_raw_field(field1) drs2 = self.api.drs_from_raw_field(field2) drs1.set_table_mode() drs2.set_table_mode() res = self.api.paths_between(drs1, drs2, Relation.PKFK) data = [x for x in res] print("Total results: " + str(len(data))) for el in data: print(str(el)) print("Paths: ") res.visualize_provenance() res.debug_print() paths = res.paths() for p in paths: print(str(p)) def test_paths_between_from_tables(self): print(self._testMethodName) table1_name = "drug_indication" table2_name = "compound_records" table1 = self.api.drs_from_table(table1_name) table2 = self.api.drs_from_table(table2_name) table1.set_table_mode() table2.set_table_mode() res = self.api.paths_between(table1, table2, Relation.PKFK) data = [x for x in res] print("Total results: " + str(len(data))) for el in data: print(str(el)) print("Paths: ") paths = res.paths() for p in paths: print(str(p)) def test_paths(self): print(self._testMethodName) return def test_traverse(self): print(self._testMethodName) field1 = ('chembl_21', 'drug_indication', 'record_id') drs_field = self.api.drs_from_raw_field(field1) res = self.api.traverse(drs_field, Relation.SCHEMA_SIM, 1) data = [x for x in res] print("Total results: " + str(len(data))) for el in data: print(str(el)) return
class TestProvenance(unittest.TestCase): # create store handler store_client = StoreHandler() # read graph path = '../test/test4/' network = deserialize_network(path) api = API(network) api.init_store() def test_keyword_provenance(self): print(self._testMethodName) res = self.api.keyword_search("Madden", max_results=10) print(res.get_provenance().prov_graph().nodes()) print(res.get_provenance().prov_graph().edges()) el_interest = [x for x in res][0] info = res.why(el_interest) print("WHY " + str(el_interest) + "? " + str(info)) explanation = res.how(el_interest) print("HOW " + str(el_interest) + "? " + str(explanation)) self.assertTrue(True) def test_content_sim_provenance(self): print(self._testMethodName) table = 'Buildings.csv' res = self.api.similar_content_to_table(table) print(res.get_provenance().prov_graph().nodes()) print(res.get_provenance().prov_graph().edges()) el_interest = [x for x in res][0] info = res.why(el_interest) print("WHY " + str(el_interest) + "? " + str(info)) explanation = res.how(el_interest) print("HOW " + str(el_interest) + "? " + str(explanation)) self.assertTrue(True) def test_intersection_provenance(self): print(self._testMethodName) res1 = self.api.keyword_search("Madden", max_results=10) res2 = self.api.keyword_search("Stonebraker", max_results=10) res = res1.intersection(res2) print(res.get_provenance().prov_graph().nodes()) print(res.get_provenance().prov_graph().edges()) el_interest = [x for x in res][0] info = res.why(el_interest) print("WHY " + str(el_interest) + "? " + str(info)) explanation = res.how(el_interest) print("HOW " + str(el_interest) + "? " + str(explanation)) self.assertTrue(True) def test_tc_table_mode_provenance(self): print(self._testMethodName) field1 = ('dwhsmall', 'All_olap2_uentity_desc_uses.csv', 'Entity Owner') field2 = ('dwhsmall', 'All_olap_entity_desc_uses.csv', 'Entity Owner') drs1 = self.api.drs_from_raw_field(field1) drs2 = self.api.drs_from_raw_field(field2) drs1.set_table_mode() drs2.set_table_mode() res = self.api.paths_between(drs1, drs2, Relation.PKFK) print(res.get_provenance().prov_graph().nodes()) print(res.get_provenance().prov_graph().edges()) el_interest = [x for x in res][0] info = res.why(el_interest) print("WHY " + str(el_interest) + "? " + str(info)) explanation = res.how(el_interest) print("HOW " + str(el_interest) + "? " + str(explanation)) self.assertTrue(True)
class TestDDApi(unittest.TestCase): # create store handler store_client = StoreHandler() # read graph path = 'models/dwh/' network = deserialize_network(path) api = API(network) api.init_store() """ Seed API """ def test_drs_from_raw_field(self): print(self._testMethodName) field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation') res = self.api.drs_from_raw_field(field) for el in res: print(str(el)) def test_drs_from_hit(self): print(self._testMethodName) field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation') res = self.api.drs_from_raw_field(field) els = [x for x in res] el = els[0] res = self.api.drs_from_hit(el) for el in res: print(str(el)) def test_drs_from_table(self): print(self._testMethodName) table = 'Iap_subject_person.csv' res = self.api.drs_from_table(table) for el in res: print(el) def test_drs_from_table_hit(self): print(self._testMethodName) field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation') res = self.api.drs_from_raw_field(field) els = [x for x in res] el = els[0] res = self.api.drs_from_table_hit(el) for el in res: print(str(el)) """ Primitive API """ def test_keyword_search(self): print(self._testMethodName) res = self.api.keyword_search("Madden", max_results=10) for el in res: print(str(el)) def test_keywords_search(self): print(self._testMethodName) res = self.api.keywords_search(["Madden", "Stonebraker", "Liskov"]) for el in res: print(str(el)) def test_schema_name_search(self): print(self._testMethodName) res = self.api.schema_name_search("Name", max_results=10) for el in res: print(str(el)) def test_schema_names_search(self): print(self._testMethodName) res = self.api.schema_names_search(["Name", "Last Name", "Employee"]) for el in res: print(str(el)) def test_entity_search(self): print(self._testMethodName) print("Future Work...") return def test_schema_neighbors(self): print(self._testMethodName) field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation') res = self.api.schema_neighbors(field) for el in res: print(str(el)) def test_schema_neighbors_of(self): print(self._testMethodName) field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation') res = self.api.schema_neighbors(field) res = self.api.schema_neighbors_of(res) for el in res: print(str(el)) def test_similar_schema_name_to_field(self): print(self._testMethodName) field = ('mitdwh', 'Buildings.csv', 'Building Name') res = self.api.similar_schema_name_to_field(field) print("RES size: " + str(res.size())) for el in res: print(str(el)) def test_ids_functions(self): print(self._testMethodName) field = ('mitdwh', 'Buildings.csv', 'Building Key') drs1 = self.api.drs_from_raw_field(field) field = ('mitdwh', 'Building Key', 'Buildings.csv') drs2 = self.api.drs_from_raw_field(field) for el in drs1: print(str(el)) for el in drs2: print(str(el)) def test_similar_schema_name_to_table(self): print(self._testMethodName) table = 'Buildings.csv' res = self.api.similar_schema_name_to_table(table) print("RES size: " + str(res.size())) for el in res: print(str(el)) def test_similar_schema_name_to(self): print(self._testMethodName) field = ('mitdwh', 'Buildings.csv', 'Building Key') res = self.api.similar_schema_name_to_field(field) res = self.api.similar_schema_name_to(res) print("RES size: " + str(res.size())) for el in res: print(str(el)) def test_similar_content_to_field(self): print(self._testMethodName) field = ('mitdwh', 'Buildings.csv', 'Building Name') res = self.api.similar_content_to_field(field) print("RES size: " + str(res.size())) for el in res: print(str(el)) def test_similar_content_to_table(self): print(self._testMethodName) table = 'Buildings.csv' res = self.api.similar_content_to_table(table) print("RES size: " + str(res.size())) for el in res: print(str(el)) def test_similar_content_to(self): print(self._testMethodName) field = ('mitdwh', 'Buildings.csv', 'Building Name') res = self.api.similar_content_to_field(field) res = self.api.similar_content_to(res) print("RES size: " + str(res.size())) for el in res: print(str(el)) def test_pkfk_field(self): print(self._testMethodName) field = ('mitdwh', 'Buildings.csv', 'Building Name') res = self.api.pkfk_field(field) print("RES size: " + str(res.size())) for el in res: print(str(el)) def test_pkfk_table(self): print(self._testMethodName) table = 'Buildings.csv' res = self.api.pkfk_table(table) print("RES size: " + str(res.size())) for el in res: print(str(el)) def test_pkfk_of(self): print(self._testMethodName) field = ('mitdwh', 'Buildings.csv', 'Building Name') res = self.api.pkfk_field(field) res = self.api.pkfk_of(res) print("RES size: " + str(res.size())) for el in res: print(str(el)) """ Combiner API """ def test_intersection(self): print(self._testMethodName) res1 = self.api.keyword_search("Madden", max_results=10) res2 = self.api.keyword_search("Stonebraker", max_results=10) res = res1.intersection(res2) for el in res: print(str(el)) def test_union(self): print(self._testMethodName) res1 = self.api.keyword_search("Madden", max_results=10) res2 = self.api.schema_name_search("Stonebraker", max_results=10) res = res1.union(res2) for el in res: print(str(el)) def test_difference(self): print(self._testMethodName) res1 = self.api.keyword_search("Madden", max_results=10) res2 = self.api.keyword_search("Stonebraker", max_results=10) res = res1.set_difference(res2) for el in res: print(str(el)) """ Other, bugs, etc """ def test_iter_edges_with_data_bug(self): table = "Fac_building.csv" # The table of interest # We get the representation of that table in DRS table_drs = self.api.drs_from_table(table) # similar tables are those with similar content content_similar = self.api.similar_content_to(table_drs) schema_similar = self.api.similar_schema_name_to( table_drs) # similar attribute names # some pkfk relationship involved too pkfk_similar = self.api.pkfk_of(table_drs) # similar tables are similar in content and schema inters1 = self.api.intersection(content_similar, schema_similar) similar_tables = self.api.intersection(inters1, pkfk_similar) similar_tables.print_tables()
# Ignore in-table results of neighbor searches # Exclude certain tables # keyword_search and neighbor_search, but on mutiple contexts import networkx as nx from api.apiutils import Relation from modelstore.elasticstore import StoreHandler, KWType from knowledgerepr import fieldnetwork from algebra import API path_to_serialized_model = "/Users/arcarter/code/datadiscovery/test/testmodel/" network = fieldnetwork.deserialize_network(path_to_serialized_model) store_client = StoreHandler() api = API(network, store_client) # short variables for Scope # These are used in keyword searches # To specify what parts of a file will be searched source = KWType.KW_TABLE # table/file/source name field = KWType.KW_SCHEMA # colum names/fields content = KWType.KW_TEXT # content of the columns # Short variables for Relation # These represent edge types in the graph # and are used for neighbor searches # schema = Relation.SCHEMA # similar schemas schema_sim = Relation.SCHEMA_SIM # Similar Schema Names # similar content values. i.e. matching substrings and numbers content_sim = Relation.CONTENT_SIM
class TestRanking(unittest.TestCase): # create store handler store_client = StoreHandler() # create synthetic graph network = GENSYN(5, 5, 20, 50, 10) api = API(network) api.init_store() def test_compute_ranking_scores_certainty(self): nodes = self.network.fields_degree(3) #self.network._visualize_graph() nids = [x for x, y in nodes] info = self.network.get_info_for(nids) hits = self.network.get_hits_from_info(info) drs_info = self.api.drs_from_hits(hits) #drs_info.visualize_provenance() res = self.api.similar_schema_name_to(drs_info) #res.visualize_provenance(labels=True) res = res.rank_coverage() res.pretty_print_columns_with_scores() self.assertTrue(True) def test_ranking_certainty_chem(self): path = '../models/chemical/' network = deserialize_network(path) api = API(network) api.init_store() table = 'activities' table_drs = api.drs_from_table(table) sim_tables = api.similar_content_to(table_drs) sim_tables.rank_certainty() print("All columns CERTAINTY: ") sim_tables.pretty_print_columns_with_scores() print("") print("All tables CERTAINTY: ") sim_tables.print_tables_with_scores() print("") sim_tables.rank_coverage() print("All columns COVERAGE: ") sim_tables.pretty_print_columns_with_scores() print("") print("All tables COVERAGE: ") sim_tables.print_tables_with_scores() print("") """