def test_content_sim_num():
    '''
    SETUP
    '''

    start_all = time.time()
    network = FieldNetwork()
    store = StoreHandler()

    # Get all fields from store
    fields_gen = store.get_all_fields()

    # Network skeleton and hierarchical relations (table - field), etc
    start_schema = time.time()
    network.init_meta_schema(fields_gen)
    end_schema = time.time()
    print("Total skeleton: {0}".format(str(end_schema - start_schema)))
    '''
    ACTUAL TEST
    '''

    # Content_sim num relation
    start_num_sig_sim = time.time()
    id_sig = store.get_all_fields_num_signatures()
    # networkbuilder.build_content_sim_relation_num(network, id_sig)
    networkbuilder.build_content_sim_relation_num_overlap_distr(
        network, id_sig)
    end_num_sig_sim = time.time()
    print("Total num-sig-sim: {0}".format(
        str(end_num_sig_sim - start_num_sig_sim)))
예제 #2
0
def test(path_to_serialized_model):
    # Deserialize model
    network = fieldnetwork.deserialize_network(path_to_serialized_model)
    # Create client
    store_client = StoreHandler()

    # Load glove model
    print("Loading language model...")
    path_to_glove_model = "../glove/glove.6B.100d.txt"
    glove_api.load_model(path_to_glove_model)
    print("Loading language model...OK")

    # Retrieve indexes
    schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl')
    content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl')

    # Create ontomatch api
    om = SSAPI(network, store_client, schema_sim_index, content_sim_index)
    # Load parsed ontology
    om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True)
    om.add_krs([("clo", "cache_onto/clo.pkl")], parsed=True)
    om.add_krs([("bao", "cache_onto/bao.pkl")], parsed=True)
    #om.add_krs([("go", "cache_onto/go.pkl")], parsed=True)  # parse again

    print("Finding matchings...")
    st = time.time()
    matchings = om.find_matchings()
    et = time.time()
    print("Finding matchings...OK")
    print("Took: " + str(et-st))

    for k, v in matchings:
        print(v)

    return om
예제 #3
0
def main(path_to_serialized_model):
    print('Loading: ' + str(path_to_serialized_model))
    network = fieldnetwork.deserialize_network(path_to_serialized_model)
    store_client = StoreHandler()
    api = API(network, store_client)
    ip_shell = InteractiveShellEmbed(banner1=init_banner, exit_msg=exit_banner)
    ip_shell()
예제 #4
0
def test_find_links(path_to_serialized_model, matchings):
    # Deserialize model
    network = fieldnetwork.deserialize_network(path_to_serialized_model)
    # Create client
    store_client = StoreHandler()

    # Load glove model
    print("Loading language model...")
    path_to_glove_model = "../glove/glove.6B.100d.txt"
    glove_api.load_model(path_to_glove_model)
    print("Loading language model...OK")

    # Retrieve indexes
    schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl')
    content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl')

    om = SSAPI(network, store_client, schema_sim_index, content_sim_index)

    om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True)
    om.add_krs([("clo", "cache_onto/clo.pkl")], parsed=True)
    om.add_krs([("bao", "cache_onto/bao.pkl")], parsed=True)

    links = om.find_links(matchings)
    for link in links:
        print(link)
예제 #5
0
def generate_matchings(input_model_path, input_ontology_name_path, output_file):
    # Deserialize model
    network = fieldnetwork.deserialize_network(input_model_path)
    # Create client
    store_client = StoreHandler()

    # Load glove model
    print("Loading language model...")
    path_to_glove_model = "../glove/glove.6B.100d.txt"
    glove_api.load_model(path_to_glove_model)
    print("Loading language model...OK")

    # Retrieve indexes
    schema_sim_index = io.deserialize_object(input_model_path + 'schema_sim_index.pkl')
    content_sim_index = io.deserialize_object(input_model_path + 'content_sim_index.pkl')

    # Create ontomatch api
    om = SSAPI(network, store_client, schema_sim_index, content_sim_index)
    for onto_name, onto_parsed_path in input_ontology_name_path:
        # Load parsed ontology
        om.add_krs([(onto_name, onto_parsed_path)], parsed=True)

    matchings = om.find_matchings()

    with open(output_file, 'w') as f:
        for m in matchings:
            f.write(str(m) + '\n')

    print("Done!")
예제 #6
0
def main(path_to_serialized_model):
    # Deserialize model
    network = fieldnetwork.deserialize_network(path_to_serialized_model)
    # Create client
    store_client = StoreHandler()

    # Load glove model
    print("Loading language model...")
    path_to_glove_model = "../glove/glove.6B.100d.txt"
    glove_api.load_model(path_to_glove_model)
    print("Loading language model...OK")

    # Retrieve indexes
    schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl')
    content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl')

    om = SSAPI(network, store_client, schema_sim_index, content_sim_index)

    om.add_krs([("dbpedia", "cache_onto/dbpedia.pkl")], parsed=True)

    matchings = om.find_matchings()

    print("Found: " + str(len(matchings)))
    for m in matchings:
        print(m)

    return om
def plot_num():
    network = FieldNetwork()
    store = StoreHandler()
    fields, num_signatures = store.get_all_fields_num_signatures()

    xaxis = []
    yaxis = []
    numpoints = 0
    for x, y in num_signatures:
        numpoints = numpoints + 1
        xaxis.append(x)
        yaxis.append(y)
    print("Num points: " + str(numpoints))
    import matplotlib.pyplot as plt
    plt.plot(xaxis, yaxis, 'ro')
    plt.axis([0, 600000, 0, 600000])
    #plt.axis([0, 10000, 0, 10000])
    #plt.axis([0, 500, 0, 500])
    plt.show()
예제 #8
0
def init_system(path_to_serialized_model, create_reporting=False):
    print_md('Loading: *' + str(path_to_serialized_model) + "*")
    sl = time.time()
    network = fieldnetwork.deserialize_network(path_to_serialized_model)
    store_client = StoreHandler()
    api = API(network=network, store_client=store_client)
    if create_reporting:
        reporting = Report(network)
    api.helper.help()
    el = time.time()
    print("Took " + str(el - sl) + " to load model")
    return api, reporting
예제 #9
0
 def __init__(self):
     self.store_client = StoreHandler()
     self.network = None
     self.schema_sim_index = None
     self.content_sim_index = None
     self.ontomatch_api = None
     self.matchings = None
     self.l4_matchings = None
     self.l5_matchings = None
     self.l52_matchings = None
     self.l42_matchings = None
     self.l1_matchings = None
     self.l7_matchings = None
     self.l42_summarized = None
     self.l52_summarized = None
예제 #10
0
def test_fuzzy(path_to_serialized_model):
    # Deserialize model
    network = fieldnetwork.deserialize_network(path_to_serialized_model)
    # Create client
    store_client = StoreHandler()

    # Retrieve indexes
    schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl')
    content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl')

    # Create ontomatch api
    om = SSAPI(network, store_client, schema_sim_index, content_sim_index)
    # Load parsed ontology
    om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True)

    matchings = matcherlib.find_hierarchy_content_fuzzy(om.kr_handlers, store_client)

    for m in matchings:
        print(m)
예제 #11
0
class TestReporting(unittest.TestCase):
    # create store handler
    store_client = StoreHandler()
    # read graph
    path = '../test/test4/'
    network = deserialize_network(path)
    api = API(network)
    api.init_store()

    def test_compute_statistics(self):
        r = Report(self.network)
        ncols = r.num_columns
        ntables = r.num_tables
        ncontent = r.num_content_sim_relations
        nschema = r.num_schema_sim_relations
        npkfk = r.num_pkfk_relations
        print("Num cols: " + str(ncols))
        print("Num tables: " + str(ntables))
        print("Num content sim relations: " + str(ncontent))
        print("Num schema sim relations: " + str(nschema))
        print("Num PKFK relations: " + str(npkfk))
예제 #12
0
def main(args):
    model_path = args.model_path
    separator = args.separator

    store_client = StoreHandler()
    network = fieldnetwork.deserialize_network(model_path)
    dod = DoD(network=network,
              store_client=store_client,
              csv_separator=separator)

    attrs = args.list_attributes.split(";")
    values = args.list_values.split(";")
    print(attrs)
    print(values)
    assert len(attrs) == len(values)

    i = 0
    for mjp, attrs_project, metadata in dod.virtual_schema_iterative_search(
            attrs, values, debug_enumerate_all_jps=False):
        print("JP: " + str(i))
        proj_view = dpu.project(mjp, attrs_project)
        print(str(proj_view.head(10)))
        print("Metadata")
        print(metadata)
        if args.output_path:
            if args.full_view:
                mjp.to_csv(args.output_path + "/raw_view_" + str(i),
                           encoding='latin1',
                           index=False)
            proj_view.to_csv(args.output_path + "/view_" + str(i),
                             encoding='latin1',
                             index=False)  # always store this
        i += 1
        if args.interactive == "True":
            print("")
            input("Press any key to continue...")
예제 #13
0
def test_4_n_42(path_to_serialized_model):
    # Deserialize model
    network = fieldnetwork.deserialize_network(path_to_serialized_model)
    # Create client
    store_client = StoreHandler()

    # Load glove model
    print("Loading language model...")
    path_to_glove_model = "../glove/glove.6B.100d.txt"
    glove_api.load_model(path_to_glove_model)
    print("Loading language model...OK")

    # Retrieve indexes
    schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl')
    content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl')

    # Create ontomatch api
    om = SSAPI(network, store_client, schema_sim_index, content_sim_index)
    # Load parsed ontology
    #om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True)
    #om.add_krs([("clo", "cache_onto/clo.pkl")], parsed=True)
    #om.add_krs([("bao", "cache_onto/bao.pkl")], parsed=True)
    om.add_krs([("dbpedia", "cache_onto/dbpedia.pkl")], parsed=True)  # parse again

    # L6: [Relations] -> [Class names] (semantic groups)

    print("Finding L6 matchings...")
    st = time.time()
    l6_matchings, sem_coh_groups = matcherlib.find_sem_coh_matchings(om.network, om.kr_handlers)
    print("Finding L6 matchings...OK, " + str(len(l6_matchings)) + " found")
    et = time.time()
    print("Took: " + str(et - st))

    for m in l6_matchings:
        print(m)

    for k, v in sem_coh_groups.items():
        print(str(k) + " -> " + str(v))

    exit()

    print("Finding matchings...")
    st = time.time()
    # L4: [Relation names] -> [Class names] (syntax)
    print("Finding L4 matchings...")
    st = time.time()
    l4_matchings = matcherlib.find_relation_class_name_matchings(om.network, om.kr_handlers)
    print("Finding L4 matchings...OK, " + str(len(l4_matchings)) + " found")
    et = time.time()
    print("Took: " + str(et - st))

    print("computing fanout")
    fanout = defaultdict(int)
    for m in l4_matchings:
        sch, cla = m
        fanout[sch] += 1
    ordered = sorted(fanout.items(), key=operator.itemgetter(1), reverse=True)
    for o in ordered:
        print(o)

    # for match in l4_matchings:
    #    print(match)

    # L4.2: [Relation names] -> [Class names] (semantic)
    print("Finding L42 matchings...")
    st = time.time()
    l42_matchings = matcherlib.find_relation_class_name_sem_matchings(om.network, om.kr_handlers)
    print("Finding L42 matchings...OK, " + str(len(l42_matchings)) + " found")
    et = time.time()
    print("Took: " + str(et - st))
    et = time.time()
    print("Finding matchings...OK")
    print("Took: " + str(et - st))

    print("are l4 subsumed by l42?")
    not_in_l42 = 0
    not_subsumed = []
    for m in l4_matchings:
        if m not in l42_matchings:
            not_in_l42 += 1
            not_subsumed.append(m)
    print("NOT-subsumed: " + str(not_in_l42))

    """
    # L5: [Attribute names] -> [Class names] (syntax)
    print("Finding L5 matchings...")
    st = time.time()
    l5_matchings = matcherlib.find_relation_class_attr_name_matching(om.network, om.kr_handlers)
    print("Finding L5 matchings...OK, " + str(len(l5_matchings)) + " found")
    et = time.time()
    print("Took: " + str(et - st))

    # for match in l5_matchings:
    #    print(match)

    # l52_matchings = []

    # L52: [Attribute names] -> [Class names] (semantic)
    print("Finding L52 matchings...")
    st = time.time()
    l52_matchings = matcherlib.find_relation_class_attr_name_sem_matchings(om.network, om.kr_handlers)
    print("Finding L52 matchings...OK, " + str(len(l52_matchings)) + " found")
    et = time.time()
    print("Took: " + str(et - st))

    """

    with open('OUTPUT_442_only', 'w') as f:
        f.write("L4" + '\n')
        for m in l4_matchings:
            f.write(str(m) + '\n')
        f.write("L42" + '\n')
        for m in l42_matchings:
            f.write(str(m) + '\n')
        f.write("L5" + '\n')
def main(output_path=None):
    start_all = time.time()
    network = FieldNetwork()
    store = StoreHandler()

    # Get all fields from store
    fields_gen = store.get_all_fields()

    # Network skeleton and hierarchical relations (table - field), etc
    start_schema = time.time()
    network.init_meta_schema(fields_gen)
    end_schema = time.time()
    print("Total skeleton: {0}".format(str(end_schema - start_schema)))
    print("!!1 " + str(end_schema - start_schema))

    # Schema_sim relation
    start_schema_sim = time.time()
    schema_sim_index = networkbuilder.build_schema_sim_relation(network)
    end_schema_sim = time.time()
    print("Total schema-sim: {0}".format(str(end_schema_sim -
                                             start_schema_sim)))
    print("!!2 " + str(end_schema_sim - start_schema_sim))

    # Entity_sim relation
    start_entity_sim = time.time()
    #fields, entities = store.get_all_fields_entities()
    #networkbuilder.build_entity_sim_relation(network, fields, entities)
    end_entity_sim = time.time()
    print("Total entity-sim: {0}".format(str(end_entity_sim -
                                             start_entity_sim)))
    """
    # Content_sim text relation (random-projection based)
    start_text_sig_sim = time.time()
    st = time.time()
    text_signatures = store.get_all_fields_text_signatures(network)
    et = time.time()
    print("Time to extract signatures from store: {0}".format(str(et - st)))
    print("!!3 " + str(et - st))

    networkbuilder.build_content_sim_relation_text_lsa(network, text_signatures)
    end_text_sig_sim = time.time()
    print("Total text-sig-sim: {0}".format(str(end_text_sig_sim - start_text_sig_sim)))
    print("!!4 " + str(end_text_sig_sim - start_text_sig_sim))
    """

    # Content_sim text relation (minhash-based)
    start_text_sig_sim = time.time()
    st = time.time()
    mh_signatures = store.get_all_mh_text_signatures()
    et = time.time()
    print("Time to extract minhash signatures from store: {0}".format(
        str(et - st)))
    print("!!3 " + str(et - st))

    content_sim_index = networkbuilder.build_content_sim_mh_text(
        network, mh_signatures)
    end_text_sig_sim = time.time()
    print("Total text-sig-sim (minhash): {0}".format(
        str(end_text_sig_sim - start_text_sig_sim)))
    print("!!4 " + str(end_text_sig_sim - start_text_sig_sim))

    # Content_sim num relation
    start_num_sig_sim = time.time()
    id_sig = store.get_all_fields_num_signatures()
    #networkbuilder.build_content_sim_relation_num(network, id_sig)
    networkbuilder.build_content_sim_relation_num_overlap_distr(
        network, id_sig)
    #networkbuilder.build_content_sim_relation_num_overlap_distr_indexed(network, id_sig)
    end_num_sig_sim = time.time()
    print("Total num-sig-sim: {0}".format(
        str(end_num_sig_sim - start_num_sig_sim)))
    print("!!5 " + str(end_num_sig_sim - start_num_sig_sim))

    # Primary Key / Foreign key relation
    start_pkfk = time.time()
    networkbuilder.build_pkfk_relation(network)
    end_pkfk = time.time()
    print("Total PKFK: {0}".format(str(end_pkfk - start_pkfk)))
    print("!!6 " + str(end_pkfk - start_pkfk))

    end_all = time.time()
    print("Total time: {0}".format(str(end_all - start_all)))
    print("!!7 " + str(end_all - start_all))

    path = "test/datagov/"
    if output_path is not None:
        path = output_path
    fieldnetwork.serialize_network(network, path)

    # Serialize indexes
    path_schsim = path + "/schema_sim_index.pkl"
    io.serialize_object(schema_sim_index, path_schsim)
    path_cntsim = path + "/content_sim_index.pkl"
    io.serialize_object(content_sim_index, path_cntsim)

    print("DONE!")
예제 #15
0
 def init_store(self):
     # create store handler
     global store_client
     store_client = StoreHandler()
class TestDDApiPathQueries(unittest.TestCase):

    # create store handler
    store_client = StoreHandler()
    # read graph
    path = 'models/chemical/'
    network = deserialize_network(path)
    api = API(network)
    api.init_store()
    """
    TC primitive API
    """
    def test_paths_between_field_mode(self):
        print(self._testMethodName)

        field1 = ('chembl_21', 'drug_indication', 'record_id')
        field2 = ('chembl_21', 'compound_records', 'record_id')

        drs1 = self.api.drs_from_raw_field(field1)
        drs2 = self.api.drs_from_raw_field(field2)

        res = self.api.paths_between(drs1, drs2, Relation.PKFK)

        data = [x for x in res]
        print("Total results: " + str(len(data)))
        for el in data:
            print(str(el))

    def test_paths_between_table_mode(self):
        print(self._testMethodName)

        field1 = ('chembl_21', 'drug_indication', 'record_id')
        field2 = ('chembl_21', 'compound_records', 'record_id')

        drs1 = self.api.drs_from_raw_field(field1)
        drs2 = self.api.drs_from_raw_field(field2)

        drs1.set_table_mode()
        drs2.set_table_mode()

        res = self.api.paths_between(drs1, drs2, Relation.PKFK)

        data = [x for x in res]
        print("Total results: " + str(len(data)))
        for el in data:
            print(str(el))

        print("Paths: ")
        res.visualize_provenance()
        res.debug_print()
        paths = res.paths()
        for p in paths:
            print(str(p))

    def test_paths_between_from_tables(self):
        print(self._testMethodName)

        table1_name = "drug_indication"
        table2_name = "compound_records"
        table1 = self.api.drs_from_table(table1_name)
        table2 = self.api.drs_from_table(table2_name)
        table1.set_table_mode()
        table2.set_table_mode()
        res = self.api.paths_between(table1, table2, Relation.PKFK)

        data = [x for x in res]
        print("Total results: " + str(len(data)))
        for el in data:
            print(str(el))

        print("Paths: ")
        paths = res.paths()
        for p in paths:
            print(str(p))

    def test_paths(self):
        print(self._testMethodName)

        return

    def test_traverse(self):
        print(self._testMethodName)

        field1 = ('chembl_21', 'drug_indication', 'record_id')
        drs_field = self.api.drs_from_raw_field(field1)
        res = self.api.traverse(drs_field, Relation.SCHEMA_SIM, 1)

        data = [x for x in res]
        print("Total results: " + str(len(data)))
        for el in data:
            print(str(el))

        return
예제 #17
0
class TestProvenance(unittest.TestCase):
    # create store handler
    store_client = StoreHandler()
    # read graph
    path = '../test/test4/'
    network = deserialize_network(path)
    api = API(network)
    api.init_store()

    def test_keyword_provenance(self):
        print(self._testMethodName)

        res = self.api.keyword_search("Madden", max_results=10)

        print(res.get_provenance().prov_graph().nodes())
        print(res.get_provenance().prov_graph().edges())

        el_interest = [x for x in res][0]

        info = res.why(el_interest)
        print("WHY " + str(el_interest) + "? " + str(info))

        explanation = res.how(el_interest)
        print("HOW " + str(el_interest) + "? " + str(explanation))

        self.assertTrue(True)

    def test_content_sim_provenance(self):
        print(self._testMethodName)

        table = 'Buildings.csv'
        res = self.api.similar_content_to_table(table)

        print(res.get_provenance().prov_graph().nodes())
        print(res.get_provenance().prov_graph().edges())

        el_interest = [x for x in res][0]

        info = res.why(el_interest)
        print("WHY " + str(el_interest) + "? " + str(info))

        explanation = res.how(el_interest)
        print("HOW " + str(el_interest) + "? " + str(explanation))

        self.assertTrue(True)

    def test_intersection_provenance(self):
        print(self._testMethodName)

        res1 = self.api.keyword_search("Madden", max_results=10)
        res2 = self.api.keyword_search("Stonebraker", max_results=10)

        res = res1.intersection(res2)

        print(res.get_provenance().prov_graph().nodes())
        print(res.get_provenance().prov_graph().edges())

        el_interest = [x for x in res][0]

        info = res.why(el_interest)
        print("WHY " + str(el_interest) + "? " + str(info))

        explanation = res.how(el_interest)
        print("HOW " + str(el_interest) + "? " + str(explanation))

        self.assertTrue(True)

    def test_tc_table_mode_provenance(self):
        print(self._testMethodName)

        field1 = ('dwhsmall', 'All_olap2_uentity_desc_uses.csv',
                  'Entity Owner')
        field2 = ('dwhsmall', 'All_olap_entity_desc_uses.csv', 'Entity Owner')

        drs1 = self.api.drs_from_raw_field(field1)
        drs2 = self.api.drs_from_raw_field(field2)

        drs1.set_table_mode()
        drs2.set_table_mode()

        res = self.api.paths_between(drs1, drs2, Relation.PKFK)

        print(res.get_provenance().prov_graph().nodes())
        print(res.get_provenance().prov_graph().edges())

        el_interest = [x for x in res][0]

        info = res.why(el_interest)
        print("WHY " + str(el_interest) + "? " + str(info))

        explanation = res.how(el_interest)
        print("HOW " + str(el_interest) + "? " + str(explanation))

        self.assertTrue(True)
예제 #18
0
class TestDDApi(unittest.TestCase):

    # create store handler
    store_client = StoreHandler()
    # read graph
    path = 'models/dwh/'
    network = deserialize_network(path)
    api = API(network)
    api.init_store()
    """
    Seed API
    """
    def test_drs_from_raw_field(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation')
        res = self.api.drs_from_raw_field(field)

        for el in res:
            print(str(el))

    def test_drs_from_hit(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation')
        res = self.api.drs_from_raw_field(field)

        els = [x for x in res]
        el = els[0]

        res = self.api.drs_from_hit(el)

        for el in res:
            print(str(el))

    def test_drs_from_table(self):
        print(self._testMethodName)

        table = 'Iap_subject_person.csv'
        res = self.api.drs_from_table(table)

        for el in res:
            print(el)

    def test_drs_from_table_hit(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation')
        res = self.api.drs_from_raw_field(field)

        els = [x for x in res]
        el = els[0]

        res = self.api.drs_from_table_hit(el)

        for el in res:
            print(str(el))

    """
    Primitive API
    """

    def test_keyword_search(self):
        print(self._testMethodName)

        res = self.api.keyword_search("Madden", max_results=10)

        for el in res:
            print(str(el))

    def test_keywords_search(self):
        print(self._testMethodName)

        res = self.api.keywords_search(["Madden", "Stonebraker", "Liskov"])

        for el in res:
            print(str(el))

    def test_schema_name_search(self):
        print(self._testMethodName)

        res = self.api.schema_name_search("Name", max_results=10)

        for el in res:
            print(str(el))

    def test_schema_names_search(self):
        print(self._testMethodName)

        res = self.api.schema_names_search(["Name", "Last Name", "Employee"])

        for el in res:
            print(str(el))

    def test_entity_search(self):
        print(self._testMethodName)

        print("Future Work...")
        return

    def test_schema_neighbors(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation')
        res = self.api.schema_neighbors(field)

        for el in res:
            print(str(el))

    def test_schema_neighbors_of(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation')
        res = self.api.schema_neighbors(field)

        res = self.api.schema_neighbors_of(res)

        for el in res:
            print(str(el))

    def test_similar_schema_name_to_field(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Name')
        res = self.api.similar_schema_name_to_field(field)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_ids_functions(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Key')
        drs1 = self.api.drs_from_raw_field(field)

        field = ('mitdwh', 'Building Key', 'Buildings.csv')
        drs2 = self.api.drs_from_raw_field(field)

        for el in drs1:
            print(str(el))
        for el in drs2:
            print(str(el))

    def test_similar_schema_name_to_table(self):
        print(self._testMethodName)

        table = 'Buildings.csv'
        res = self.api.similar_schema_name_to_table(table)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_similar_schema_name_to(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Key')
        res = self.api.similar_schema_name_to_field(field)

        res = self.api.similar_schema_name_to(res)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_similar_content_to_field(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Name')
        res = self.api.similar_content_to_field(field)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_similar_content_to_table(self):
        print(self._testMethodName)

        table = 'Buildings.csv'
        res = self.api.similar_content_to_table(table)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_similar_content_to(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Name')
        res = self.api.similar_content_to_field(field)

        res = self.api.similar_content_to(res)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_pkfk_field(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Name')
        res = self.api.pkfk_field(field)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_pkfk_table(self):
        print(self._testMethodName)

        table = 'Buildings.csv'
        res = self.api.pkfk_table(table)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_pkfk_of(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Name')
        res = self.api.pkfk_field(field)

        res = self.api.pkfk_of(res)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    """
    Combiner API
    """

    def test_intersection(self):
        print(self._testMethodName)

        res1 = self.api.keyword_search("Madden", max_results=10)
        res2 = self.api.keyword_search("Stonebraker", max_results=10)

        res = res1.intersection(res2)

        for el in res:
            print(str(el))

    def test_union(self):
        print(self._testMethodName)

        res1 = self.api.keyword_search("Madden", max_results=10)
        res2 = self.api.schema_name_search("Stonebraker", max_results=10)

        res = res1.union(res2)

        for el in res:
            print(str(el))

    def test_difference(self):
        print(self._testMethodName)

        res1 = self.api.keyword_search("Madden", max_results=10)
        res2 = self.api.keyword_search("Stonebraker", max_results=10)

        res = res1.set_difference(res2)

        for el in res:
            print(str(el))

    """
    Other, bugs, etc
    """

    def test_iter_edges_with_data_bug(self):
        table = "Fac_building.csv"  # The table of interest
        # We get the representation of that table in DRS
        table_drs = self.api.drs_from_table(table)
        # similar tables are those with similar content
        content_similar = self.api.similar_content_to(table_drs)
        schema_similar = self.api.similar_schema_name_to(
            table_drs)  # similar attribute names
        # some pkfk relationship involved too
        pkfk_similar = self.api.pkfk_of(table_drs)
        # similar tables are similar in content and schema
        inters1 = self.api.intersection(content_similar, schema_similar)
        similar_tables = self.api.intersection(inters1, pkfk_similar)
        similar_tables.print_tables()
예제 #19
0
# Ignore in-table results of neighbor searches
# Exclude certain tables
# keyword_search and neighbor_search, but on mutiple contexts

import networkx as nx
from api.apiutils import Relation

from modelstore.elasticstore import StoreHandler, KWType
from knowledgerepr import fieldnetwork
from algebra import API

path_to_serialized_model = "/Users/arcarter/code/datadiscovery/test/testmodel/"
network = fieldnetwork.deserialize_network(path_to_serialized_model)
store_client = StoreHandler()

api = API(network, store_client)

# short variables for Scope
# These are used in keyword searches
# To specify what parts of a file will be searched
source = KWType.KW_TABLE  # table/file/source name
field = KWType.KW_SCHEMA  # colum names/fields
content = KWType.KW_TEXT  # content of the columns

# Short variables for Relation
# These represent edge types in the graph
# and are used for neighbor searches
# schema = Relation.SCHEMA  # similar schemas
schema_sim = Relation.SCHEMA_SIM  # Similar Schema Names
# similar content values. i.e. matching substrings and numbers
content_sim = Relation.CONTENT_SIM
예제 #20
0
class TestRanking(unittest.TestCase):
    # create store handler
    store_client = StoreHandler()

    # create synthetic graph
    network = GENSYN(5, 5, 20, 50, 10)

    api = API(network)
    api.init_store()

    def test_compute_ranking_scores_certainty(self):

        nodes = self.network.fields_degree(3)

        #self.network._visualize_graph()

        nids = [x for x, y in nodes]

        info = self.network.get_info_for(nids)
        hits = self.network.get_hits_from_info(info)

        drs_info = self.api.drs_from_hits(hits)

        #drs_info.visualize_provenance()

        res = self.api.similar_schema_name_to(drs_info)

        #res.visualize_provenance(labels=True)

        res = res.rank_coverage()

        res.pretty_print_columns_with_scores()

        self.assertTrue(True)

    def test_ranking_certainty_chem(self):
        path = '../models/chemical/'
        network = deserialize_network(path)
        api = API(network)
        api.init_store()

        table = 'activities'
        table_drs = api.drs_from_table(table)

        sim_tables = api.similar_content_to(table_drs)

        sim_tables.rank_certainty()

        print("All columns CERTAINTY: ")
        sim_tables.pretty_print_columns_with_scores()
        print("")
        print("All tables CERTAINTY: ")
        sim_tables.print_tables_with_scores()
        print("")

        sim_tables.rank_coverage()

        print("All columns COVERAGE: ")
        sim_tables.pretty_print_columns_with_scores()
        print("")
        print("All tables COVERAGE: ")
        sim_tables.print_tables_with_scores()
        print("")

    """