def test_map_table_properties(): table = GenericTable(TEST_FILENAME) table.init() properties = map_table_properties(table) assert "uri" in properties[0].keys() assert "prefixed_name" in properties[0].keys() assert "score" in properties[0].keys()
def test_map_atomic_table_property(): table = GenericTable(TEST_FILENAME) table.init() _property = map_table_properties_connectivity(table) assert _property == { '0_1': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' }
def test_benchmark_agdistis(): agdistis_wrapper = AgdistisWrapper() onlyfiles = [ f for f in listdir(ENTITIES_DIR) if isfile(join(ENTITIES_DIR, f)) ] num = 0 while True: try: _id = onlyfiles[num] print("process table %d out of %d" % (num, len(onlyfiles)), flush=True) print("table id %s" % (_id), flush=True) fixture_entities = get_gold_standard_entities(_id) _table = GenericTable(filename=join(TABLES_DIR, _id), _id=_id) _table.init() agdistis_entities = agdistis_wrapper.disambiguate_table(_table) to_compare = map_agdistis_entities_to_gold_standard_format( _table, agdistis_entities) print("", flush=True) print(fixture_entities, flush=True) print("", flush=True) print(to_compare, flush=True) print(diff_entities(fixture_entities, to_compare), flush=True) num += 1 if (num >= len(onlyfiles)): break except BaseException as e: print(str(e))
def test_benchmark_dbpedia_lookup_subject_columns_only(): onlyfiles = [ f for f in listdir(ENTITIES_DIR) if isfile(join(ENTITIES_DIR, f)) ] scidentifier = SCIdentifier() num = 0 while True: try: _id = onlyfiles[num] print("process table %d out of %d" % (num, len(onlyfiles)), flush=True) print("table id %s" % (_id), flush=True) fixture_entities = get_gold_standard_entities(_id) _table = GenericTable(filename=join(TABLES_DIR, _id), _id=_id) _table.init() _subject_columns = scidentifier.identify_subject_column(_table) if _subject_columns: _table.subject_column = _subject_columns[0] dbpedia_lookup_entities = disambiguate_table_subject_column_only( _table) to_compare = map_agdistis_entities_to_gold_standard_format( _table, dbpedia_lookup_entities) print("", flush=True) print(fixture_entities, flush=True) print("", flush=True) print(to_compare, flush=True) print(diff_entities(fixture_entities, to_compare), flush=True) num += 1 if (num >= len(onlyfiles)): break except BaseException as e: print(str(e))
def test_table_case(): from taipan.pathes import TABLES_DIR _id = "34041816_1_4749054164534706977.csv" _table = GenericTable(filename=join(TABLES_DIR, _id), _id=_id) _table.init() row = _table.table[1] concat_dis = agdistis_wrapper.disambiguate_row(row) cell_dis = agdistis_wrapper._disambiguate_row(row)
def test_identify_subject_column_table_string(): table_string = "\"Region\",\"Currency\",\"Price\",\"Price in ?\"\n\"Australia SA+WA\",\"AUD\",\"24.95\",\"15.91\"\n\"Israel\",\"ILS\",\"79\",\"15.03\"\n\"Australia\",\"AUD\",\"19.99\",\"12.75\"\n\"Kuwait\",\"KWD\",\"4.50\",\"11.08\"\n\"Canada\",\"CAD\",\"14.99\",\"10.02\"" table = GenericTable("stub", csv_string=table_string) table.init() sc = SCIDENTIFIER.identify_subject_column(table) # table can not be predicted, [0] is returned by default assert isinstance(sc, list) assert len(sc) > 0
def test_generate_rdf(): table = GenericTable(TEST_FILENAME) table.init() rdf = generate_rdf(table) g = rdflib.Graph() g.parse(data=rdf, format="n3") assert len(g.all_nodes()) > 1
def get_additional_tables(self): tables = [] subject_column_list = os.path.join(ADDITIONAL_DATA_DIR, "subject_columns.csv") id_list = self.load_csv(subject_column_list) for (_id, subject_column) in id_list: table_filename = os.path.join(ADDITIONAL_DATA_DIR, "tables", _id) table = GenericTable(filename=table_filename, _id=_id) table.init() table.table = table.table[:int(ROWS_TO_ANALYZE)] table.subject_column = int(subject_column) tables.append(table) return tables
def test_space_delimiter(): table = GenericTable("stub", csv_string=tomtom_csv, delimiter=" ") table.init() assert table.table.shape == (18, 4) assert table.subject_column is None
def test_from_string(): table = GenericTable("stub", csv_string=TABLE_STRING) table.init() assert len(table.table) == 28 assert table.subject_column is None
def test_init(): table = GenericTable(TEST_FILENAME) table.init() assert len(table.table) == 13 assert table.subject_column is None
def test(): table = GenericTable("stub", csv_string=TABLE_STRING, delimiter=";") table.init() sc_ident = SCIdentifier() subject_column = sc_ident.identify_subject_column(table) assert subject_column == [4, 6]