def test_benchmark_dbpedia_lookup_subject_columns_only(): onlyfiles = [ f for f in listdir(ENTITIES_DIR) if isfile(join(ENTITIES_DIR, f)) ] scidentifier = SCIdentifier() num = 0 while True: try: _id = onlyfiles[num] print("process table %d out of %d" % (num, len(onlyfiles)), flush=True) print("table id %s" % (_id), flush=True) fixture_entities = get_gold_standard_entities(_id) _table = GenericTable(filename=join(TABLES_DIR, _id), _id=_id) _table.init() _subject_columns = scidentifier.identify_subject_column(_table) if _subject_columns: _table.subject_column = _subject_columns[0] dbpedia_lookup_entities = disambiguate_table_subject_column_only( _table) to_compare = map_agdistis_entities_to_gold_standard_format( _table, dbpedia_lookup_entities) print("", flush=True) print(fixture_entities, flush=True) print("", flush=True) print(to_compare, flush=True) print(diff_entities(fixture_entities, to_compare), flush=True) num += 1 if (num >= len(onlyfiles)): break except BaseException as e: print(str(e))
def test_disambiguate_table_subject_column_only_case_1(): table = GenericTable() table.table = CASE_1_TABLE table.subject_column = 1 entities = disambiguate_table_subject_column_only(table) import ipdb ipdb.set_trace()
def get_additional_tables(self): tables = [] subject_column_list = os.path.join(ADDITIONAL_DATA_DIR, "subject_columns.csv") id_list = self.load_csv(subject_column_list) for (_id, subject_column) in id_list: table_filename = os.path.join(ADDITIONAL_DATA_DIR, "tables", _id) table = GenericTable(filename=table_filename, _id=_id) table.init() table.table = table.table[:int(ROWS_TO_ANALYZE)] table.subject_column = int(subject_column) tables.append(table) return tables