def get_document_bodies(keyword=None, force=False): fname = 'document_bodies{}.json'.format('_' + keyword if keyword else '') if not force: document_bodies = load_dict_from_json(fname) if document_bodies: return document_bodies print('Creating new entity bodies.') entities_with_types = get_all_instance_types() long_abstracts = get_entity_data( 'long_abstracts_en.ttl') if not keyword or keyword == 'long' else {} short_abstracts = get_entity_data( 'short_abstracts_en.ttl') if not keyword or keyword == 'short' else {} anchor_texts = get_entity_data( 'anchor_text_en.ttl') if not keyword or keyword == 'anchor' else {} document_bodies = defaultdict(str) for entity in entities_with_types: if (not keyword or keyword == 'short') and entity in short_abstracts: document_bodies[entity] = short_abstracts[entity] elif (not keyword or keyword == 'long') and entity in long_abstracts: document_bodies[entity] = long_abstracts[entity] if (not keyword or keyword == 'anchor') and entity in anchor_texts: document_bodies[entity] += anchor_texts[entity] save_dict_to_json(document_bodies, fname) print(f'Created {len(document_bodies)} document bodies.') return document_bodies
def get_all_instance_types(transitive=False, force=False): fname = f'instance_types{"_all" if transitive else ""}.json' if not force: instance_types = load_dict_from_json(fname) if instance_types: return instance_types print("Creating new instance types mapping...") instance_type_filenames = [ 'instance_types_en.ttl', 'instance_types_sdtyped_dbo_en.ttl' ] if transitive: instance_type_filenames.append('instance_types_transitive_en.ttl') instance_types = defaultdict(list) for filename in instance_type_filenames: instance_types = get_instance_types(filename, instance_types) if transitive: ontology = get_ontology() for entity in instance_types: if transitive: types = [] for t in instance_types[entity]: types.extend(ontology.get(t, {}).get('path', [])) instance_types[entity] = list(set(types)) else: instance_types[entity] = list(set(instance_types[entity])) save_dict_to_json(instance_types, fname) return instance_types
def remove_non_existing_types(filename, type_hierarchy): s = load_dict_from_json(filename) i = 0 for q in s: if q['category'] == 'resource': before = len(q['type']) q['type'] = [t for t in q['type'] if t in type_hierarchy] if len(q['type']) != before: i += 1 print(f'Removed extra types from {i} instances.') save_dict_to_json(s, filename)
def load_baseline_results(self, dataset='train', force=False): fname = f'top100_{self.model}_{self.similarity}_{dataset}' if not force: results = load_dict_from_json(fname) if results: return results print('Retrieving from index.') queries = load_dict_from_json(f'{dataset}_set_fixed.json') if not queries: print('Cannot find the dataset.') return None res = getattr(self, f'baseline_{self.model}_retrieval')(queries) save_dict_to_json(res, fname) return res
def get_type_weights(force=False): fname = 'type_weight.json' if not force: weight_doc = load_dict_from_json(fname) if weight_doc: return weight_doc print('Making new type_weight file.') types = get_all_instance_types(True) weight_doc = defaultdict(int) for _, types in types.items(): for t in types: weight_doc[t] += 1 save_dict_to_json(weight_doc, fname) return weight_doc
def get_type_entity(ancestors=False, force=False): fname = f'type_entity{"_all" if ancestors else ""}.json' if not force: documents = load_dict_from_json(fname) if documents: return documents print('Creating new type_entity file.') entities_with_types = get_all_instance_types(ancestors) documents = defaultdict(list) for entity, types in entities_with_types.items(): for t in types: documents[t].append(entity) print('Done. Num types: ', len(documents)) save_dict_to_json(documents, fname) return documents
def get_EC_documents(doc_body='short', force=False): filename = 'document_EC{}.json'.format('_' + doc_body if doc_body else '') if not force: document = load_dict_from_json(filename) if document: return document print('Creating new document.') bodies = get_document_bodies(doc_body) # types = get_all_instance_types(transitive=True) document = defaultdict(dict) for entity, body in bodies.items(): document[entity]['body'] = body # document[entity]['type'] = ' '.join(types[entity]) save_dict_to_json(document, filename) return document
def get_ontology(force=False): fname = 'ontology.json' if not force: ontology = load_dict_from_json(fname) if ontology: return ontology print('Creating new ontology file.') ontology = defaultdict(dict) with open(get_data_path('dbpedia_2016-10.nt', True), 'r', encoding='UTF-8') as f: for line in f: if line.startswith('#'): continue s, p, o = parse_ttl_line(line) if (p != 'http://www.w3.org/2000/01/rdf-schema#subClassOf'): continue if not ('owl#Thing' in o or o.startswith('http://dbpedia.org/ontology/')): continue subj = resolve_uri(s) if not subj: continue subj = 'dbo:' + subj obj = resolve_uri(o) obj = obj if obj == 'owl#Thing' else 'dbo:' + obj if subj in ontology: ontology[subj]['parents'].append(obj) else: ontology[subj]['parents'] = [obj] for entity in ontology: path = list(set(get_path(ontology, entity))) ontology[entity]['path'] = path ontology[entity]['num_ancestors'] = len(path) print(f'There are {len(ontology)} types') save_dict_to_json(ontology, fname) return ontology
def get_TC_documents(doc_body='anchor', ancestors=False, force=False): filename = 'document_TC{}{}.json'.format( '_' + doc_body if doc_body else '', '_all' if ancestors else '') if not force: document = load_dict_from_json(filename) if document: return document print('Creating new document.') bodies = get_document_bodies(doc_body) type_entities = get_type_entity(ancestors) num_types = len(type_entities) document = defaultdict(dict) for i, (t, entities) in enumerate(type_entities.items()): print( f'Processing {i+1}/{num_types} type with {len(entities)} entities') document[t]['body'] = ' '.join( bodies.get(entity, '') for entity in entities) save_dict_to_json(document, filename) return document
def remove_empty_queries(filename='train_set_fixed.json'): s = load_dict_from_json(filename) num_queries = len(s) s = [q for q in s if q['question']] print(f'Removed {num_queries-len(s)} from the dataset') save_dict_to_json(s, filename)