def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open('paths_config.json') as f: config = json.load(f) wikidata_path = config['wikidata dump'] class_ids_path = config['class ids'] classes_path = config['class dump'] characteristics_path = config['class characteristics'] with open(class_ids_path) as f: class_ids = set(map(lambda l: l.strip(), f.readlines())) logging.log(level=logging.INFO, msg='loaded class ids') to_charac = to_characteristic(class_ids, JSONDumpReader(wikidata_path)) logging.log(level=logging.INFO, msg='computed subclasses and instances of all classes') JSONDumpWriter(characteristics_path).write( map(lambda ch: ch.to_dict(), map(to_charac, JSONDumpReader(classes_path)))) logging.log(level=logging.INFO, msg='wrote characteristics to {}'.format(characteristics_path))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open('paths_config.json') as f: config = json.load(f) # Reads from Wikidata JSON dump. wikidata_path = config['wikidata dump'] # Writes and then reads JSON dump containing only classes. class_dump_path = config['class dump'] # Writes JSON dump containing only orphan classes. orphan_class_dump_path = config['orphan class dump'] # get the set of ids, which identify classes in dump class_ids = get_class_ids(JSONDumpReader(wikidata_path)) logging.log(level=logging.INFO, msg='found {} classes'.format(len(class_ids))) # write all classes into new JSON dump JSONDumpWriter(class_dump_path).write( filter(lambda e: is_item(e) and e['id'] in class_ids, JSONDumpReader(wikidata_path))) logging.log(level=logging.INFO, msg='wrote classes to {}'.format(class_dump_path)) # write all unlinked classes into new JSON dump JSONDumpWriter(orphan_class_dump_path).write( filter(is_orphan_class, JSONDumpReader(class_dump_path))) logging.log( level=logging.INFO, msg='wrote orphan classes to {}'.format(orphan_class_dump_path))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open('paths_config.json') as f: config = json.load(f) dump_path = config['wikidata dump'] class_charac_path = config['class characteristics'] test_data_path = config['test data'] output_path = config['triple sentences'] with open(config['irrelevant properties']) as f: irrelevant_properties = set(l.strip() for l in f) with open(config['relevant class ids']) as f: relevant_class_ids = set(l.strip() for l in f) relevant_item_ids = set() for charac in filter(lambda c: c['id'] in relevant_class_ids, JSONDumpReader(class_charac_path)): relevant_item_ids.update(charac['instances']) relevant_item_ids.update((charac['subclasses'])) relevant_item_ids.add(charac['id']) logging.log(level=logging.INFO, msg='identified {} relevant itemds'.format( len(relevant_item_ids))) # triple sentences should not include the test samples sources = set(load_test_inputs(test_data_path)) relation = 'P279' def is_forbidden_triple(triple: List[str]) -> bool: if triple[1] == relation and triple[0] in sources: return True return False sentences = TripleSentences( filter(lambda o: o['id'] in relevant_item_ids, JSONDumpReader(dump_path)), forbidden_properties=irrelevant_properties, is_forbidden_triple=is_forbidden_triple).get_sequences() with open(output_path, mode='w') as f: for idx, sentence in enumerate( map(lambda s: ' '.join(s) + '\n', sentences)): f.write(sentence) if idx % 10000 == 0: logging.log(level=logging.INFO, msg='wrote {} sentences'.format(idx + 1)) logging.log(level=logging.INFO, msg='wrote triple sentences to {}'.format(output_path))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open('paths_config.json') as f: paths_config = json.load(f) evaluation_path = paths_config['evaluation'] results = pd.read_csv(evaluation_path, sep=';') algorithm = results[results['accuracy'] == results['accuracy'].max()].iloc[0]['algorithm'] logging.info('analysis best performing algorithm {}'.format(algorithm)) test_data_path = paths_config['test data'] golds = dict((gold_sample.input_arg, set(gold_sample.possible_outputs)) for gold_sample in load_test_data(test_data_path)) result_path = paths_config['execution results'].format(algorithm) tps = set() fps = set() with open(result_path) as f: for unknown, prediction in map(lambda l: l.strip().split(','), f): if prediction in golds[unknown]: tps.add(unknown) else: fps.add(unknown) characteristics_path = paths_config['class characteristics'] tp_analysis_path = paths_config['tp class analysis'] fp_analysis_path = paths_config['fp class analysis'] with open(tp_analysis_path, mode='w') as f: json.dump( analyze_characteristics( filter(lambda c: c['id'] in tps, JSONDumpReader(characteristics_path))), f) logging.log(level=logging.INFO, msg='wrote analysis to {}'.format(tp_analysis_path)) with open(fp_analysis_path, mode='w') as f: json.dump( analyze_characteristics( filter(lambda c: c['id'] in fps, JSONDumpReader(characteristics_path))), f) logging.log(level=logging.INFO, msg='wrote analysis to {}'.format(fp_analysis_path))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open('paths_config.json') as f: config = json.load(f) classes_path = config['relevant class dump'] class_ids_path = config['relevant class ids'] with open(class_ids_path, mode='w') as f: f.write('\n'.join(map(lambda c: c['id'], JSONDumpReader(classes_path))) + '\n') logging.log(level=logging.INFO, msg='wrote relevant class ids to {}'.format(class_ids_path))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open('paths_config.json') as f: config = json.load(f) orphan_classes_path = config['orphan class dump'] characteristics_path = config['class characteristics'] analysis_path = config['orphan class analysis'] orphan_class_ids = set( map(lambda c: c['id'], JSONDumpReader(orphan_classes_path))) logging.log(level=logging.INFO, msg='loaded orphan class ids') with open(analysis_path, mode='w') as f: json.dump( analyze_characteristics( filter(lambda c: c['id'] in orphan_class_ids, JSONDumpReader(characteristics_path))), f) logging.log(level=logging.INFO, msg='wrote analysis to {}'.format(analysis_path))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open('paths_config.json') as f: config = json.load(f) characteristics_path = config['class characteristics'] analysis_path = config['class analysis'] with open(analysis_path, mode='w') as f: json.dump( analyze_characteristics(JSONDumpReader(characteristics_path)), f) logging.log(level=logging.INFO, msg='wrote analysis to {}'.format(analysis_path))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open('paths_config.json') as f: config = json.load(f) wikidata_path = config['wikidata dump'] wikidata_labels_path = config['wikidata labels'] with open(wikidata_labels_path, mode='w') as f: for e in JSONDumpReader(wikidata_path): if get_english_label(e): f.write(','.join([e['id'], get_english_label(e)]) + '\n') logging.log(level=logging.INFO, msg='wrote class ids to {}'.format(wikidata_labels_path))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open('paths_config.json') as f: paths_config = json.load(f) with open(paths_config['irrelevant properties']) as f: irrelevant_properties = set(l.strip() for l in f) classes_path = paths_config['class dump'] relevant_classes_path = paths_config['relevant class dump'] JSONDumpWriter(relevant_classes_path).write( filter( lambda c: set(c['claims'].keys()).isdisjoint(irrelevant_properties ), JSONDumpReader(classes_path))) logging.log( level=logging.INFO, msg='wrote relevant classes to {}'.format(relevant_classes_path))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open('paths_config.json') as f: config = json.load(f) relevant_class_ids_path = config['relevant class ids'] characteristics_path = config['class characteristics'] analysis_path = config['relevant class analysis'] with open(relevant_class_ids_path) as f: relevant_class_ids = set(l.strip() for l in f) logging.log(level=logging.INFO, msg='loaded relevant class ids') with open(analysis_path, mode='w') as f: json.dump( analyze_characteristics( filter(lambda c: c['id'] in relevant_class_ids, JSONDumpReader(characteristics_path))), f) logging.log(level=logging.INFO, msg='wrote analysis to {}'.format(analysis_path))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open('paths_config.json') as f: config = json.load(f) classes_path = config['class dump'] subclass_of_path = config['subclass of relations'] with open(subclass_of_path, mode='w') as f: f.writelines(','.join([c['id']] + list(get_subclass_of_ids(c))) + '\n' for c in JSONDumpReader(classes_path)) logging.log(level=logging.INFO, msg='wrote subclass of relations to {}'.format(subclass_of_path))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) with open('paths_config.json') as f: config = json.load(f) triple_sentences_path = config['triple sentences'] graph_walk_sentences_path = config['graph walk sentences'] relevant_class_ids_path = config['relevant class ids'] wikidata_dump_path = config['wikidata dump'] n_triple_sentences = 0 with open(triple_sentences_path) as f: for _ in f: n_triple_sentences += 1 logging.info('loaded triple sentences') n_graph_walk_sentences = 0 unique_sources = set() unique_items = set() with open(graph_walk_sentences_path) as f: for sentence in map(lambda l: l.strip().split(), f): n_graph_walk_sentences += 1 unique_sources.add(sentence[0]) unique_items.update(sentence[::2]) logging.info('loaded graph walk sentences') n_unique_sources = len(unique_sources) n_unique_items = len(unique_items) relevant_classes = set() with open(relevant_class_ids_path) as f: for cid in map(lambda l: l.strip(), f): relevant_classes.add(cid) logging.info('loaded relevant class ids') n_relevant_classes = len(relevant_classes) all_items = set() for entity in JSONDumpReader(wikidata_dump_path): if entity['id'][0] == 'Q': all_items.add(entity['id']) logging.info('loaded all items') n_all_items = len(all_items) relevant_class_coverage = len(relevant_classes.intersection(unique_items)) / float(n_relevant_classes) all_item_coverage = float(n_unique_items) / n_all_items print('triple_sentences = {}'.format(n_triple_sentences)) print() print('graph walk sentences:') print('graph walk sentences = {}'.format(n_graph_walk_sentences)) print('unique sources = {}'.format(n_unique_sources)) print('unique_items = {}'.format(n_unique_items)) print('relevant class coverage = {:.2f}%'.format(100.0*relevant_class_coverage)) print('all item coverage = {:2f}%'.format(100.0*all_item_coverage))