def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    with open('paths_config.json') as f:
        config = json.load(f)

    wikidata_path = config['wikidata dump']
    class_ids_path = config['class ids']
    classes_path = config['class dump']
    characteristics_path = config['class characteristics']

    with open(class_ids_path) as f:
        class_ids = set(map(lambda l: l.strip(), f.readlines()))
    logging.log(level=logging.INFO, msg='loaded class ids')

    to_charac = to_characteristic(class_ids, JSONDumpReader(wikidata_path))
    logging.log(level=logging.INFO,
                msg='computed subclasses and instances of all classes')

    JSONDumpWriter(characteristics_path).write(
        map(lambda ch: ch.to_dict(),
            map(to_charac, JSONDumpReader(classes_path))))
    logging.log(level=logging.INFO,
                msg='wrote characteristics to {}'.format(characteristics_path))
예제 #2
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    with open('paths_config.json') as f:
        config = json.load(f)

    # Reads from Wikidata JSON dump.
    wikidata_path = config['wikidata dump']
    # Writes and then reads JSON dump containing only classes.
    class_dump_path = config['class dump']
    # Writes JSON dump containing only orphan classes.
    orphan_class_dump_path = config['orphan class dump']

    # get the set of ids, which identify classes in dump
    class_ids = get_class_ids(JSONDumpReader(wikidata_path))
    logging.log(level=logging.INFO,
                msg='found {} classes'.format(len(class_ids)))

    # write all classes into new JSON dump
    JSONDumpWriter(class_dump_path).write(
        filter(lambda e: is_item(e) and e['id'] in class_ids,
               JSONDumpReader(wikidata_path)))
    logging.log(level=logging.INFO,
                msg='wrote classes to {}'.format(class_dump_path))

    # write all unlinked classes into new JSON dump
    JSONDumpWriter(orphan_class_dump_path).write(
        filter(is_orphan_class, JSONDumpReader(class_dump_path)))
    logging.log(
        level=logging.INFO,
        msg='wrote orphan classes to {}'.format(orphan_class_dump_path))
예제 #3
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    with open('paths_config.json') as f:
        config = json.load(f)

    dump_path = config['wikidata dump']
    class_charac_path = config['class characteristics']
    test_data_path = config['test data']
    output_path = config['triple sentences']

    with open(config['irrelevant properties']) as f:
        irrelevant_properties = set(l.strip() for l in f)

    with open(config['relevant class ids']) as f:
        relevant_class_ids = set(l.strip() for l in f)

    relevant_item_ids = set()
    for charac in filter(lambda c: c['id'] in relevant_class_ids,
                         JSONDumpReader(class_charac_path)):
        relevant_item_ids.update(charac['instances'])
        relevant_item_ids.update((charac['subclasses']))
        relevant_item_ids.add(charac['id'])
    logging.log(level=logging.INFO,
                msg='identified {} relevant itemds'.format(
                    len(relevant_item_ids)))

    # triple sentences should not include the test samples
    sources = set(load_test_inputs(test_data_path))
    relation = 'P279'

    def is_forbidden_triple(triple: List[str]) -> bool:
        if triple[1] == relation and triple[0] in sources:
            return True
        return False

    sentences = TripleSentences(
        filter(lambda o: o['id'] in relevant_item_ids,
               JSONDumpReader(dump_path)),
        forbidden_properties=irrelevant_properties,
        is_forbidden_triple=is_forbidden_triple).get_sequences()

    with open(output_path, mode='w') as f:
        for idx, sentence in enumerate(
                map(lambda s: ' '.join(s) + '\n', sentences)):
            f.write(sentence)
            if idx % 10000 == 0:
                logging.log(level=logging.INFO,
                            msg='wrote {} sentences'.format(idx + 1))
    logging.log(level=logging.INFO,
                msg='wrote triple sentences to {}'.format(output_path))
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    with open('paths_config.json') as f:
        paths_config = json.load(f)

    evaluation_path = paths_config['evaluation']

    results = pd.read_csv(evaluation_path, sep=';')
    algorithm = results[results['accuracy'] ==
                        results['accuracy'].max()].iloc[0]['algorithm']
    logging.info('analysis best performing algorithm {}'.format(algorithm))

    test_data_path = paths_config['test data']
    golds = dict((gold_sample.input_arg, set(gold_sample.possible_outputs))
                 for gold_sample in load_test_data(test_data_path))

    result_path = paths_config['execution results'].format(algorithm)
    tps = set()
    fps = set()
    with open(result_path) as f:
        for unknown, prediction in map(lambda l: l.strip().split(','), f):
            if prediction in golds[unknown]:
                tps.add(unknown)
            else:
                fps.add(unknown)

    characteristics_path = paths_config['class characteristics']
    tp_analysis_path = paths_config['tp class analysis']
    fp_analysis_path = paths_config['fp class analysis']

    with open(tp_analysis_path, mode='w') as f:
        json.dump(
            analyze_characteristics(
                filter(lambda c: c['id'] in tps,
                       JSONDumpReader(characteristics_path))), f)
    logging.log(level=logging.INFO,
                msg='wrote analysis to {}'.format(tp_analysis_path))

    with open(fp_analysis_path, mode='w') as f:
        json.dump(
            analyze_characteristics(
                filter(lambda c: c['id'] in fps,
                       JSONDumpReader(characteristics_path))), f)
    logging.log(level=logging.INFO,
                msg='wrote analysis to {}'.format(fp_analysis_path))
예제 #5
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    with open('paths_config.json') as f:
        config = json.load(f)

    classes_path = config['relevant class dump']
    class_ids_path = config['relevant class ids']

    with open(class_ids_path, mode='w') as f:
        f.write('\n'.join(map(lambda c: c['id'], JSONDumpReader(classes_path))) + '\n')
    logging.log(level=logging.INFO, msg='wrote relevant class ids to {}'.format(class_ids_path))
예제 #6
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    with open('paths_config.json') as f:
        config = json.load(f)

    orphan_classes_path = config['orphan class dump']
    characteristics_path = config['class characteristics']
    analysis_path = config['orphan class analysis']

    orphan_class_ids = set(
        map(lambda c: c['id'], JSONDumpReader(orphan_classes_path)))
    logging.log(level=logging.INFO, msg='loaded orphan class ids')

    with open(analysis_path, mode='w') as f:
        json.dump(
            analyze_characteristics(
                filter(lambda c: c['id'] in orphan_class_ids,
                       JSONDumpReader(characteristics_path))), f)
    logging.log(level=logging.INFO,
                msg='wrote analysis to {}'.format(analysis_path))
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    with open('paths_config.json') as f:
        config = json.load(f)

    characteristics_path = config['class characteristics']
    analysis_path = config['class analysis']

    with open(analysis_path, mode='w') as f:
        json.dump(
            analyze_characteristics(JSONDumpReader(characteristics_path)), f)
    logging.log(level=logging.INFO,
                msg='wrote analysis to {}'.format(analysis_path))
예제 #8
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    with open('paths_config.json') as f:
        config = json.load(f)

    wikidata_path = config['wikidata dump']
    wikidata_labels_path = config['wikidata labels']

    with open(wikidata_labels_path, mode='w') as f:
        for e in JSONDumpReader(wikidata_path):
            if get_english_label(e):
                f.write(','.join([e['id'], get_english_label(e)]) + '\n')
    logging.log(level=logging.INFO,
                msg='wrote class ids to {}'.format(wikidata_labels_path))
예제 #9
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    with open('paths_config.json') as f:
        paths_config = json.load(f)

    with open(paths_config['irrelevant properties']) as f:
        irrelevant_properties = set(l.strip() for l in f)

    classes_path = paths_config['class dump']
    relevant_classes_path = paths_config['relevant class dump']

    JSONDumpWriter(relevant_classes_path).write(
        filter(
            lambda c: set(c['claims'].keys()).isdisjoint(irrelevant_properties
                                                         ),
            JSONDumpReader(classes_path)))
    logging.log(
        level=logging.INFO,
        msg='wrote relevant classes to {}'.format(relevant_classes_path))
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    with open('paths_config.json') as f:
        config = json.load(f)

    relevant_class_ids_path = config['relevant class ids']
    characteristics_path = config['class characteristics']
    analysis_path = config['relevant class analysis']

    with open(relevant_class_ids_path) as f:
        relevant_class_ids = set(l.strip() for l in f)
    logging.log(level=logging.INFO, msg='loaded relevant class ids')

    with open(analysis_path, mode='w') as f:
        json.dump(
            analyze_characteristics(
                filter(lambda c: c['id'] in relevant_class_ids,
                       JSONDumpReader(characteristics_path))), f)
    logging.log(level=logging.INFO,
                msg='wrote analysis to {}'.format(analysis_path))
예제 #11
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    with open('paths_config.json') as f:
        config = json.load(f)

    classes_path = config['class dump']
    subclass_of_path = config['subclass of relations']

    with open(subclass_of_path, mode='w') as f:
        f.writelines(','.join([c['id']] + list(get_subclass_of_ids(c))) + '\n' for c in JSONDumpReader(classes_path))
    logging.log(level=logging.INFO, msg='wrote subclass of relations to {}'.format(subclass_of_path))
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    with open('paths_config.json') as f:
        config = json.load(f)

    triple_sentences_path = config['triple sentences']
    graph_walk_sentences_path = config['graph walk sentences']
    relevant_class_ids_path = config['relevant class ids']
    wikidata_dump_path = config['wikidata dump']

    n_triple_sentences = 0

    with open(triple_sentences_path) as f:
        for _ in f:
            n_triple_sentences += 1
    logging.info('loaded triple sentences')

    n_graph_walk_sentences = 0
    unique_sources = set()
    unique_items = set()

    with open(graph_walk_sentences_path) as f:
        for sentence in map(lambda l: l.strip().split(), f):
            n_graph_walk_sentences += 1
            unique_sources.add(sentence[0])
            unique_items.update(sentence[::2])
    logging.info('loaded graph walk sentences')

    n_unique_sources = len(unique_sources)
    n_unique_items = len(unique_items)

    relevant_classes = set()

    with open(relevant_class_ids_path) as f:
        for cid in map(lambda l: l.strip(), f):
            relevant_classes.add(cid)
    logging.info('loaded relevant class ids')

    n_relevant_classes = len(relevant_classes)

    all_items = set()

    for entity in JSONDumpReader(wikidata_dump_path):
        if entity['id'][0] == 'Q':
            all_items.add(entity['id'])
    logging.info('loaded all items')

    n_all_items = len(all_items)

    relevant_class_coverage = len(relevant_classes.intersection(unique_items)) / float(n_relevant_classes)
    all_item_coverage = float(n_unique_items) / n_all_items

    print('triple_sentences = {}'.format(n_triple_sentences))
    print()
    print('graph walk sentences:')
    print('graph walk sentences = {}'.format(n_graph_walk_sentences))
    print('unique sources = {}'.format(n_unique_sources))
    print('unique_items = {}'.format(n_unique_items))
    print('relevant class coverage = {:.2f}%'.format(100.0*relevant_class_coverage))
    print('all item coverage = {:2f}%'.format(100.0*all_item_coverage))