Пример #1
0
                raise Exception('only N<=2 is supported')
            print(q['freebaseKey'], '    ', relpaths, file=sys.stderr)
        except:
            traceback.print_exc()
            raise

        return {'qId': q['qId'],
                'relPaths': relpaths}


if __name__ == "__main__":
    split, endpoint, N = sys.argv[1:]

    data = datalib.load_multi_data(split, ['main', 'd-freebase'])

    sparql = SPARQLWrapper(endpoint)
    sparql.setReturnFormat(JSON)

    # XXX: We would like to write the JSON file as we go, but we need
    # to test for last element in save_json() and things would just
    # get too complicated too needlessly.

    qrpf = QuestionRelPathFinder(sparql, int(N))
    pool = Pool(processes=1)
    qrp = pool.map(qrpf, data.to_list())
    pool.close()
    pool.join()

    with open('d-freebase-rp/%s.json' % (split,), 'w') as f:
        datalib.save_json(qrp, f)
    qmids = []
    for q in data.to_list():
        res_line = {}
        res_line['qId'] = q['qId']
        res_line['freebaseMids'] = []

        for c in q['Concept']:
            print('%s (%s) ? %s / %s' %
                  (q['qId'], q['qText'], c['fullLabel'], c['pageID']),
                  file=sys.stderr)
            pair = {}
            pair['concept'] = c['fullLabel']
            pair['mid'] = queryPageID(c['pageID'])
            pair['pageID'] = c['pageID']
            res_line['freebaseMids'].append(pair)

        if 'freebaseKey' in q:
            print('%s (%s) key %s' % (q['qId'], q['qText'], q['freebaseKey']),
                  file=sys.stderr)
            keyPair = queryKey(q['freebaseKey'])
            if keyPair['mid'] not in [
                    p['mid'] for p in res_line['freebaseMids']
            ]:
                res_line['freebaseMids'].append(keyPair)

        # print (json.dumps(res_line))
        qmids.append(res_line)

    with open('d-freebase-mids/%s.json' % (split, ), 'w') as f:
        datalib.save_json(qmids, f)
Пример #3
0
#!/usr/bin/python
#
# fulldata.py: Create JSON files containing full data available for each question
#
# This merges JSON files from main/ and all the d-*/ directories to full/.
#
# Example: mkdir full; for split in devtest val trainmodel test; do ./fulldata.py $split full/ main/ d-*/; done

import sys
import datalib


if __name__ == "__main__":
    split = sys.argv[1]
    outdirname = sys.argv[2]
    indirnames = sys.argv[3:]

    data = datalib.load_multi_data(split, indirnames)
    with open('%s/%s.json' % (outdirname, split), 'w') as f:
        datalib.save_json(data.to_list(), f)

if __name__ == "__main__":
    split = sys.argv[1]
    data = datalib.load_multi_data(split, ['main', 'd-dump', 'd-freebase'])

    qmids = []
    for q in data.to_list():
        res_line = {}
        res_line['qId'] = q['qId']
        res_line['freebaseMids'] = []

        for c in q['Concept']:
            print('%s (%s) ? %s / %s' % (q['qId'], q['qText'], c['fullLabel'], c['pageID']), file=sys.stderr)
            pair = {}
            pair['concept'] = c['fullLabel']
            pair['mid'] = queryPageID(c['pageID'])
            res_line['freebaseMids'].append(pair)

        if 'freebaseKey' in q:
            print('%s (%s) key %s' % (q['qId'], q['qText'], q['freebaseKey']), file=sys.stderr)
            keyPair = queryKey(q['freebaseKey'])
            if keyPair['mid'] not in [p['mid'] for p in res_line['freebaseMids']]:
                res_line['freebaseMids'].append(keyPair)

        # print (json.dumps(res_line))
        qmids.append(res_line)

    with open('d-freebase-mids/%s.json' % (split,), 'w') as f:
        datalib.save_json(qmids, f)
    # Count how many times each path occurs, sort by frequency
    # (to preserve run-by-run stability, secondary sort alphabetically)
    # pl_counter = Counter(path_labels)
    # pl_tuples = [(pl, c) for pl, c in pl_counter.items()]
    # pl_set = sorted(sorted(pl_tuples, key=itemgetter(0)), key=itemgetter(1), reverse=True)
    # pl_set = list(set(path_labels))
    pl_set = remove_duplicates(path_labels)
    return OrderedDict([('qId', q['qId']), ('exploringPaths', pl_set)])


if __name__ == "__main__":
    split = sys.argv[1]
    global mode
    global apikey
    apikey = sys.argv[2] if len(sys.argv) > 2 else None
    data = datalib.load_multi_data(split, ['main', 'd-freebase-mids', 'd-dump'])

    # XXX: We would like to write the JSON file as we go, but we need
    # to test for last element in save_json() and things would just
    # get too complicated too needlessly.

    pool = Pool(processes=2)
    #qrp = pool.map(get_question_rp, data.to_list())
    qrp = map(get_question_rp, data.to_list())
    pool.close()
    pool.join()

    with open('d-relation-dump/%s_.json' % (split,), 'w') as f:
        datalib.save_json(list(qrp), f)
Пример #6
0
#!/usr/bin/python -u
# Resplit main/train.json to the three sub-splits.

from collections import OrderedDict

import datalib
from rawimport import resplit_train

if __name__ == "__main__":
    questions_main = datalib.load_multi_data('train', ['main'])

    qlist_wqr = []
    qlist_mfb = []
    for q in questions_main.to_list():
        q2 = OrderedDict([('qId', q['qId']), ('answers', q['answers']),
                          ('qText', q['qText'])])
        if q['qId'].startswith('wqr'):
            qlist_wqr.append(q2)
        else:
            qlist_mfb.append(q2)
    qlist = qlist_wqr + qlist_mfb

    devtest_main, val_main, trainmodel_main = resplit_train(qlist)
    for nsplit, nsplit_main in [('devtest', devtest_main), ('val', val_main),
                                ('trainmodel', trainmodel_main)]:
        with open('main/%s.json' % (nsplit, ), 'w') as f:
            datalib.save_json(nsplit_main, f, sort_keys=False)
def save_data(split, maindir, questions_main, fbdir, questions_fb):
    """ save full dataset for a given split """
    for data, fname in [(questions_main, '%s/%s.json' % (maindir, split)),
                        (questions_fb, '%s/%s.json' % (fbdir, split))]:
        with open(fname, 'w') as f:
            datalib.save_json(data, f)
#!/usr/bin/python -u
# Resplit main/train.json to the three sub-splits.

from collections import OrderedDict

import datalib
from rawimport import resplit_train


if __name__ == "__main__":
    questions_main = datalib.load_multi_data('train', ['main'])

    qlist_wqr = []
    qlist_mfb = []
    for q in questions_main.to_list():
        q2 = OrderedDict([('qId', q['qId']), ('answers', q['answers']), ('qText', q['qText'])])
        if q['qId'].startswith('wqr'):
            qlist_wqr.append(q2)
        else:
            qlist_mfb.append(q2)
    qlist = qlist_wqr + qlist_mfb

    devtest_main, val_main, trainmodel_main = resplit_train(qlist)
    for nsplit, nsplit_main in [
            ('devtest', devtest_main),
            ('val', val_main),
            ('trainmodel', trainmodel_main)]:
        with open('main/%s.json' % (nsplit,), 'w') as f:
            datalib.save_json(nsplit_main, f, sort_keys=False)
Пример #9
0
    # Count how many times each path occurs, sort by frequency
    # (to preserve run-by-run stability, secondary sort alphabetically)
    # pl_counter = Counter(path_labels)
    # pl_tuples = [(pl, c) for pl, c in pl_counter.items()]
    # pl_set = sorted(sorted(pl_tuples, key=itemgetter(0)), key=itemgetter(1), reverse=True)
    # pl_set = list(set(path_labels))
    pl_set = remove_duplicates(path_labels)
    return OrderedDict([('qId', q['qId']), ('exploringPaths', pl_set)])


if __name__ == "__main__":
    split = sys.argv[1]
    global mode
    global apikey
    apikey = sys.argv[2] if len(sys.argv) > 2 else None
    data = datalib.load_multi_data(split,
                                   ['main', 'd-freebase-mids', 'd-dump'])

    # XXX: We would like to write the JSON file as we go, but we need
    # to test for last element in save_json() and things would just
    # get too complicated too needlessly.

    pool = Pool(processes=2)
    #qrp = pool.map(get_question_rp, data.to_list())
    qrp = map(get_question_rp, data.to_list())
    pool.close()
    pool.join()

    with open('d-relation-dump/%s_.json' % (split, ), 'w') as f:
        datalib.save_json(list(qrp), f)
import json
import sys
import datalib


def jacana_bind(data, jacanajson):
    """ bind jacana json by utterance texts to our dataset and get new prettier json """
    topicmap = dict([(jq['utterance'], jq['topics']) for jq in jacanajson])

    qnlp = []
    for q in data.to_list():
        topics = topicmap[q['qText']]
        topics = [topic.split(' ## ') for topic in topics]
        qnlp.append({'qId': q['qId'], 'entities': topics})
    return qnlp


if __name__ == "__main__":
    split, jacanafile = sys.argv[1:]

    data = datalib.load_multi_data(split, ['main'])

    with open(jacanafile, 'r') as jf:
        jacanajson = json.load(jf)

    qnlp = jacana_bind(data, jacanajson)

    with open('d-entities/%s.json' % (split,), 'w') as f:
        datalib.save_json(qnlp, f)
def save_data(split, maindir, questions_main, fbdir, questions_fb):
    """ save full dataset for a given split """
    for data, fname in [(questions_main, '%s/%s.json' % (maindir, split)), (questions_fb, '%s/%s.json' % (fbdir, split))]:
        with open(fname, 'w') as f:
            datalib.save_json(data, f)