relpaths = self.concept_rels2_match(mid, set(q['answers']))
            if self.N > 2:
                raise Exception('only N<=2 is supported')
            print(q['freebaseKey'], '    ', relpaths, file=sys.stderr)
        except:
            traceback.print_exc()
            raise

        return {'qId': q['qId'],
                'relPaths': relpaths}


if __name__ == "__main__":
    split, endpoint, N = sys.argv[1:]

    data = datalib.load_multi_data(split, ['main', 'd-freebase'])

    sparql = SPARQLWrapper(endpoint)
    sparql.setReturnFormat(JSON)

    # XXX: We would like to write the JSON file as we go, but we need
    # to test for last element in save_json() and things would just
    # get too complicated too needlessly.

    qrpf = QuestionRelPathFinder(sparql, int(N))
    pool = Pool(processes=1)
    qrp = pool.map(qrpf, data.to_list())
    pool.close()
    pool.join()

    with open('d-freebase-rp/%s.json' % (split,), 'w') as f:
            keyPair = {
                'concept': r['label']['value'],
                'mid': r['topic']['value'][27:]
            }
            retVal.append(keyPair)
        s.add(r['topic']['value'])

    if (len(retVal) < 1):
        return {}
    else:
        return retVal[0]


if __name__ == "__main__":
    split = sys.argv[1]
    data = datalib.load_multi_data(split, ['main', 'd-dump', 'd-freebase'])

    qmids = []
    for q in data.to_list():
        res_line = {}
        res_line['qId'] = q['qId']
        res_line['freebaseMids'] = []

        for c in q['Concept']:
            print('%s (%s) ? %s / %s' %
                  (q['qId'], q['qText'], c['fullLabel'], c['pageID']),
                  file=sys.stderr)
            pair = {}
            pair['concept'] = c['fullLabel']
            pair['mid'] = queryPageID(c['pageID'])
            pair['pageID'] = c['pageID']
예제 #3
0
#!/usr/bin/python
#
# fulldata.py: Create JSON files containing full data available for each question
#
# This merges JSON files from main/ and all the d-*/ directories to full/.
#
# Example: mkdir full; for split in devtest val trainmodel test; do ./fulldata.py $split full/ main/ d-*/; done

import sys
import datalib


if __name__ == "__main__":
    split = sys.argv[1]
    outdirname = sys.argv[2]
    indirnames = sys.argv[3:]

    data = datalib.load_multi_data(split, indirnames)
    with open('%s/%s.json' % (outdirname, split), 'w') as f:
        datalib.save_json(data.to_list(), f)
예제 #4
0
#!/usr/bin/python -u
# Resplit main/train.json to the three sub-splits.

from collections import OrderedDict

import datalib
from rawimport import resplit_train

if __name__ == "__main__":
    questions_main = datalib.load_multi_data('train', ['main'])

    qlist_wqr = []
    qlist_mfb = []
    for q in questions_main.to_list():
        q2 = OrderedDict([('qId', q['qId']), ('answers', q['answers']),
                          ('qText', q['qText'])])
        if q['qId'].startswith('wqr'):
            qlist_wqr.append(q2)
        else:
            qlist_mfb.append(q2)
    qlist = qlist_wqr + qlist_mfb

    devtest_main, val_main, trainmodel_main = resplit_train(qlist)
    for nsplit, nsplit_main in [('devtest', devtest_main), ('val', val_main),
                                ('trainmodel', trainmodel_main)]:
        with open('main/%s.json' % (nsplit, ), 'w') as f:
            datalib.save_json(nsplit_main, f, sort_keys=False)
    # Count how many times each path occurs, sort by frequency
    # (to preserve run-by-run stability, secondary sort alphabetically)
    # pl_counter = Counter(path_labels)
    # pl_tuples = [(pl, c) for pl, c in pl_counter.items()]
    # pl_set = sorted(sorted(pl_tuples, key=itemgetter(0)), key=itemgetter(1), reverse=True)
    # pl_set = list(set(path_labels))
    pl_set = remove_duplicates(path_labels)
    return OrderedDict([('qId', q['qId']), ('exploringPaths', pl_set)])


if __name__ == "__main__":
    split = sys.argv[1]
    global mode
    global apikey
    apikey = sys.argv[2] if len(sys.argv) > 2 else None
    data = datalib.load_multi_data(split, ['main', 'd-freebase-mids', 'd-dump'])

    # XXX: We would like to write the JSON file as we go, but we need
    # to test for last element in save_json() and things would just
    # get too complicated too needlessly.

    pool = Pool(processes=2)
    #qrp = pool.map(get_question_rp, data.to_list())
    qrp = map(get_question_rp, data.to_list())
    pool.close()
    pool.join()

    with open('d-relation-dump/%s_.json' % (split,), 'w') as f:
        datalib.save_json(list(qrp), f)
#!/usr/bin/python -u
# Resplit main/train.json to the three sub-splits.

from collections import OrderedDict

import datalib
from rawimport import resplit_train


if __name__ == "__main__":
    questions_main = datalib.load_multi_data('train', ['main'])

    qlist_wqr = []
    qlist_mfb = []
    for q in questions_main.to_list():
        q2 = OrderedDict([('qId', q['qId']), ('answers', q['answers']), ('qText', q['qText'])])
        if q['qId'].startswith('wqr'):
            qlist_wqr.append(q2)
        else:
            qlist_mfb.append(q2)
    qlist = qlist_wqr + qlist_mfb

    devtest_main, val_main, trainmodel_main = resplit_train(qlist)
    for nsplit, nsplit_main in [
            ('devtest', devtest_main),
            ('val', val_main),
            ('trainmodel', trainmodel_main)]:
        with open('main/%s.json' % (nsplit,), 'w') as f:
            datalib.save_json(nsplit_main, f, sort_keys=False)
import json
import sys
import datalib


def jacana_bind(data, jacanajson):
    """ bind jacana json by utterance texts to our dataset and get new prettier json """
    topicmap = dict([(jq['utterance'], jq['topics']) for jq in jacanajson])

    qnlp = []
    for q in data.to_list():
        topics = topicmap[q['qText']]
        topics = [topic.split(' ## ') for topic in topics]
        qnlp.append({'qId': q['qId'], 'entities': topics})
    return qnlp


if __name__ == "__main__":
    split, jacanafile = sys.argv[1:]

    data = datalib.load_multi_data(split, ['main'])

    with open(jacanafile, 'r') as jf:
        jacanajson = json.load(jf)

    qnlp = jacana_bind(data, jacanajson)

    with open('d-entities/%s.json' % (split,), 'w') as f:
        datalib.save_json(qnlp, f)