raise Exception('only N<=2 is supported') print(q['freebaseKey'], ' ', relpaths, file=sys.stderr) except: traceback.print_exc() raise return {'qId': q['qId'], 'relPaths': relpaths} if __name__ == "__main__": split, endpoint, N = sys.argv[1:] data = datalib.load_multi_data(split, ['main', 'd-freebase']) sparql = SPARQLWrapper(endpoint) sparql.setReturnFormat(JSON) # XXX: We would like to write the JSON file as we go, but we need # to test for last element in save_json() and things would just # get too complicated too needlessly. qrpf = QuestionRelPathFinder(sparql, int(N)) pool = Pool(processes=1) qrp = pool.map(qrpf, data.to_list()) pool.close() pool.join() with open('d-freebase-rp/%s.json' % (split,), 'w') as f: datalib.save_json(qrp, f)
qmids = [] for q in data.to_list(): res_line = {} res_line['qId'] = q['qId'] res_line['freebaseMids'] = [] for c in q['Concept']: print('%s (%s) ? %s / %s' % (q['qId'], q['qText'], c['fullLabel'], c['pageID']), file=sys.stderr) pair = {} pair['concept'] = c['fullLabel'] pair['mid'] = queryPageID(c['pageID']) pair['pageID'] = c['pageID'] res_line['freebaseMids'].append(pair) if 'freebaseKey' in q: print('%s (%s) key %s' % (q['qId'], q['qText'], q['freebaseKey']), file=sys.stderr) keyPair = queryKey(q['freebaseKey']) if keyPair['mid'] not in [ p['mid'] for p in res_line['freebaseMids'] ]: res_line['freebaseMids'].append(keyPair) # print (json.dumps(res_line)) qmids.append(res_line) with open('d-freebase-mids/%s.json' % (split, ), 'w') as f: datalib.save_json(qmids, f)
#!/usr/bin/python # # fulldata.py: Create JSON files containing full data available for each question # # This merges JSON files from main/ and all the d-*/ directories to full/. # # Example: mkdir full; for split in devtest val trainmodel test; do ./fulldata.py $split full/ main/ d-*/; done import sys import datalib if __name__ == "__main__": split = sys.argv[1] outdirname = sys.argv[2] indirnames = sys.argv[3:] data = datalib.load_multi_data(split, indirnames) with open('%s/%s.json' % (outdirname, split), 'w') as f: datalib.save_json(data.to_list(), f)
if __name__ == "__main__": split = sys.argv[1] data = datalib.load_multi_data(split, ['main', 'd-dump', 'd-freebase']) qmids = [] for q in data.to_list(): res_line = {} res_line['qId'] = q['qId'] res_line['freebaseMids'] = [] for c in q['Concept']: print('%s (%s) ? %s / %s' % (q['qId'], q['qText'], c['fullLabel'], c['pageID']), file=sys.stderr) pair = {} pair['concept'] = c['fullLabel'] pair['mid'] = queryPageID(c['pageID']) res_line['freebaseMids'].append(pair) if 'freebaseKey' in q: print('%s (%s) key %s' % (q['qId'], q['qText'], q['freebaseKey']), file=sys.stderr) keyPair = queryKey(q['freebaseKey']) if keyPair['mid'] not in [p['mid'] for p in res_line['freebaseMids']]: res_line['freebaseMids'].append(keyPair) # print (json.dumps(res_line)) qmids.append(res_line) with open('d-freebase-mids/%s.json' % (split,), 'w') as f: datalib.save_json(qmids, f)
# Count how many times each path occurs, sort by frequency # (to preserve run-by-run stability, secondary sort alphabetically) # pl_counter = Counter(path_labels) # pl_tuples = [(pl, c) for pl, c in pl_counter.items()] # pl_set = sorted(sorted(pl_tuples, key=itemgetter(0)), key=itemgetter(1), reverse=True) # pl_set = list(set(path_labels)) pl_set = remove_duplicates(path_labels) return OrderedDict([('qId', q['qId']), ('exploringPaths', pl_set)]) if __name__ == "__main__": split = sys.argv[1] global mode global apikey apikey = sys.argv[2] if len(sys.argv) > 2 else None data = datalib.load_multi_data(split, ['main', 'd-freebase-mids', 'd-dump']) # XXX: We would like to write the JSON file as we go, but we need # to test for last element in save_json() and things would just # get too complicated too needlessly. pool = Pool(processes=2) #qrp = pool.map(get_question_rp, data.to_list()) qrp = map(get_question_rp, data.to_list()) pool.close() pool.join() with open('d-relation-dump/%s_.json' % (split,), 'w') as f: datalib.save_json(list(qrp), f)
#!/usr/bin/python -u # Resplit main/train.json to the three sub-splits. from collections import OrderedDict import datalib from rawimport import resplit_train if __name__ == "__main__": questions_main = datalib.load_multi_data('train', ['main']) qlist_wqr = [] qlist_mfb = [] for q in questions_main.to_list(): q2 = OrderedDict([('qId', q['qId']), ('answers', q['answers']), ('qText', q['qText'])]) if q['qId'].startswith('wqr'): qlist_wqr.append(q2) else: qlist_mfb.append(q2) qlist = qlist_wqr + qlist_mfb devtest_main, val_main, trainmodel_main = resplit_train(qlist) for nsplit, nsplit_main in [('devtest', devtest_main), ('val', val_main), ('trainmodel', trainmodel_main)]: with open('main/%s.json' % (nsplit, ), 'w') as f: datalib.save_json(nsplit_main, f, sort_keys=False)
def save_data(split, maindir, questions_main, fbdir, questions_fb): """ save full dataset for a given split """ for data, fname in [(questions_main, '%s/%s.json' % (maindir, split)), (questions_fb, '%s/%s.json' % (fbdir, split))]: with open(fname, 'w') as f: datalib.save_json(data, f)
#!/usr/bin/python -u # Resplit main/train.json to the three sub-splits. from collections import OrderedDict import datalib from rawimport import resplit_train if __name__ == "__main__": questions_main = datalib.load_multi_data('train', ['main']) qlist_wqr = [] qlist_mfb = [] for q in questions_main.to_list(): q2 = OrderedDict([('qId', q['qId']), ('answers', q['answers']), ('qText', q['qText'])]) if q['qId'].startswith('wqr'): qlist_wqr.append(q2) else: qlist_mfb.append(q2) qlist = qlist_wqr + qlist_mfb devtest_main, val_main, trainmodel_main = resplit_train(qlist) for nsplit, nsplit_main in [ ('devtest', devtest_main), ('val', val_main), ('trainmodel', trainmodel_main)]: with open('main/%s.json' % (nsplit,), 'w') as f: datalib.save_json(nsplit_main, f, sort_keys=False)
# Count how many times each path occurs, sort by frequency # (to preserve run-by-run stability, secondary sort alphabetically) # pl_counter = Counter(path_labels) # pl_tuples = [(pl, c) for pl, c in pl_counter.items()] # pl_set = sorted(sorted(pl_tuples, key=itemgetter(0)), key=itemgetter(1), reverse=True) # pl_set = list(set(path_labels)) pl_set = remove_duplicates(path_labels) return OrderedDict([('qId', q['qId']), ('exploringPaths', pl_set)]) if __name__ == "__main__": split = sys.argv[1] global mode global apikey apikey = sys.argv[2] if len(sys.argv) > 2 else None data = datalib.load_multi_data(split, ['main', 'd-freebase-mids', 'd-dump']) # XXX: We would like to write the JSON file as we go, but we need # to test for last element in save_json() and things would just # get too complicated too needlessly. pool = Pool(processes=2) #qrp = pool.map(get_question_rp, data.to_list()) qrp = map(get_question_rp, data.to_list()) pool.close() pool.join() with open('d-relation-dump/%s_.json' % (split, ), 'w') as f: datalib.save_json(list(qrp), f)
import json import sys import datalib def jacana_bind(data, jacanajson): """ bind jacana json by utterance texts to our dataset and get new prettier json """ topicmap = dict([(jq['utterance'], jq['topics']) for jq in jacanajson]) qnlp = [] for q in data.to_list(): topics = topicmap[q['qText']] topics = [topic.split(' ## ') for topic in topics] qnlp.append({'qId': q['qId'], 'entities': topics}) return qnlp if __name__ == "__main__": split, jacanafile = sys.argv[1:] data = datalib.load_multi_data(split, ['main']) with open(jacanafile, 'r') as jf: jacanajson = json.load(jf) qnlp = jacana_bind(data, jacanajson) with open('d-entities/%s.json' % (split,), 'w') as f: datalib.save_json(qnlp, f)